aco/ra: move variables from affinity register to avoid waitcnt

If we don't use this affinity register, we're likely to end up moving the
temporary later. If it's a memory instruction destination, that's probably
more expensive than just copying the blocking variables.

fossil-db (navi31):
Totals from 504 (0.63% of 79825) affected shaders:
Instrs: 4108284 -> 4109026 (+0.02%); split: -0.01%, +0.03%
CodeSize: 21226764 -> 21229764 (+0.01%); split: -0.01%, +0.02%
Latency: 26931635 -> 26806989 (-0.46%); split: -0.47%, +0.00%
InvThroughput: 8443520 -> 8439235 (-0.05%); split: -0.06%, +0.01%
VClause: 99209 -> 99314 (+0.11%); split: -0.00%, +0.11%
SClause: 85089 -> 85085 (-0.00%)
Copies: 340323 -> 340993 (+0.20%); split: -0.06%, +0.26%
Branches: 117225 -> 117209 (-0.01%); split: -0.02%, +0.00%
VALU: 2421859 -> 2422529 (+0.03%); split: -0.01%, +0.04%
SALU: 503465 -> 503470 (+0.00%); split: -0.00%, +0.00%

fossil-db (navi21):
Totals from 582 (0.73% of 79825) affected shaders:
Instrs: 3714908 -> 3714990 (+0.00%); split: -0.02%, +0.02%
CodeSize: 19977880 -> 19973076 (-0.02%); split: -0.04%, +0.01%
VGPRs: 40480 -> 40496 (+0.04%)
Latency: 26028895 -> 25772711 (-0.98%); split: -0.99%, +0.00%
InvThroughput: 9827389 -> 9818194 (-0.09%); split: -0.10%, +0.01%
VClause: 103702 -> 103815 (+0.11%); split: -0.02%, +0.13%
SClause: 90861 -> 90857 (-0.00%)
Copies: 335276 -> 335992 (+0.21%); split: -0.09%, +0.30%
Branches: 123912 -> 123897 (-0.01%); split: -0.02%, +0.00%
VALU: 2466032 -> 2466748 (+0.03%); split: -0.01%, +0.04%
SALU: 533658 -> 533667 (+0.00%); split: -0.00%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38262>
This commit is contained in:
Rhys Perry 2025-11-13 11:10:26 +00:00 committed by Marge Bot
parent 681ec4cba7
commit 310f588f92

View file

@ -1214,14 +1214,9 @@ find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_i
return vars;
}
/* collect variables from a register area and clear reg_file
* variables are sorted in decreasing size and
* increasing assigned register
*/
std::vector<unsigned>
collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
void
collect_vars(ra_ctx& ctx, RegisterFile& reg_file, std::vector<unsigned>& ids)
{
std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
std::sort(ids.begin(), ids.end(),
[&](unsigned a, unsigned b)
{
@ -1235,6 +1230,17 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte
assignment& var = ctx.assignments[id];
reg_file.clear(var.reg, var.rc);
}
}
/* collect variables from a register area and clear reg_file
* variables are sorted in decreasing size and
* increasing assigned register
*/
std::vector<unsigned>
collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
{
std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
collect_vars(ctx, reg_file, ids);
return ids;
}
@ -1982,6 +1988,49 @@ should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file)
return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
}
std::optional<PhysReg>
get_reg_affinity(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
std::vector<parallelcopy>& parallelcopies, aco_ptr<Instruction>& instr,
int operand_index, assignment& affinity)
{
/* check if the target register is blocked */
if (operand_index == -1 && reg_file.test(affinity.reg, temp.bytes())) {
/* It is cheaper to just assign a different register. */
if (ctx.assignments[temp.id()].weight == 0)
return {};
const PhysRegInterval def_regs{PhysReg(affinity.reg.reg()), temp.size()};
std::vector<unsigned> vars = find_vars(ctx, reg_file, def_regs);
/* Bail if the cost of moving the blocking var is likely more expensive
* than assigning a different register.
*/
if (std::any_of(vars.begin(), vars.end(), [&](unsigned id) -> bool
{ return ctx.assignments[id].weight >= ctx.assignments[temp.id()].weight; }))
return {};
RegisterFile tmp_file(reg_file);
collect_vars(ctx, tmp_file, vars);
/* re-enable the killed operands, so that we don't move the blocking vars there */
if (!is_phi(instr))
tmp_file.fill_killed_operands(instr.get());
/* create parallelcopy to move blocking vars */
std::vector<parallelcopy> pc;
if (get_reg_specified(ctx, tmp_file, temp.regClass(), instr, affinity.reg, operand_index) &&
get_regs_for_copies(ctx, tmp_file, parallelcopies, vars, instr, def_regs)) {
parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
return affinity.reg;
}
} else if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg,
operand_index)) {
return affinity.reg;
}
return {};
}
PhysReg
get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
std::vector<parallelcopy>& parallelcopies, aco_ptr<Instruction>& instr,
@ -2008,11 +2057,15 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
}
}
std::optional<PhysReg> res;
if (ctx.assignments[temp.id()].affinity) {
assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
if (affinity.assigned) {
if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, operand_index))
return affinity.reg;
res =
get_reg_affinity(ctx, reg_file, temp, parallelcopies, instr, operand_index, affinity);
if (res)
return *res;
}
}
if (ctx.assignments[temp.id()].precolor_affinity) {
@ -2021,8 +2074,6 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
return ctx.assignments[temp.id()].reg;
}
std::optional<PhysReg> res;
if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
res = get_reg_vector(ctx, reg_file, temp, instr, operand_index);
if (res)