From 310f588f9260b2e0497c5ccdda7107b80670f3dd Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 13 Nov 2025 11:10:26 +0000 Subject: [PATCH] aco/ra: move variables from affinity register to avoid waitcnt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we don't use this affinity register, we're likely to end up moving the temporary later. If it's a memory instruction destination, that's probably more expensive than just copying the blocking variables. fossil-db (navi31): Totals from 504 (0.63% of 79825) affected shaders: Instrs: 4108284 -> 4109026 (+0.02%); split: -0.01%, +0.03% CodeSize: 21226764 -> 21229764 (+0.01%); split: -0.01%, +0.02% Latency: 26931635 -> 26806989 (-0.46%); split: -0.47%, +0.00% InvThroughput: 8443520 -> 8439235 (-0.05%); split: -0.06%, +0.01% VClause: 99209 -> 99314 (+0.11%); split: -0.00%, +0.11% SClause: 85089 -> 85085 (-0.00%) Copies: 340323 -> 340993 (+0.20%); split: -0.06%, +0.26% Branches: 117225 -> 117209 (-0.01%); split: -0.02%, +0.00% VALU: 2421859 -> 2422529 (+0.03%); split: -0.01%, +0.04% SALU: 503465 -> 503470 (+0.00%); split: -0.00%, +0.00% fossil-db (navi21): Totals from 582 (0.73% of 79825) affected shaders: Instrs: 3714908 -> 3714990 (+0.00%); split: -0.02%, +0.02% CodeSize: 19977880 -> 19973076 (-0.02%); split: -0.04%, +0.01% VGPRs: 40480 -> 40496 (+0.04%) Latency: 26028895 -> 25772711 (-0.98%); split: -0.99%, +0.00% InvThroughput: 9827389 -> 9818194 (-0.09%); split: -0.10%, +0.01% VClause: 103702 -> 103815 (+0.11%); split: -0.02%, +0.13% SClause: 90861 -> 90857 (-0.00%) Copies: 335276 -> 335992 (+0.21%); split: -0.09%, +0.30% Branches: 123912 -> 123897 (-0.01%); split: -0.02%, +0.00% VALU: 2466032 -> 2466748 (+0.03%); split: -0.01%, +0.04% SALU: 533658 -> 533667 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_register_allocation.cpp | 73 +++++++++++++++++--- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index ed3fb487038..5e8bed33284 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1214,14 +1214,9 @@ find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_i return vars; } -/* collect variables from a register area and clear reg_file - * variables are sorted in decreasing size and - * increasing assigned register - */ -std::vector -collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) +void +collect_vars(ra_ctx& ctx, RegisterFile& reg_file, std::vector& ids) { - std::vector ids = find_vars(ctx, reg_file, reg_interval); std::sort(ids.begin(), ids.end(), [&](unsigned a, unsigned b) { @@ -1235,6 +1230,17 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte assignment& var = ctx.assignments[id]; reg_file.clear(var.reg, var.rc); } +} + +/* collect variables from a register area and clear reg_file + * variables are sorted in decreasing size and + * increasing assigned register + */ +std::vector +collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) +{ + std::vector ids = find_vars(ctx, reg_file, reg_interval); + collect_vars(ctx, reg_file, ids); return ids; } @@ -1982,6 +1988,49 @@ should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file) return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size; } +std::optional +get_reg_affinity(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, + std::vector& parallelcopies, aco_ptr& instr, + int operand_index, assignment& affinity) +{ + /* check if the target register is blocked */ + if (operand_index == -1 && reg_file.test(affinity.reg, temp.bytes())) { + /* It is cheaper to just assign a different register. */ + if (ctx.assignments[temp.id()].weight == 0) + return {}; + + const PhysRegInterval def_regs{PhysReg(affinity.reg.reg()), temp.size()}; + std::vector vars = find_vars(ctx, reg_file, def_regs); + + /* Bail if the cost of moving the blocking var is likely more expensive + * than assigning a different register. + */ + if (std::any_of(vars.begin(), vars.end(), [&](unsigned id) -> bool + { return ctx.assignments[id].weight >= ctx.assignments[temp.id()].weight; })) + return {}; + + RegisterFile tmp_file(reg_file); + collect_vars(ctx, tmp_file, vars); + + /* re-enable the killed operands, so that we don't move the blocking vars there */ + if (!is_phi(instr)) + tmp_file.fill_killed_operands(instr.get()); + + /* create parallelcopy to move blocking vars */ + std::vector pc; + if (get_reg_specified(ctx, tmp_file, temp.regClass(), instr, affinity.reg, operand_index) && + get_regs_for_copies(ctx, tmp_file, parallelcopies, vars, instr, def_regs)) { + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); + return affinity.reg; + } + } else if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, + operand_index)) { + return affinity.reg; + } + + return {}; +} + PhysReg get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, std::vector& parallelcopies, aco_ptr& instr, @@ -2008,11 +2057,15 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, } } + std::optional res; + if (ctx.assignments[temp.id()].affinity) { assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity]; if (affinity.assigned) { - if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, operand_index)) - return affinity.reg; + res = + get_reg_affinity(ctx, reg_file, temp, parallelcopies, instr, operand_index, affinity); + if (res) + return *res; } } if (ctx.assignments[temp.id()].precolor_affinity) { @@ -2021,8 +2074,6 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, return ctx.assignments[temp.id()].reg; } - std::optional res; - if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) { res = get_reg_vector(ctx, reg_file, temp, instr, operand_index); if (res)