From 563b0b347a50366f4a75f2638d966e42addda7af Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Mon, 15 Sep 2025 12:30:58 +0200 Subject: [PATCH] ir3: don't create merge sets for subreg moves There are multiple places where RA assumes merge sets are either all-full or all-half registers. Creating merge sets for subreg moves mixes full and half registers which may lead to RA failures. Fix this by not creating merge sets for subreg moves anymore. Instead, we manually try to allocate a subreg move's src for its dst when selecting a register during RA, similar to how ALU/SFU instructions try to reuse their srcs. Totals: Instrs: 363174291 -> 363175216 (+0.00%); split: -0.00%, +0.00% CodeSize: 922975364 -> 922977230 (+0.00%); split: -0.00%, +0.00% NOPs: 47652421 -> 47652444 (+0.00%); split: -0.00%, +0.00% MOVs: 15652959 -> 15653065 (+0.00%); split: -0.00%, +0.00% COVs: 4097203 -> 4097052 (-0.00%); split: -0.01%, +0.00% (ss): 7806025 -> 7806183 (+0.00%); split: -0.00%, +0.00% (sy): 3981862 -> 3981855 (-0.00%); split: -0.00%, +0.00% (ss)-stall: 26612057 -> 26612789 (+0.00%); split: -0.00%, +0.00% (sy)-stall: 111568786 -> 111568721 (-0.00%); split: -0.00%, +0.00% STPs: 345796 -> 345792 (-0.00%) LDPs: 191118 -> 191111 (-0.00%) Preamble Instrs: 160491915 -> 160492355 (+0.00%); split: -0.00%, +0.00% Last helper: 116587870 -> 116588273 (+0.00%); split: -0.00%, +0.00% Cat0: 53288367 -> 53288384 (+0.00%); split: -0.00%, +0.00% Cat1: 20954383 -> 20954336 (-0.00%); split: -0.00%, +0.00% Cat2: 155294307 -> 155295252 (+0.00%); split: -0.00%, +0.00% Cat6: 4623070 -> 4623059 (-0.00%) Cat7: 9302363 -> 9302384 (+0.00%); split: -0.00%, +0.00% Totals from 979 (0.07% of 1352016) affected shaders: Instrs: 1324850 -> 1325775 (+0.07%); split: -0.07%, +0.14% CodeSize: 2596114 -> 2597980 (+0.07%); split: -0.04%, +0.11% NOPs: 330197 -> 330220 (+0.01%); split: -0.23%, +0.24% MOVs: 62592 -> 62698 (+0.17%); split: -0.35%, +0.52% COVs: 49011 -> 48860 (-0.31%); split: -0.62%, +0.31% (ss): 35671 -> 35829 (+0.44%); split: -0.28%, +0.73% (sy): 18936 -> 18929 (-0.04%); split: -0.13%, +0.09% (ss)-stall: 157929 -> 158661 (+0.46%); split: -0.36%, +0.82% (sy)-stall: 543371 -> 543306 (-0.01%); split: -0.20%, +0.19% STPs: 2741 -> 2737 (-0.15%) LDPs: 3022 -> 3015 (-0.23%) Preamble Instrs: 322588 -> 323028 (+0.14%); split: -0.01%, +0.14% Last helper: 298996 -> 299399 (+0.13%); split: -0.05%, +0.19% Cat0: 361575 -> 361592 (+0.00%); split: -0.21%, +0.22% Cat1: 111733 -> 111686 (-0.04%); split: -0.45%, +0.41% Cat2: 487366 -> 488311 (+0.19%); split: -0.04%, +0.23% Cat6: 21239 -> 21228 (-0.05%) Cat7: 37170 -> 37191 (+0.06%); split: -0.06%, +0.12% Signed-off-by: Job Noorman Fixes: c757b22c5fa ("ir3: add subreg move optimization") Part-of: --- src/freedreno/ir3/ir3_merge_regs.c | 14 ----------- src/freedreno/ir3/ir3_ra.c | 38 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c index cd4309387eb..dba68af0d37 100644 --- a/src/freedreno/ir3/ir3_merge_regs.c +++ b/src/freedreno/ir3/ir3_merge_regs.c @@ -384,19 +384,6 @@ aggressive_coalesce_collect(struct ir3_liveness *live, } } -static void -aggressive_coalesce_subreg_move(struct ir3_liveness *live, - struct ir3_instruction *instr) -{ - enum ir3_subreg_move subreg_move = ir3_is_subreg_move(instr); - - if (subreg_move != IR3_SUBREG_MOVE_NONE && - (instr->dsts[0]->flags & IR3_REG_SSA)) { - unsigned offset = subreg_move == IR3_SUBREG_MOVE_LOWER ? 0 : 1; - try_merge_defs(live, instr->srcs[0]->def, instr->dsts[0], offset); - } -} - static void aggressive_coalesce_rpt(struct ir3_liveness *live, struct ir3_instruction *instr) @@ -618,7 +605,6 @@ ir3_aggressive_coalesce(struct ir3_liveness *live, aggressive_coalesce_parallel_copy(live, instr); break; default: - aggressive_coalesce_subreg_move(live, instr); break; } } diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 4aa2d70cc5f..8c0a22ddecf 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -1384,6 +1384,34 @@ try_allocate_src(struct ra_ctx *ctx, struct ra_file *file, return ~0; } +static physreg_t +try_allocate_src_subreg(struct ra_ctx *ctx, struct ra_file *file, + struct ir3_register *reg, + enum ir3_subreg_move subreg_move) +{ + assert(subreg_move != IR3_SUBREG_MOVE_NONE); + + /* Subreg moves always write a half register. */ + assert(reg_elem_size(reg) == 1); + + struct ir3_register *src = reg->instr->srcs[0]; + if (!ra_reg_is_src(src) || ra_get_file(ctx, src) != file) + return ~0; + + unsigned offset = subreg_move == IR3_SUBREG_MOVE_LOWER ? 0 : 1; + struct ra_interval *src_interval = ra_interval_get(ctx, src->def); + physreg_t src_physreg = ra_interval_get_physreg(src_interval) + offset; + unsigned file_size = reg_file_size(file, reg); + unsigned size = reg_size(reg); + + if (src_physreg + size <= file_size && + get_reg_specified(ctx, file, reg, src_physreg, false)) { + return src_physreg; + } + + return ~0; +} + static bool rpt_has_unique_merge_set(struct ir3_instruction *instr) { @@ -1460,6 +1488,16 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg) } } + /* For subreg moves (see ir3_is_subreg_move), try to allocate half of their + * full src for their dst. If this succeeds, the instruction can be removed. + */ + enum ir3_subreg_move subreg_move = ir3_is_subreg_move(reg->instr); + if (subreg_move != IR3_SUBREG_MOVE_NONE) { + physreg_t src_reg = try_allocate_src_subreg(ctx, file, reg, subreg_move); + if (src_reg != (physreg_t)~0) + return src_reg; + } + /* For ALU and SFU instructions, if the src reg is avail to pick, use it. * Because this doesn't introduce unnecessary dependencies, and it * potentially avoids needing (ss) syncs for write after read hazards for