From c093efb65e80057ffd2cae4376d44e73227707e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Thu, 14 May 2026 22:22:10 +0200 Subject: [PATCH] r300: pack immediates more aggressively to avoid running out of constant slots After removing the TGSI layer, load_const values will be emitted directly as RC immediates without the scalar packing that tgsi_ureg used to do. This can push fragment shaders past the 32-slot hardware limit on R3xx/R4xx. Swap dead_constants and dataflow_swizzles pass order so constant compaction runs before swizzle legalization, giving the legalization pass an accurate slot count to work with. In rc_remove_unused_constants, when the slot budget is tight on R3xx/R4xx, enable aggressive packing for vec-used immediates. Deduplicate repeated values within an immediate and merge subsequent vec immediates into existing slots by matching values and filling free channels. Very small win on R5xx and very small hit on R3xx/R4xx (due to smaller amount of legal swizzles). Shader-db RV530: total cycles in shared programs: 191452 -> 191425 (-0.01%) cycles in affected programs: 5168 -> 5141 (-0.52%) helped: 24 HURT: 10 total temps in shared programs: 17046 -> 17037 (-0.05%) temps in affected programs: 201 -> 192 (-4.48%) helped: 11 HURT: 5 total consts in shared programs: 94033 -> 94030 (<.01%) consts in affected programs: 277 -> 274 (-1.08%) helped: 5 HURT: 5 total instructions in shared programs: 128840 -> 128823 (-0.01%) instructions in affected programs: 3588 -> 3571 (-0.47%) helped: 25 HURT: 12 RV410: total cycles in shared programs: 176230 -> 176270 (0.02%) cycles in affected programs: 20598 -> 20638 (0.19%) helped: 51 HURT: 66 total temps in shared programs: 19655 -> 19650 (-0.03%) temps in affected programs: 1310 -> 1305 (-0.38%) helped: 37 HURT: 25 total instructions in shared programs: 119346 -> 119379 (0.03%) instructions in affected programs: 13884 -> 13917 (0.24%) helped: 58 HURT: 65 total consts in shared programs: 86146 -> 86412 (0.31%) consts in affected programs: 3093 -> 3359 (8.60%) helped: 8 HURT: 182 Assisted-by: Claude Sonnet 4.6 Part-of: --- .../drivers/r300/compiler/r3xx_fragprog.c | 2 +- .../r300/compiler/radeon_remove_constants.c | 152 +++++++++++++++--- 2 files changed, 130 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/r3xx_fragprog.c b/src/gallium/drivers/r300/compiler/r3xx_fragprog.c index b338cc452a6..740e57a7afd 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_fragprog.c +++ b/src/gallium/drivers/r300/compiler/r3xx_fragprog.c @@ -176,8 +176,8 @@ r3xx_compile_fragment_program(struct r300_fragment_program_compiler *c) {"convert rgb<->alpha", 1, opt, rc_convert_rgb_alpha, NULL}, {"dataflow optimize", 1, opt, rc_optimize, NULL}, {"inline literals", 1, is_r500 && opt, rc_inline_literals, NULL}, - {"dataflow swizzles", 1, 1, rc_dataflow_swizzles, NULL}, {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table}, + {"dataflow swizzles", 1, 1, rc_dataflow_swizzles, NULL}, {"dataflow presubtract", 1, opt, rc_local_transform, opt_presubtract}, {"pair translate", 1, 1, rc_pair_translate, NULL}, {"pair scheduling", 1, 1, rc_pair_schedule, &opt}, diff --git a/src/gallium/drivers/r300/compiler/radeon_remove_constants.c b/src/gallium/drivers/r300/compiler/radeon_remove_constants.c index 5e92329f7f2..a8b09604f7f 100644 --- a/src/gallium/drivers/r300/compiler/radeon_remove_constants.c +++ b/src/gallium/drivers/r300/compiler/radeon_remove_constants.c @@ -7,6 +7,7 @@ #include #include #include "util/bitscan.h" +#include "radeon_code.h" #include "radeon_dataflow.h" struct const_remap_state { @@ -96,21 +97,111 @@ place_immediate_in_free_slot(struct const_remap_state *s, unsigned i) assert(util_bitcount(s->is_used_as_vector[i]) > 1); unsigned count = s->new_constants.Count; - + bool remapped = count != i; s->new_constants.Constants[count] = s->constants[i]; - s->new_constants.Constants[count].UseMask = s->is_used_as_vector[i]; + s->new_constants.Constants[count].UseMask = 0; + + /* Deduplicate repeated values within the immediate, leaving + * free channels for later merging via try_merge_vec_immediate. */ for (unsigned chan = 0; chan < 4; chan++) { - if (s->constants[i].UseMask & 1 << chan & s->is_used_as_vector[i]) { + if (!(s->is_used_as_vector[i] & (1 << chan))) + continue; + float val = s->constants[i].u.Immediate[chan]; + bool found = false; + for (unsigned slot_chan = 0; slot_chan < 4; slot_chan++) { + if ((s->new_constants.Constants[count].UseMask & (1 << slot_chan)) && + s->new_constants.Constants[count].u.Immediate[slot_chan] == val) { + s->inv_remap_table[i].index[chan] = count; + s->inv_remap_table[i].swizzle[chan] = slot_chan; + remapped |= slot_chan != chan; + found = true; + break; + } + } + if (!found) { + unsigned new_chan = ffs(~s->new_constants.Constants[count].UseMask) - 1; + s->new_constants.Constants[count].u.Immediate[new_chan] = val; + s->new_constants.Constants[count].UseMask |= (1 << new_chan); s->inv_remap_table[i].index[chan] = count; - s->inv_remap_table[i].swizzle[chan] = chan; + s->inv_remap_table[i].swizzle[chan] = new_chan; + remapped |= new_chan != chan; } } - if (count != i) { + if (remapped) s->is_identity = false; - } s->new_constants.Count++; } +/* Try to merge a vec-used immediate into an already-placed slot by matching + * values and filling free channels. */ +static bool +try_merge_vec_immediate(struct const_remap_state *s, unsigned i) +{ + uint8_t vec_mask = s->is_used_as_vector[i]; + + for (unsigned j = 0; j < s->new_constants.Count; j++) { + if (s->new_constants.Constants[j].Type != RC_CONSTANT_IMMEDIATE) + continue; + + /* Work on a local copy so we don't corrupt state on failure. */ + uint8_t new_chan[4]; + uint8_t slot_used = s->new_constants.Constants[j].UseMask; + float slot_vals[4]; + memcpy(slot_vals, s->new_constants.Constants[j].u.Immediate, sizeof(slot_vals)); + + bool ok = true; + for (unsigned chan = 0; chan < 4; chan++) { + new_chan[chan] = 4; + if (!(vec_mask & (1 << chan))) + continue; + + float val = s->constants[i].u.Immediate[chan]; + + /* First look for an existing (or tentatively placed) channel with + * the same value. */ + bool found = false; + for (unsigned slot_chan = 0; slot_chan < 4; slot_chan++) { + if ((slot_used & (1 << slot_chan)) && slot_vals[slot_chan] == val) { + new_chan[chan] = slot_chan; + found = true; + break; + } + } + if (!found) { + /* Put the value in a free channel. */ + uint8_t free_chan = ffs(~slot_used) - 1; + if (free_chan > 3) { + ok = false; + break; + } + + new_chan[chan] = free_chan; + slot_vals[free_chan] = val; + slot_used |= (1 << free_chan); + } + } + + if (!ok) + continue; + + /* Write newly-claimed channels and update remap tables. */ + for (unsigned chan = 0; chan < 4; chan++) { + if (!(vec_mask & (1 << chan))) + continue; + if (!(s->new_constants.Constants[j].UseMask & (1 << new_chan[chan]))) { + s->new_constants.Constants[j].u.Immediate[new_chan[chan]] = + s->constants[i].u.Immediate[chan]; + s->new_constants.Constants[j].UseMask |= (1 << new_chan[chan]); + } + s->inv_remap_table[i].index[chan] = j; + s->inv_remap_table[i].swizzle[chan] = new_chan[chan]; + } + s->is_identity = false; + return true; + } + return false; +} + static void try_merge_constants_external(struct const_remap_state *s, unsigned i) { @@ -202,7 +293,7 @@ rc_remove_unused_constants(struct radeon_compiler *c, void *user) } } - /* Now iterate over scalarar externals and put them into empty slots. */ + /* Now iterate over scalar externals and put them into empty slots. */ for (unsigned i = 0; i < c->Program.Constants.Count; i++) { if (constants[i].Type != RC_CONSTANT_EXTERNAL) continue; @@ -210,21 +301,45 @@ rc_remove_unused_constants(struct radeon_compiler *c, void *user) try_merge_constants_external(s, i); } - /* Now put immediates which are used as vectors. */ + /* Place state constants before immediates so the immediate-packing budget + * accounts for state slots that cannot be packed with immediates. */ + for (unsigned i = 0; i < c->Program.Constants.Count; i++) { + if (constants[i].Type != RC_CONSTANT_STATE) + continue; + if (util_bitcount(s->constants[i].UseMask) > 0) { + place_constant_in_free_slot(s, i); + } + } + + /* Count vec-used immediates to estimate whether aggressive packing (specifically + * packing which can produce invalid swizzles) is needed. */ + unsigned num_vec_imm = 0; + for (unsigned i = 0; i < c->Program.Constants.Count; i++) { + if (constants[i].Type == RC_CONSTANT_IMMEDIATE && + util_bitcount(s->constants[i].UseMask) > 0 && + util_bitcount(s->is_used_as_vector[i]) > 0) + num_vec_imm++; + } + bool aggressive = c->type == RC_VERTEX_PROGRAM || + (!c->is_r500 && + s->new_constants.Count + num_vec_imm > R300_PFS_NUM_CONST_REGS); + + /* Place vec-used immediates first. Place_immediate_in_free_slot deduplicates + * repeated values within the immediate, leaving free channels in the new slot. + * Subsequent vec immediates can then merge into those free channels via + * try_merge_vec_immediate, naturally building a shared value palette. */ for (unsigned i = 0; i < c->Program.Constants.Count; i++) { if (constants[i].Type == RC_CONSTANT_IMMEDIATE && util_bitcount(s->constants[i].UseMask) > 0 && util_bitcount(s->is_used_as_vector[i]) > 0) { + if (aggressive && try_merge_vec_immediate(s, i)) + continue; place_immediate_in_free_slot(s, i); } } - /* Now walk over scalar immediates and try to: - * a) check for duplicates, - * b) find free slot. - * All of this is already done by rc_constants_add_immediate_scalar, - * so just use it. - */ + /* Scalar-only channels fill the remaining free channels of already-placed + * slots or create new ones via rc_constants_add_immediate_scalar. */ for (unsigned i = 0; i < c->Program.Constants.Count; i++) { if (constants[i].Type != RC_CONSTANT_IMMEDIATE) continue; @@ -240,15 +355,6 @@ rc_remove_unused_constants(struct radeon_compiler *c, void *user) } } - /* Finally place state constants. */ - for (unsigned i = 0; i < c->Program.Constants.Count; i++) { - if (constants[i].Type != RC_CONSTANT_STATE) - continue; - if (util_bitcount(s->constants[i].UseMask) > 0) { - place_constant_in_free_slot(s, i); - } - } - /* is_identity ==> new_count == old_count * !is_identity ==> new_count < old_count */ assert(!((s->has_rel_addr || !c->remove_unused_constants) && s->are_externals_remapped));