r300: pack immediates more aggressively to avoid running out of constant slots

After removing the TGSI layer, load_const values will be emitted directly
as RC immediates without the scalar packing that tgsi_ureg used to do.
This can push fragment shaders past the 32-slot hardware limit on R3xx/R4xx.

Swap dead_constants and dataflow_swizzles pass order so constant
compaction runs before swizzle legalization, giving the legalization
pass an accurate slot count to work with.

In rc_remove_unused_constants, when the slot budget is tight on
R3xx/R4xx, enable aggressive packing for vec-used immediates.
Deduplicate repeated values within an immediate and merge subsequent vec
immediates into existing slots by matching values and filling free
channels.

Very small win on R5xx and very small hit on R3xx/R4xx (due to smaller
amount of legal swizzles).

Shader-db RV530:
total cycles in shared programs: 191452 -> 191425 (-0.01%)
cycles in affected programs: 5168 -> 5141 (-0.52%)
helped: 24
HURT: 10
total temps in shared programs: 17046 -> 17037 (-0.05%)
temps in affected programs: 201 -> 192 (-4.48%)
helped: 11
HURT: 5
total consts in shared programs: 94033 -> 94030 (<.01%)
consts in affected programs: 277 -> 274 (-1.08%)
helped: 5
HURT: 5
total instructions in shared programs: 128840 -> 128823 (-0.01%)
instructions in affected programs: 3588 -> 3571 (-0.47%)
helped: 25
HURT: 12

RV410:
total cycles in shared programs: 176230 -> 176270 (0.02%)
cycles in affected programs: 20598 -> 20638 (0.19%)
helped: 51
HURT: 66
total temps in shared programs: 19655 -> 19650 (-0.03%)
temps in affected programs: 1310 -> 1305 (-0.38%)
helped: 37
HURT: 25
total instructions in shared programs: 119346 -> 119379 (0.03%)
instructions in affected programs: 13884 -> 13917 (0.24%)
helped: 58
HURT: 65
total consts in shared programs: 86146 -> 86412 (0.31%)
consts in affected programs: 3093 -> 3359 (8.60%)
helped: 8
HURT: 182

Assisted-by: Claude Sonnet 4.6 <noreply@anthropic.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41618>
This commit is contained in:
Pavel Ondračka 2026-05-14 22:22:10 +02:00
parent 2a9473e7ea
commit c093efb65e
2 changed files with 130 additions and 24 deletions

View file

@ -176,8 +176,8 @@ r3xx_compile_fragment_program(struct r300_fragment_program_compiler *c)
{"convert rgb<->alpha", 1, opt, rc_convert_rgb_alpha, NULL},
{"dataflow optimize", 1, opt, rc_optimize, NULL},
{"inline literals", 1, is_r500 && opt, rc_inline_literals, NULL},
{"dataflow swizzles", 1, 1, rc_dataflow_swizzles, NULL},
{"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
{"dataflow swizzles", 1, 1, rc_dataflow_swizzles, NULL},
{"dataflow presubtract", 1, opt, rc_local_transform, opt_presubtract},
{"pair translate", 1, 1, rc_pair_translate, NULL},
{"pair scheduling", 1, 1, rc_pair_schedule, &opt},

View file

@ -7,6 +7,7 @@
#include <stdbool.h>
#include <stdlib.h>
#include "util/bitscan.h"
#include "radeon_code.h"
#include "radeon_dataflow.h"
struct const_remap_state {
@ -96,21 +97,111 @@ place_immediate_in_free_slot(struct const_remap_state *s, unsigned i)
assert(util_bitcount(s->is_used_as_vector[i]) > 1);
unsigned count = s->new_constants.Count;
bool remapped = count != i;
s->new_constants.Constants[count] = s->constants[i];
s->new_constants.Constants[count].UseMask = s->is_used_as_vector[i];
s->new_constants.Constants[count].UseMask = 0;
/* Deduplicate repeated values within the immediate, leaving
* free channels for later merging via try_merge_vec_immediate. */
for (unsigned chan = 0; chan < 4; chan++) {
if (s->constants[i].UseMask & 1 << chan & s->is_used_as_vector[i]) {
if (!(s->is_used_as_vector[i] & (1 << chan)))
continue;
float val = s->constants[i].u.Immediate[chan];
bool found = false;
for (unsigned slot_chan = 0; slot_chan < 4; slot_chan++) {
if ((s->new_constants.Constants[count].UseMask & (1 << slot_chan)) &&
s->new_constants.Constants[count].u.Immediate[slot_chan] == val) {
s->inv_remap_table[i].index[chan] = count;
s->inv_remap_table[i].swizzle[chan] = slot_chan;
remapped |= slot_chan != chan;
found = true;
break;
}
}
if (!found) {
unsigned new_chan = ffs(~s->new_constants.Constants[count].UseMask) - 1;
s->new_constants.Constants[count].u.Immediate[new_chan] = val;
s->new_constants.Constants[count].UseMask |= (1 << new_chan);
s->inv_remap_table[i].index[chan] = count;
s->inv_remap_table[i].swizzle[chan] = chan;
s->inv_remap_table[i].swizzle[chan] = new_chan;
remapped |= new_chan != chan;
}
}
if (count != i) {
if (remapped)
s->is_identity = false;
}
s->new_constants.Count++;
}
/* Try to merge a vec-used immediate into an already-placed slot by matching
* values and filling free channels. */
static bool
try_merge_vec_immediate(struct const_remap_state *s, unsigned i)
{
uint8_t vec_mask = s->is_used_as_vector[i];
for (unsigned j = 0; j < s->new_constants.Count; j++) {
if (s->new_constants.Constants[j].Type != RC_CONSTANT_IMMEDIATE)
continue;
/* Work on a local copy so we don't corrupt state on failure. */
uint8_t new_chan[4];
uint8_t slot_used = s->new_constants.Constants[j].UseMask;
float slot_vals[4];
memcpy(slot_vals, s->new_constants.Constants[j].u.Immediate, sizeof(slot_vals));
bool ok = true;
for (unsigned chan = 0; chan < 4; chan++) {
new_chan[chan] = 4;
if (!(vec_mask & (1 << chan)))
continue;
float val = s->constants[i].u.Immediate[chan];
/* First look for an existing (or tentatively placed) channel with
* the same value. */
bool found = false;
for (unsigned slot_chan = 0; slot_chan < 4; slot_chan++) {
if ((slot_used & (1 << slot_chan)) && slot_vals[slot_chan] == val) {
new_chan[chan] = slot_chan;
found = true;
break;
}
}
if (!found) {
/* Put the value in a free channel. */
uint8_t free_chan = ffs(~slot_used) - 1;
if (free_chan > 3) {
ok = false;
break;
}
new_chan[chan] = free_chan;
slot_vals[free_chan] = val;
slot_used |= (1 << free_chan);
}
}
if (!ok)
continue;
/* Write newly-claimed channels and update remap tables. */
for (unsigned chan = 0; chan < 4; chan++) {
if (!(vec_mask & (1 << chan)))
continue;
if (!(s->new_constants.Constants[j].UseMask & (1 << new_chan[chan]))) {
s->new_constants.Constants[j].u.Immediate[new_chan[chan]] =
s->constants[i].u.Immediate[chan];
s->new_constants.Constants[j].UseMask |= (1 << new_chan[chan]);
}
s->inv_remap_table[i].index[chan] = j;
s->inv_remap_table[i].swizzle[chan] = new_chan[chan];
}
s->is_identity = false;
return true;
}
return false;
}
static void
try_merge_constants_external(struct const_remap_state *s, unsigned i)
{
@ -202,7 +293,7 @@ rc_remove_unused_constants(struct radeon_compiler *c, void *user)
}
}
/* Now iterate over scalarar externals and put them into empty slots. */
/* Now iterate over scalar externals and put them into empty slots. */
for (unsigned i = 0; i < c->Program.Constants.Count; i++) {
if (constants[i].Type != RC_CONSTANT_EXTERNAL)
continue;
@ -210,21 +301,45 @@ rc_remove_unused_constants(struct radeon_compiler *c, void *user)
try_merge_constants_external(s, i);
}
/* Now put immediates which are used as vectors. */
/* Place state constants before immediates so the immediate-packing budget
* accounts for state slots that cannot be packed with immediates. */
for (unsigned i = 0; i < c->Program.Constants.Count; i++) {
if (constants[i].Type != RC_CONSTANT_STATE)
continue;
if (util_bitcount(s->constants[i].UseMask) > 0) {
place_constant_in_free_slot(s, i);
}
}
/* Count vec-used immediates to estimate whether aggressive packing (specifically
* packing which can produce invalid swizzles) is needed. */
unsigned num_vec_imm = 0;
for (unsigned i = 0; i < c->Program.Constants.Count; i++) {
if (constants[i].Type == RC_CONSTANT_IMMEDIATE &&
util_bitcount(s->constants[i].UseMask) > 0 &&
util_bitcount(s->is_used_as_vector[i]) > 0)
num_vec_imm++;
}
bool aggressive = c->type == RC_VERTEX_PROGRAM ||
(!c->is_r500 &&
s->new_constants.Count + num_vec_imm > R300_PFS_NUM_CONST_REGS);
/* Place vec-used immediates first. Place_immediate_in_free_slot deduplicates
* repeated values within the immediate, leaving free channels in the new slot.
* Subsequent vec immediates can then merge into those free channels via
* try_merge_vec_immediate, naturally building a shared value palette. */
for (unsigned i = 0; i < c->Program.Constants.Count; i++) {
if (constants[i].Type == RC_CONSTANT_IMMEDIATE &&
util_bitcount(s->constants[i].UseMask) > 0 &&
util_bitcount(s->is_used_as_vector[i]) > 0) {
if (aggressive && try_merge_vec_immediate(s, i))
continue;
place_immediate_in_free_slot(s, i);
}
}
/* Now walk over scalar immediates and try to:
* a) check for duplicates,
* b) find free slot.
* All of this is already done by rc_constants_add_immediate_scalar,
* so just use it.
*/
/* Scalar-only channels fill the remaining free channels of already-placed
* slots or create new ones via rc_constants_add_immediate_scalar. */
for (unsigned i = 0; i < c->Program.Constants.Count; i++) {
if (constants[i].Type != RC_CONSTANT_IMMEDIATE)
continue;
@ -240,15 +355,6 @@ rc_remove_unused_constants(struct radeon_compiler *c, void *user)
}
}
/* Finally place state constants. */
for (unsigned i = 0; i < c->Program.Constants.Count; i++) {
if (constants[i].Type != RC_CONSTANT_STATE)
continue;
if (util_bitcount(s->constants[i].UseMask) > 0) {
place_constant_in_free_slot(s, i);
}
}
/* is_identity ==> new_count == old_count
* !is_identity ==> new_count < old_count */
assert(!((s->has_rel_addr || !c->remove_unused_constants) && s->are_externals_remapped));