ir3: Improve spilling of NIR vars to scratch.
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Previously, we would spill at the NIR level any temp array over 16 vec4s.
This had two problems:

1) We wouldn't spill for the worst case scenario: a MAD accessing a dst
array and 3 different src arrays (that all get fully unspilled, rather
than just reloading the specific reg in the operand).  This would fail to
register allocate.  We haven't seen this in practice.

2) We would spill vec4[17] and larger arrays that weren't necessary to get
the shader to register allocate.  This occurred on a FS for in Stray that
had a vec4[24] array and just 4 vec4s of register pressure other than the
array.

Instead, use NIR scratch spilling when the worst case set of vars to
reference in an instruction would overflow GPR space.  This makes the
shader in Stray go from 11ms to .5ms, by eliminating all spilling and
leaving the array in GPRs.  On the other hand, if leaving the arrays
unspilled in NIR means that we cause spilling in ir3, the fact that ir3
spills/reloads work on the whole array may cause the amount of spilling to
increase.  However, we can see the effect is very small in terms of number
of shaders affected in shader-db and an overwhelmingly positive effect on
spills:

MaxWaves: 22522470 -> 22520664 (-0.01%)
Instrs: 396093281 -> 396122221 (+0.01%); split: -0.00%, +0.01%
STPs: 218915 -> 182907 (-16.45%)
LDPs: 155374 -> 153364 (-1.29%); split: -2.79%, +1.50%

Totals from 496 (0.03% of 1561298) affected shaders:
MaxWaves: 3792 -> 1986 (-47.63%)
Instrs: 441224 -> 470164 (+6.56%); split: -0.00%, +6.57%
CodeSize: 926164 -> 976734 (+5.46%); split: -0.05%, +5.52%
NOPs: 58896 -> 52765 (-10.41%); split: -14.95%, +4.60%
MOVs: 16314 -> 57901 (+254.92%)
COVs: 3293 -> 5146 (+56.27%)
Full: 12876 -> 23632 (+83.54%)
(ss): 18613 -> 11573 (-37.82%); split: -47.53%, +9.71%
(sy): 2539 -> 2505 (-1.34%); split: -10.75%, +9.41%
(ss)-stall: 40682 -> 26413 (-35.07%); split: -47.90%, +12.80%
(sy)-stall: 147862 -> 117004 (-20.87%); split: -37.65%, +16.69%
STPs: 38566 -> 2558 (-93.37%)
LDPs: 5060 -> 3050 (-39.72%); split: -85.77%, +45.93%
Cat0: 65593 -> 59487 (-9.31%); split: -13.42%, +4.15%
Cat1: 19667 -> 63105 (+220.87%)
Cat2: 155958 -> 157879 (+1.23%); split: -0.05%, +1.28%
Cat6: 105228 -> 94910 (-9.81%); split: -12.36%, +2.54%
Cat7: 2480 -> 2485 (+0.20%); split: -0.08%, +0.28%
Subgroup size: 31872 -> 31744 (-0.40%)

The primary impacted application from shader-db is gfxbench aztec ruins.
A quick test of it showed no significant performance improvement (n=3).

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37245>
This commit is contained in:
Emma Anholt 2025-09-05 16:29:31 -07:00 committed by Marge Bot
parent 0d9428736b
commit c00ebca5c4

View file

@ -7,6 +7,7 @@
*/ */
#include "util/u_debug.h" #include "util/u_debug.h"
#include "util/u_dynarray.h"
#include "util/u_math.h" #include "util/u_math.h"
#include "ir3_compiler.h" #include "ir3_compiler.h"
@ -277,31 +278,6 @@ ir3_lower_bit_size(const nir_instr *instr, UNUSED void *data)
return 0; return 0;
} }
static void
ir3_get_variable_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align)
{
switch (type->base_type) {
case GLSL_TYPE_ARRAY:
case GLSL_TYPE_INTERFACE:
case GLSL_TYPE_STRUCT:
glsl_size_align_handle_array_and_structs(type, ir3_get_variable_size_align_bytes,
size, align);
break;
case GLSL_TYPE_UINT8:
case GLSL_TYPE_INT8:
/* 8-bit values are handled through 16-bit half-registers, so the resulting size
* and alignment value has to be doubled to reflect the actual variable size
* requirement.
*/
*size = 2 * glsl_get_components(type);
*align = 2;
break;
default:
glsl_get_natural_size_align_bytes(type, size, align);
break;
}
}
#define OPT(nir, pass, ...) \ #define OPT(nir, pass, ...) \
({ \ ({ \
bool this_progress = false; \ bool this_progress = false; \
@ -1114,6 +1090,174 @@ atomic_supported(const nir_instr * instr, const void * data)
return nir_instr_as_intrinsic(instr)->def.bit_size != 64; return nir_instr_as_intrinsic(instr)->def.bit_size != 64;
} }
/**
* Like glsl_get_natural_size_align_bytes, but for ir3 RA, where all <32-bit
* components are stored in half regs.
*/
static void
ir3_get_ra_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align)
{
switch (type->base_type) {
case GLSL_TYPE_BOOL:
case GLSL_TYPE_UINT8:
case GLSL_TYPE_INT8:
case GLSL_TYPE_UINT16:
case GLSL_TYPE_INT16:
case GLSL_TYPE_FLOAT16:
case GLSL_TYPE_BFLOAT16:
*size = 2 * glsl_get_components(type);
*align = 2;
break;
case GLSL_TYPE_FLOAT_E4M3FN:
case GLSL_TYPE_FLOAT_E5M2:
case GLSL_TYPE_UINT:
case GLSL_TYPE_INT:
case GLSL_TYPE_FLOAT:
case GLSL_TYPE_DOUBLE:
case GLSL_TYPE_UINT64:
case GLSL_TYPE_INT64: {
unsigned N = glsl_get_bit_size(type) / 8;
*size = N * glsl_get_components(type);
*align = N;
break;
}
case GLSL_TYPE_ARRAY:
case GLSL_TYPE_INTERFACE:
case GLSL_TYPE_STRUCT:
glsl_size_align_handle_array_and_structs(type,
ir3_get_ra_size_align_bytes,
size, align);
break;
case GLSL_TYPE_SAMPLER:
case GLSL_TYPE_TEXTURE:
case GLSL_TYPE_IMAGE:
/* Bindless samplers and images. */
*size = 8;
*align = 8;
break;
case GLSL_TYPE_COOPERATIVE_MATRIX:
case GLSL_TYPE_ATOMIC_UINT:
case GLSL_TYPE_SUBROUTINE:
case GLSL_TYPE_VOID:
case GLSL_TYPE_ERROR:
UNREACHABLE("type does not have a natural size");
}
}
static int
variable_size_sort(const void *a, const void *b)
{
const nir_variable *var_a = *(const nir_variable **)a;
const nir_variable *var_b = *(const nir_variable **)b;
uint32_t size_a, align_a;
ir3_get_ra_size_align_bytes(var_a->type, &size_a, &align_a);
uint32_t size_b, align_b;
ir3_get_ra_size_align_bytes(var_b->type, &size_b, &align_b);
return size_a - size_b;
}
/* Filters out variables from the set that might go to ir3 RA, in order to avoid
* exceeding the limit of register pressure in a single instruction.
*
* A single instruction could require up to 4 array vars to be fully loaded in
* GPR space: 1 destination and 3 src operands (since we reload full arrays when
* unspilling)
*/
static void
ir3_filter_vars_to_scratch_single_instr_limit(struct set *set, uint32_t limit,
bool limit_for_half)
{
struct util_dynarray candidate_nonspilled;
util_dynarray_init(&candidate_nonspilled, NULL);
/* Create an array of vars to potentially not spill sorted by increasing
* size.
*/
set_foreach(set, entry) {
const nir_variable *var = entry->key;
/* If it's definitely a 32/64-bit array that will be stored in full regs,
* then don't consider it while we're limiting for half-reg accesses. This
* is conservative when we can't figure out the array type, but thanks to
* struct splitting we always successfully determine it on fossils db.
*/
if (glsl_type_is_array_or_matrix(var->type)) {
const struct glsl_type *elem_type = glsl_without_array_or_matrix(var->type);
if (limit_for_half && glsl_type_is_vector_or_scalar(elem_type) &&
glsl_get_bit_size(elem_type) > 16) {
continue;
}
}
util_dynarray_append(&candidate_nonspilled, var);
}
qsort(
util_dynarray_begin(&candidate_nonspilled),
util_dynarray_num_elements(&candidate_nonspilled, const nir_variable *),
sizeof(nir_variable *), variable_size_sort);
/* Loop removing variables from the set of variables to not spill, until the
* worst case set of variables remaining fit under the limit.
*/
for (;;) {
int last =
util_dynarray_num_elements(&candidate_nonspilled, nir_variable *) - 1;
uint32_t total_size = 0;
for (int i = last; i >= MAX2(last - 3, 0); i--) {
nir_variable *var =
*util_dynarray_element(&candidate_nonspilled, nir_variable *, i);
uint32_t size, align;
ir3_get_ra_size_align_bytes(var->type, &size, &align);
total_size += size;
}
if (total_size <= limit)
break;
nir_variable *var =
util_dynarray_pop(&candidate_nonspilled, nir_variable *);
_mesa_set_remove_key(set, var);
}
util_dynarray_fini(&candidate_nonspilled);
}
static void
ir3_vars_to_scratch_cb(struct set *set, void *data)
{
struct ir3_pressure *limit_pressure = data;
struct set *nonspilled = _mesa_pointer_set_create(NULL);
set_foreach(set, entry) {
_mesa_set_add(nonspilled, entry->key);
}
/* Filter for the half vars first, which may let the full limit (which
* considers all vars) succeed on vars it wouldn't otherwise.
*
* We decrement the limit for the array's sizes by a vec4's size, because an
* instruction will likely have non-array sources that also need to be
* present, so we can't have the whole register file taken up by an array.
*/
ir3_filter_vars_to_scratch_single_instr_limit(
nonspilled, (limit_pressure->half * 2) - 16, true);
ir3_filter_vars_to_scratch_single_instr_limit(
nonspilled, (limit_pressure->full * 2) - 16, false);
set_foreach(set, entry) {
const nir_variable *var = entry->key;
if (_mesa_set_search(nonspilled, var))
_mesa_set_remove_key(set, var);
}
}
/** /**
* Filters the real_wavesize that was set based on API requirements, to an * Filters the real_wavesize that was set based on API requirements, to an
* appopriate value given hardware limits and the NIR shader we get. * appopriate value given hardware limits and the NIR shader we get.
@ -1249,9 +1393,10 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so,
* expensive. * expensive.
*/ */
if (so->compiler->has_pvtmem) { if (so->compiler->has_pvtmem) {
progress |= OPT(s, nir_lower_vars_to_scratch, struct ir3_pressure limit_pressure = ir3_ra_get_reg_file_limits(so);
16 * 16 /* bytes */, progress |=
ir3_get_variable_size_align_bytes, glsl_get_natural_size_align_bytes); OPT(s, nir_lower_vars_to_scratch_global,
glsl_get_natural_size_align_bytes, ir3_vars_to_scratch_cb, &limit_pressure);
} }
/* Lower scratch writemasks */ /* Lower scratch writemasks */