ir3: Move load/store vectorization to finalize

Some frontends such as rusticl and turnip call the optimization loop
before choosing the shared memory layout, in order to be able to delete
variables that turn out to be unused. This means that we can't vectorize
them until after the first run of the optimization loop. Other drivers
also seem to do something similar.

This also has the benefit that by delaying vectorization of UBOs until
after they are lowered from derefs, we don't insert casts which remove
the ability of nir_lower_explicit_io to insert a range, which was
blocking the pushing of vectorized indirect UBO loads. This has a
significant positive impact on fossil-db:

Only doing vectorization later exposes a bug where vectorization could
change the bitsize after we used it to determine which descriptor to
use. It happened to work before because vectorization was usually done
early. To fix it, move adjusting the descriptor to a new pass that
happens after finalizing.

Totals:
MaxWaves: 2249140 -> 2281068 (+1.42%); split: +1.43%, -0.01%
Instrs: 49624230 -> 49143117 (-0.97%); split: -1.14%, +0.17%
CodeSize: 103796862 -> 104143744 (+0.33%); split: -0.98%, +1.31%
NOPs: 8489860 -> 8512218 (+0.26%); split: -1.55%, +1.81%
MOVs: 1531650 -> 1574911 (+2.82%); split: -1.37%, +4.20%
Full: 1814334 -> 1748906 (-3.61%); split: -3.64%, +0.03%
(ss): 1155395 -> 1128249 (-2.35%); split: -3.48%, +1.13%
(sy): 608650 -> 567972 (-6.68%); split: -7.32%, +0.64%
(ss)-stall: 4352550 -> 4340473 (-0.28%); split: -2.08%, +1.80%
(sy)-stall: 17852259 -> 16943647 (-5.09%); split: -6.25%, +1.16%
STPs: 24568 -> 24215 (-1.44%)
LDPs: 37799 -> 37468 (-0.88%)
Early-preamble: 115698 -> 113694 (-1.73%); split: +0.17%, -1.90%
Cat0: 9345228 -> 9367782 (+0.24%); split: -1.41%, +1.65%
Cat1: 2445265 -> 2549122 (+4.25%); split: -0.81%, +5.06%
Cat2: 18704736 -> 18377519 (-1.75%); split: -1.76%, +0.01%
Cat3: 14210303 -> 14130558 (-0.56%); split: -0.56%, +0.00%
Cat4: 1346895 -> 1346462 (-0.03%); split: -0.03%, +0.00%
Cat5: 1420418 -> 1420417 (-0.00%); split: -0.07%, +0.07%
Cat6: 745590 -> 549358 (-26.32%); split: -26.66%, +0.34%
Cat7: 1405795 -> 1401899 (-0.28%); split: -0.96%, +0.68%

Totals from 79089 (48.19% of 164134) affected shaders:
MaxWaves: 947648 -> 979576 (+3.37%); split: +3.40%, -0.03%
Instrs: 38664140 -> 38183027 (-1.24%); split: -1.47%, +0.22%
CodeSize: 80179110 -> 80525992 (+0.43%); split: -1.27%, +1.70%
NOPs: 6880907 -> 6903265 (+0.32%); split: -1.91%, +2.23%
MOVs: 1183855 -> 1227116 (+3.65%); split: -1.78%, +5.43%
Full: 1107056 -> 1041628 (-5.91%); split: -5.96%, +0.05%
(ss): 939342 -> 912196 (-2.89%); split: -4.28%, +1.39%
(sy): 457959 -> 417281 (-8.88%); split: -9.73%, +0.85%
(ss)-stall: 3664495 -> 3652418 (-0.33%); split: -2.47%, +2.14%
(sy)-stall: 12266805 -> 11358193 (-7.41%); split: -9.10%, +1.69%

STPs: 7494 -> 7141 (-4.71%)
LDPs: 7050 -> 6719 (-4.70%)
Early-preamble: 46339 -> 44335 (-4.32%); split: +0.43%, -4.75%
Cat0: 7548630 -> 7571184 (+0.30%); split: -1.75%, +2.05%
Cat1: 1823872 -> 1927729 (+5.69%); split: -1.09%, +6.78%
Cat2: 14767716 -> 14440499 (-2.22%); split: -2.22%, +0.01%
Cat3: 10630582 -> 10550837 (-0.75%); split: -0.75%, +0.00%
Cat4: 1150090 -> 1149657 (-0.04%); split: -0.04%, +0.00%
Cat5: 1068913 -> 1068912 (-0.00%); split: -0.09%, +0.09%
Cat6: 554910 -> 358678 (-35.36%); split: -35.82%, +0.45%
Cat7: 1119427 -> 1115531 (-0.35%); split: -1.20%, +0.86%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34441>
This commit is contained in:
Connor Abbott 2025-04-09 13:32:37 -04:00 committed by Marge Bot
parent 2f93137308
commit 9977c4d682
2 changed files with 75 additions and 26 deletions

View file

@ -380,14 +380,6 @@ ir3_optimize_loop(struct ir3_compiler *compiler,
};
progress |= OPT(s, nir_opt_offsets, &offset_options);
nir_load_store_vectorize_options vectorize_opts = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform,
.callback = ir3_nir_should_vectorize_mem,
.robust_modes = options->robust_modes,
.cb_data = compiler,
};
progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
if (lower_flrp != 0) {
if (OPT(s, nir_lower_flrp, lower_flrp, false /* always_precise */)) {
OPT(s, nir_opt_constant_folding);
@ -652,7 +644,20 @@ ir3_finalize_nir(struct ir3_compiler *compiler,
bool idiv_progress = OPT(s, nir_opt_idiv_const, 8);
idiv_progress |= OPT(s, nir_lower_idiv, &idiv_options);
if (idiv_progress)
/* Do load/store vectorization after the first opt loop to give us a chance
* to optimize lowered SSBO pointers. Without the first opt loop every
* SSBO load/store with a different pointer looks like it has a different
* descriptor, even when it doesn't.
*/
nir_load_store_vectorize_options vectorize_opts = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform,
.callback = ir3_nir_should_vectorize_mem,
.robust_modes = options->robust_modes,
.cb_data = compiler,
};
bool vectorize_progress = OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
if (idiv_progress || vectorize_progress)
ir3_optimize_loop(compiler, options, s);
OPT(s, nir_remove_dead_variables, nir_var_function_temp, NULL);

View file

@ -310,23 +310,6 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
}
}
/* Descriptor index has to be adjusted in the following cases:
* - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
* loads -- next-index descriptor will be able to do that;
* - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
* storage accesses of that size.
*/
if ((dev->physical_device->info->a6xx.storage_16bit &&
!dev->physical_device->info->a6xx.has_isam_v &&
intrin->intrinsic == nir_intrinsic_load_ssbo &&
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
intrin->def.bit_size > 16) ||
(dev->physical_device->info->a7xx.storage_8bit &&
((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
(intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
}
nir_def *results[MAX_SETS] = { NULL };
if (nir_scalar_is_const(scalar_idx)) {
@ -1099,6 +1082,62 @@ tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
lower_fdm_instr, (void *)options);
}
static bool
lower_ssbo_descriptor_instr(nir_builder *b, nir_intrinsic_instr *intrin,
void *cb_data)
{
struct tu_device *dev = (struct tu_device *)cb_data;
/* Descriptor index has to be adjusted in the following cases:
* - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
* loads -- next-index descriptor will be able to do that;
* - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
* storage accesses of that size.
*/
if ((dev->physical_device->info->a6xx.storage_16bit &&
!dev->physical_device->info->a6xx.has_isam_v &&
intrin->intrinsic == nir_intrinsic_load_ssbo &&
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
intrin->def.bit_size > 16) ||
(dev->physical_device->info->a7xx.storage_8bit &&
((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
(intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
unsigned buffer_src;
if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
/* This has the value first */
buffer_src = 1;
} else {
buffer_src = 0;
}
b->cursor = nir_before_instr(&intrin->instr);
nir_def *buffer = intrin->src[buffer_src].ssa;
assert(buffer->parent_instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *bindless =
nir_instr_as_intrinsic(buffer->parent_instr);
assert(bindless->intrinsic == nir_intrinsic_bindless_resource_ir3);
nir_def *descriptor_idx = bindless->src[0].ssa;
descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
nir_def *new_buffer =
nir_bindless_resource_ir3(b, 32, descriptor_idx,
.desc_set = nir_intrinsic_desc_set(bindless));
nir_src_rewrite(&intrin->src[buffer_src], new_buffer);
return true;
}
return false;
}
static bool
tu_nir_lower_ssbo_descriptor(nir_shader *shader,
struct tu_device *dev)
{
return nir_shader_intrinsics_pass(shader, lower_ssbo_descriptor_instr,
nir_metadata_control_flow,
(void *)dev);
}
static void
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
@ -2620,6 +2659,11 @@ tu_shader_create(struct tu_device *dev,
ir3_finalize_nir(dev->compiler, &nir_options, nir);
/* This has to happen after finalizing, so that we know the final bitsize
* after vectorizing.
*/
NIR_PASS(_, nir, tu_nir_lower_ssbo_descriptor, dev);
const struct ir3_shader_options options = {
.api_wavesize = key->api_wavesize,
.real_wavesize = key->real_wavesize,