ac/nir/ngg: Improve reuse of position value.

Instead of hand-rolled code, use nir_scalar and its
helper functions to reuse the position value.
Results in more copies, which are mitigated by
copy prop from the previous commit.

This helps eliminate some instructions, especially VMEM loads
from the deferred shader part of NGG culling shaders, which
can be reused from the position values calculated by the
non-deferred part.

Fossil DB stats on Navi 21:

Totals from 2472 (3.11% of 79377) affected shaders:
MaxWaves: 78748 -> 78772 (+0.03%)
Instrs: 636342 -> 633739 (-0.41%); split: -0.45%, +0.04%
CodeSize: 3444740 -> 3427172 (-0.51%); split: -0.53%, +0.02%
VGPRs: 62552 -> 62176 (-0.60%)
Latency: 2025711 -> 2019449 (-0.31%); split: -0.73%, +0.42%
InvThroughput: 221140 -> 221946 (+0.36%); split: -0.12%, +0.49%
VClause: 5443 -> 5278 (-3.03%); split: -3.20%, +0.17%
SClause: 8369 -> 8302 (-0.80%); split: -0.82%, +0.02%
Copies: 102435 -> 101652 (-0.76%); split: -0.87%, +0.11%
PreSGPRs: 63714 -> 63533 (-0.28%)
PreVGPRs: 48555 -> 48392 (-0.34%)
VALU: 242165 -> 241457 (-0.29%); split: -0.33%, +0.04%
SALU: 197656 -> 197482 (-0.09%); split: -0.10%, +0.01%
VMEM: 7746 -> 7571 (-2.26%)
SMEM: 10822 -> 10730 (-0.85%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22073>
This commit is contained in:
Timur Kristóf 2025-03-20 14:42:30 +01:00 committed by Marge Bot
parent f7a160d501
commit 1e7d28a82e

View file

@ -400,42 +400,29 @@ remove_culling_shader_outputs(nir_shader *culling_shader, lower_ngg_nogs_state *
}
static void
rewrite_uses_to_var(nir_builder *b, nir_def *old_def, nir_variable *replacement_var, unsigned replacement_var_channel)
replace_scalar_component_uses(nir_builder *b, nir_scalar old, nir_scalar rep)
{
if (old_def->parent_instr->type == nir_instr_type_load_const)
if (old.def->parent_instr->type == nir_instr_type_load_const)
return;
b->cursor = nir_after_instr(old_def->parent_instr);
if (b->cursor.instr->type == nir_instr_type_phi)
b->cursor = nir_after_phis(old_def->parent_instr->block);
assert(old.def->bit_size == rep.def->bit_size);
nir_def *pos_val_rep = nir_load_var(b, replacement_var);
nir_def *replacement = nir_channel(b, pos_val_rep, replacement_var_channel);
if (old_def->num_components > 1) {
/* old_def uses a swizzled vector component.
* There is no way to replace the uses of just a single vector component,
* so instead create a new vector and replace all uses of the old vector.
*/
nir_def *old_def_elements[NIR_MAX_VEC_COMPONENTS] = {0};
for (unsigned j = 0; j < old_def->num_components; ++j)
old_def_elements[j] = nir_channel(b, old_def, j);
replacement = nir_vec(b, old_def_elements, old_def->num_components);
nir_def *dst[NIR_MAX_VEC_COMPONENTS] = {0};
for (unsigned dst_comp = 0; dst_comp < old.def->num_components; ++dst_comp) {
nir_scalar old_dst = nir_get_scalar(old.def, dst_comp);
nir_scalar new_dst = dst_comp == old.comp ? rep : old_dst;
dst[dst_comp] = nir_channel(b, new_dst.def, new_dst.comp);
}
nir_def_rewrite_uses_after(old_def, replacement, replacement->parent_instr);
nir_def *replacement = nir_vec(b, dst, old.def->num_components);
nir_def_rewrite_uses_after(old.def, replacement, replacement->parent_instr);
}
static bool
remove_extra_pos_output(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
apply_repacked_pos_output(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
{
lower_ngg_nogs_state *s = (lower_ngg_nogs_state *) state;
/* These are not allowed in VS / TES */
assert(intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
intrin->intrinsic != nir_intrinsic_load_per_vertex_input);
/* We are only interested in output stores now */
if (intrin->intrinsic != nir_intrinsic_store_output)
return false;
@ -443,8 +430,6 @@ remove_extra_pos_output(nir_builder *b, nir_intrinsic_instr *intrin, void *state
if (io_sem.location != VARYING_SLOT_POS)
return false;
b->cursor = nir_before_instr(&intrin->instr);
/* In case other outputs use what we calculated for pos,
* try to avoid calculating it again by rewriting the usages
* of the store components here.
@ -452,47 +437,21 @@ remove_extra_pos_output(nir_builder *b, nir_intrinsic_instr *intrin, void *state
nir_def *store_val = intrin->src[0].ssa;
unsigned store_pos_component = nir_intrinsic_component(intrin);
nir_instr_remove(&intrin->instr);
for (unsigned comp = 0; comp < store_val->num_components; ++comp) {
nir_scalar val = nir_scalar_chase_movs(nir_get_scalar(store_val, comp));
b->cursor = nir_after_instr_and_phis(val.def->parent_instr);
nir_def *reloaded = nir_load_var(b, s->position_value_var);
if (store_val->parent_instr->type == nir_instr_type_alu) {
nir_alu_instr *alu = nir_instr_as_alu(store_val->parent_instr);
if (nir_op_is_vec_or_mov(alu->op)) {
/* Output store uses a vector, we can easily rewrite uses of each vector element. */
unsigned num_vec_src = 0;
if (alu->op == nir_op_mov)
num_vec_src = 1;
else if (alu->op == nir_op_vec2)
num_vec_src = 2;
else if (alu->op == nir_op_vec3)
num_vec_src = 3;
else if (alu->op == nir_op_vec4)
num_vec_src = 4;
assert(num_vec_src);
/* Remember the current components whose uses we wish to replace.
* This is needed because rewriting one source can affect the others too.
*/
nir_def *vec_comps[NIR_MAX_VEC_COMPONENTS] = {0};
for (unsigned i = 0; i < num_vec_src; i++)
vec_comps[i] = alu->src[i].src.ssa;
for (unsigned i = 0; i < num_vec_src; i++)
rewrite_uses_to_var(b, vec_comps[i], s->position_value_var, store_pos_component + i);
} else {
rewrite_uses_to_var(b, store_val, s->position_value_var, store_pos_component);
}
} else {
rewrite_uses_to_var(b, store_val, s->position_value_var, store_pos_component);
replace_scalar_component_uses(b, val, nir_get_scalar(reloaded, store_pos_component + comp));
}
return true;
}
static void
remove_extra_pos_outputs(nir_shader *shader, lower_ngg_nogs_state *s)
apply_repacked_pos_outputs(nir_shader *shader, lower_ngg_nogs_state *s)
{
nir_shader_intrinsics_pass(shader, remove_extra_pos_output,
nir_shader_intrinsics_pass(shader, apply_repacked_pos_output,
nir_metadata_control_flow, s);
}
@ -1809,18 +1768,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
/* Replace uniforms. */
apply_reusable_variables(b, &state);
/* Remove the redundant position output. */
remove_extra_pos_outputs(shader, &state);
/* After looking at the performance in apps eg. Doom Eternal, and The Witcher 3,
* it seems that it's best to put the position export always at the end, and
* then let ACO schedule it up (slightly) only when early prim export is used.
*/
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
nir_def *pos_val = nir_load_var(b, state.position_value_var);
for (int i = 0; i < 4; i++)
state.out.outputs[VARYING_SLOT_POS][i] = nir_channel(b, pos_val, i);
/* Reuse the position value calculated in the non-deferred shader part. */
apply_repacked_pos_outputs(shader, &state);
}
/* Gather outputs data and types */