nir/loop_analyze: determine for all ALU whether it can be constant-folded

Totals from 16 (0.02% of 79839) affected shaders: (Navi48)

MaxWaves: 512 -> 464 (-9.38%)
Instrs: 11821 -> 17205 (+45.55%)
CodeSize: 60536 -> 86644 (+43.13%)
VGPRs: 732 -> 804 (+9.84%)
Latency: 68411 -> 39349 (-42.48%)
InvThroughput: 14217 -> 9306 (-34.54%)
VClause: 223 -> 302 (+35.43%)
SClause: 262 -> 317 (+20.99%)
Copies: 961 -> 696 (-27.58%); split: -39.23%, +11.65%
Branches: 182 -> 158 (-13.19%); split: -29.67%, +16.48%
PreSGPRs: 1210 -> 945 (-21.90%); split: -29.42%, +7.52%
PreVGPRs: 647 -> 633 (-2.16%)
VALU: 5112 -> 10857 (+112.38%)
SALU: 3215 -> 2335 (-27.37%); split: -30.67%, +3.30%
VMEM: 228 -> 349 (+53.07%)
SMEM: 567 -> 549 (-3.17%); split: -3.70%, +0.53%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38659>
This commit is contained in:
Daniel Schürmann 2025-11-25 11:18:37 +01:00 committed by Marge Bot
parent 1d6f85a207
commit 4997d8fb1b
2 changed files with 71 additions and 42 deletions

View file

@ -48,21 +48,19 @@ get_loop_var(nir_def *value, loop_info_state *state)
return NULL;
}
/* If a condition is a comparision between a constant and
* a basic induction variable we know that it will be eliminated once
* the loop is unrolled.
/* If an instruction only depends on basic induction variables
* and constants, we know that it will be eliminated once the
* loop is unrolled.
*/
static bool
condition_can_constant_fold(loop_info_state *state, nir_scalar cond_scalar)
is_const_after_unrolling(loop_info_state *state, nir_def *def)
{
nir_scalar lhs = nir_scalar_chase_alu_src(cond_scalar, 0);
nir_scalar rhs = nir_scalar_chase_alu_src(cond_scalar, 1);
nir_instr *instr = nir_def_instr(def);
if (instr->pass_flags == 0)
return false;
if (nir_scalar_is_const(lhs) && get_loop_var(rhs.def, state))
return true;
if (nir_scalar_is_const(rhs) && get_loop_var(lhs.def, state))
return true;
return false;
/* The pass flags are only correct within the loop. */
return instr->block->index >= nir_loop_first_block(state->loop)->index;
}
/** Calculate an estimated cost in number of instructions
@ -87,29 +85,15 @@ instr_cost(loop_info_state *state, nir_instr *instr,
const nir_op_info *info = &nir_op_infos[alu->op];
unsigned cost = 1;
if (nir_op_is_selection(alu->op)) {
bool can_constant_fold = true;
for (unsigned i = 0; can_constant_fold && i < alu->def.num_components; i++) {
nir_scalar cond_scalar = nir_scalar_chase_alu_src(nir_get_scalar(&alu->def, i), 0);
can_constant_fold &= nir_is_terminator_condition_with_two_inputs(cond_scalar) &&
condition_can_constant_fold(state, cond_scalar);
}
/* Check if this instruction can be constant-folded after unrolling. */
if (is_const_after_unrolling(state, &alu->def))
return 0;
if (nir_op_is_selection(alu->op) && is_const_after_unrolling(state, alu->src[0].src.ssa)) {
/* If the condition can be constant folded after the loop is unrolled,
* so can the selection.
*/
if (can_constant_fold)
return 0;
} else if (nir_alu_instr_is_comparison(alu) &&
nir_op_infos[alu->op].num_inputs == 2) {
bool can_constant_fold = true;
for (unsigned i = 0; can_constant_fold && i < alu->def.num_components; i++) {
nir_scalar cond_scalar = nir_get_scalar(&alu->def, i);
can_constant_fold &= condition_can_constant_fold(state, cond_scalar);
}
if (can_constant_fold)
return 0;
return 0;
} else if (nir_op_is_vec_or_mov(alu->op)) {
/* movs and vecs are likely free. */
return 0;
@ -1386,6 +1370,57 @@ force_unroll_heuristics(loop_info_state *state, nir_block *block)
return false;
}
static void
gather_constant_fold_info(loop_info_state *state, nir_instr *instr)
{
instr->pass_flags = 0;
/* Loop induction variables with constant initializer and constant
* update source get constant-folded when the loop is being unrolled.
*/
if (instr->type == nir_instr_type_phi &&
instr->block == nir_loop_first_block(state->loop)) {
nir_loop_induction_variable *var = get_loop_var(nir_instr_def(instr), state);
instr->pass_flags = var && nir_def_is_const(var->init_src->ssa) &&
nir_def_is_const(var->update_src->src.ssa);
}
if (instr->type != nir_instr_type_alu)
return;
/* ALU instruction which only depend on constants and constant-foldable
* sources, can also be constant-folded.
*/
nir_alu_instr *alu = nir_instr_as_alu(instr);
for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
if(!nir_src_is_const(alu->src[i].src) &&
!is_const_after_unrolling(state, alu->src[i].src.ssa))
return;
}
instr->pass_flags = 1;
}
static void
gather_unroll_heuristic_info(loop_info_state *state, const nir_shader_compiler_options *options)
{
nir_foreach_block_in_cf_node(block, &state->loop->cf_node) {
/* Calculate instruction cost. */
nir_foreach_instr(instr, block) {
gather_constant_fold_info(state, instr);
state->loop->info->instr_cost += instr_cost(state, instr, options);
}
if (state->loop->info->force_unroll)
continue;
if (force_unroll_heuristics(state, block)) {
state->loop->info->force_unroll = true;
}
}
}
static void
get_loop_info(loop_info_state *state, nir_function_impl *impl)
{
@ -1413,18 +1448,7 @@ get_loop_info(loop_info_state *state, nir_function_impl *impl)
impl->function->shader->info.float_controls_execution_mode,
impl->function->shader->options->max_unroll_iterations);
nir_foreach_block_in_cf_node(block, &state->loop->cf_node) {
nir_foreach_instr(instr, block) {
state->loop->info->instr_cost += instr_cost(state, instr, options);
}
if (state->loop->info->force_unroll)
continue;
if (force_unroll_heuristics(state, block)) {
state->loop->info->force_unroll = true;
}
}
gather_unroll_heuristic_info(state, options);
}
static void
@ -1484,6 +1508,7 @@ nir_loop_analyze_impl(nir_function_impl *impl,
bool force_unroll_sampler_indirect)
{
struct hash_table *range_ht = _mesa_pointer_hash_table_create(NULL);
nir_metadata_require(impl, nir_metadata_block_index);
foreach_list_typed(nir_cf_node, node, node, &impl->body)
process_loops(node, indirect_mask, force_unroll_sampler_indirect, range_ht);

View file

@ -106,3 +106,7 @@ dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap
dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap_map2
dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap
dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap_map2
# Generates too many immediates due to loop unrolling
KHR-GLES31.core.geometry_shader.limits.max_output_components
KHR-GL46.geometry_shader.limits.max_output_components