From f4812dc11d48204091260914d481784eb55a75eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 31 Mar 2026 10:51:59 +0200 Subject: [PATCH] nir/opt_constant_folding: constant-fold op(bcsel(), #c) -> bcsel(.., #c1, #c2) for all ALU instructions except fneg instead of using nir_opt_algebraic for a small subset. Totals from 17711 (8.49% of 208640) affected shaders: (Navi48) MaxWaves: 364391 -> 364397 (+0.00%); split: +0.01%, -0.01% Instrs: 33873994 -> 33780398 (-0.28%); split: -0.31%, +0.03% CodeSize: 198627596 -> 198259724 (-0.19%); split: -0.23%, +0.05% VGPRs: 1435516 -> 1435144 (-0.03%); split: -0.04%, +0.02% SpillSGPRs: 652827 -> 654577 (+0.27%); split: -0.00%, +0.27% SpillVGPRs: 594840 -> 593598 (-0.21%); split: -0.28%, +0.07% Scratch: 31791360 -> 31543552 (-0.78%) Latency: 417824569 -> 415881858 (-0.46%); split: -0.48%, +0.02% InvThroughput: 80376232 -> 80307996 (-0.08%); split: -0.10%, +0.01% VClause: 557238 -> 554770 (-0.44%); split: -0.50%, +0.06% SClause: 688297 -> 688125 (-0.02%); split: -0.04%, +0.02% Copies: 3571756 -> 3566704 (-0.14%); split: -0.44%, +0.29% Branches: 628710 -> 628576 (-0.02%); split: -0.07%, +0.05% PreSGPRs: 1100316 -> 1103478 (+0.29%); split: -0.02%, +0.30% PreVGPRs: 1132139 -> 1128765 (-0.30%); split: -0.30%, +0.00% VALU: 18944830 -> 18912030 (-0.17%); split: -0.20%, +0.03% SALU: 4363054 -> 4342748 (-0.47%); split: -0.57%, +0.10% VMEM: 1894420 -> 1891754 (-0.14%); split: -0.19%, +0.05% SMEM: 1073860 -> 1073741 (-0.01%); split: -0.01%, +0.00% VOPD: 1734659 -> 1735718 (+0.06%); split: +0.20%, -0.14% Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir_opt_constant_folding.c | 81 +++++++++++++++++++-- src/compiler/nir/nir_search.c | 10 +++ 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c index 134ec171fd8..8526e175ce9 100644 --- a/src/compiler/nir/nir_opt_constant_folding.c +++ b/src/compiler/nir/nir_opt_constant_folding.c @@ -55,10 +55,41 @@ const_value_for_alu(nir_builder *b, nir_alu_instr *alu, unsigned bit_size, dest); } +static bool +is_bcsel_with_two_constants(nir_alu_instr *bcsel) +{ + return bcsel && bcsel->op == nir_op_bcsel && + bcsel->def.num_components == 1 && + bcsel->src[0].swizzle[0] == 0 && + bcsel->src[0].src.ssa->num_components == 1 && + nir_src_is_const(bcsel->src[1].src) && + nir_src_is_const(bcsel->src[2].src); +} + +static bool +should_fold_bcsel(nir_alu_instr *alu) +{ + /* Don't fold bcsel if the resulting bit size is larger than 32 bit + * as these commonly require two instructions. + */ + if (alu->def.bit_size > 32) + return false; + + /* Don't fight with nir_lower_load_const_to_scalar. */ + if (nir_op_is_vec_or_mov(alu->op)) + return false; + + /* Make an exception for fneg, because in many cases it can be + * folded with the next instruction. + */ + return alu->op != nir_op_fneg; +} + nir_def * nir_try_constant_fold_alu(nir_builder *b, nir_alu_instr *alu) { nir_const_value src[NIR_ALU_MAX_INPUTS][NIR_MAX_VEC_COMPONENTS]; + nir_def *bcsel = NULL; /* In the case that any outputs/inputs have unsized types, then we need to * guess the bit-size. In this case, the validator ensures that all @@ -79,19 +110,57 @@ nir_try_constant_fold_alu(nir_builder *b, nir_alu_instr *alu) bit_size = alu->src[i].src.ssa->bit_size; nir_load_const_instr *load_const = nir_src_as_load_const(alu->src[i].src); - if (!load_const) - return NULL; - for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(alu, i); - j++) { - src[i][j] = load_const->value[alu->src[i].swizzle[j]]; + if (load_const) { + for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(alu, i); j++) + src[i][j] = load_const->value[alu->src[i].swizzle[j]]; + continue; + } + + /* Check if the source is a bcsel with two constants. */ + nir_alu_instr *bcsel_alu = nir_src_as_alu(alu->src[i].src); + if (should_fold_bcsel(alu) && is_bcsel_with_two_constants(bcsel_alu)) { + /* If there is multiple bcsel sources, they must use the same condition. */ + if (bcsel && bcsel_alu->src[0].src.ssa != bcsel) + return false; + + bcsel = bcsel_alu->src[0].src.ssa; + + /* Use first bcsel constant. */ + load_const = nir_src_as_load_const(bcsel_alu->src[1].src); + for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(alu, i); j++) + src[i][j] = load_const->value[bcsel_alu->src[1].swizzle[alu->src[i].swizzle[j]]]; + } else { + return NULL; } } if (bit_size == 0) bit_size = 32; - return const_value_for_alu(b, alu, bit_size, src); + /* If all sources are constant, we can fold the ALU. */ + if (!bcsel) + return const_value_for_alu(b, alu, bit_size, src); + + /* At least one source is a bcsel with two constants. Fold the ALU twice + * and create a new bcsel, selecting between the folded values. + */ + nir_def *then_const = const_value_for_alu(b, alu, bit_size, src); + + /* Create second bcsel constant. */ + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + nir_alu_instr *bcsel_alu = nir_src_as_alu(alu->src[i].src); + if (!bcsel_alu) + continue; + + nir_load_const_instr *load_const = nir_src_as_load_const(bcsel_alu->src[2].src); + for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(alu, i); j++) { + src[i][j] = load_const->value[bcsel_alu->src[2].swizzle[alu->src[i].swizzle[j]]]; + } + } + nir_def *else_const = const_value_for_alu(b, alu, bit_size, src); + + return nir_bcsel(b, bcsel, then_const, else_const); } static nir_const_value * diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c index 403e5fc08ae..bddfd7f365d 100644 --- a/src/compiler/nir/nir_search.c +++ b/src/compiler/nir/nir_search.c @@ -25,6 +25,7 @@ #include #include "util/half_float.h" #include "nir_builder.h" +#include "nir_opcodes.h" #include "nir_worklist.h" /* This should be the same as nir_search_max_comm_ops in nir_algebraic.py. */ @@ -441,6 +442,15 @@ construct_value(nir_builder *build, if (const_expr) { nir_instr_free(&alu->instr); def = const_expr; + if (nir_def_is_alu(def)) { + /* The instruction got folded into bcsel of two constants. */ + nir_alu_instr *bcsel = nir_def_as_alu(def); + assert(bcsel->op == nir_op_bcsel); + util_dynarray_append_typed(state->states, uint16_t, 0); + nir_algebraic_automaton(nir_src_parent_instr(&bcsel->src[1].src), state->states, state->pass_op_table); + util_dynarray_append_typed(state->states, uint16_t, 0); + nir_algebraic_automaton(nir_src_parent_instr(&bcsel->src[2].src), state->states, state->pass_op_table); + } } else { nir_builder_instr_insert(build, &alu->instr); }