Merge branch 'nir-licm2' into 'main'

nir/opt_licm: use nir_instr_can_speculate, hoist from multiple levels of nested loops, add filter_block & filter_instr callbacks, add tests

See merge request mesa/mesa!38823
This commit is contained in:
Marek Olšák 2025-12-20 00:51:05 +00:00
commit 40e90e227c
8 changed files with 383 additions and 39 deletions

View file

@ -1050,3 +1050,17 @@ ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data)
return target_width;
}
bool
ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits,
unsigned num_all_src_bits, nir_loop *loop)
{
/* This heuristic reduces spilling. Note that while this only seems to apply to ALU, any ALU
* that's hoisted potentially enables hoisting intrinsics using it, so this really affects
* all instructions.
*/
if (!instr_dominates_exit && instr->type == nir_instr_type_alu)
return num_dst_bits + 64 < num_all_src_bits;
return true;
}

View file

@ -447,6 +447,10 @@ ac_nir_allow_offset_wrap_cb(nir_intrinsic_instr *instr, const void *data);
bool
ac_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu);
bool
ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits,
unsigned num_all_src_bits, nir_loop *loop);
uint8_t
ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data);

View file

@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs;
if (!stage->key.optimisations_disabled) {
NIR_PASS(_, stage->nir, nir_opt_licm);
NIR_PASS(_, stage->nir, nir_opt_licm, ac_nir_opt_licm_filter_instr_cb);
if (stage->stage == MESA_SHADER_VERTEX) {
/* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.

View file

@ -425,6 +425,7 @@ if with_tests
'tests/mod_analysis_tests.cpp',
'tests/negative_equal_tests.cpp',
'tests/opt_if_tests.cpp',
'tests/opt_licm_tests.cpp',
'tests/opt_loop_tests.cpp',
'tests/opt_peephole_select.cpp',
'tests/opt_shrink_vectors_tests.cpp',

View file

@ -4746,6 +4746,18 @@ nir_block *nir_cf_node_cf_tree_prev(nir_cf_node *node);
block != nir_cf_node_cf_tree_prev(node); \
block = prev, prev = nir_block_cf_tree_prev(block))
static inline nir_block *
nir_loop_predecessor_block(nir_loop *loop)
{
return nir_cf_node_cf_tree_prev(&loop->cf_node);
}
static inline nir_block *
nir_loop_successor_block(nir_loop *loop)
{
return nir_cf_node_cf_tree_next(&loop->cf_node);
}
/* If the following CF node is an if, this function returns that if.
* Otherwise, it returns NULL.
*/
@ -6458,7 +6470,14 @@ bool nir_opt_large_constants(nir_shader *shader,
glsl_type_size_align_func size_align,
unsigned threshold);
bool nir_opt_licm(nir_shader *shader);
typedef bool (*nir_opt_licm_filter_instr_cb)(nir_instr *instr,
bool instr_dominates_exit,
unsigned num_dst_bits,
unsigned num_all_src_bits,
nir_loop *loop);
bool nir_opt_licm(nir_shader *shader,
nir_opt_licm_filter_instr_cb filter_instr);
bool nir_opt_loop(nir_shader *shader);
bool nir_opt_loop_unroll(nir_shader *shader);

View file

@ -5,30 +5,69 @@
#include "nir.h"
typedef struct {
nir_opt_licm_filter_instr_cb filter_instr;
nir_loop *loop;
bool current_block_dominates_exit;
unsigned num_all_src_bits;
} licm_state;
static bool
defined_before_loop(nir_src *src, void *state)
defined_before_loop(nir_src *src, void *_state)
{
unsigned *loop_preheader_idx = state;
return nir_def_block(src->ssa)->index <= *loop_preheader_idx;
licm_state *state = (licm_state *)_state;
state->num_all_src_bits += src->ssa->bit_size * src->ssa->num_components;
/* The current instruction is loop-invariant only if its sources are before
* the loop.
*/
return nir_def_block(src->ssa)->index <=
nir_loop_predecessor_block(state->loop)->index;
}
static bool
is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
is_instr_loop_invariant(nir_instr *instr, licm_state *state)
{
switch (instr->type) {
case nir_instr_type_load_const:
case nir_instr_type_undef:
return true;
case nir_instr_type_intrinsic:
if (!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
return false;
FALLTHROUGH;
case nir_instr_type_alu:
case nir_instr_type_tex:
case nir_instr_type_deref:
return nir_foreach_src(instr, defined_before_loop, &loop_preheader_idx);
case nir_instr_type_intrinsic: {
/* An instruction can be hoisted if it either dominates the exit (i.e.
* it always executes) and is reorderable, or is speculatable.
*/
if (state->current_block_dominates_exit) {
if (instr->type == nir_instr_type_intrinsic &&
!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
return false;
} else {
if (!nir_instr_can_speculate(instr))
return false;
}
state->num_all_src_bits = 0;
bool invariant = nir_foreach_src(instr, defined_before_loop, state);
if (!invariant)
return false;
if (state->filter_instr) {
nir_def *def = nir_instr_def(instr);
if (!state->filter_instr(instr, state->current_block_dominates_exit,
def->bit_size * def->num_components,
state->num_all_src_bits, state->loop))
return false;
}
return true;
}
case nir_instr_type_phi:
case nir_instr_type_call:
@ -39,13 +78,17 @@ is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
}
static bool
visit_block(nir_block *block, nir_block *preheader)
visit_block(nir_block *block, licm_state *state)
{
state->current_block_dominates_exit =
nir_block_dominates(block, nir_loop_successor_block(state->loop));
bool progress = false;
nir_foreach_instr_safe(instr, block) {
if (is_instr_loop_invariant(instr, preheader->index)) {
if (is_instr_loop_invariant(instr, state)) {
nir_instr_remove(instr);
nir_instr_insert_after_block(preheader, instr);
nir_instr_insert_after_block(nir_loop_predecessor_block(state->loop),
instr);
progress = true;
}
}
@ -80,40 +123,62 @@ should_optimize_loop(nir_loop *loop)
}
static bool
visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit)
visit_cf_list(struct exec_list *list, licm_state *state)
{
bool progress = false;
foreach_list_typed(nir_cf_node, node, node, list) {
switch (node->type) {
case nir_cf_node_block: {
/* By only visiting blocks which dominate the loop exit, we
* ensure that we don't speculatively hoist any instructions
* which otherwise might not be executed.
*
* Note, that the proper check would be whether this block
* postdominates the loop preheader.
nir_cf_node *next = nir_cf_node_next(node);
bool optimize_loop = false;
/* If the next CF node is a loop that we optimize, visit it first
* before visiting its predecessor block, so that any instructions
* hoisted from this (potentially nested) loop are then considered
* for hoisting from the outer loop as well. The goal is to hoist
* instructions across all levels of nested loops.
*/
if (next && next->type == nir_cf_node_loop) {
nir_loop *inner_loop = nir_cf_node_as_loop(next);
optimize_loop = should_optimize_loop(inner_loop);
if (optimize_loop) {
nir_loop *outer_loop = state->loop;
state->loop = inner_loop;
progress |= visit_cf_list(&inner_loop->body, state);
progress |= visit_cf_list(&inner_loop->continue_list, state);
state->loop = outer_loop;
}
}
/* Visit the block. */
nir_block *block = nir_cf_node_as_block(node);
if (exit && nir_block_dominates(block, exit))
progress |= visit_block(block, preheader);
if (state->loop)
progress |= visit_block(block, state);
if (next && next->type == nir_cf_node_loop && !optimize_loop) {
nir_loop *loop = nir_cf_node_as_loop(next);
/* We treat this loop like any other block, so we don't do LICM
* from it per se, but if this loop is nested inside another
* loop, we still do LICM for the outer loop.
*/
progress |= visit_cf_list(&loop->body, state);
progress |= visit_cf_list(&loop->continue_list, state);
}
break;
}
case nir_cf_node_if: {
nir_if *nif = nir_cf_node_as_if(node);
progress |= visit_cf_list(&nif->then_list, preheader, exit);
progress |= visit_cf_list(&nif->else_list, preheader, exit);
progress |= visit_cf_list(&nif->then_list, state);
progress |= visit_cf_list(&nif->else_list, state);
break;
}
case nir_cf_node_loop: {
nir_loop *loop = nir_cf_node_as_loop(node);
bool opt = should_optimize_loop(loop);
nir_block *inner_preheader = opt ? nir_cf_node_cf_tree_prev(node) : preheader;
nir_block *inner_exit = opt ? nir_cf_node_cf_tree_next(node) : exit;
progress |= visit_cf_list(&loop->body, inner_preheader, inner_exit);
progress |= visit_cf_list(&loop->continue_list, inner_preheader, inner_exit);
case nir_cf_node_loop:
/* All loops are handled when handling their predecessor block. */
break;
}
case nir_cf_node_function:
UNREACHABLE("NIR LICM: Unsupported cf_node type.");
}
@ -123,17 +188,19 @@ visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit)
}
bool
nir_opt_licm(nir_shader *shader)
nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_instr_cb filter_instr)
{
licm_state state = {filter_instr};
bool progress = false;
nir_foreach_function_impl(impl, shader) {
nir_metadata_require(impl, nir_metadata_block_index |
nir_metadata_dominance);
bool impl_progress = visit_cf_list(&impl->body, NULL, NULL);
progress |= nir_progress(impl_progress, impl,
nir_metadata_block_index | nir_metadata_dominance);
state.loop = NULL;
progress |= nir_progress(visit_cf_list(&impl->body, &state), impl,
nir_metadata_control_flow);
}
return progress;

View file

@ -0,0 +1,231 @@
/* Copyright 2025 Advanced Micro Devices, Inc.
* SPDX-License-Identifier: MIT
*/
#include "nir_test.h"
class nir_opt_licm_test : public nir_test {
protected:
nir_opt_licm_test()
: nir_test::nir_test("nir_opt_licm_test", MESA_SHADER_COMPUTE)
{
}
nir_loop *loop;
nir_block *original_block;
nir_def *x, *y, *z, *r;
bool expect_progress;
bool insert_after_break;
void test_init();
void test_finish(nir_opt_licm_filter_instr_cb filter_instr);
};
void
nir_opt_licm_test::test_init()
{
x = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
y = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
z = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
loop = nir_push_loop(b);
if (insert_after_break)
nir_break_if(b, nir_undef(b, 1, 1));
original_block = nir_loop_last_block(loop);
}
static bool
filter_using_dst_src_bits(nir_instr *instr, bool instr_dominates_exit,
unsigned num_dst_bits, unsigned num_all_src_bits,
nir_loop *loop)
{
return num_dst_bits <= num_all_src_bits;
}
void
nir_opt_licm_test::test_finish(nir_opt_licm_filter_instr_cb filter_instr)
{
if (!insert_after_break)
nir_break_if(b, nir_undef(b, 1, 1));
nir_pop_loop(b, loop);
nir_validate_shader(b->shader, NULL);
bool progress = false;
NIR_PASS(progress, b->shader, nir_opt_licm, filter_instr);
if (expect_progress) {
ASSERT_TRUE(progress);
ASSERT_EQ(nir_def_instr(r)->block, nir_loop_predecessor_block(loop));
} else {
ASSERT_FALSE(progress);
ASSERT_EQ(nir_def_instr(r)->block, original_block);
}
}
TEST_F(nir_opt_licm_test, hoist_alu_unary)
{
this->insert_after_break = true;
this->expect_progress = true;
this->test_init();
r = nir_ineg(b, x);
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, hoist_alu_binary)
{
this->insert_after_break = true;
this->expect_progress = true;
this->test_init();
r = nir_iadd(b, x, y);
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, skip_alu_u2u64)
{
this->insert_after_break = true;
this->expect_progress = false;
this->test_init();
r = nir_u2u64(b, x);
/* If sizeof(dst) > sizeof(all srcs), the default behavior is not to hoist
* because that would increase register usage of the whole loop.
*/
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, skip_load_ssbo_no_flags_before_break)
{
this->insert_after_break = false;
this->expect_progress = false;
this->test_init();
r = nir_load_ssbo(b, 1, 32, x, y);
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_before_break)
{
this->insert_after_break = false;
this->expect_progress = true;
this->test_init();
r = nir_load_ssbo(b, 1, 32, x, y);
nir_intrinsic_set_access(nir_def_as_intrinsic(r),
(gl_access_qualifier)(ACCESS_CAN_REORDER));
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, skip_load_ssbo_reorderable)
{
this->insert_after_break = true;
this->expect_progress = false;
this->test_init();
r = nir_load_ssbo(b, 1, 32, x, y);
nir_intrinsic_set_access(nir_def_as_intrinsic(r),
(gl_access_qualifier)(ACCESS_CAN_REORDER));
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, skip_load_ssbo_speculatable)
{
this->insert_after_break = true;
this->expect_progress = false;
this->test_init();
r = nir_load_ssbo(b, 1, 32, x, y);
nir_intrinsic_set_access(nir_def_as_intrinsic(r),
(gl_access_qualifier)(ACCESS_CAN_SPECULATE));
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_speculatable)
{
this->insert_after_break = true;
this->expect_progress = true;
this->test_init();
r = nir_load_ssbo(b, 1, 32, x, y);
nir_intrinsic_set_access(nir_def_as_intrinsic(r),
(gl_access_qualifier)(ACCESS_CAN_REORDER |
ACCESS_CAN_SPECULATE));
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, hoist_alu_2_nested_loops)
{
this->insert_after_break = true;
this->expect_progress = true;
this->test_init();
nir_loop *nested_loop = nir_push_loop(b);
{
nir_break_if(b, nir_undef(b, 1, 1));
r = nir_ineg(b, x);
}
nir_pop_loop(b, nested_loop);
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, hoist_alu_6_nested_loops)
{
this->insert_after_break = true;
this->expect_progress = true;
this->test_init();
nir_loop *nested_loops[5];
for (unsigned i = 0; i < ARRAY_SIZE(nested_loops); i++) {
nested_loops[i] = nir_push_loop(b);
nir_break_if(b, nir_undef(b, 1, 1));
}
r = nir_ineg(b, x);
for (int i = ARRAY_SIZE(nested_loops) - 1; i >= 0; i--)
nir_pop_loop(b, nested_loops[i]);
this->test_finish(filter_using_dst_src_bits);
}
TEST_F(nir_opt_licm_test, skip_tex)
{
this->insert_after_break = true;
this->expect_progress = false;
this->test_init();
nir_tex_builder fields = {0};
fields.coord = x;
fields.texture_handle = y;
fields.dest_type = nir_type_uint32;
r = nir_build_tex_struct(b, nir_texop_tex, fields);
this->test_finish(NULL);
}
TEST_F(nir_opt_licm_test, hoist_tex_before_break)
{
this->insert_after_break = false;
this->expect_progress = true;
this->test_init();
nir_tex_builder fields = {0};
fields.coord = x;
fields.texture_handle = y;
fields.dest_type = nir_type_uint32;
r = nir_build_tex_struct(b, nir_texop_tex, fields);
this->test_finish(NULL);
}
TEST_F(nir_opt_licm_test, hoist_tex_speculatable)
{
this->insert_after_break = true;
this->expect_progress = true;
this->test_init();
nir_tex_builder fields = {0};
fields.coord = x;
fields.texture_handle = y;
fields.can_speculate = true;
fields.dest_type = nir_type_uint32;
r = nir_build_tex_struct(b, nir_texop_tex, fields);
this->test_finish(NULL);
}

View file

@ -786,6 +786,14 @@ static bool robustness_filter(const nir_intrinsic_instr *intr,
return false;
}
static bool
opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit,
unsigned num_dst_bits, unsigned num_all_src_bits,
nir_loop *loop)
{
return instr_dominates_exit;
}
/**
* \brief Lowers a NIR shader.
*
@ -806,7 +814,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
NIR_PASS(_, nir, nir_lower_memory_model);
NIR_PASS(_, nir, nir_opt_licm);
NIR_PASS(_, nir, nir_opt_licm, opt_licm_filter_instr_cb);
NIR_PASS(_, nir, nir_lower_memcpy);