diff --git a/src/amd/common/nir/ac_nir.c b/src/amd/common/nir/ac_nir.c index b652dfcf996..e3b460647fe 100644 --- a/src/amd/common/nir/ac_nir.c +++ b/src/amd/common/nir/ac_nir.c @@ -1050,3 +1050,17 @@ ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data) return target_width; } + +bool +ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits, + unsigned num_all_src_bits, nir_loop *loop) +{ + /* This heuristic reduces spilling. Note that while this only seems to apply to ALU, any ALU + * that's hoisted potentially enables hoisting intrinsics using it, so this really affects + * all instructions. + */ + if (!instr_dominates_exit && instr->type == nir_instr_type_alu) + return num_dst_bits + 64 < num_all_src_bits; + + return true; +} diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h index 2ab3e10a1f5..223349a713f 100644 --- a/src/amd/common/nir/ac_nir.h +++ b/src/amd/common/nir/ac_nir.h @@ -447,6 +447,10 @@ ac_nir_allow_offset_wrap_cb(nir_intrinsic_instr *instr, const void *data); bool ac_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu); +bool +ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits, + unsigned num_all_src_bits, nir_loop *loop); + uint8_t ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 3f39e28e01e..00ece257739 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs; if (!stage->key.optimisations_disabled) { - NIR_PASS(_, stage->nir, nir_opt_licm); + NIR_PASS(_, stage->nir, nir_opt_licm, ac_nir_opt_licm_filter_instr_cb); if (stage->stage == MESA_SHADER_VERTEX) { /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences. diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index e13cd1df407..7c2b89ec6ea 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -425,6 +425,7 @@ if with_tests 'tests/mod_analysis_tests.cpp', 'tests/negative_equal_tests.cpp', 'tests/opt_if_tests.cpp', + 'tests/opt_licm_tests.cpp', 'tests/opt_loop_tests.cpp', 'tests/opt_peephole_select.cpp', 'tests/opt_shrink_vectors_tests.cpp', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 7209d28e853..647452432ae 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -4746,6 +4746,18 @@ nir_block *nir_cf_node_cf_tree_prev(nir_cf_node *node); block != nir_cf_node_cf_tree_prev(node); \ block = prev, prev = nir_block_cf_tree_prev(block)) +static inline nir_block * +nir_loop_predecessor_block(nir_loop *loop) +{ + return nir_cf_node_cf_tree_prev(&loop->cf_node); +} + +static inline nir_block * +nir_loop_successor_block(nir_loop *loop) +{ + return nir_cf_node_cf_tree_next(&loop->cf_node); +} + /* If the following CF node is an if, this function returns that if. * Otherwise, it returns NULL. */ @@ -6458,7 +6470,14 @@ bool nir_opt_large_constants(nir_shader *shader, glsl_type_size_align_func size_align, unsigned threshold); -bool nir_opt_licm(nir_shader *shader); +typedef bool (*nir_opt_licm_filter_instr_cb)(nir_instr *instr, + bool instr_dominates_exit, + unsigned num_dst_bits, + unsigned num_all_src_bits, + nir_loop *loop); + +bool nir_opt_licm(nir_shader *shader, + nir_opt_licm_filter_instr_cb filter_instr); bool nir_opt_loop(nir_shader *shader); bool nir_opt_loop_unroll(nir_shader *shader); diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c index 8ede208b4cc..0b5c2d35ce9 100644 --- a/src/compiler/nir/nir_opt_licm.c +++ b/src/compiler/nir/nir_opt_licm.c @@ -5,30 +5,69 @@ #include "nir.h" +typedef struct { + nir_opt_licm_filter_instr_cb filter_instr; + + nir_loop *loop; + bool current_block_dominates_exit; + unsigned num_all_src_bits; +} licm_state; + static bool -defined_before_loop(nir_src *src, void *state) +defined_before_loop(nir_src *src, void *_state) { - unsigned *loop_preheader_idx = state; - return nir_def_block(src->ssa)->index <= *loop_preheader_idx; + licm_state *state = (licm_state *)_state; + + state->num_all_src_bits += src->ssa->bit_size * src->ssa->num_components; + + /* The current instruction is loop-invariant only if its sources are before + * the loop. + */ + return nir_def_block(src->ssa)->index <= + nir_loop_predecessor_block(state->loop)->index; } static bool -is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx) +is_instr_loop_invariant(nir_instr *instr, licm_state *state) { switch (instr->type) { case nir_instr_type_load_const: case nir_instr_type_undef: return true; - case nir_instr_type_intrinsic: - if (!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) - return false; - FALLTHROUGH; - case nir_instr_type_alu: case nir_instr_type_tex: case nir_instr_type_deref: - return nir_foreach_src(instr, defined_before_loop, &loop_preheader_idx); + case nir_instr_type_intrinsic: { + /* An instruction can be hoisted if it either dominates the exit (i.e. + * it always executes) and is reorderable, or is speculatable. + */ + if (state->current_block_dominates_exit) { + if (instr->type == nir_instr_type_intrinsic && + !nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) + return false; + } else { + if (!nir_instr_can_speculate(instr)) + return false; + } + + state->num_all_src_bits = 0; + + bool invariant = nir_foreach_src(instr, defined_before_loop, state); + if (!invariant) + return false; + + if (state->filter_instr) { + nir_def *def = nir_instr_def(instr); + + if (!state->filter_instr(instr, state->current_block_dominates_exit, + def->bit_size * def->num_components, + state->num_all_src_bits, state->loop)) + return false; + } + + return true; + } case nir_instr_type_phi: case nir_instr_type_call: @@ -39,13 +78,17 @@ is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx) } static bool -visit_block(nir_block *block, nir_block *preheader) +visit_block(nir_block *block, licm_state *state) { + state->current_block_dominates_exit = + nir_block_dominates(block, nir_loop_successor_block(state->loop)); + bool progress = false; nir_foreach_instr_safe(instr, block) { - if (is_instr_loop_invariant(instr, preheader->index)) { + if (is_instr_loop_invariant(instr, state)) { nir_instr_remove(instr); - nir_instr_insert_after_block(preheader, instr); + nir_instr_insert_after_block(nir_loop_predecessor_block(state->loop), + instr); progress = true; } } @@ -80,40 +123,62 @@ should_optimize_loop(nir_loop *loop) } static bool -visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit) +visit_cf_list(struct exec_list *list, licm_state *state) { bool progress = false; foreach_list_typed(nir_cf_node, node, node, list) { switch (node->type) { case nir_cf_node_block: { - /* By only visiting blocks which dominate the loop exit, we - * ensure that we don't speculatively hoist any instructions - * which otherwise might not be executed. - * - * Note, that the proper check would be whether this block - * postdominates the loop preheader. + nir_cf_node *next = nir_cf_node_next(node); + bool optimize_loop = false; + + /* If the next CF node is a loop that we optimize, visit it first + * before visiting its predecessor block, so that any instructions + * hoisted from this (potentially nested) loop are then considered + * for hoisting from the outer loop as well. The goal is to hoist + * instructions across all levels of nested loops. */ + if (next && next->type == nir_cf_node_loop) { + nir_loop *inner_loop = nir_cf_node_as_loop(next); + optimize_loop = should_optimize_loop(inner_loop); + + if (optimize_loop) { + nir_loop *outer_loop = state->loop; + + state->loop = inner_loop; + progress |= visit_cf_list(&inner_loop->body, state); + progress |= visit_cf_list(&inner_loop->continue_list, state); + state->loop = outer_loop; + } + } + + /* Visit the block. */ nir_block *block = nir_cf_node_as_block(node); - if (exit && nir_block_dominates(block, exit)) - progress |= visit_block(block, preheader); + if (state->loop) + progress |= visit_block(block, state); + + if (next && next->type == nir_cf_node_loop && !optimize_loop) { + nir_loop *loop = nir_cf_node_as_loop(next); + + /* We treat this loop like any other block, so we don't do LICM + * from it per se, but if this loop is nested inside another + * loop, we still do LICM for the outer loop. + */ + progress |= visit_cf_list(&loop->body, state); + progress |= visit_cf_list(&loop->continue_list, state); + } break; } case nir_cf_node_if: { nir_if *nif = nir_cf_node_as_if(node); - progress |= visit_cf_list(&nif->then_list, preheader, exit); - progress |= visit_cf_list(&nif->else_list, preheader, exit); + progress |= visit_cf_list(&nif->then_list, state); + progress |= visit_cf_list(&nif->else_list, state); break; } - case nir_cf_node_loop: { - nir_loop *loop = nir_cf_node_as_loop(node); - bool opt = should_optimize_loop(loop); - nir_block *inner_preheader = opt ? nir_cf_node_cf_tree_prev(node) : preheader; - nir_block *inner_exit = opt ? nir_cf_node_cf_tree_next(node) : exit; - progress |= visit_cf_list(&loop->body, inner_preheader, inner_exit); - progress |= visit_cf_list(&loop->continue_list, inner_preheader, inner_exit); + case nir_cf_node_loop: + /* All loops are handled when handling their predecessor block. */ break; - } case nir_cf_node_function: UNREACHABLE("NIR LICM: Unsupported cf_node type."); } @@ -123,17 +188,19 @@ visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit) } bool -nir_opt_licm(nir_shader *shader) +nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_instr_cb filter_instr) { + licm_state state = {filter_instr}; bool progress = false; nir_foreach_function_impl(impl, shader) { nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance); - bool impl_progress = visit_cf_list(&impl->body, NULL, NULL); - progress |= nir_progress(impl_progress, impl, - nir_metadata_block_index | nir_metadata_dominance); + state.loop = NULL; + + progress |= nir_progress(visit_cf_list(&impl->body, &state), impl, + nir_metadata_control_flow); } return progress; diff --git a/src/compiler/nir/tests/opt_licm_tests.cpp b/src/compiler/nir/tests/opt_licm_tests.cpp new file mode 100644 index 00000000000..f8b70394f3a --- /dev/null +++ b/src/compiler/nir/tests/opt_licm_tests.cpp @@ -0,0 +1,231 @@ +/* Copyright 2025 Advanced Micro Devices, Inc. + * SPDX-License-Identifier: MIT + */ + +#include "nir_test.h" + +class nir_opt_licm_test : public nir_test { +protected: + nir_opt_licm_test() + : nir_test::nir_test("nir_opt_licm_test", MESA_SHADER_COMPUTE) + { + } + + nir_loop *loop; + nir_block *original_block; + nir_def *x, *y, *z, *r; + bool expect_progress; + bool insert_after_break; + + void test_init(); + void test_finish(nir_opt_licm_filter_instr_cb filter_instr); +}; + +void +nir_opt_licm_test::test_init() +{ + x = nir_load_global(b, 1, 32, nir_undef(b, 1, 64)); + y = nir_load_global(b, 1, 32, nir_undef(b, 1, 64)); + z = nir_load_global(b, 1, 32, nir_undef(b, 1, 64)); + + loop = nir_push_loop(b); + if (insert_after_break) + nir_break_if(b, nir_undef(b, 1, 1)); + original_block = nir_loop_last_block(loop); +} + +static bool +filter_using_dst_src_bits(nir_instr *instr, bool instr_dominates_exit, + unsigned num_dst_bits, unsigned num_all_src_bits, + nir_loop *loop) +{ + return num_dst_bits <= num_all_src_bits; +} + +void +nir_opt_licm_test::test_finish(nir_opt_licm_filter_instr_cb filter_instr) +{ + if (!insert_after_break) + nir_break_if(b, nir_undef(b, 1, 1)); + nir_pop_loop(b, loop); + nir_validate_shader(b->shader, NULL); + + bool progress = false; + NIR_PASS(progress, b->shader, nir_opt_licm, filter_instr); + + if (expect_progress) { + ASSERT_TRUE(progress); + ASSERT_EQ(nir_def_instr(r)->block, nir_loop_predecessor_block(loop)); + } else { + ASSERT_FALSE(progress); + ASSERT_EQ(nir_def_instr(r)->block, original_block); + } +} + +TEST_F(nir_opt_licm_test, hoist_alu_unary) +{ + this->insert_after_break = true; + this->expect_progress = true; + this->test_init(); + r = nir_ineg(b, x); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, hoist_alu_binary) +{ + this->insert_after_break = true; + this->expect_progress = true; + this->test_init(); + r = nir_iadd(b, x, y); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, skip_alu_u2u64) +{ + this->insert_after_break = true; + this->expect_progress = false; + this->test_init(); + r = nir_u2u64(b, x); + + /* If sizeof(dst) > sizeof(all srcs), the default behavior is not to hoist + * because that would increase register usage of the whole loop. + */ + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, skip_load_ssbo_no_flags_before_break) +{ + this->insert_after_break = false; + this->expect_progress = false; + this->test_init(); + r = nir_load_ssbo(b, 1, 32, x, y); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_before_break) +{ + this->insert_after_break = false; + this->expect_progress = true; + this->test_init(); + r = nir_load_ssbo(b, 1, 32, x, y); + nir_intrinsic_set_access(nir_def_as_intrinsic(r), + (gl_access_qualifier)(ACCESS_CAN_REORDER)); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, skip_load_ssbo_reorderable) +{ + this->insert_after_break = true; + this->expect_progress = false; + this->test_init(); + r = nir_load_ssbo(b, 1, 32, x, y); + nir_intrinsic_set_access(nir_def_as_intrinsic(r), + (gl_access_qualifier)(ACCESS_CAN_REORDER)); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, skip_load_ssbo_speculatable) +{ + this->insert_after_break = true; + this->expect_progress = false; + this->test_init(); + r = nir_load_ssbo(b, 1, 32, x, y); + nir_intrinsic_set_access(nir_def_as_intrinsic(r), + (gl_access_qualifier)(ACCESS_CAN_SPECULATE)); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_speculatable) +{ + this->insert_after_break = true; + this->expect_progress = true; + this->test_init(); + r = nir_load_ssbo(b, 1, 32, x, y); + nir_intrinsic_set_access(nir_def_as_intrinsic(r), + (gl_access_qualifier)(ACCESS_CAN_REORDER | + ACCESS_CAN_SPECULATE)); + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, hoist_alu_2_nested_loops) +{ + this->insert_after_break = true; + this->expect_progress = true; + this->test_init(); + + nir_loop *nested_loop = nir_push_loop(b); + { + nir_break_if(b, nir_undef(b, 1, 1)); + r = nir_ineg(b, x); + } + nir_pop_loop(b, nested_loop); + + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, hoist_alu_6_nested_loops) +{ + this->insert_after_break = true; + this->expect_progress = true; + this->test_init(); + + nir_loop *nested_loops[5]; + + for (unsigned i = 0; i < ARRAY_SIZE(nested_loops); i++) { + nested_loops[i] = nir_push_loop(b); + nir_break_if(b, nir_undef(b, 1, 1)); + } + + r = nir_ineg(b, x); + + for (int i = ARRAY_SIZE(nested_loops) - 1; i >= 0; i--) + nir_pop_loop(b, nested_loops[i]); + + this->test_finish(filter_using_dst_src_bits); +} + +TEST_F(nir_opt_licm_test, skip_tex) +{ + this->insert_after_break = true; + this->expect_progress = false; + this->test_init(); + + nir_tex_builder fields = {0}; + fields.coord = x; + fields.texture_handle = y; + fields.dest_type = nir_type_uint32; + + r = nir_build_tex_struct(b, nir_texop_tex, fields); + this->test_finish(NULL); +} + +TEST_F(nir_opt_licm_test, hoist_tex_before_break) +{ + this->insert_after_break = false; + this->expect_progress = true; + this->test_init(); + + nir_tex_builder fields = {0}; + fields.coord = x; + fields.texture_handle = y; + fields.dest_type = nir_type_uint32; + + r = nir_build_tex_struct(b, nir_texop_tex, fields); + this->test_finish(NULL); +} + +TEST_F(nir_opt_licm_test, hoist_tex_speculatable) +{ + this->insert_after_break = true; + this->expect_progress = true; + this->test_init(); + + nir_tex_builder fields = {0}; + fields.coord = x; + fields.texture_handle = y; + fields.can_speculate = true; + fields.dest_type = nir_type_uint32; + + r = nir_build_tex_struct(b, nir_texop_tex, fields); + this->test_finish(NULL); +} diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c index c674119fc94..4ef6fa9461d 100644 --- a/src/imagination/pco/pco_nir.c +++ b/src/imagination/pco/pco_nir.c @@ -786,6 +786,14 @@ static bool robustness_filter(const nir_intrinsic_instr *intr, return false; } +static bool +opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, + unsigned num_dst_bits, unsigned num_all_src_bits, + nir_loop *loop) +{ + return instr_dominates_exit; +} + /** * \brief Lowers a NIR shader. * @@ -806,7 +814,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data) NIR_PASS(_, nir, nir_lower_memory_model); - NIR_PASS(_, nir, nir_opt_licm); + NIR_PASS(_, nir, nir_opt_licm, opt_licm_filter_instr_cb); NIR_PASS(_, nir, nir_lower_memcpy);