diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 3f39e28e01e..87dd1f79c12 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -606,6 +606,35 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons); } + if (gfx_level >= GFX12) { + /* loadcnt */ + NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads, + nir_move_tex_load | nir_move_tex_load_fragment_mask | + nir_move_load_image | nir_move_load_image_fragment_mask | + nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | + nir_move_load_buffer_amd | nir_move_only_divergent); + + /* samplecnt (these flags are unaffected by nir_move_only_divergent) */ + NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads, + nir_move_tex_sample | nir_move_tex_lod); + } else { + /* vmcnt */ + NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads, + nir_move_tex_sample | nir_move_tex_lod | + nir_move_tex_load | nir_move_tex_load_fragment_mask | + nir_move_load_image | nir_move_load_image_fragment_mask | + nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | + nir_move_load_buffer_amd | nir_move_only_divergent); + } + + /* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal + * order can help the backend scheduler) + */ + NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads, + nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent); + + NIR_PASS(_, stage->nir, nir_opt_group_loads, nir_group_same_resource_only, 16); + stage->info.nir_shared_size = stage->nir->info.shared_size; } diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index e13cd1df407..33b9746d909 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -281,6 +281,7 @@ else 'nir_opt_memcpy.c', 'nir_opt_move.c', 'nir_opt_move_discards_to_top.c', + 'nir_opt_move_reorder_loads.c', 'nir_opt_move_to_top.c', 'nir_opt_mqsad.c', 'nir_opt_non_uniform_access.c', @@ -423,6 +424,7 @@ if with_tests 'tests/lower_discard_if_tests.cpp', 'tests/minimize_call_live_states_test.cpp', 'tests/mod_analysis_tests.cpp', + 'tests/move_reorder_loads_tests.cpp', 'tests/negative_equal_tests.cpp', 'tests/opt_if_tests.cpp', 'tests/opt_loop_tests.cpp', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 7209d28e853..c4ea3adbbfa 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6501,6 +6501,9 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options); bool nir_opt_move(nir_shader *shader, nir_move_options options); +unsigned nir_get_closest_use_instr_index(nir_instr *instr); +bool nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options); + typedef struct nir_opt_offsets_options { /** nir_load_uniform max base offset */ uint32_t uniform_max; diff --git a/src/compiler/nir/nir_opt_move_reorder_loads.c b/src/compiler/nir/nir_opt_move_reorder_loads.c new file mode 100644 index 00000000000..5c8efc82c2e --- /dev/null +++ b/src/compiler/nir/nir_opt_move_reorder_loads.c @@ -0,0 +1,187 @@ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * SPDX-License-Identifier: MIT + */ + +/* This pass moves (sinks / reorders) loads to make them execute in the order + * their results are used. The loads are moved immediately after the load whose + * use occurs sooner, which is the minimum distance necessary to move loads to + * get them in the desired order. It doesn't move loads that don't need to be + * moved, and it doesn't move loads between blocks, but it reorders loads within + * a block even if their uses are outside the block. + * + * Such moves reduce live ranges for the load results, but increase live ranges + * for the load srcs. + * + * Before: After: + * %0 = load %0 = load + * ... ... + * %1 = load ─┐ + * ... │ ... + * %2 = load ─│──┐ + * ... │ │ ... + * %3 = load │ │ %3 = load + * │ └─> %2 = load + * └────> %1 = load + * ... ... + * use %0 use %0 + * ... ... + * use %3 use %3 + * ... ... + * use %2 use %2 + * ... ... + * use %1 use %1 + * + * This is useful for hw that uses a load counter to wait for the N-th previous + * load before a use. Executing loads in the order the results are used allows + * the hw to wait only for the oldest load at any given time. + * + * If the hw has multiple load counters for different kinds of loads, it's + * recommended to call this pass separately for each such counter using + * different options. + */ + +#include "nir.h" +#include "util/u_dynarray.h" + +unsigned +nir_get_closest_use_instr_index(nir_instr *instr) +{ + unsigned closest_use_instr_index = UINT_MAX; + + nir_foreach_use_including_if(src, nir_instr_def(instr)) { + unsigned this_use = + nir_src_is_if(src) ? + nir_if_first_then_block(nir_src_parent_if(src))->start_ip : + nir_src_parent_instr(src)->index; + + closest_use_instr_index = MIN2(closest_use_instr_index, this_use); + } + + /* This will fail only if instr has no use. */ + assert(closest_use_instr_index != UINT_MAX && + "dead code shouldn't be present"); + return closest_use_instr_index; +} + +typedef struct load_info { + nir_instr *instr; + unsigned closest_use_instr_index; +} load_info; + +static int +compare_closest_use(const void *a, const void *b) +{ + return ((load_info*)a)->closest_use_instr_index - + ((load_info*)b)->closest_use_instr_index; +} + +static bool +process_block(nir_block *block, nir_move_options options, + struct util_dynarray *scratch) +{ + util_dynarray_clear(scratch); + bool sorted = true; + + /* Gather all loads that we want to reorder. */ + nir_foreach_instr(instr, block) { + if (nir_can_move_instr(instr, options)) { + unsigned closest_use_instr_index = nir_get_closest_use_instr_index(instr); + + load_info info = { + .instr = instr, + .closest_use_instr_index = closest_use_instr_index, + }; + + /* If the previous load has its closest use after the closest use of + * the current load, they must be reordered. + */ + if (util_dynarray_num_elements(scratch, load_info) && + util_dynarray_last_ptr(scratch, load_info)->closest_use_instr_index > + closest_use_instr_index) + sorted = false; + + util_dynarray_append(scratch, load_info, info); + } + } + + /* Exit if the loads are already sorted. */ + if (sorted) + return false; + + bool progress = false; + unsigned num_loads = util_dynarray_num_elements(scratch, load_info); + load_info *loads = util_dynarray_element(scratch, load_info, 0); + + /* Sort loads by the position of their use. This only sorts the gathered + * loads in the array, which is necessary to determine their order. + */ + qsort(loads, num_loads, sizeof(load_info), compare_closest_use); + + /* Sink loads that should be later. */ + for (unsigned i = 1; i < num_loads; i++) { + load_info *prev = &loads[i - 1]; + load_info *cur = &loads[i]; + + /* Check whether qsort did its job. */ + assert(prev->closest_use_instr_index <= cur->closest_use_instr_index); + + /* If prev should be before cur in the shader, but prev is after cur + * in the shader, sink cur after prev. + */ + if (prev->closest_use_instr_index < cur->closest_use_instr_index && + prev->instr->index > cur->instr->index) { + nir_instr_move(nir_after_instr(prev->instr), cur->instr); + /* Set the position of cur to where we moved it. */ + cur->instr->index = prev->instr->index; + progress = true; + } + } + + return progress; +} + +bool +nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options) +{ + /* Reject unexpected flags. */ + assert(!(options & ~(nir_move_tex_sample | + nir_move_tex_load | + nir_move_tex_load_fragment_mask | + nir_move_tex_lod | + nir_move_tex_query | + nir_move_load_image | + nir_move_load_image_fragment_mask | + nir_move_query_image | + nir_move_load_input | + nir_move_load_global | + nir_move_load_ubo | + nir_move_load_ssbo | + nir_move_load_buffer_amd | + nir_move_only_convergent | + nir_move_only_divergent))); + bool any_progress = false; + + struct util_dynarray scratch; + util_dynarray_init(&scratch, NULL); + + nir_foreach_function_impl(impl, nir) { + bool progress = false; + + nir_metadata_require(impl, nir_metadata_instr_index | + (options & (nir_move_only_convergent | + nir_move_only_divergent) ? + nir_metadata_divergence : 0)); + + nir_foreach_block(block, impl) { + progress |= process_block(block, options, &scratch); + } + + any_progress |= nir_progress(progress, impl, + nir_metadata_control_flow | + nir_metadata_divergence); + } + + util_dynarray_fini(&scratch); + return any_progress; +} diff --git a/src/compiler/nir/tests/move_reorder_loads_tests.cpp b/src/compiler/nir/tests/move_reorder_loads_tests.cpp new file mode 100644 index 00000000000..6679ffd4a71 --- /dev/null +++ b/src/compiler/nir/tests/move_reorder_loads_tests.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * SPDX-License-Identifier: MIT + */ + +#include + +#include "nir.h" +#include "nir_builder.h" + +namespace { + +class nir_move_reorder_test : public ::testing::Test { +protected: + nir_move_reorder_test() + { + glsl_type_singleton_init_or_ref(); + } + + ~nir_move_reorder_test() + { + if (HasFailure()) { + printf("\nShader from the failed test:\n\n"); + nir_print_shader(nir, stdout); + } + } + + nir_shader *nir; +}; + +TEST_F(nir_move_reorder_test, ssbo) +{ + nir_shader_compiler_options options = {0}; + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, &options, "n"); + nir_def *undef = nir_undef(&b, 1, 32); + nir_def *loads[1000]; + const unsigned num_loads = ARRAY_SIZE(loads); + + this->nir = b.shader; + + /* Insert loads. */ + for (unsigned i = 0; i < num_loads; i++) { + loads[i] = nir_load_ssbo(&b, 1, 32, undef, undef); + nir_intrinsic_set_access(nir_instr_as_intrinsic(loads[i]->parent_instr), + ACCESS_CAN_REORDER); + } + + srand(0x54987321); + + /* Permute the loads in the array using Fisher–Yates shuffle. */ + for (unsigned i = 0; i < num_loads - 2; i++) { + unsigned j = i + rand() % (num_loads - i); + assert(j < num_loads); + + nir_def *tmp = loads[i]; + loads[i] = loads[j]; + loads[j] = tmp; + } + + /* Generate uses that use the loads in the permuted order. */ + for (unsigned i = 0; i < num_loads; i++) + nir_ineg(&b, loads[i]); + + NIR_PASS(_, b.shader, nir_opt_move_reorder_loads, nir_move_load_ssbo); + + nir_metadata_require(b.impl, nir_metadata_instr_index); + + /* Verify that the loads are sorted in the block by the position of their + * closest use. + */ + unsigned prev_load_closest_use = 0; + + nir_foreach_instr(instr, nir_start_block(b.impl)) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + unsigned closest_use = nir_get_closest_use_instr_index(instr); + + if (prev_load_closest_use) { + ASSERT_LT(prev_load_closest_use, closest_use); + } + + prev_load_closest_use = closest_use; + } +} + +} diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 728984304c5..747cf95abfa 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1042,6 +1042,33 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx * /* This must be done after si_nir_late_opts() because it may generate vec const. */ NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + if (sel->screen->info.gfx_level >= GFX12) { + /* loadcnt */ + NIR_PASS(_, nir, nir_opt_move_reorder_loads, + nir_move_tex_load | nir_move_tex_load_fragment_mask | + nir_move_load_image | nir_move_load_image_fragment_mask | + nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | + nir_move_load_buffer_amd | nir_move_only_divergent); + + /* samplecnt (these flags are unaffected by nir_move_only_divergent) */ + NIR_PASS(_, nir, nir_opt_move_reorder_loads, + nir_move_tex_sample | nir_move_tex_lod); + } else { + /* vmcnt */ + NIR_PASS(_, nir, nir_opt_move_reorder_loads, + nir_move_tex_sample | nir_move_tex_lod | + nir_move_tex_load | nir_move_tex_load_fragment_mask | + nir_move_load_image | nir_move_load_image_fragment_mask | + nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | + nir_move_load_buffer_amd | nir_move_only_divergent); + } + + /* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal + * order can help the backend scheduler) + */ + NIR_PASS(_, nir, nir_opt_move_reorder_loads, + nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent); + /* This helps LLVM form VMEM clauses and thus get more GPU cache hits. * 200 is tuned for Viewperf. It should be done last. */