Merge branch 'nir-move-reorder-loads' into 'main'

Draft: nir: add new pass nir_opt_move_reorder_loads for ACO

See merge request mesa/mesa!36244
This commit is contained in:
Marek Olšák 2025-12-20 00:49:04 +00:00
commit 8ea3a1ed60
6 changed files with 335 additions and 0 deletions

View file

@ -606,6 +606,35 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
}
if (gfx_level >= GFX12) {
/* loadcnt */
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
nir_move_tex_load | nir_move_tex_load_fragment_mask |
nir_move_load_image | nir_move_load_image_fragment_mask |
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
nir_move_load_buffer_amd | nir_move_only_divergent);
/* samplecnt (these flags are unaffected by nir_move_only_divergent) */
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
nir_move_tex_sample | nir_move_tex_lod);
} else {
/* vmcnt */
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
nir_move_tex_sample | nir_move_tex_lod |
nir_move_tex_load | nir_move_tex_load_fragment_mask |
nir_move_load_image | nir_move_load_image_fragment_mask |
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
nir_move_load_buffer_amd | nir_move_only_divergent);
}
/* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
* order can help the backend scheduler)
*/
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
NIR_PASS(_, stage->nir, nir_opt_group_loads, nir_group_same_resource_only, 16);
stage->info.nir_shared_size = stage->nir->info.shared_size;
}

View file

@ -281,6 +281,7 @@ else
'nir_opt_memcpy.c',
'nir_opt_move.c',
'nir_opt_move_discards_to_top.c',
'nir_opt_move_reorder_loads.c',
'nir_opt_move_to_top.c',
'nir_opt_mqsad.c',
'nir_opt_non_uniform_access.c',
@ -423,6 +424,7 @@ if with_tests
'tests/lower_discard_if_tests.cpp',
'tests/minimize_call_live_states_test.cpp',
'tests/mod_analysis_tests.cpp',
'tests/move_reorder_loads_tests.cpp',
'tests/negative_equal_tests.cpp',
'tests/opt_if_tests.cpp',
'tests/opt_loop_tests.cpp',

View file

@ -6501,6 +6501,9 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
bool nir_opt_move(nir_shader *shader, nir_move_options options);
unsigned nir_get_closest_use_instr_index(nir_instr *instr);
bool nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options);
typedef struct nir_opt_offsets_options {
/** nir_load_uniform max base offset */
uint32_t uniform_max;

View file

@ -0,0 +1,187 @@
/*
* Copyright 2025 Advanced Micro Devices, Inc.
* SPDX-License-Identifier: MIT
*/
/* This pass moves (sinks / reorders) loads to make them execute in the order
* their results are used. The loads are moved immediately after the load whose
* use occurs sooner, which is the minimum distance necessary to move loads to
* get them in the desired order. It doesn't move loads that don't need to be
* moved, and it doesn't move loads between blocks, but it reorders loads within
* a block even if their uses are outside the block.
*
* Such moves reduce live ranges for the load results, but increase live ranges
* for the load srcs.
*
* Before: After:
* %0 = load %0 = load
* ... ...
* %1 = load
* ... ...
* %2 = load
* ... ...
* %3 = load %3 = load
* > %2 = load
* > %1 = load
* ... ...
* use %0 use %0
* ... ...
* use %3 use %3
* ... ...
* use %2 use %2
* ... ...
* use %1 use %1
*
* This is useful for hw that uses a load counter to wait for the N-th previous
* load before a use. Executing loads in the order the results are used allows
* the hw to wait only for the oldest load at any given time.
*
* If the hw has multiple load counters for different kinds of loads, it's
* recommended to call this pass separately for each such counter using
* different options.
*/
#include "nir.h"
#include "util/u_dynarray.h"
unsigned
nir_get_closest_use_instr_index(nir_instr *instr)
{
unsigned closest_use_instr_index = UINT_MAX;
nir_foreach_use_including_if(src, nir_instr_def(instr)) {
unsigned this_use =
nir_src_is_if(src) ?
nir_if_first_then_block(nir_src_parent_if(src))->start_ip :
nir_src_parent_instr(src)->index;
closest_use_instr_index = MIN2(closest_use_instr_index, this_use);
}
/* This will fail only if instr has no use. */
assert(closest_use_instr_index != UINT_MAX &&
"dead code shouldn't be present");
return closest_use_instr_index;
}
typedef struct load_info {
nir_instr *instr;
unsigned closest_use_instr_index;
} load_info;
static int
compare_closest_use(const void *a, const void *b)
{
return ((load_info*)a)->closest_use_instr_index -
((load_info*)b)->closest_use_instr_index;
}
static bool
process_block(nir_block *block, nir_move_options options,
struct util_dynarray *scratch)
{
util_dynarray_clear(scratch);
bool sorted = true;
/* Gather all loads that we want to reorder. */
nir_foreach_instr(instr, block) {
if (nir_can_move_instr(instr, options)) {
unsigned closest_use_instr_index = nir_get_closest_use_instr_index(instr);
load_info info = {
.instr = instr,
.closest_use_instr_index = closest_use_instr_index,
};
/* If the previous load has its closest use after the closest use of
* the current load, they must be reordered.
*/
if (util_dynarray_num_elements(scratch, load_info) &&
util_dynarray_last_ptr(scratch, load_info)->closest_use_instr_index >
closest_use_instr_index)
sorted = false;
util_dynarray_append(scratch, load_info, info);
}
}
/* Exit if the loads are already sorted. */
if (sorted)
return false;
bool progress = false;
unsigned num_loads = util_dynarray_num_elements(scratch, load_info);
load_info *loads = util_dynarray_element(scratch, load_info, 0);
/* Sort loads by the position of their use. This only sorts the gathered
* loads in the array, which is necessary to determine their order.
*/
qsort(loads, num_loads, sizeof(load_info), compare_closest_use);
/* Sink loads that should be later. */
for (unsigned i = 1; i < num_loads; i++) {
load_info *prev = &loads[i - 1];
load_info *cur = &loads[i];
/* Check whether qsort did its job. */
assert(prev->closest_use_instr_index <= cur->closest_use_instr_index);
/* If prev should be before cur in the shader, but prev is after cur
* in the shader, sink cur after prev.
*/
if (prev->closest_use_instr_index < cur->closest_use_instr_index &&
prev->instr->index > cur->instr->index) {
nir_instr_move(nir_after_instr(prev->instr), cur->instr);
/* Set the position of cur to where we moved it. */
cur->instr->index = prev->instr->index;
progress = true;
}
}
return progress;
}
bool
nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options)
{
/* Reject unexpected flags. */
assert(!(options & ~(nir_move_tex_sample |
nir_move_tex_load |
nir_move_tex_load_fragment_mask |
nir_move_tex_lod |
nir_move_tex_query |
nir_move_load_image |
nir_move_load_image_fragment_mask |
nir_move_query_image |
nir_move_load_input |
nir_move_load_global |
nir_move_load_ubo |
nir_move_load_ssbo |
nir_move_load_buffer_amd |
nir_move_only_convergent |
nir_move_only_divergent)));
bool any_progress = false;
struct util_dynarray scratch;
util_dynarray_init(&scratch, NULL);
nir_foreach_function_impl(impl, nir) {
bool progress = false;
nir_metadata_require(impl, nir_metadata_instr_index |
(options & (nir_move_only_convergent |
nir_move_only_divergent) ?
nir_metadata_divergence : 0));
nir_foreach_block(block, impl) {
progress |= process_block(block, options, &scratch);
}
any_progress |= nir_progress(progress, impl,
nir_metadata_control_flow |
nir_metadata_divergence);
}
util_dynarray_fini(&scratch);
return any_progress;
}

View file

@ -0,0 +1,87 @@
/*
* Copyright 2025 Advanced Micro Devices, Inc.
* SPDX-License-Identifier: MIT
*/
#include <gtest/gtest.h>
#include "nir.h"
#include "nir_builder.h"
namespace {
class nir_move_reorder_test : public ::testing::Test {
protected:
nir_move_reorder_test()
{
glsl_type_singleton_init_or_ref();
}
~nir_move_reorder_test()
{
if (HasFailure()) {
printf("\nShader from the failed test:\n\n");
nir_print_shader(nir, stdout);
}
}
nir_shader *nir;
};
TEST_F(nir_move_reorder_test, ssbo)
{
nir_shader_compiler_options options = {0};
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, &options, "n");
nir_def *undef = nir_undef(&b, 1, 32);
nir_def *loads[1000];
const unsigned num_loads = ARRAY_SIZE(loads);
this->nir = b.shader;
/* Insert loads. */
for (unsigned i = 0; i < num_loads; i++) {
loads[i] = nir_load_ssbo(&b, 1, 32, undef, undef);
nir_intrinsic_set_access(nir_instr_as_intrinsic(loads[i]->parent_instr),
ACCESS_CAN_REORDER);
}
srand(0x54987321);
/* Permute the loads in the array using FisherYates shuffle. */
for (unsigned i = 0; i < num_loads - 2; i++) {
unsigned j = i + rand() % (num_loads - i);
assert(j < num_loads);
nir_def *tmp = loads[i];
loads[i] = loads[j];
loads[j] = tmp;
}
/* Generate uses that use the loads in the permuted order. */
for (unsigned i = 0; i < num_loads; i++)
nir_ineg(&b, loads[i]);
NIR_PASS(_, b.shader, nir_opt_move_reorder_loads, nir_move_load_ssbo);
nir_metadata_require(b.impl, nir_metadata_instr_index);
/* Verify that the loads are sorted in the block by the position of their
* closest use.
*/
unsigned prev_load_closest_use = 0;
nir_foreach_instr(instr, nir_start_block(b.impl)) {
if (instr->type != nir_instr_type_intrinsic)
continue;
unsigned closest_use = nir_get_closest_use_instr_index(instr);
if (prev_load_closest_use) {
ASSERT_LT(prev_load_closest_use, closest_use);
}
prev_load_closest_use = closest_use;
}
}
}

View file

@ -1042,6 +1042,33 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
/* This must be done after si_nir_late_opts() because it may generate vec const. */
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
if (sel->screen->info.gfx_level >= GFX12) {
/* loadcnt */
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
nir_move_tex_load | nir_move_tex_load_fragment_mask |
nir_move_load_image | nir_move_load_image_fragment_mask |
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
nir_move_load_buffer_amd | nir_move_only_divergent);
/* samplecnt (these flags are unaffected by nir_move_only_divergent) */
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
nir_move_tex_sample | nir_move_tex_lod);
} else {
/* vmcnt */
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
nir_move_tex_sample | nir_move_tex_lod |
nir_move_tex_load | nir_move_tex_load_fragment_mask |
nir_move_load_image | nir_move_load_image_fragment_mask |
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
nir_move_load_buffer_amd | nir_move_only_divergent);
}
/* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
* order can help the backend scheduler)
*/
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
/* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
* 200 is tuned for Viewperf. It should be done last.
*/