mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 05:10:11 +01:00
Merge branch 'nir-move-reorder-loads' into 'main'
Draft: nir: add new pass nir_opt_move_reorder_loads for ACO See merge request mesa/mesa!36244
This commit is contained in:
commit
8ea3a1ed60
6 changed files with 335 additions and 0 deletions
|
|
@ -606,6 +606,35 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
|
||||||
NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
|
NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (gfx_level >= GFX12) {
|
||||||
|
/* loadcnt */
|
||||||
|
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||||
|
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||||
|
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||||
|
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||||
|
|
||||||
|
/* samplecnt (these flags are unaffected by nir_move_only_divergent) */
|
||||||
|
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_tex_sample | nir_move_tex_lod);
|
||||||
|
} else {
|
||||||
|
/* vmcnt */
|
||||||
|
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_tex_sample | nir_move_tex_lod |
|
||||||
|
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||||
|
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||||
|
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||||
|
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
|
||||||
|
* order can help the backend scheduler)
|
||||||
|
*/
|
||||||
|
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
|
||||||
|
|
||||||
|
NIR_PASS(_, stage->nir, nir_opt_group_loads, nir_group_same_resource_only, 16);
|
||||||
|
|
||||||
stage->info.nir_shared_size = stage->nir->info.shared_size;
|
stage->info.nir_shared_size = stage->nir->info.shared_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -281,6 +281,7 @@ else
|
||||||
'nir_opt_memcpy.c',
|
'nir_opt_memcpy.c',
|
||||||
'nir_opt_move.c',
|
'nir_opt_move.c',
|
||||||
'nir_opt_move_discards_to_top.c',
|
'nir_opt_move_discards_to_top.c',
|
||||||
|
'nir_opt_move_reorder_loads.c',
|
||||||
'nir_opt_move_to_top.c',
|
'nir_opt_move_to_top.c',
|
||||||
'nir_opt_mqsad.c',
|
'nir_opt_mqsad.c',
|
||||||
'nir_opt_non_uniform_access.c',
|
'nir_opt_non_uniform_access.c',
|
||||||
|
|
@ -423,6 +424,7 @@ if with_tests
|
||||||
'tests/lower_discard_if_tests.cpp',
|
'tests/lower_discard_if_tests.cpp',
|
||||||
'tests/minimize_call_live_states_test.cpp',
|
'tests/minimize_call_live_states_test.cpp',
|
||||||
'tests/mod_analysis_tests.cpp',
|
'tests/mod_analysis_tests.cpp',
|
||||||
|
'tests/move_reorder_loads_tests.cpp',
|
||||||
'tests/negative_equal_tests.cpp',
|
'tests/negative_equal_tests.cpp',
|
||||||
'tests/opt_if_tests.cpp',
|
'tests/opt_if_tests.cpp',
|
||||||
'tests/opt_loop_tests.cpp',
|
'tests/opt_loop_tests.cpp',
|
||||||
|
|
|
||||||
|
|
@ -6501,6 +6501,9 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
|
||||||
|
|
||||||
bool nir_opt_move(nir_shader *shader, nir_move_options options);
|
bool nir_opt_move(nir_shader *shader, nir_move_options options);
|
||||||
|
|
||||||
|
unsigned nir_get_closest_use_instr_index(nir_instr *instr);
|
||||||
|
bool nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options);
|
||||||
|
|
||||||
typedef struct nir_opt_offsets_options {
|
typedef struct nir_opt_offsets_options {
|
||||||
/** nir_load_uniform max base offset */
|
/** nir_load_uniform max base offset */
|
||||||
uint32_t uniform_max;
|
uint32_t uniform_max;
|
||||||
|
|
|
||||||
187
src/compiler/nir/nir_opt_move_reorder_loads.c
Normal file
187
src/compiler/nir/nir_opt_move_reorder_loads.c
Normal file
|
|
@ -0,0 +1,187 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This pass moves (sinks / reorders) loads to make them execute in the order
|
||||||
|
* their results are used. The loads are moved immediately after the load whose
|
||||||
|
* use occurs sooner, which is the minimum distance necessary to move loads to
|
||||||
|
* get them in the desired order. It doesn't move loads that don't need to be
|
||||||
|
* moved, and it doesn't move loads between blocks, but it reorders loads within
|
||||||
|
* a block even if their uses are outside the block.
|
||||||
|
*
|
||||||
|
* Such moves reduce live ranges for the load results, but increase live ranges
|
||||||
|
* for the load srcs.
|
||||||
|
*
|
||||||
|
* Before: After:
|
||||||
|
* %0 = load %0 = load
|
||||||
|
* ... ...
|
||||||
|
* %1 = load ─┐
|
||||||
|
* ... │ ...
|
||||||
|
* %2 = load ─│──┐
|
||||||
|
* ... │ │ ...
|
||||||
|
* %3 = load │ │ %3 = load
|
||||||
|
* │ └─> %2 = load
|
||||||
|
* └────> %1 = load
|
||||||
|
* ... ...
|
||||||
|
* use %0 use %0
|
||||||
|
* ... ...
|
||||||
|
* use %3 use %3
|
||||||
|
* ... ...
|
||||||
|
* use %2 use %2
|
||||||
|
* ... ...
|
||||||
|
* use %1 use %1
|
||||||
|
*
|
||||||
|
* This is useful for hw that uses a load counter to wait for the N-th previous
|
||||||
|
* load before a use. Executing loads in the order the results are used allows
|
||||||
|
* the hw to wait only for the oldest load at any given time.
|
||||||
|
*
|
||||||
|
* If the hw has multiple load counters for different kinds of loads, it's
|
||||||
|
* recommended to call this pass separately for each such counter using
|
||||||
|
* different options.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "nir.h"
|
||||||
|
#include "util/u_dynarray.h"
|
||||||
|
|
||||||
|
unsigned
|
||||||
|
nir_get_closest_use_instr_index(nir_instr *instr)
|
||||||
|
{
|
||||||
|
unsigned closest_use_instr_index = UINT_MAX;
|
||||||
|
|
||||||
|
nir_foreach_use_including_if(src, nir_instr_def(instr)) {
|
||||||
|
unsigned this_use =
|
||||||
|
nir_src_is_if(src) ?
|
||||||
|
nir_if_first_then_block(nir_src_parent_if(src))->start_ip :
|
||||||
|
nir_src_parent_instr(src)->index;
|
||||||
|
|
||||||
|
closest_use_instr_index = MIN2(closest_use_instr_index, this_use);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This will fail only if instr has no use. */
|
||||||
|
assert(closest_use_instr_index != UINT_MAX &&
|
||||||
|
"dead code shouldn't be present");
|
||||||
|
return closest_use_instr_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct load_info {
|
||||||
|
nir_instr *instr;
|
||||||
|
unsigned closest_use_instr_index;
|
||||||
|
} load_info;
|
||||||
|
|
||||||
|
static int
|
||||||
|
compare_closest_use(const void *a, const void *b)
|
||||||
|
{
|
||||||
|
return ((load_info*)a)->closest_use_instr_index -
|
||||||
|
((load_info*)b)->closest_use_instr_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
process_block(nir_block *block, nir_move_options options,
|
||||||
|
struct util_dynarray *scratch)
|
||||||
|
{
|
||||||
|
util_dynarray_clear(scratch);
|
||||||
|
bool sorted = true;
|
||||||
|
|
||||||
|
/* Gather all loads that we want to reorder. */
|
||||||
|
nir_foreach_instr(instr, block) {
|
||||||
|
if (nir_can_move_instr(instr, options)) {
|
||||||
|
unsigned closest_use_instr_index = nir_get_closest_use_instr_index(instr);
|
||||||
|
|
||||||
|
load_info info = {
|
||||||
|
.instr = instr,
|
||||||
|
.closest_use_instr_index = closest_use_instr_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* If the previous load has its closest use after the closest use of
|
||||||
|
* the current load, they must be reordered.
|
||||||
|
*/
|
||||||
|
if (util_dynarray_num_elements(scratch, load_info) &&
|
||||||
|
util_dynarray_last_ptr(scratch, load_info)->closest_use_instr_index >
|
||||||
|
closest_use_instr_index)
|
||||||
|
sorted = false;
|
||||||
|
|
||||||
|
util_dynarray_append(scratch, load_info, info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Exit if the loads are already sorted. */
|
||||||
|
if (sorted)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
bool progress = false;
|
||||||
|
unsigned num_loads = util_dynarray_num_elements(scratch, load_info);
|
||||||
|
load_info *loads = util_dynarray_element(scratch, load_info, 0);
|
||||||
|
|
||||||
|
/* Sort loads by the position of their use. This only sorts the gathered
|
||||||
|
* loads in the array, which is necessary to determine their order.
|
||||||
|
*/
|
||||||
|
qsort(loads, num_loads, sizeof(load_info), compare_closest_use);
|
||||||
|
|
||||||
|
/* Sink loads that should be later. */
|
||||||
|
for (unsigned i = 1; i < num_loads; i++) {
|
||||||
|
load_info *prev = &loads[i - 1];
|
||||||
|
load_info *cur = &loads[i];
|
||||||
|
|
||||||
|
/* Check whether qsort did its job. */
|
||||||
|
assert(prev->closest_use_instr_index <= cur->closest_use_instr_index);
|
||||||
|
|
||||||
|
/* If prev should be before cur in the shader, but prev is after cur
|
||||||
|
* in the shader, sink cur after prev.
|
||||||
|
*/
|
||||||
|
if (prev->closest_use_instr_index < cur->closest_use_instr_index &&
|
||||||
|
prev->instr->index > cur->instr->index) {
|
||||||
|
nir_instr_move(nir_after_instr(prev->instr), cur->instr);
|
||||||
|
/* Set the position of cur to where we moved it. */
|
||||||
|
cur->instr->index = prev->instr->index;
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options)
|
||||||
|
{
|
||||||
|
/* Reject unexpected flags. */
|
||||||
|
assert(!(options & ~(nir_move_tex_sample |
|
||||||
|
nir_move_tex_load |
|
||||||
|
nir_move_tex_load_fragment_mask |
|
||||||
|
nir_move_tex_lod |
|
||||||
|
nir_move_tex_query |
|
||||||
|
nir_move_load_image |
|
||||||
|
nir_move_load_image_fragment_mask |
|
||||||
|
nir_move_query_image |
|
||||||
|
nir_move_load_input |
|
||||||
|
nir_move_load_global |
|
||||||
|
nir_move_load_ubo |
|
||||||
|
nir_move_load_ssbo |
|
||||||
|
nir_move_load_buffer_amd |
|
||||||
|
nir_move_only_convergent |
|
||||||
|
nir_move_only_divergent)));
|
||||||
|
bool any_progress = false;
|
||||||
|
|
||||||
|
struct util_dynarray scratch;
|
||||||
|
util_dynarray_init(&scratch, NULL);
|
||||||
|
|
||||||
|
nir_foreach_function_impl(impl, nir) {
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
nir_metadata_require(impl, nir_metadata_instr_index |
|
||||||
|
(options & (nir_move_only_convergent |
|
||||||
|
nir_move_only_divergent) ?
|
||||||
|
nir_metadata_divergence : 0));
|
||||||
|
|
||||||
|
nir_foreach_block(block, impl) {
|
||||||
|
progress |= process_block(block, options, &scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
any_progress |= nir_progress(progress, impl,
|
||||||
|
nir_metadata_control_flow |
|
||||||
|
nir_metadata_divergence);
|
||||||
|
}
|
||||||
|
|
||||||
|
util_dynarray_fini(&scratch);
|
||||||
|
return any_progress;
|
||||||
|
}
|
||||||
87
src/compiler/nir/tests/move_reorder_loads_tests.cpp
Normal file
87
src/compiler/nir/tests/move_reorder_loads_tests.cpp
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include "nir.h"
|
||||||
|
#include "nir_builder.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
class nir_move_reorder_test : public ::testing::Test {
|
||||||
|
protected:
|
||||||
|
nir_move_reorder_test()
|
||||||
|
{
|
||||||
|
glsl_type_singleton_init_or_ref();
|
||||||
|
}
|
||||||
|
|
||||||
|
~nir_move_reorder_test()
|
||||||
|
{
|
||||||
|
if (HasFailure()) {
|
||||||
|
printf("\nShader from the failed test:\n\n");
|
||||||
|
nir_print_shader(nir, stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_shader *nir;
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_F(nir_move_reorder_test, ssbo)
|
||||||
|
{
|
||||||
|
nir_shader_compiler_options options = {0};
|
||||||
|
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, &options, "n");
|
||||||
|
nir_def *undef = nir_undef(&b, 1, 32);
|
||||||
|
nir_def *loads[1000];
|
||||||
|
const unsigned num_loads = ARRAY_SIZE(loads);
|
||||||
|
|
||||||
|
this->nir = b.shader;
|
||||||
|
|
||||||
|
/* Insert loads. */
|
||||||
|
for (unsigned i = 0; i < num_loads; i++) {
|
||||||
|
loads[i] = nir_load_ssbo(&b, 1, 32, undef, undef);
|
||||||
|
nir_intrinsic_set_access(nir_instr_as_intrinsic(loads[i]->parent_instr),
|
||||||
|
ACCESS_CAN_REORDER);
|
||||||
|
}
|
||||||
|
|
||||||
|
srand(0x54987321);
|
||||||
|
|
||||||
|
/* Permute the loads in the array using Fisher–Yates shuffle. */
|
||||||
|
for (unsigned i = 0; i < num_loads - 2; i++) {
|
||||||
|
unsigned j = i + rand() % (num_loads - i);
|
||||||
|
assert(j < num_loads);
|
||||||
|
|
||||||
|
nir_def *tmp = loads[i];
|
||||||
|
loads[i] = loads[j];
|
||||||
|
loads[j] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Generate uses that use the loads in the permuted order. */
|
||||||
|
for (unsigned i = 0; i < num_loads; i++)
|
||||||
|
nir_ineg(&b, loads[i]);
|
||||||
|
|
||||||
|
NIR_PASS(_, b.shader, nir_opt_move_reorder_loads, nir_move_load_ssbo);
|
||||||
|
|
||||||
|
nir_metadata_require(b.impl, nir_metadata_instr_index);
|
||||||
|
|
||||||
|
/* Verify that the loads are sorted in the block by the position of their
|
||||||
|
* closest use.
|
||||||
|
*/
|
||||||
|
unsigned prev_load_closest_use = 0;
|
||||||
|
|
||||||
|
nir_foreach_instr(instr, nir_start_block(b.impl)) {
|
||||||
|
if (instr->type != nir_instr_type_intrinsic)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
unsigned closest_use = nir_get_closest_use_instr_index(instr);
|
||||||
|
|
||||||
|
if (prev_load_closest_use) {
|
||||||
|
ASSERT_LT(prev_load_closest_use, closest_use);
|
||||||
|
}
|
||||||
|
|
||||||
|
prev_load_closest_use = closest_use;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1042,6 +1042,33 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
|
||||||
/* This must be done after si_nir_late_opts() because it may generate vec const. */
|
/* This must be done after si_nir_late_opts() because it may generate vec const. */
|
||||||
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
|
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
|
||||||
|
|
||||||
|
if (sel->screen->info.gfx_level >= GFX12) {
|
||||||
|
/* loadcnt */
|
||||||
|
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||||
|
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||||
|
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||||
|
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||||
|
|
||||||
|
/* samplecnt (these flags are unaffected by nir_move_only_divergent) */
|
||||||
|
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_tex_sample | nir_move_tex_lod);
|
||||||
|
} else {
|
||||||
|
/* vmcnt */
|
||||||
|
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_tex_sample | nir_move_tex_lod |
|
||||||
|
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||||
|
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||||
|
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||||
|
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
|
||||||
|
* order can help the backend scheduler)
|
||||||
|
*/
|
||||||
|
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||||
|
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
|
||||||
|
|
||||||
/* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
|
/* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
|
||||||
* 200 is tuned for Viewperf. It should be done last.
|
* 200 is tuned for Viewperf. It should be done last.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue