mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 05:10:11 +01:00
Merge branch 'nir-move-reorder-loads' into 'main'
Draft: nir: add new pass nir_opt_move_reorder_loads for ACO See merge request mesa/mesa!36244
This commit is contained in:
commit
8ea3a1ed60
6 changed files with 335 additions and 0 deletions
|
|
@ -606,6 +606,35 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
|
|||
NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
|
||||
}
|
||||
|
||||
if (gfx_level >= GFX12) {
|
||||
/* loadcnt */
|
||||
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||
|
||||
/* samplecnt (these flags are unaffected by nir_move_only_divergent) */
|
||||
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||
nir_move_tex_sample | nir_move_tex_lod);
|
||||
} else {
|
||||
/* vmcnt */
|
||||
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||
nir_move_tex_sample | nir_move_tex_lod |
|
||||
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||
}
|
||||
|
||||
/* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
|
||||
* order can help the backend scheduler)
|
||||
*/
|
||||
NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
|
||||
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
|
||||
|
||||
NIR_PASS(_, stage->nir, nir_opt_group_loads, nir_group_same_resource_only, 16);
|
||||
|
||||
stage->info.nir_shared_size = stage->nir->info.shared_size;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -281,6 +281,7 @@ else
|
|||
'nir_opt_memcpy.c',
|
||||
'nir_opt_move.c',
|
||||
'nir_opt_move_discards_to_top.c',
|
||||
'nir_opt_move_reorder_loads.c',
|
||||
'nir_opt_move_to_top.c',
|
||||
'nir_opt_mqsad.c',
|
||||
'nir_opt_non_uniform_access.c',
|
||||
|
|
@ -423,6 +424,7 @@ if with_tests
|
|||
'tests/lower_discard_if_tests.cpp',
|
||||
'tests/minimize_call_live_states_test.cpp',
|
||||
'tests/mod_analysis_tests.cpp',
|
||||
'tests/move_reorder_loads_tests.cpp',
|
||||
'tests/negative_equal_tests.cpp',
|
||||
'tests/opt_if_tests.cpp',
|
||||
'tests/opt_loop_tests.cpp',
|
||||
|
|
|
|||
|
|
@ -6501,6 +6501,9 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
|
|||
|
||||
bool nir_opt_move(nir_shader *shader, nir_move_options options);
|
||||
|
||||
unsigned nir_get_closest_use_instr_index(nir_instr *instr);
|
||||
bool nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options);
|
||||
|
||||
typedef struct nir_opt_offsets_options {
|
||||
/** nir_load_uniform max base offset */
|
||||
uint32_t uniform_max;
|
||||
|
|
|
|||
187
src/compiler/nir/nir_opt_move_reorder_loads.c
Normal file
187
src/compiler/nir/nir_opt_move_reorder_loads.c
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
/*
|
||||
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* This pass moves (sinks / reorders) loads to make them execute in the order
|
||||
* their results are used. The loads are moved immediately after the load whose
|
||||
* use occurs sooner, which is the minimum distance necessary to move loads to
|
||||
* get them in the desired order. It doesn't move loads that don't need to be
|
||||
* moved, and it doesn't move loads between blocks, but it reorders loads within
|
||||
* a block even if their uses are outside the block.
|
||||
*
|
||||
* Such moves reduce live ranges for the load results, but increase live ranges
|
||||
* for the load srcs.
|
||||
*
|
||||
* Before: After:
|
||||
* %0 = load %0 = load
|
||||
* ... ...
|
||||
* %1 = load ─┐
|
||||
* ... │ ...
|
||||
* %2 = load ─│──┐
|
||||
* ... │ │ ...
|
||||
* %3 = load │ │ %3 = load
|
||||
* │ └─> %2 = load
|
||||
* └────> %1 = load
|
||||
* ... ...
|
||||
* use %0 use %0
|
||||
* ... ...
|
||||
* use %3 use %3
|
||||
* ... ...
|
||||
* use %2 use %2
|
||||
* ... ...
|
||||
* use %1 use %1
|
||||
*
|
||||
* This is useful for hw that uses a load counter to wait for the N-th previous
|
||||
* load before a use. Executing loads in the order the results are used allows
|
||||
* the hw to wait only for the oldest load at any given time.
|
||||
*
|
||||
* If the hw has multiple load counters for different kinds of loads, it's
|
||||
* recommended to call this pass separately for each such counter using
|
||||
* different options.
|
||||
*/
|
||||
|
||||
#include "nir.h"
|
||||
#include "util/u_dynarray.h"
|
||||
|
||||
unsigned
|
||||
nir_get_closest_use_instr_index(nir_instr *instr)
|
||||
{
|
||||
unsigned closest_use_instr_index = UINT_MAX;
|
||||
|
||||
nir_foreach_use_including_if(src, nir_instr_def(instr)) {
|
||||
unsigned this_use =
|
||||
nir_src_is_if(src) ?
|
||||
nir_if_first_then_block(nir_src_parent_if(src))->start_ip :
|
||||
nir_src_parent_instr(src)->index;
|
||||
|
||||
closest_use_instr_index = MIN2(closest_use_instr_index, this_use);
|
||||
}
|
||||
|
||||
/* This will fail only if instr has no use. */
|
||||
assert(closest_use_instr_index != UINT_MAX &&
|
||||
"dead code shouldn't be present");
|
||||
return closest_use_instr_index;
|
||||
}
|
||||
|
||||
typedef struct load_info {
|
||||
nir_instr *instr;
|
||||
unsigned closest_use_instr_index;
|
||||
} load_info;
|
||||
|
||||
static int
|
||||
compare_closest_use(const void *a, const void *b)
|
||||
{
|
||||
return ((load_info*)a)->closest_use_instr_index -
|
||||
((load_info*)b)->closest_use_instr_index;
|
||||
}
|
||||
|
||||
static bool
|
||||
process_block(nir_block *block, nir_move_options options,
|
||||
struct util_dynarray *scratch)
|
||||
{
|
||||
util_dynarray_clear(scratch);
|
||||
bool sorted = true;
|
||||
|
||||
/* Gather all loads that we want to reorder. */
|
||||
nir_foreach_instr(instr, block) {
|
||||
if (nir_can_move_instr(instr, options)) {
|
||||
unsigned closest_use_instr_index = nir_get_closest_use_instr_index(instr);
|
||||
|
||||
load_info info = {
|
||||
.instr = instr,
|
||||
.closest_use_instr_index = closest_use_instr_index,
|
||||
};
|
||||
|
||||
/* If the previous load has its closest use after the closest use of
|
||||
* the current load, they must be reordered.
|
||||
*/
|
||||
if (util_dynarray_num_elements(scratch, load_info) &&
|
||||
util_dynarray_last_ptr(scratch, load_info)->closest_use_instr_index >
|
||||
closest_use_instr_index)
|
||||
sorted = false;
|
||||
|
||||
util_dynarray_append(scratch, load_info, info);
|
||||
}
|
||||
}
|
||||
|
||||
/* Exit if the loads are already sorted. */
|
||||
if (sorted)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
unsigned num_loads = util_dynarray_num_elements(scratch, load_info);
|
||||
load_info *loads = util_dynarray_element(scratch, load_info, 0);
|
||||
|
||||
/* Sort loads by the position of their use. This only sorts the gathered
|
||||
* loads in the array, which is necessary to determine their order.
|
||||
*/
|
||||
qsort(loads, num_loads, sizeof(load_info), compare_closest_use);
|
||||
|
||||
/* Sink loads that should be later. */
|
||||
for (unsigned i = 1; i < num_loads; i++) {
|
||||
load_info *prev = &loads[i - 1];
|
||||
load_info *cur = &loads[i];
|
||||
|
||||
/* Check whether qsort did its job. */
|
||||
assert(prev->closest_use_instr_index <= cur->closest_use_instr_index);
|
||||
|
||||
/* If prev should be before cur in the shader, but prev is after cur
|
||||
* in the shader, sink cur after prev.
|
||||
*/
|
||||
if (prev->closest_use_instr_index < cur->closest_use_instr_index &&
|
||||
prev->instr->index > cur->instr->index) {
|
||||
nir_instr_move(nir_after_instr(prev->instr), cur->instr);
|
||||
/* Set the position of cur to where we moved it. */
|
||||
cur->instr->index = prev->instr->index;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options)
|
||||
{
|
||||
/* Reject unexpected flags. */
|
||||
assert(!(options & ~(nir_move_tex_sample |
|
||||
nir_move_tex_load |
|
||||
nir_move_tex_load_fragment_mask |
|
||||
nir_move_tex_lod |
|
||||
nir_move_tex_query |
|
||||
nir_move_load_image |
|
||||
nir_move_load_image_fragment_mask |
|
||||
nir_move_query_image |
|
||||
nir_move_load_input |
|
||||
nir_move_load_global |
|
||||
nir_move_load_ubo |
|
||||
nir_move_load_ssbo |
|
||||
nir_move_load_buffer_amd |
|
||||
nir_move_only_convergent |
|
||||
nir_move_only_divergent)));
|
||||
bool any_progress = false;
|
||||
|
||||
struct util_dynarray scratch;
|
||||
util_dynarray_init(&scratch, NULL);
|
||||
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
bool progress = false;
|
||||
|
||||
nir_metadata_require(impl, nir_metadata_instr_index |
|
||||
(options & (nir_move_only_convergent |
|
||||
nir_move_only_divergent) ?
|
||||
nir_metadata_divergence : 0));
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
progress |= process_block(block, options, &scratch);
|
||||
}
|
||||
|
||||
any_progress |= nir_progress(progress, impl,
|
||||
nir_metadata_control_flow |
|
||||
nir_metadata_divergence);
|
||||
}
|
||||
|
||||
util_dynarray_fini(&scratch);
|
||||
return any_progress;
|
||||
}
|
||||
87
src/compiler/nir/tests/move_reorder_loads_tests.cpp
Normal file
87
src/compiler/nir/tests/move_reorder_loads_tests.cpp
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
namespace {
|
||||
|
||||
class nir_move_reorder_test : public ::testing::Test {
|
||||
protected:
|
||||
nir_move_reorder_test()
|
||||
{
|
||||
glsl_type_singleton_init_or_ref();
|
||||
}
|
||||
|
||||
~nir_move_reorder_test()
|
||||
{
|
||||
if (HasFailure()) {
|
||||
printf("\nShader from the failed test:\n\n");
|
||||
nir_print_shader(nir, stdout);
|
||||
}
|
||||
}
|
||||
|
||||
nir_shader *nir;
|
||||
};
|
||||
|
||||
TEST_F(nir_move_reorder_test, ssbo)
|
||||
{
|
||||
nir_shader_compiler_options options = {0};
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, &options, "n");
|
||||
nir_def *undef = nir_undef(&b, 1, 32);
|
||||
nir_def *loads[1000];
|
||||
const unsigned num_loads = ARRAY_SIZE(loads);
|
||||
|
||||
this->nir = b.shader;
|
||||
|
||||
/* Insert loads. */
|
||||
for (unsigned i = 0; i < num_loads; i++) {
|
||||
loads[i] = nir_load_ssbo(&b, 1, 32, undef, undef);
|
||||
nir_intrinsic_set_access(nir_instr_as_intrinsic(loads[i]->parent_instr),
|
||||
ACCESS_CAN_REORDER);
|
||||
}
|
||||
|
||||
srand(0x54987321);
|
||||
|
||||
/* Permute the loads in the array using Fisher–Yates shuffle. */
|
||||
for (unsigned i = 0; i < num_loads - 2; i++) {
|
||||
unsigned j = i + rand() % (num_loads - i);
|
||||
assert(j < num_loads);
|
||||
|
||||
nir_def *tmp = loads[i];
|
||||
loads[i] = loads[j];
|
||||
loads[j] = tmp;
|
||||
}
|
||||
|
||||
/* Generate uses that use the loads in the permuted order. */
|
||||
for (unsigned i = 0; i < num_loads; i++)
|
||||
nir_ineg(&b, loads[i]);
|
||||
|
||||
NIR_PASS(_, b.shader, nir_opt_move_reorder_loads, nir_move_load_ssbo);
|
||||
|
||||
nir_metadata_require(b.impl, nir_metadata_instr_index);
|
||||
|
||||
/* Verify that the loads are sorted in the block by the position of their
|
||||
* closest use.
|
||||
*/
|
||||
unsigned prev_load_closest_use = 0;
|
||||
|
||||
nir_foreach_instr(instr, nir_start_block(b.impl)) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
unsigned closest_use = nir_get_closest_use_instr_index(instr);
|
||||
|
||||
if (prev_load_closest_use) {
|
||||
ASSERT_LT(prev_load_closest_use, closest_use);
|
||||
}
|
||||
|
||||
prev_load_closest_use = closest_use;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1042,6 +1042,33 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
|
|||
/* This must be done after si_nir_late_opts() because it may generate vec const. */
|
||||
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
|
||||
|
||||
if (sel->screen->info.gfx_level >= GFX12) {
|
||||
/* loadcnt */
|
||||
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||
|
||||
/* samplecnt (these flags are unaffected by nir_move_only_divergent) */
|
||||
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||
nir_move_tex_sample | nir_move_tex_lod);
|
||||
} else {
|
||||
/* vmcnt */
|
||||
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||
nir_move_tex_sample | nir_move_tex_lod |
|
||||
nir_move_tex_load | nir_move_tex_load_fragment_mask |
|
||||
nir_move_load_image | nir_move_load_image_fragment_mask |
|
||||
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
|
||||
nir_move_load_buffer_amd | nir_move_only_divergent);
|
||||
}
|
||||
|
||||
/* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
|
||||
* order can help the backend scheduler)
|
||||
*/
|
||||
NIR_PASS(_, nir, nir_opt_move_reorder_loads,
|
||||
nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
|
||||
|
||||
/* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
|
||||
* 200 is tuned for Viewperf. It should be done last.
|
||||
*/
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue