Merge branch 'nir-move-reorder-loads' into 'main'

Draft: nir: add new pass nir_opt_move_reorder_loads for ACO See merge request mesa/mesa!36244
2025-12-20 05:10:11 +01:00 · 2025-12-20 00:49:04 +00:00 · 2025-12-20 00:49:04 +00:00 · 8ea3a1ed60
commit 8ea3a1ed60
parent c430f394c5 2035403d3a
6 changed files with 335 additions and 0 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -606,6 +606,35 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
      NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
   }
   if (gfx_level >= GFX12) {
      /* loadcnt */
      NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
               nir_move_tex_load | nir_move_tex_load_fragment_mask |
               nir_move_load_image | nir_move_load_image_fragment_mask |
               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
               nir_move_load_buffer_amd | nir_move_only_divergent);
      /* samplecnt (these flags are unaffected by nir_move_only_divergent) */
      NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
               nir_move_tex_sample | nir_move_tex_lod);
   } else {
      /* vmcnt */
      NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
               nir_move_tex_sample | nir_move_tex_lod |
               nir_move_tex_load | nir_move_tex_load_fragment_mask |
               nir_move_load_image | nir_move_load_image_fragment_mask |
               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
               nir_move_load_buffer_amd | nir_move_only_divergent);
   }
   /* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
    * order can help the backend scheduler)
    */
   NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
            nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
   NIR_PASS(_, stage->nir, nir_opt_group_loads, nir_group_same_resource_only, 16);
   stage->info.nir_shared_size = stage->nir->info.shared_size;
 }
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -281,6 +281,7 @@ else
  'nir_opt_memcpy.c',
  'nir_opt_move.c',
  'nir_opt_move_discards_to_top.c',
  'nir_opt_move_reorder_loads.c',
  'nir_opt_move_to_top.c',
  'nir_opt_mqsad.c',
  'nir_opt_non_uniform_access.c',
@ -423,6 +424,7 @@ if with_tests
        'tests/lower_discard_if_tests.cpp',
        'tests/minimize_call_live_states_test.cpp',
        'tests/mod_analysis_tests.cpp',
        'tests/move_reorder_loads_tests.cpp',
        'tests/negative_equal_tests.cpp',
        'tests/opt_if_tests.cpp',
        'tests/opt_loop_tests.cpp',
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -6501,6 +6501,9 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
 bool nir_opt_move(nir_shader *shader, nir_move_options options);
 unsigned nir_get_closest_use_instr_index(nir_instr *instr);
 bool nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options);
 typedef struct nir_opt_offsets_options {
   /** nir_load_uniform max base offset */
   uint32_t uniform_max;
--- a/src/compiler/nir/nir_opt_move_reorder_loads.c
+++ b/src/compiler/nir/nir_opt_move_reorder_loads.c
@ -0,0 +1,187 @@
 /*
 * Copyright 2025 Advanced Micro Devices, Inc.
 * SPDX-License-Identifier: MIT
 */
 /* This pass moves (sinks / reorders) loads to make them execute in the order
 * their results are used. The loads are moved immediately after the load whose
 * use occurs sooner, which is the minimum distance necessary to move loads to
 * get them in the desired order. It doesn't move loads that don't need to be
 * moved, and it doesn't move loads between blocks, but it reorders loads within
 * a block even if their uses are outside the block.
 *
 * Such moves reduce live ranges for the load results, but increase live ranges
 * for the load srcs.
 *
 *    Before:           After:
 *    %0 = load         %0 = load
 *    ...               ...
 *    %1 = load ─┐
 *    ...        │      ...
 *    %2 = load ─│──┐
 *    ...        │  │   ...
 *    %3 = load  │  │   %3 = load
 *               │  └─> %2 = load
 *               └────> %1 = load
 *    ...               ...
 *    use %0            use %0
 *    ...               ...
 *    use %3            use %3
 *    ...               ...
 *    use %2            use %2
 *    ...               ...
 *    use %1            use %1
 *
 * This is useful for hw that uses a load counter to wait for the N-th previous
 * load before a use. Executing loads in the order the results are used allows
 * the hw to wait only for the oldest load at any given time.
 *
 * If the hw has multiple load counters for different kinds of loads, it's
 * recommended to call this pass separately for each such counter using
 * different options.
 */
 #include "nir.h"
 #include "util/u_dynarray.h"
 unsigned
 nir_get_closest_use_instr_index(nir_instr *instr)
 {
   unsigned closest_use_instr_index = UINT_MAX;
   nir_foreach_use_including_if(src, nir_instr_def(instr)) {
      unsigned this_use =
         nir_src_is_if(src) ?
               nir_if_first_then_block(nir_src_parent_if(src))->start_ip :
               nir_src_parent_instr(src)->index;
      closest_use_instr_index = MIN2(closest_use_instr_index, this_use);
   }
   /* This will fail only if instr has no use. */
   assert(closest_use_instr_index != UINT_MAX &&
          "dead code shouldn't be present");
   return closest_use_instr_index;
 }
 typedef struct load_info {
   nir_instr *instr;
   unsigned closest_use_instr_index;
 } load_info;
 static int
 compare_closest_use(const void *a, const void *b)
 {
   return ((load_info*)a)->closest_use_instr_index -
          ((load_info*)b)->closest_use_instr_index;
 }
 static bool
 process_block(nir_block *block, nir_move_options options,
              struct util_dynarray *scratch)
 {
   util_dynarray_clear(scratch);
   bool sorted = true;
   /* Gather all loads that we want to reorder. */
   nir_foreach_instr(instr, block) {
      if (nir_can_move_instr(instr, options)) {
         unsigned closest_use_instr_index = nir_get_closest_use_instr_index(instr);
         load_info info = {
            .instr = instr,
            .closest_use_instr_index = closest_use_instr_index,
         };
         /* If the previous load has its closest use after the closest use of
          * the current load, they must be reordered.
          */
         if (util_dynarray_num_elements(scratch, load_info) &&
             util_dynarray_last_ptr(scratch, load_info)->closest_use_instr_index >
             closest_use_instr_index)
            sorted = false;
         util_dynarray_append(scratch, load_info, info);
      }
   }
   /* Exit if the loads are already sorted. */
   if (sorted)
      return false;
   bool progress = false;
   unsigned  num_loads = util_dynarray_num_elements(scratch, load_info);
   load_info *loads = util_dynarray_element(scratch, load_info, 0);
   /* Sort loads by the position of their use. This only sorts the gathered
    * loads in the array, which is necessary to determine their order.
    */
   qsort(loads, num_loads, sizeof(load_info), compare_closest_use);
   /* Sink loads that should be later. */
   for (unsigned i = 1; i < num_loads; i++) {
      load_info *prev = &loads[i - 1];
      load_info *cur = &loads[i];
      /* Check whether qsort did its job. */
      assert(prev->closest_use_instr_index <= cur->closest_use_instr_index);
      /* If prev should be before cur in the shader, but prev is after cur
       * in the shader, sink cur after prev.
       */
      if (prev->closest_use_instr_index < cur->closest_use_instr_index &&
          prev->instr->index > cur->instr->index) {
         nir_instr_move(nir_after_instr(prev->instr), cur->instr);
         /* Set the position of cur to where we moved it. */
         cur->instr->index = prev->instr->index;
         progress = true;
      }
   }
   return progress;
 }
 bool
 nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options)
 {
   /* Reject unexpected flags. */
   assert(!(options & ~(nir_move_tex_sample |
                        nir_move_tex_load |
                        nir_move_tex_load_fragment_mask |
                        nir_move_tex_lod |
                        nir_move_tex_query |
                        nir_move_load_image |
                        nir_move_load_image_fragment_mask |
                        nir_move_query_image |
                        nir_move_load_input |
                        nir_move_load_global |
                        nir_move_load_ubo |
                        nir_move_load_ssbo |
                        nir_move_load_buffer_amd |
                        nir_move_only_convergent |
                        nir_move_only_divergent)));
   bool any_progress = false;
   struct util_dynarray scratch;
   util_dynarray_init(&scratch, NULL);
   nir_foreach_function_impl(impl, nir) {
      bool progress = false;
      nir_metadata_require(impl, nir_metadata_instr_index |
                           (options & (nir_move_only_convergent |
                                       nir_move_only_divergent) ?
                               nir_metadata_divergence : 0));
      nir_foreach_block(block, impl) {
         progress |= process_block(block, options, &scratch);
      }
      any_progress |= nir_progress(progress, impl,
                                   nir_metadata_control_flow |
                                   nir_metadata_divergence);
   }
   util_dynarray_fini(&scratch);
   return any_progress;
 }
--- a/src/compiler/nir/tests/move_reorder_loads_tests.cpp
+++ b/src/compiler/nir/tests/move_reorder_loads_tests.cpp
@ -0,0 +1,87 @@
 /*
 * Copyright 2025 Advanced Micro Devices, Inc.
 * SPDX-License-Identifier: MIT
 */
 #include <gtest/gtest.h>
 #include "nir.h"
 #include "nir_builder.h"
 namespace {
 class nir_move_reorder_test : public ::testing::Test {
 protected:
   nir_move_reorder_test()
   {
      glsl_type_singleton_init_or_ref();
   }
   ~nir_move_reorder_test()
   {
      if (HasFailure()) {
         printf("\nShader from the failed test:\n\n");
         nir_print_shader(nir, stdout);
      }
   }
   nir_shader *nir;
 };
 TEST_F(nir_move_reorder_test, ssbo)
 {
   nir_shader_compiler_options options = {0};
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, &options, "n");
   nir_def *undef = nir_undef(&b, 1, 32);
   nir_def *loads[1000];
   const unsigned num_loads = ARRAY_SIZE(loads);
   this->nir = b.shader;
   /* Insert loads. */
   for (unsigned i = 0; i < num_loads; i++) {
      loads[i] = nir_load_ssbo(&b, 1, 32, undef, undef);
      nir_intrinsic_set_access(nir_instr_as_intrinsic(loads[i]->parent_instr),
                               ACCESS_CAN_REORDER);
   }
   srand(0x54987321);
   /* Permute the loads in the array using Fisher–Yates shuffle. */
   for (unsigned i = 0; i < num_loads - 2; i++) {
      unsigned j = i + rand() % (num_loads - i);
      assert(j < num_loads);
      nir_def *tmp = loads[i];
      loads[i] = loads[j];
      loads[j] = tmp;
   }
   /* Generate uses that use the loads in the permuted order. */
   for (unsigned i = 0; i < num_loads; i++)
      nir_ineg(&b, loads[i]);
   NIR_PASS(_, b.shader, nir_opt_move_reorder_loads, nir_move_load_ssbo);
   nir_metadata_require(b.impl, nir_metadata_instr_index);
   /* Verify that the loads are sorted in the block by the position of their
    * closest use.
    */
   unsigned prev_load_closest_use = 0;
   nir_foreach_instr(instr, nir_start_block(b.impl)) {
      if (instr->type != nir_instr_type_intrinsic)
         continue;
      unsigned closest_use = nir_get_closest_use_instr_index(instr);
      if (prev_load_closest_use) {
         ASSERT_LT(prev_load_closest_use, closest_use);
      }
      prev_load_closest_use = closest_use;
   }
 }
 }
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -1042,6 +1042,33 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
   /* This must be done after si_nir_late_opts() because it may generate vec const. */
   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
   if (sel->screen->info.gfx_level >= GFX12) {
      /* loadcnt */
      NIR_PASS(_, nir, nir_opt_move_reorder_loads,
               nir_move_tex_load | nir_move_tex_load_fragment_mask |
               nir_move_load_image | nir_move_load_image_fragment_mask |
               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
               nir_move_load_buffer_amd | nir_move_only_divergent);
      /* samplecnt (these flags are unaffected by nir_move_only_divergent) */
      NIR_PASS(_, nir, nir_opt_move_reorder_loads,
               nir_move_tex_sample | nir_move_tex_lod);
   } else {
      /* vmcnt */
      NIR_PASS(_, nir, nir_opt_move_reorder_loads,
               nir_move_tex_sample | nir_move_tex_lod |
               nir_move_tex_load | nir_move_tex_load_fragment_mask |
               nir_move_load_image | nir_move_load_image_fragment_mask |
               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
               nir_move_load_buffer_amd | nir_move_only_divergent);
   }
   /* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
    * order can help the backend scheduler)
    */
   NIR_PASS(_, nir, nir_opt_move_reorder_loads,
            nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
   /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
    * 200 is tuned for Viewperf. It should be done last.
    */