diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 3f39e28e01e..87dd1f79c12 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -606,6 +606,35 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
       NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
    }
 
+   if (gfx_level >= GFX12) {
+      /* loadcnt */
+      NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
+               nir_move_tex_load | nir_move_tex_load_fragment_mask |
+               nir_move_load_image | nir_move_load_image_fragment_mask |
+               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
+               nir_move_load_buffer_amd | nir_move_only_divergent);
+
+      /* samplecnt (these flags are unaffected by nir_move_only_divergent) */
+      NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
+               nir_move_tex_sample | nir_move_tex_lod);
+   } else {
+      /* vmcnt */
+      NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
+               nir_move_tex_sample | nir_move_tex_lod |
+               nir_move_tex_load | nir_move_tex_load_fragment_mask |
+               nir_move_load_image | nir_move_load_image_fragment_mask |
+               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
+               nir_move_load_buffer_amd | nir_move_only_divergent);
+   }
+
+   /* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
+    * order can help the backend scheduler)
+    */
+   NIR_PASS(_, stage->nir, nir_opt_move_reorder_loads,
+            nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
+
+   NIR_PASS(_, stage->nir, nir_opt_group_loads, nir_group_same_resource_only, 16);
+
    stage->info.nir_shared_size = stage->nir->info.shared_size;
 }
 
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index e13cd1df407..33b9746d909 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -281,6 +281,7 @@ else
   'nir_opt_memcpy.c',
   'nir_opt_move.c',
   'nir_opt_move_discards_to_top.c',
+  'nir_opt_move_reorder_loads.c',
   'nir_opt_move_to_top.c',
   'nir_opt_mqsad.c',
   'nir_opt_non_uniform_access.c',
@@ -423,6 +424,7 @@ if with_tests
         'tests/lower_discard_if_tests.cpp',
         'tests/minimize_call_live_states_test.cpp',
         'tests/mod_analysis_tests.cpp',
+        'tests/move_reorder_loads_tests.cpp',
         'tests/negative_equal_tests.cpp',
         'tests/opt_if_tests.cpp',
         'tests/opt_loop_tests.cpp',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 7209d28e853..c4ea3adbbfa 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -6501,6 +6501,9 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
 
 bool nir_opt_move(nir_shader *shader, nir_move_options options);
 
+unsigned nir_get_closest_use_instr_index(nir_instr *instr);
+bool nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options);
+
 typedef struct nir_opt_offsets_options {
    /** nir_load_uniform max base offset */
    uint32_t uniform_max;
diff --git a/src/compiler/nir/nir_opt_move_reorder_loads.c b/src/compiler/nir/nir_opt_move_reorder_loads.c
new file mode 100644
index 00000000000..5c8efc82c2e
--- /dev/null
+++ b/src/compiler/nir/nir_opt_move_reorder_loads.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* This pass moves (sinks / reorders) loads to make them execute in the order
+ * their results are used. The loads are moved immediately after the load whose
+ * use occurs sooner, which is the minimum distance necessary to move loads to
+ * get them in the desired order. It doesn't move loads that don't need to be
+ * moved, and it doesn't move loads between blocks, but it reorders loads within
+ * a block even if their uses are outside the block.
+ *
+ * Such moves reduce live ranges for the load results, but increase live ranges
+ * for the load srcs.
+ *
+ *    Before:           After:
+ *    %0 = load         %0 = load
+ *    ...               ...
+ *    %1 = load ─┐
+ *    ...        │      ...
+ *    %2 = load ─│──┐
+ *    ...        │  │   ...
+ *    %3 = load  │  │   %3 = load
+ *               │  └─> %2 = load
+ *               └────> %1 = load
+ *    ...               ...
+ *    use %0            use %0
+ *    ...               ...
+ *    use %3            use %3
+ *    ...               ...
+ *    use %2            use %2
+ *    ...               ...
+ *    use %1            use %1
+ *
+ * This is useful for hw that uses a load counter to wait for the N-th previous
+ * load before a use. Executing loads in the order the results are used allows
+ * the hw to wait only for the oldest load at any given time.
+ *
+ * If the hw has multiple load counters for different kinds of loads, it's
+ * recommended to call this pass separately for each such counter using
+ * different options.
+ */
+
+#include "nir.h"
+#include "util/u_dynarray.h"
+
+unsigned
+nir_get_closest_use_instr_index(nir_instr *instr)
+{
+   unsigned closest_use_instr_index = UINT_MAX;
+
+   nir_foreach_use_including_if(src, nir_instr_def(instr)) {
+      unsigned this_use =
+         nir_src_is_if(src) ?
+               nir_if_first_then_block(nir_src_parent_if(src))->start_ip :
+               nir_src_parent_instr(src)->index;
+
+      closest_use_instr_index = MIN2(closest_use_instr_index, this_use);
+   }
+
+   /* This will fail only if instr has no use. */
+   assert(closest_use_instr_index != UINT_MAX &&
+          "dead code shouldn't be present");
+   return closest_use_instr_index;
+}
+
+typedef struct load_info {
+   nir_instr *instr;
+   unsigned closest_use_instr_index;
+} load_info;
+
+static int
+compare_closest_use(const void *a, const void *b)
+{
+   return ((load_info*)a)->closest_use_instr_index -
+          ((load_info*)b)->closest_use_instr_index;
+}
+
+static bool
+process_block(nir_block *block, nir_move_options options,
+              struct util_dynarray *scratch)
+{
+   util_dynarray_clear(scratch);
+   bool sorted = true;
+
+   /* Gather all loads that we want to reorder. */
+   nir_foreach_instr(instr, block) {
+      if (nir_can_move_instr(instr, options)) {
+         unsigned closest_use_instr_index = nir_get_closest_use_instr_index(instr);
+
+         load_info info = {
+            .instr = instr,
+            .closest_use_instr_index = closest_use_instr_index,
+         };
+
+         /* If the previous load has its closest use after the closest use of
+          * the current load, they must be reordered.
+          */
+         if (util_dynarray_num_elements(scratch, load_info) &&
+             util_dynarray_last_ptr(scratch, load_info)->closest_use_instr_index >
+             closest_use_instr_index)
+            sorted = false;
+
+         util_dynarray_append(scratch, load_info, info);
+      }
+   }
+
+   /* Exit if the loads are already sorted. */
+   if (sorted)
+      return false;
+
+   bool progress = false;
+   unsigned  num_loads = util_dynarray_num_elements(scratch, load_info);
+   load_info *loads = util_dynarray_element(scratch, load_info, 0);
+
+   /* Sort loads by the position of their use. This only sorts the gathered
+    * loads in the array, which is necessary to determine their order.
+    */
+   qsort(loads, num_loads, sizeof(load_info), compare_closest_use);
+
+   /* Sink loads that should be later. */
+   for (unsigned i = 1; i < num_loads; i++) {
+      load_info *prev = &loads[i - 1];
+      load_info *cur = &loads[i];
+
+      /* Check whether qsort did its job. */
+      assert(prev->closest_use_instr_index <= cur->closest_use_instr_index);
+
+      /* If prev should be before cur in the shader, but prev is after cur
+       * in the shader, sink cur after prev.
+       */
+      if (prev->closest_use_instr_index < cur->closest_use_instr_index &&
+          prev->instr->index > cur->instr->index) {
+         nir_instr_move(nir_after_instr(prev->instr), cur->instr);
+         /* Set the position of cur to where we moved it. */
+         cur->instr->index = prev->instr->index;
+         progress = true;
+      }
+   }
+
+   return progress;
+}
+
+bool
+nir_opt_move_reorder_loads(nir_shader *nir, nir_move_options options)
+{
+   /* Reject unexpected flags. */
+   assert(!(options & ~(nir_move_tex_sample |
+                        nir_move_tex_load |
+                        nir_move_tex_load_fragment_mask |
+                        nir_move_tex_lod |
+                        nir_move_tex_query |
+                        nir_move_load_image |
+                        nir_move_load_image_fragment_mask |
+                        nir_move_query_image |
+                        nir_move_load_input |
+                        nir_move_load_global |
+                        nir_move_load_ubo |
+                        nir_move_load_ssbo |
+                        nir_move_load_buffer_amd |
+                        nir_move_only_convergent |
+                        nir_move_only_divergent)));
+   bool any_progress = false;
+
+   struct util_dynarray scratch;
+   util_dynarray_init(&scratch, NULL);
+
+   nir_foreach_function_impl(impl, nir) {
+      bool progress = false;
+
+      nir_metadata_require(impl, nir_metadata_instr_index |
+                           (options & (nir_move_only_convergent |
+                                       nir_move_only_divergent) ?
+                               nir_metadata_divergence : 0));
+
+      nir_foreach_block(block, impl) {
+         progress |= process_block(block, options, &scratch);
+      }
+
+      any_progress |= nir_progress(progress, impl,
+                                   nir_metadata_control_flow |
+                                   nir_metadata_divergence);
+   }
+
+   util_dynarray_fini(&scratch);
+   return any_progress;
+}
diff --git a/src/compiler/nir/tests/move_reorder_loads_tests.cpp b/src/compiler/nir/tests/move_reorder_loads_tests.cpp
new file mode 100644
index 00000000000..6679ffd4a71
--- /dev/null
+++ b/src/compiler/nir/tests/move_reorder_loads_tests.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <gtest/gtest.h>
+
+#include "nir.h"
+#include "nir_builder.h"
+
+namespace {
+
+class nir_move_reorder_test : public ::testing::Test {
+protected:
+   nir_move_reorder_test()
+   {
+      glsl_type_singleton_init_or_ref();
+   }
+
+   ~nir_move_reorder_test()
+   {
+      if (HasFailure()) {
+         printf("\nShader from the failed test:\n\n");
+         nir_print_shader(nir, stdout);
+      }
+   }
+
+   nir_shader *nir;
+};
+
+TEST_F(nir_move_reorder_test, ssbo)
+{
+   nir_shader_compiler_options options = {0};
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, &options, "n");
+   nir_def *undef = nir_undef(&b, 1, 32);
+   nir_def *loads[1000];
+   const unsigned num_loads = ARRAY_SIZE(loads);
+
+   this->nir = b.shader;
+
+   /* Insert loads. */
+   for (unsigned i = 0; i < num_loads; i++) {
+      loads[i] = nir_load_ssbo(&b, 1, 32, undef, undef);
+      nir_intrinsic_set_access(nir_instr_as_intrinsic(loads[i]->parent_instr),
+                               ACCESS_CAN_REORDER);
+   }
+
+   srand(0x54987321);
+
+   /* Permute the loads in the array using Fisher–Yates shuffle. */
+   for (unsigned i = 0; i < num_loads - 2; i++) {
+      unsigned j = i + rand() % (num_loads - i);
+      assert(j < num_loads);
+
+      nir_def *tmp = loads[i];
+      loads[i] = loads[j];
+      loads[j] = tmp;
+   }
+
+   /* Generate uses that use the loads in the permuted order. */
+   for (unsigned i = 0; i < num_loads; i++)
+      nir_ineg(&b, loads[i]);
+
+   NIR_PASS(_, b.shader, nir_opt_move_reorder_loads, nir_move_load_ssbo);
+
+   nir_metadata_require(b.impl, nir_metadata_instr_index);
+
+   /* Verify that the loads are sorted in the block by the position of their
+    * closest use.
+    */
+   unsigned prev_load_closest_use = 0;
+
+   nir_foreach_instr(instr, nir_start_block(b.impl)) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      unsigned closest_use = nir_get_closest_use_instr_index(instr);
+
+      if (prev_load_closest_use) {
+         ASSERT_LT(prev_load_closest_use, closest_use);
+      }
+
+      prev_load_closest_use = closest_use;
+   }
+}
+
+}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 728984304c5..747cf95abfa 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1042,6 +1042,33 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
    /* This must be done after si_nir_late_opts() because it may generate vec const. */
    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
 
+   if (sel->screen->info.gfx_level >= GFX12) {
+      /* loadcnt */
+      NIR_PASS(_, nir, nir_opt_move_reorder_loads,
+               nir_move_tex_load | nir_move_tex_load_fragment_mask |
+               nir_move_load_image | nir_move_load_image_fragment_mask |
+               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
+               nir_move_load_buffer_amd | nir_move_only_divergent);
+
+      /* samplecnt (these flags are unaffected by nir_move_only_divergent) */
+      NIR_PASS(_, nir, nir_opt_move_reorder_loads,
+               nir_move_tex_sample | nir_move_tex_lod);
+   } else {
+      /* vmcnt */
+      NIR_PASS(_, nir, nir_opt_move_reorder_loads,
+               nir_move_tex_sample | nir_move_tex_lod |
+               nir_move_tex_load | nir_move_tex_load_fragment_mask |
+               nir_move_load_image | nir_move_load_image_fragment_mask |
+               nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo |
+               nir_move_load_buffer_amd | nir_move_only_divergent);
+   }
+
+   /* lgkmcnt/kmcnt (even though SMEM can finish out of order, putting the loads in the optimal
+    * order can help the backend scheduler)
+    */
+   NIR_PASS(_, nir, nir_opt_move_reorder_loads,
+            nir_move_load_global | nir_move_load_ubo | nir_move_load_ssbo | nir_move_only_convergent);
+
    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
     * 200 is tuned for Viewperf. It should be done last.
     */