From 9c21279c6007e270caf3e892378af3afcaf22847 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 3 Dec 2025 23:00:48 -0500
Subject: [PATCH 1/9] nir/opt_licm: add a private state structure for the pass

The structure will grow in later commits.

The major change is that the preheader and exit blocks are replaced
by tracking just the innermost optimized nir_loop * and getting the
predecessor and successor blocks out of it.
---
 src/compiler/nir/nir.h          | 12 ++++++
 src/compiler/nir/nir_opt_licm.c | 71 ++++++++++++++++++++++-----------
 2 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index b12223909e6..88e61b72664 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4731,6 +4731,18 @@ nir_block *nir_cf_node_cf_tree_prev(nir_cf_node *node);
         block != nir_cf_node_cf_tree_prev(node);               \
         block = prev, prev = nir_block_cf_tree_prev(block))
 
+static inline nir_block *
+nir_loop_predecessor_block(nir_loop *loop)
+{
+   return nir_cf_node_cf_tree_prev(&loop->cf_node);
+}
+
+static inline nir_block *
+nir_loop_successor_block(nir_loop *loop)
+{
+   return nir_cf_node_cf_tree_next(&loop->cf_node);
+}
+
 /* If the following CF node is an if, this function returns that if.
  * Otherwise, it returns NULL.
  */
diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index 8ede208b4cc..c3236df0d0f 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -5,15 +5,24 @@
 
 #include "nir.h"
 
+typedef struct {
+   nir_loop *loop;
+} licm_state;
+
 static bool
-defined_before_loop(nir_src *src, void *state)
+defined_before_loop(nir_src *src, void *_state)
 {
-   unsigned *loop_preheader_idx = state;
-   return nir_def_block(src->ssa)->index <= *loop_preheader_idx;
+   licm_state *state = (licm_state *)_state;
+
+   /* The current instruction is loop-invariant only if its sources are before
+    * the loop.
+    */
+   return nir_def_block(src->ssa)->index <=
+          nir_loop_predecessor_block(state->loop)->index;
 }
 
 static bool
-is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
+is_instr_loop_invariant(nir_instr *instr, licm_state *state)
 {
    switch (instr->type) {
    case nir_instr_type_load_const:
@@ -28,7 +37,7 @@ is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
    case nir_instr_type_alu:
    case nir_instr_type_tex:
    case nir_instr_type_deref:
-      return nir_foreach_src(instr, defined_before_loop, &loop_preheader_idx);
+      return nir_foreach_src(instr, defined_before_loop, state);
 
    case nir_instr_type_phi:
    case nir_instr_type_call:
@@ -39,13 +48,16 @@ is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
 }
 
 static bool
-visit_block(nir_block *block, nir_block *preheader)
+visit_block(nir_block *block, licm_state *state)
 {
+   assert(state->loop);
+
    bool progress = false;
    nir_foreach_instr_safe(instr, block) {
-      if (is_instr_loop_invariant(instr, preheader->index)) {
+      if (is_instr_loop_invariant(instr, state)) {
          nir_instr_remove(instr);
-         nir_instr_insert_after_block(preheader, instr);
+         nir_instr_insert_after_block(nir_loop_predecessor_block(state->loop),
+                                      instr);
          progress = true;
       }
    }
@@ -80,38 +92,47 @@ should_optimize_loop(nir_loop *loop)
 }
 
 static bool
-visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit)
+visit_cf_list(struct exec_list *list, licm_state *state)
 {
    bool progress = false;
 
    foreach_list_typed(nir_cf_node, node, node, list) {
       switch (node->type) {
       case nir_cf_node_block: {
-         /* By only visiting blocks which dominate the loop exit, we
-          * ensure that we don't speculatively hoist any instructions
+         /* By only visiting blocks which dominate the block after the loop,
+          * we ensure that we don't speculatively hoist any instructions
           * which otherwise might not be executed.
           *
           * Note, that the proper check would be whether this block
-          * postdominates the loop preheader.
+          * postdominates the block before the loop.
           */
          nir_block *block = nir_cf_node_as_block(node);
-         if (exit && nir_block_dominates(block, exit))
-            progress |= visit_block(block, preheader);
+         if (state->loop &&
+             nir_block_dominates(block, nir_loop_successor_block(state->loop)))
+            progress |= visit_block(block, state);
          break;
       }
       case nir_cf_node_if: {
          nir_if *nif = nir_cf_node_as_if(node);
-         progress |= visit_cf_list(&nif->then_list, preheader, exit);
-         progress |= visit_cf_list(&nif->else_list, preheader, exit);
+         progress |= visit_cf_list(&nif->then_list, state);
+         progress |= visit_cf_list(&nif->else_list, state);
          break;
       }
       case nir_cf_node_loop: {
-         nir_loop *loop = nir_cf_node_as_loop(node);
-         bool opt = should_optimize_loop(loop);
-         nir_block *inner_preheader = opt ? nir_cf_node_cf_tree_prev(node) : preheader;
-         nir_block *inner_exit = opt ? nir_cf_node_cf_tree_next(node) : exit;
-         progress |= visit_cf_list(&loop->body, inner_preheader, inner_exit);
-         progress |= visit_cf_list(&loop->continue_list, inner_preheader, inner_exit);
+         nir_loop *inner_loop = nir_cf_node_as_loop(node);
+         nir_loop *outer_loop = state->loop;
+
+         /* If we don't optimize this loop, we treat it like a block, so we
+          * don't do LICM from it per se, but if this loop is nested inside
+          * another loop that's optimized, we still do LICM from this CF list
+          * for the outer loop.
+          */
+         if (should_optimize_loop(inner_loop))
+            state->loop = inner_loop;
+
+         progress |= visit_cf_list(&inner_loop->body, state);
+         progress |= visit_cf_list(&inner_loop->continue_list, state);
+         state->loop = outer_loop;
          break;
       }
       case nir_cf_node_function:
@@ -125,14 +146,16 @@ visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit)
 bool
 nir_opt_licm(nir_shader *shader)
 {
+   licm_state state = {0};
    bool progress = false;
 
    nir_foreach_function_impl(impl, shader) {
       nir_metadata_require(impl, nir_metadata_block_index |
                                     nir_metadata_dominance);
 
-      bool impl_progress = visit_cf_list(&impl->body, NULL, NULL);
-      progress |= nir_progress(impl_progress, impl,
+      state.loop = NULL;
+
+      progress |= nir_progress(visit_cf_list(&impl->body, &state), impl,
                                nir_metadata_block_index | nir_metadata_dominance);
    }
 

From e9a330b98bfa6ed83e233d9da920f3e1211ea631 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 17:34:38 -0500
Subject: [PATCH 2/9] nir/opt_licm: use nir_metadata_control_flow

---
 src/compiler/nir/nir_opt_licm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index c3236df0d0f..48e69c33257 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -156,7 +156,7 @@ nir_opt_licm(nir_shader *shader)
       state.loop = NULL;
 
       progress |= nir_progress(visit_cf_list(&impl->body, &state), impl,
-                               nir_metadata_block_index | nir_metadata_dominance);
+                               nir_metadata_control_flow);
    }
 
    return progress;

From cf5cb2898bce3ccaabeefec743f48dc09c4986a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 00:21:59 -0500
Subject: [PATCH 3/9] nir/opt_licm: hoist instructions across multiple levels
 of nested loops

radv gfx12:

Totals:
Instrs: 42861311 -> 42861476 (+0.00%); split: -0.00%, +0.00%
CodeSize: 227917476 -> 227918160 (+0.00%); split: -0.00%, +0.00%
Latency: 265381068 -> 265373506 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 42954018 -> 42952350 (-0.00%)
VClause: 819026 -> 819024 (-0.00%)
SClause: 1210348 -> 1210293 (-0.00%)
Copies: 2919525 -> 2919597 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 2889432 -> 2889406 (-0.00%)
VALU: 23757371 -> 23757377 (+0.00%); split: -0.00%, +0.00%
SALU: 5981417 -> 5981485 (+0.00%); split: -0.00%, +0.00%
VOPD: 8966 -> 8964 (-0.02%)
---
 src/compiler/nir/nir_opt_licm.c | 52 +++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index 48e69c33257..a926b9b6342 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -99,6 +99,29 @@ visit_cf_list(struct exec_list *list, licm_state *state)
    foreach_list_typed(nir_cf_node, node, node, list) {
       switch (node->type) {
       case nir_cf_node_block: {
+         nir_cf_node *next = nir_cf_node_next(node);
+         bool optimize_loop = false;
+
+         /* If the next CF node is a loop that we optimize, visit it first
+          * before visiting its predecessor block, so that any instructions
+          * hoisted from this (potentially nested) loop are then considered
+          * for hoisting from the outer loop as well. The goal is to hoist
+          * instructions across all levels of nested loops.
+          */
+         if (next && next->type == nir_cf_node_loop) {
+            nir_loop *inner_loop = nir_cf_node_as_loop(next);
+            optimize_loop = should_optimize_loop(inner_loop);
+
+            if (optimize_loop) {
+               nir_loop *outer_loop = state->loop;
+
+               state->loop = inner_loop;
+               progress |= visit_cf_list(&inner_loop->body, state);
+               progress |= visit_cf_list(&inner_loop->continue_list, state);
+               state->loop = outer_loop;
+            }
+         }
+
          /* By only visiting blocks which dominate the block after the loop,
           * we ensure that we don't speculatively hoist any instructions
           * which otherwise might not be executed.
@@ -110,6 +133,17 @@ visit_cf_list(struct exec_list *list, licm_state *state)
          if (state->loop &&
              nir_block_dominates(block, nir_loop_successor_block(state->loop)))
             progress |= visit_block(block, state);
+
+         if (next && next->type == nir_cf_node_loop && !optimize_loop) {
+            nir_loop *loop = nir_cf_node_as_loop(next);
+
+            /* We treat this loop like any other block, so we don't do LICM
+             * from it per se, but if this loop is nested inside another
+             * loop, we still do LICM for the outer loop.
+             */
+            progress |= visit_cf_list(&loop->body, state);
+            progress |= visit_cf_list(&loop->continue_list, state);
+         }
          break;
       }
       case nir_cf_node_if: {
@@ -118,23 +152,9 @@ visit_cf_list(struct exec_list *list, licm_state *state)
          progress |= visit_cf_list(&nif->else_list, state);
          break;
       }
-      case nir_cf_node_loop: {
-         nir_loop *inner_loop = nir_cf_node_as_loop(node);
-         nir_loop *outer_loop = state->loop;
-
-         /* If we don't optimize this loop, we treat it like a block, so we
-          * don't do LICM from it per se, but if this loop is nested inside
-          * another loop that's optimized, we still do LICM from this CF list
-          * for the outer loop.
-          */
-         if (should_optimize_loop(inner_loop))
-            state->loop = inner_loop;
-
-         progress |= visit_cf_list(&inner_loop->body, state);
-         progress |= visit_cf_list(&inner_loop->continue_list, state);
-         state->loop = outer_loop;
+      case nir_cf_node_loop:
+         /* All loops are handled when handling their predecessor block. */
          break;
-      }
       case nir_cf_node_function:
          UNREACHABLE("NIR LICM: Unsupported cf_node type.");
       }

From 904623bf02960db29e3c3a28df20d0ed79f3e1cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 17:21:37 -0500
Subject: [PATCH 4/9] nir/opt_licm: add filter_block callback for the current
 behavior

The nir_opt_licm_filter_blocks_dominating_exit callback keeps the current
behavior, but newer users won't use it.
---
 src/amd/vulkan/radv_pipeline.c  |  2 +-
 src/compiler/nir/nir.h          |  7 ++++++-
 src/compiler/nir/nir_opt_licm.c | 29 +++++++++++++++++++----------
 src/imagination/pco/pco_nir.c   |  2 +-
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 979b94c53be..6e697b55b36 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs;
 
    if (!stage->key.optimisations_disabled) {
-      NIR_PASS(_, stage->nir, nir_opt_licm);
+      NIR_PASS(_, stage->nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit);
 
       if (stage->stage == MESA_SHADER_VERTEX) {
          /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 88e61b72664..52b9d98de72 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -6412,7 +6412,12 @@ bool nir_opt_large_constants(nir_shader *shader,
                              glsl_type_size_align_func size_align,
                              unsigned threshold);
 
-bool nir_opt_licm(nir_shader *shader);
+typedef bool (*nir_opt_licm_filter_block_cb)(nir_block *block, nir_loop *loop);
+
+bool nir_opt_licm_filter_blocks_dominating_exit(nir_block *block,
+                                                nir_loop *loop);
+bool nir_opt_licm(nir_shader *shader,
+                  nir_opt_licm_filter_block_cb filter_block);
 bool nir_opt_loop(nir_shader *shader);
 
 bool nir_opt_loop_unroll(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index a926b9b6342..625189029b1 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -6,6 +6,8 @@
 #include "nir.h"
 
 typedef struct {
+   nir_opt_licm_filter_block_cb filter_block;
+
    nir_loop *loop;
 } licm_state;
 
@@ -122,16 +124,10 @@ visit_cf_list(struct exec_list *list, licm_state *state)
             }
          }
 
-         /* By only visiting blocks which dominate the block after the loop,
-          * we ensure that we don't speculatively hoist any instructions
-          * which otherwise might not be executed.
-          *
-          * Note, that the proper check would be whether this block
-          * postdominates the block before the loop.
-          */
+         /* Visit the block. */
          nir_block *block = nir_cf_node_as_block(node);
          if (state->loop &&
-             nir_block_dominates(block, nir_loop_successor_block(state->loop)))
+             (!state->filter_block || state->filter_block(block, state->loop)))
             progress |= visit_block(block, state);
 
          if (next && next->type == nir_cf_node_loop && !optimize_loop) {
@@ -164,9 +160,22 @@ visit_cf_list(struct exec_list *list, licm_state *state)
 }
 
 bool
-nir_opt_licm(nir_shader *shader)
+nir_opt_licm_filter_blocks_dominating_exit(nir_block *block, nir_loop *loop)
 {
-   licm_state state = {0};
+   /* TODO: This is deprecated because nir_instr_can_speculate is a better way
+    * to handle speculative execution.
+    *
+    * By only visiting blocks which dominate the block after the loop,
+    * we ensure that we don't speculatively hoist any instructions
+    * which otherwise might not be executed.
+    */
+   return nir_block_dominates(block, nir_loop_successor_block(loop));
+}
+
+bool
+nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_block_cb filter_block)
+{
+   licm_state state = {filter_block};
    bool progress = false;
 
    nir_foreach_function_impl(impl, shader) {
diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c
index 3b9ce60196b..9b81309edfc 100644
--- a/src/imagination/pco/pco_nir.c
+++ b/src/imagination/pco/pco_nir.c
@@ -808,7 +808,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
 
    NIR_PASS(_, nir, nir_lower_memory_model);
 
-   NIR_PASS(_, nir, nir_opt_licm);
+   NIR_PASS(_, nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit);
 
    NIR_PASS(_, nir, nir_lower_memcpy);
 

From e6838e106a75c7b690ca371aabbd65282a7e3372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 17:37:27 -0500
Subject: [PATCH 5/9] nir/opt_licm: use nir_instr_can_speculate

---
 src/compiler/nir/nir_opt_licm.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index 625189029b1..4c204fd5f28 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -9,6 +9,7 @@ typedef struct {
    nir_opt_licm_filter_block_cb filter_block;
 
    nir_loop *loop;
+   bool current_block_dominates_exit;
 } licm_state;
 
 static bool
@@ -31,15 +32,28 @@ is_instr_loop_invariant(nir_instr *instr, licm_state *state)
    case nir_instr_type_undef:
       return true;
 
-   case nir_instr_type_intrinsic:
-      if (!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
-         return false;
-      FALLTHROUGH;
-
    case nir_instr_type_alu:
    case nir_instr_type_tex:
    case nir_instr_type_deref:
-      return nir_foreach_src(instr, defined_before_loop, state);
+   case nir_instr_type_intrinsic: {
+      /* An instruction can be hoisted if it either dominates the exit (i.e.
+       * it always executes) and is reorderable, or is speculatable.
+       */
+      if (state->current_block_dominates_exit) {
+         if (instr->type == nir_instr_type_intrinsic &&
+             !nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
+            return false;
+      } else {
+         if (!nir_instr_can_speculate(instr))
+            return false;
+      }
+
+      bool invariant = nir_foreach_src(instr, defined_before_loop, state);
+      if (!invariant)
+         return false;
+
+      return true;
+   }
 
    case nir_instr_type_phi:
    case nir_instr_type_call:
@@ -52,7 +66,8 @@ is_instr_loop_invariant(nir_instr *instr, licm_state *state)
 static bool
 visit_block(nir_block *block, licm_state *state)
 {
-   assert(state->loop);
+   state->current_block_dominates_exit =
+      nir_block_dominates(block, nir_loop_successor_block(state->loop));
 
    bool progress = false;
    nir_foreach_instr_safe(instr, block) {

From 75b9ba74efa97aa717af26d883340ea2dee70665 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 17:38:56 -0500
Subject: [PATCH 6/9] nir/opt_licm: add filter_instr callback

The extra parameters are for a heuristic to control register usage,
e.g. if num_dst_bits <= num_all_src_bits, register usage likely doesn't
increase.
---
 src/amd/vulkan/radv_pipeline.c  |  2 +-
 src/compiler/nir/nir.h          |  8 +++++++-
 src/compiler/nir/nir_opt_licm.c | 20 ++++++++++++++++++--
 src/imagination/pco/pco_nir.c   |  2 +-
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 6e697b55b36..96ddaa986e0 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs;
 
    if (!stage->key.optimisations_disabled) {
-      NIR_PASS(_, stage->nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit);
+      NIR_PASS(_, stage->nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit, NULL);
 
       if (stage->stage == MESA_SHADER_VERTEX) {
          /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 52b9d98de72..98ea6ca4b09 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -6413,11 +6413,17 @@ bool nir_opt_large_constants(nir_shader *shader,
                              unsigned threshold);
 
 typedef bool (*nir_opt_licm_filter_block_cb)(nir_block *block, nir_loop *loop);
+typedef bool (*nir_opt_licm_filter_instr_cb)(nir_instr *instr,
+                                             bool instr_dominates_exit,
+                                             unsigned num_dst_bits,
+                                             unsigned num_all_src_bits,
+                                             nir_loop *loop);
 
 bool nir_opt_licm_filter_blocks_dominating_exit(nir_block *block,
                                                 nir_loop *loop);
 bool nir_opt_licm(nir_shader *shader,
-                  nir_opt_licm_filter_block_cb filter_block);
+                  nir_opt_licm_filter_block_cb filter_block,
+                  nir_opt_licm_filter_instr_cb filter_instr);
 bool nir_opt_loop(nir_shader *shader);
 
 bool nir_opt_loop_unroll(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index 4c204fd5f28..758847ee6f6 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -7,9 +7,11 @@
 
 typedef struct {
    nir_opt_licm_filter_block_cb filter_block;
+   nir_opt_licm_filter_instr_cb filter_instr;
 
    nir_loop *loop;
    bool current_block_dominates_exit;
+   unsigned num_all_src_bits;
 } licm_state;
 
 static bool
@@ -17,6 +19,8 @@ defined_before_loop(nir_src *src, void *_state)
 {
    licm_state *state = (licm_state *)_state;
 
+   state->num_all_src_bits += src->ssa->bit_size * src->ssa->num_components;
+
    /* The current instruction is loop-invariant only if its sources are before
     * the loop.
     */
@@ -48,10 +52,21 @@ is_instr_loop_invariant(nir_instr *instr, licm_state *state)
             return false;
       }
 
+      state->num_all_src_bits = 0;
+
       bool invariant = nir_foreach_src(instr, defined_before_loop, state);
       if (!invariant)
          return false;
 
+      if (state->filter_instr) {
+         nir_def *def = nir_instr_def(instr);
+
+         if (!state->filter_instr(instr, state->current_block_dominates_exit,
+                                  def->bit_size * def->num_components,
+                                  state->num_all_src_bits, state->loop))
+            return false;
+      }
+
       return true;
    }
 
@@ -188,9 +203,10 @@ nir_opt_licm_filter_blocks_dominating_exit(nir_block *block, nir_loop *loop)
 }
 
 bool
-nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_block_cb filter_block)
+nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_block_cb filter_block,
+             nir_opt_licm_filter_instr_cb filter_instr)
 {
-   licm_state state = {filter_block};
+   licm_state state = {filter_block, filter_instr};
    bool progress = false;
 
    nir_foreach_function_impl(impl, shader) {
diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c
index 9b81309edfc..46016319423 100644
--- a/src/imagination/pco/pco_nir.c
+++ b/src/imagination/pco/pco_nir.c
@@ -808,7 +808,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
 
    NIR_PASS(_, nir, nir_lower_memory_model);
 
-   NIR_PASS(_, nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit);
+   NIR_PASS(_, nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit, NULL);
 
    NIR_PASS(_, nir, nir_lower_memcpy);
 

From 1b48e814228c6200ad086e0aea7a5aba42417642 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 16:46:12 -0500
Subject: [PATCH 7/9] nir/tests: add nir_opt_licm tests

---
 src/compiler/nir/meson.build              |   1 +
 src/compiler/nir/tests/opt_licm_tests.cpp | 231 ++++++++++++++++++++++
 2 files changed, 232 insertions(+)
 create mode 100644 src/compiler/nir/tests/opt_licm_tests.cpp

diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 49c28177fe7..61abe2fedd5 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -425,6 +425,7 @@ if with_tests
         'tests/mod_analysis_tests.cpp',
         'tests/negative_equal_tests.cpp',
         'tests/opt_if_tests.cpp',
+        'tests/opt_licm_tests.cpp',
         'tests/opt_loop_tests.cpp',
         'tests/opt_peephole_select.cpp',
         'tests/opt_shrink_vectors_tests.cpp',
diff --git a/src/compiler/nir/tests/opt_licm_tests.cpp b/src/compiler/nir/tests/opt_licm_tests.cpp
new file mode 100644
index 00000000000..f669f48dd8d
--- /dev/null
+++ b/src/compiler/nir/tests/opt_licm_tests.cpp
@@ -0,0 +1,231 @@
+/* Copyright 2025 Advanced Micro Devices, Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nir_test.h"
+
+class nir_opt_licm_test : public nir_test {
+protected:
+   nir_opt_licm_test()
+      : nir_test::nir_test("nir_opt_licm_test", MESA_SHADER_COMPUTE)
+   {
+   }
+
+   nir_loop *loop;
+   nir_block *original_block;
+   nir_def *x, *y, *z, *r;
+   bool expect_progress;
+   bool insert_after_break;
+
+   void test_init();
+   void test_finish(nir_opt_licm_filter_instr_cb filter_instr);
+};
+
+void
+nir_opt_licm_test::test_init()
+{
+   x = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
+   y = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
+   z = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
+
+   loop = nir_push_loop(b);
+   if (insert_after_break)
+      nir_break_if(b, nir_undef(b, 1, 1));
+   original_block = nir_loop_last_block(loop);
+}
+
+static bool
+filter_using_dst_src_bits(nir_instr *instr, bool instr_dominates_exit,
+                          unsigned num_dst_bits, unsigned num_all_src_bits,
+                          nir_loop *loop)
+{
+   return num_dst_bits <= num_all_src_bits;
+}
+
+void
+nir_opt_licm_test::test_finish(nir_opt_licm_filter_instr_cb filter_instr)
+{
+   if (!insert_after_break)
+      nir_break_if(b, nir_undef(b, 1, 1));
+   nir_pop_loop(b, loop);
+   nir_validate_shader(b->shader, NULL);
+
+   bool progress = false;
+   NIR_PASS(progress, b->shader, nir_opt_licm, NULL, filter_instr);
+
+   if (expect_progress) {
+      ASSERT_TRUE(progress);
+      ASSERT_EQ(nir_def_instr(r)->block, nir_loop_predecessor_block(loop));
+   } else {
+      ASSERT_FALSE(progress);
+      ASSERT_EQ(nir_def_instr(r)->block, original_block);
+   }
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_unary)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_ineg(b, x);
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_binary)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_iadd(b, x, y);
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_alu_u2u64)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_u2u64(b, x);
+
+   /* If sizeof(dst) > sizeof(all srcs), the default behavior is not to hoist
+    * because that would increase register usage of the whole loop.
+    */
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_load_ssbo_no_flags_before_break)
+{
+   this->insert_after_break = false;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_before_break)
+{
+   this->insert_after_break = false;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_REORDER));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_load_ssbo_reorderable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_REORDER));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_load_ssbo_speculatable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_SPECULATE));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_speculatable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_REORDER |
+                                                  ACCESS_CAN_SPECULATE));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_2_nested_loops)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_loop *nested_loop = nir_push_loop(b);
+   {
+      nir_break_if(b, nir_undef(b, 1, 1));
+      r = nir_ineg(b, x);
+   }
+   nir_pop_loop(b, nested_loop);
+
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_6_nested_loops)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_loop *nested_loops[5];
+
+   for (unsigned i = 0; i < ARRAY_SIZE(nested_loops); i++) {
+      nested_loops[i] = nir_push_loop(b);
+      nir_break_if(b, nir_undef(b, 1, 1));
+   }
+
+   r = nir_ineg(b, x);
+
+   for (int i = ARRAY_SIZE(nested_loops) - 1; i >= 0; i--)
+      nir_pop_loop(b, nested_loops[i]);
+
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_tex)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+
+   nir_tex_builder fields = {0};
+   fields.coord = x;
+   fields.texture_handle = y;
+   fields.dest_type = nir_type_uint32;
+
+   r = nir_build_tex_struct(b, nir_texop_tex, fields);
+   this->test_finish(NULL);
+}
+
+TEST_F(nir_opt_licm_test, hoist_tex_before_break)
+{
+   this->insert_after_break = false;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_tex_builder fields = {0};
+   fields.coord = x;
+   fields.texture_handle = y;
+   fields.dest_type = nir_type_uint32;
+
+   r = nir_build_tex_struct(b, nir_texop_tex, fields);
+   this->test_finish(NULL);
+}
+
+TEST_F(nir_opt_licm_test, hoist_tex_speculatable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_tex_builder fields = {0};
+   fields.coord = x;
+   fields.texture_handle = y;
+   fields.can_speculate = true;
+   fields.dest_type = nir_type_uint32;
+
+   r = nir_build_tex_struct(b, nir_texop_tex, fields);
+   this->test_finish(NULL);
+}

From ce2383c6dfb438101ee9b5bfa093b7d661d06a33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Dec 2025 20:45:07 -0500
Subject: [PATCH 8/9] radv: enable hoisting speculatable instructions

We could hoist more at the cost of more spilling, or we need new passes
and utilities to control and reduce spilling.

gfx12:

Totals:
MaxWaves: 2385110 -> 2385088 (-0.00%); split: +0.00%, -0.00%
Instrs: 42861476 -> 42853162 (-0.02%); split: -0.03%, +0.01%
CodeSize: 227918160 -> 227864192 (-0.02%); split: -0.03%, +0.01%
VGPRs: 3519596 -> 3519704 (+0.00%); split: -0.00%, +0.00%
SpillSGPRs: 10191 -> 10475 (+2.79%); split: -0.28%, +3.07%
SpillVGPRs: 1807 -> 1824 (+0.94%)
Scratch: 7154432 -> 7156992 (+0.04%)
Latency: 265373506 -> 265370350 (-0.00%); split: -0.03%, +0.02%
InvThroughput: 42952350 -> 42938844 (-0.03%); split: -0.05%, +0.02%
VClause: 819024 -> 819050 (+0.00%); split: -0.00%, +0.01%
SClause: 1210293 -> 1210321 (+0.00%); split: -0.00%, +0.00%
Copies: 2919597 -> 2913239 (-0.22%); split: -0.32%, +0.11%
Branches: 824395 -> 824537 (+0.02%); split: -0.00%, +0.02%
PreSGPRs: 2889406 -> 2893773 (+0.15%); split: -0.00%, +0.15%
PreVGPRs: 2312710 -> 2313112 (+0.02%); split: -0.00%, +0.02%
VALU: 23757377 -> 23757583 (+0.00%); split: -0.00%, +0.00%
SALU: 5981485 -> 5974039 (-0.12%); split: -0.17%, +0.04%
VMEM: 1590721 -> 1590755 (+0.00%)
---
 src/amd/common/nir/ac_nir.c    | 14 ++++++++++++++
 src/amd/common/nir/ac_nir.h    |  4 ++++
 src/amd/vulkan/radv_pipeline.c |  2 +-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/nir/ac_nir.c b/src/amd/common/nir/ac_nir.c
index 51c3e2af69b..fd2eca4dfc9 100644
--- a/src/amd/common/nir/ac_nir.c
+++ b/src/amd/common/nir/ac_nir.c
@@ -1049,3 +1049,17 @@ ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data)
 
    return target_width;
 }
+
+bool
+ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits,
+                                unsigned num_all_src_bits, nir_loop *loop)
+{
+   /* This heuristic reduces spilling. Note that while this only seems to apply to ALU, any ALU
+    * that's hoisted potentially enables hoisting intrinsics using it, so this really affects
+    * all instructions.
+    */
+   if (!instr_dominates_exit && instr->type == nir_instr_type_alu)
+      return num_dst_bits + 64 < num_all_src_bits;
+
+   return true;
+}
diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h
index 6f6dd3105ff..4b397409723 100644
--- a/src/amd/common/nir/ac_nir.h
+++ b/src/amd/common/nir/ac_nir.h
@@ -447,6 +447,10 @@ ac_nir_allow_offset_wrap_cb(nir_intrinsic_instr *instr, const void *data);
 bool
 ac_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu);
 
+bool
+ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits,
+                                unsigned num_all_src_bits, nir_loop *loop);
+
 uint8_t
 ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data);
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 96ddaa986e0..d7cee0649b7 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs;
 
    if (!stage->key.optimisations_disabled) {
-      NIR_PASS(_, stage->nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit, NULL);
+      NIR_PASS(_, stage->nir, nir_opt_licm, NULL, ac_nir_opt_licm_filter_instr_cb);
 
       if (stage->stage == MESA_SHADER_VERTEX) {
          /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.

From c34d84852b59a9b17a12c987dbac5abb1f97343d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 5 Dec 2025 12:24:09 -0500
Subject: [PATCH 9/9] nir/opt_licm: remove the block_filter callback

For PCO, the instr_filter callback can do the same thing.
---
 src/amd/vulkan/radv_pipeline.c            |  2 +-
 src/compiler/nir/nir.h                    |  4 ----
 src/compiler/nir/nir_opt_licm.c           | 22 +++-------------------
 src/compiler/nir/tests/opt_licm_tests.cpp |  2 +-
 src/imagination/pco/pco_nir.c             | 10 +++++++++-
 5 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index d7cee0649b7..6373bf4dbda 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs;
 
    if (!stage->key.optimisations_disabled) {
-      NIR_PASS(_, stage->nir, nir_opt_licm, NULL, ac_nir_opt_licm_filter_instr_cb);
+      NIR_PASS(_, stage->nir, nir_opt_licm, ac_nir_opt_licm_filter_instr_cb);
 
       if (stage->stage == MESA_SHADER_VERTEX) {
          /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 98ea6ca4b09..6c2791b592e 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -6412,17 +6412,13 @@ bool nir_opt_large_constants(nir_shader *shader,
                              glsl_type_size_align_func size_align,
                              unsigned threshold);
 
-typedef bool (*nir_opt_licm_filter_block_cb)(nir_block *block, nir_loop *loop);
 typedef bool (*nir_opt_licm_filter_instr_cb)(nir_instr *instr,
                                              bool instr_dominates_exit,
                                              unsigned num_dst_bits,
                                              unsigned num_all_src_bits,
                                              nir_loop *loop);
 
-bool nir_opt_licm_filter_blocks_dominating_exit(nir_block *block,
-                                                nir_loop *loop);
 bool nir_opt_licm(nir_shader *shader,
-                  nir_opt_licm_filter_block_cb filter_block,
                   nir_opt_licm_filter_instr_cb filter_instr);
 bool nir_opt_loop(nir_shader *shader);
 
diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index 758847ee6f6..0b5c2d35ce9 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -6,7 +6,6 @@
 #include "nir.h"
 
 typedef struct {
-   nir_opt_licm_filter_block_cb filter_block;
    nir_opt_licm_filter_instr_cb filter_instr;
 
    nir_loop *loop;
@@ -156,8 +155,7 @@ visit_cf_list(struct exec_list *list, licm_state *state)
 
          /* Visit the block. */
          nir_block *block = nir_cf_node_as_block(node);
-         if (state->loop &&
-             (!state->filter_block || state->filter_block(block, state->loop)))
+         if (state->loop)
             progress |= visit_block(block, state);
 
          if (next && next->type == nir_cf_node_loop && !optimize_loop) {
@@ -190,23 +188,9 @@ visit_cf_list(struct exec_list *list, licm_state *state)
 }
 
 bool
-nir_opt_licm_filter_blocks_dominating_exit(nir_block *block, nir_loop *loop)
+nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_instr_cb filter_instr)
 {
-   /* TODO: This is deprecated because nir_instr_can_speculate is a better way
-    * to handle speculative execution.
-    *
-    * By only visiting blocks which dominate the block after the loop,
-    * we ensure that we don't speculatively hoist any instructions
-    * which otherwise might not be executed.
-    */
-   return nir_block_dominates(block, nir_loop_successor_block(loop));
-}
-
-bool
-nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_block_cb filter_block,
-             nir_opt_licm_filter_instr_cb filter_instr)
-{
-   licm_state state = {filter_block, filter_instr};
+   licm_state state = {filter_instr};
    bool progress = false;
 
    nir_foreach_function_impl(impl, shader) {
diff --git a/src/compiler/nir/tests/opt_licm_tests.cpp b/src/compiler/nir/tests/opt_licm_tests.cpp
index f669f48dd8d..f8b70394f3a 100644
--- a/src/compiler/nir/tests/opt_licm_tests.cpp
+++ b/src/compiler/nir/tests/opt_licm_tests.cpp
@@ -51,7 +51,7 @@ nir_opt_licm_test::test_finish(nir_opt_licm_filter_instr_cb filter_instr)
    nir_validate_shader(b->shader, NULL);
 
    bool progress = false;
-   NIR_PASS(progress, b->shader, nir_opt_licm, NULL, filter_instr);
+   NIR_PASS(progress, b->shader, nir_opt_licm, filter_instr);
 
    if (expect_progress) {
       ASSERT_TRUE(progress);
diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c
index 46016319423..3ccd9d1adf7 100644
--- a/src/imagination/pco/pco_nir.c
+++ b/src/imagination/pco/pco_nir.c
@@ -788,6 +788,14 @@ static bool robustness_filter(const nir_intrinsic_instr *intr,
    return false;
 }
 
+static bool
+opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit,
+                         unsigned num_dst_bits, unsigned num_all_src_bits,
+                         nir_loop *loop)
+{
+   return instr_dominates_exit;
+}
+
 /**
  * \brief Lowers a NIR shader.
  *
@@ -808,7 +816,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
 
    NIR_PASS(_, nir, nir_lower_memory_model);
 
-   NIR_PASS(_, nir, nir_opt_licm, nir_opt_licm_filter_blocks_dominating_exit, NULL);
+   NIR_PASS(_, nir, nir_opt_licm, opt_licm_filter_instr_cb);
 
    NIR_PASS(_, nir, nir_lower_memcpy);