diff --git a/src/amd/common/nir/ac_nir.c b/src/amd/common/nir/ac_nir.c
index b652dfcf996..e3b460647fe 100644
--- a/src/amd/common/nir/ac_nir.c
+++ b/src/amd/common/nir/ac_nir.c
@@ -1050,3 +1050,17 @@ ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data)
 
    return target_width;
 }
+
+bool
+ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits,
+                                unsigned num_all_src_bits, nir_loop *loop)
+{
+   /* This heuristic reduces spilling. Note that while this only seems to apply to ALU, any ALU
+    * that's hoisted potentially enables hoisting intrinsics using it, so this really affects
+    * all instructions.
+    */
+   if (!instr_dominates_exit && instr->type == nir_instr_type_alu)
+      return num_dst_bits + 64 < num_all_src_bits;
+
+   return true;
+}
diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h
index 2ab3e10a1f5..223349a713f 100644
--- a/src/amd/common/nir/ac_nir.h
+++ b/src/amd/common/nir/ac_nir.h
@@ -447,6 +447,10 @@ ac_nir_allow_offset_wrap_cb(nir_intrinsic_instr *instr, const void *data);
 bool
 ac_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu);
 
+bool
+ac_nir_opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit, unsigned num_dst_bits,
+                                unsigned num_all_src_bits, nir_loop *loop);
+
 uint8_t
 ac_nir_opt_vectorize_cb(const nir_instr *instr, const void *data);
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 3f39e28e01e..00ece257739 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -368,7 +368,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies | nir_dont_move_byte_word_vecs;
 
    if (!stage->key.optimisations_disabled) {
-      NIR_PASS(_, stage->nir, nir_opt_licm);
+      NIR_PASS(_, stage->nir, nir_opt_licm, ac_nir_opt_licm_filter_instr_cb);
 
       if (stage->stage == MESA_SHADER_VERTEX) {
          /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index e13cd1df407..7c2b89ec6ea 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -425,6 +425,7 @@ if with_tests
         'tests/mod_analysis_tests.cpp',
         'tests/negative_equal_tests.cpp',
         'tests/opt_if_tests.cpp',
+        'tests/opt_licm_tests.cpp',
         'tests/opt_loop_tests.cpp',
         'tests/opt_peephole_select.cpp',
         'tests/opt_shrink_vectors_tests.cpp',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 7209d28e853..647452432ae 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4746,6 +4746,18 @@ nir_block *nir_cf_node_cf_tree_prev(nir_cf_node *node);
         block != nir_cf_node_cf_tree_prev(node);               \
         block = prev, prev = nir_block_cf_tree_prev(block))
 
+static inline nir_block *
+nir_loop_predecessor_block(nir_loop *loop)
+{
+   return nir_cf_node_cf_tree_prev(&loop->cf_node);
+}
+
+static inline nir_block *
+nir_loop_successor_block(nir_loop *loop)
+{
+   return nir_cf_node_cf_tree_next(&loop->cf_node);
+}
+
 /* If the following CF node is an if, this function returns that if.
  * Otherwise, it returns NULL.
  */
@@ -6458,7 +6470,14 @@ bool nir_opt_large_constants(nir_shader *shader,
                              glsl_type_size_align_func size_align,
                              unsigned threshold);
 
-bool nir_opt_licm(nir_shader *shader);
+typedef bool (*nir_opt_licm_filter_instr_cb)(nir_instr *instr,
+                                             bool instr_dominates_exit,
+                                             unsigned num_dst_bits,
+                                             unsigned num_all_src_bits,
+                                             nir_loop *loop);
+
+bool nir_opt_licm(nir_shader *shader,
+                  nir_opt_licm_filter_instr_cb filter_instr);
 bool nir_opt_loop(nir_shader *shader);
 
 bool nir_opt_loop_unroll(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_licm.c b/src/compiler/nir/nir_opt_licm.c
index 8ede208b4cc..0b5c2d35ce9 100644
--- a/src/compiler/nir/nir_opt_licm.c
+++ b/src/compiler/nir/nir_opt_licm.c
@@ -5,30 +5,69 @@
 
 #include "nir.h"
 
+typedef struct {
+   nir_opt_licm_filter_instr_cb filter_instr;
+
+   nir_loop *loop;
+   bool current_block_dominates_exit;
+   unsigned num_all_src_bits;
+} licm_state;
+
 static bool
-defined_before_loop(nir_src *src, void *state)
+defined_before_loop(nir_src *src, void *_state)
 {
-   unsigned *loop_preheader_idx = state;
-   return nir_def_block(src->ssa)->index <= *loop_preheader_idx;
+   licm_state *state = (licm_state *)_state;
+
+   state->num_all_src_bits += src->ssa->bit_size * src->ssa->num_components;
+
+   /* The current instruction is loop-invariant only if its sources are before
+    * the loop.
+    */
+   return nir_def_block(src->ssa)->index <=
+          nir_loop_predecessor_block(state->loop)->index;
 }
 
 static bool
-is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
+is_instr_loop_invariant(nir_instr *instr, licm_state *state)
 {
    switch (instr->type) {
    case nir_instr_type_load_const:
    case nir_instr_type_undef:
       return true;
 
-   case nir_instr_type_intrinsic:
-      if (!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
-         return false;
-      FALLTHROUGH;
-
    case nir_instr_type_alu:
    case nir_instr_type_tex:
    case nir_instr_type_deref:
-      return nir_foreach_src(instr, defined_before_loop, &loop_preheader_idx);
+   case nir_instr_type_intrinsic: {
+      /* An instruction can be hoisted if it either dominates the exit (i.e.
+       * it always executes) and is reorderable, or is speculatable.
+       */
+      if (state->current_block_dominates_exit) {
+         if (instr->type == nir_instr_type_intrinsic &&
+             !nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
+            return false;
+      } else {
+         if (!nir_instr_can_speculate(instr))
+            return false;
+      }
+
+      state->num_all_src_bits = 0;
+
+      bool invariant = nir_foreach_src(instr, defined_before_loop, state);
+      if (!invariant)
+         return false;
+
+      if (state->filter_instr) {
+         nir_def *def = nir_instr_def(instr);
+
+         if (!state->filter_instr(instr, state->current_block_dominates_exit,
+                                  def->bit_size * def->num_components,
+                                  state->num_all_src_bits, state->loop))
+            return false;
+      }
+
+      return true;
+   }
 
    case nir_instr_type_phi:
    case nir_instr_type_call:
@@ -39,13 +78,17 @@ is_instr_loop_invariant(nir_instr *instr, unsigned loop_preheader_idx)
 }
 
 static bool
-visit_block(nir_block *block, nir_block *preheader)
+visit_block(nir_block *block, licm_state *state)
 {
+   state->current_block_dominates_exit =
+      nir_block_dominates(block, nir_loop_successor_block(state->loop));
+
    bool progress = false;
    nir_foreach_instr_safe(instr, block) {
-      if (is_instr_loop_invariant(instr, preheader->index)) {
+      if (is_instr_loop_invariant(instr, state)) {
          nir_instr_remove(instr);
-         nir_instr_insert_after_block(preheader, instr);
+         nir_instr_insert_after_block(nir_loop_predecessor_block(state->loop),
+                                      instr);
          progress = true;
       }
    }
@@ -80,40 +123,62 @@ should_optimize_loop(nir_loop *loop)
 }
 
 static bool
-visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit)
+visit_cf_list(struct exec_list *list, licm_state *state)
 {
    bool progress = false;
 
    foreach_list_typed(nir_cf_node, node, node, list) {
       switch (node->type) {
       case nir_cf_node_block: {
-         /* By only visiting blocks which dominate the loop exit, we
-          * ensure that we don't speculatively hoist any instructions
-          * which otherwise might not be executed.
-          *
-          * Note, that the proper check would be whether this block
-          * postdominates the loop preheader.
+         nir_cf_node *next = nir_cf_node_next(node);
+         bool optimize_loop = false;
+
+         /* If the next CF node is a loop that we optimize, visit it first
+          * before visiting its predecessor block, so that any instructions
+          * hoisted from this (potentially nested) loop are then considered
+          * for hoisting from the outer loop as well. The goal is to hoist
+          * instructions across all levels of nested loops.
           */
+         if (next && next->type == nir_cf_node_loop) {
+            nir_loop *inner_loop = nir_cf_node_as_loop(next);
+            optimize_loop = should_optimize_loop(inner_loop);
+
+            if (optimize_loop) {
+               nir_loop *outer_loop = state->loop;
+
+               state->loop = inner_loop;
+               progress |= visit_cf_list(&inner_loop->body, state);
+               progress |= visit_cf_list(&inner_loop->continue_list, state);
+               state->loop = outer_loop;
+            }
+         }
+
+         /* Visit the block. */
          nir_block *block = nir_cf_node_as_block(node);
-         if (exit && nir_block_dominates(block, exit))
-            progress |= visit_block(block, preheader);
+         if (state->loop)
+            progress |= visit_block(block, state);
+
+         if (next && next->type == nir_cf_node_loop && !optimize_loop) {
+            nir_loop *loop = nir_cf_node_as_loop(next);
+
+            /* We treat this loop like any other block, so we don't do LICM
+             * from it per se, but if this loop is nested inside another
+             * loop, we still do LICM for the outer loop.
+             */
+            progress |= visit_cf_list(&loop->body, state);
+            progress |= visit_cf_list(&loop->continue_list, state);
+         }
          break;
       }
       case nir_cf_node_if: {
          nir_if *nif = nir_cf_node_as_if(node);
-         progress |= visit_cf_list(&nif->then_list, preheader, exit);
-         progress |= visit_cf_list(&nif->else_list, preheader, exit);
+         progress |= visit_cf_list(&nif->then_list, state);
+         progress |= visit_cf_list(&nif->else_list, state);
          break;
       }
-      case nir_cf_node_loop: {
-         nir_loop *loop = nir_cf_node_as_loop(node);
-         bool opt = should_optimize_loop(loop);
-         nir_block *inner_preheader = opt ? nir_cf_node_cf_tree_prev(node) : preheader;
-         nir_block *inner_exit = opt ? nir_cf_node_cf_tree_next(node) : exit;
-         progress |= visit_cf_list(&loop->body, inner_preheader, inner_exit);
-         progress |= visit_cf_list(&loop->continue_list, inner_preheader, inner_exit);
+      case nir_cf_node_loop:
+         /* All loops are handled when handling their predecessor block. */
          break;
-      }
       case nir_cf_node_function:
          UNREACHABLE("NIR LICM: Unsupported cf_node type.");
       }
@@ -123,17 +188,19 @@ visit_cf_list(struct exec_list *list, nir_block *preheader, nir_block *exit)
 }
 
 bool
-nir_opt_licm(nir_shader *shader)
+nir_opt_licm(nir_shader *shader, nir_opt_licm_filter_instr_cb filter_instr)
 {
+   licm_state state = {filter_instr};
    bool progress = false;
 
    nir_foreach_function_impl(impl, shader) {
       nir_metadata_require(impl, nir_metadata_block_index |
                                     nir_metadata_dominance);
 
-      bool impl_progress = visit_cf_list(&impl->body, NULL, NULL);
-      progress |= nir_progress(impl_progress, impl,
-                               nir_metadata_block_index | nir_metadata_dominance);
+      state.loop = NULL;
+
+      progress |= nir_progress(visit_cf_list(&impl->body, &state), impl,
+                               nir_metadata_control_flow);
    }
 
    return progress;
diff --git a/src/compiler/nir/tests/opt_licm_tests.cpp b/src/compiler/nir/tests/opt_licm_tests.cpp
new file mode 100644
index 00000000000..f8b70394f3a
--- /dev/null
+++ b/src/compiler/nir/tests/opt_licm_tests.cpp
@@ -0,0 +1,231 @@
+/* Copyright 2025 Advanced Micro Devices, Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nir_test.h"
+
+class nir_opt_licm_test : public nir_test {
+protected:
+   nir_opt_licm_test()
+      : nir_test::nir_test("nir_opt_licm_test", MESA_SHADER_COMPUTE)
+   {
+   }
+
+   nir_loop *loop;
+   nir_block *original_block;
+   nir_def *x, *y, *z, *r;
+   bool expect_progress;
+   bool insert_after_break;
+
+   void test_init();
+   void test_finish(nir_opt_licm_filter_instr_cb filter_instr);
+};
+
+void
+nir_opt_licm_test::test_init()
+{
+   x = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
+   y = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
+   z = nir_load_global(b, 1, 32, nir_undef(b, 1, 64));
+
+   loop = nir_push_loop(b);
+   if (insert_after_break)
+      nir_break_if(b, nir_undef(b, 1, 1));
+   original_block = nir_loop_last_block(loop);
+}
+
+static bool
+filter_using_dst_src_bits(nir_instr *instr, bool instr_dominates_exit,
+                          unsigned num_dst_bits, unsigned num_all_src_bits,
+                          nir_loop *loop)
+{
+   return num_dst_bits <= num_all_src_bits;
+}
+
+void
+nir_opt_licm_test::test_finish(nir_opt_licm_filter_instr_cb filter_instr)
+{
+   if (!insert_after_break)
+      nir_break_if(b, nir_undef(b, 1, 1));
+   nir_pop_loop(b, loop);
+   nir_validate_shader(b->shader, NULL);
+
+   bool progress = false;
+   NIR_PASS(progress, b->shader, nir_opt_licm, filter_instr);
+
+   if (expect_progress) {
+      ASSERT_TRUE(progress);
+      ASSERT_EQ(nir_def_instr(r)->block, nir_loop_predecessor_block(loop));
+   } else {
+      ASSERT_FALSE(progress);
+      ASSERT_EQ(nir_def_instr(r)->block, original_block);
+   }
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_unary)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_ineg(b, x);
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_binary)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_iadd(b, x, y);
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_alu_u2u64)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_u2u64(b, x);
+
+   /* If sizeof(dst) > sizeof(all srcs), the default behavior is not to hoist
+    * because that would increase register usage of the whole loop.
+    */
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_load_ssbo_no_flags_before_break)
+{
+   this->insert_after_break = false;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_before_break)
+{
+   this->insert_after_break = false;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_REORDER));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_load_ssbo_reorderable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_REORDER));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_load_ssbo_speculatable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_SPECULATE));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_load_ssbo_reorderable_speculatable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+   r = nir_load_ssbo(b, 1, 32, x, y);
+   nir_intrinsic_set_access(nir_def_as_intrinsic(r),
+                            (gl_access_qualifier)(ACCESS_CAN_REORDER |
+                                                  ACCESS_CAN_SPECULATE));
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_2_nested_loops)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_loop *nested_loop = nir_push_loop(b);
+   {
+      nir_break_if(b, nir_undef(b, 1, 1));
+      r = nir_ineg(b, x);
+   }
+   nir_pop_loop(b, nested_loop);
+
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, hoist_alu_6_nested_loops)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_loop *nested_loops[5];
+
+   for (unsigned i = 0; i < ARRAY_SIZE(nested_loops); i++) {
+      nested_loops[i] = nir_push_loop(b);
+      nir_break_if(b, nir_undef(b, 1, 1));
+   }
+
+   r = nir_ineg(b, x);
+
+   for (int i = ARRAY_SIZE(nested_loops) - 1; i >= 0; i--)
+      nir_pop_loop(b, nested_loops[i]);
+
+   this->test_finish(filter_using_dst_src_bits);
+}
+
+TEST_F(nir_opt_licm_test, skip_tex)
+{
+   this->insert_after_break = true;
+   this->expect_progress = false;
+   this->test_init();
+
+   nir_tex_builder fields = {0};
+   fields.coord = x;
+   fields.texture_handle = y;
+   fields.dest_type = nir_type_uint32;
+
+   r = nir_build_tex_struct(b, nir_texop_tex, fields);
+   this->test_finish(NULL);
+}
+
+TEST_F(nir_opt_licm_test, hoist_tex_before_break)
+{
+   this->insert_after_break = false;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_tex_builder fields = {0};
+   fields.coord = x;
+   fields.texture_handle = y;
+   fields.dest_type = nir_type_uint32;
+
+   r = nir_build_tex_struct(b, nir_texop_tex, fields);
+   this->test_finish(NULL);
+}
+
+TEST_F(nir_opt_licm_test, hoist_tex_speculatable)
+{
+   this->insert_after_break = true;
+   this->expect_progress = true;
+   this->test_init();
+
+   nir_tex_builder fields = {0};
+   fields.coord = x;
+   fields.texture_handle = y;
+   fields.can_speculate = true;
+   fields.dest_type = nir_type_uint32;
+
+   r = nir_build_tex_struct(b, nir_texop_tex, fields);
+   this->test_finish(NULL);
+}
diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c
index c674119fc94..4ef6fa9461d 100644
--- a/src/imagination/pco/pco_nir.c
+++ b/src/imagination/pco/pco_nir.c
@@ -786,6 +786,14 @@ static bool robustness_filter(const nir_intrinsic_instr *intr,
    return false;
 }
 
+static bool
+opt_licm_filter_instr_cb(nir_instr *instr, bool instr_dominates_exit,
+                         unsigned num_dst_bits, unsigned num_all_src_bits,
+                         nir_loop *loop)
+{
+   return instr_dominates_exit;
+}
+
 /**
  * \brief Lowers a NIR shader.
  *
@@ -806,7 +814,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
 
    NIR_PASS(_, nir, nir_lower_memory_model);
 
-   NIR_PASS(_, nir, nir_opt_licm);
+   NIR_PASS(_, nir, nir_opt_licm, opt_licm_filter_instr_cb);
 
    NIR_PASS(_, nir, nir_lower_memcpy);