diff --git a/src/intel/compiler/brw_load_reg.cpp b/src/intel/compiler/brw_load_reg.cpp
new file mode 100644
index 00000000000..bd058968c23
--- /dev/null
+++ b/src/intel/compiler/brw_load_reg.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#include "brw_shader.h"
+#include "brw_cfg.h"
+#include "brw_analysis.h"
+#include "brw_builder.h"
+
+/* Duplicated from brw_def_analysis::fully_defines. */
+static bool
+fully_defines(const brw_shader &s, brw_inst *inst)
+{
+   return s.alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
+          !inst->is_partial_write();
+}
+
+bool
+brw_insert_load_reg(brw_shader &s)
+{
+   bool progress = false;
+
+   const brw_def_analysis &defs = s.def_analysis.require();
+
+   foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
+      /* These should not exist yet. */
+      assert(inst->opcode != SHADER_OPCODE_LOAD_REG);
+
+      /* These opcodes may have the right source and destination patterns to
+       * have their sources replaced by load_reg, but these instructions are
+       * special and / or wierd. They should not be modified.
+       */
+      if (inst->opcode == SHADER_OPCODE_UNDEF ||
+          inst->opcode == BRW_OPCODE_DPAS) {
+         continue;
+      }
+
+      /* If the destination is non-VGRF adding load_reg instructions will not
+       * help. If the destination is already SSA, nothing needs to be done.
+       */
+      if (inst->dst.file != VGRF || defs.get(inst->dst) != NULL)
+         continue;
+
+      /* If there is a source that would cause def_analysis::update_for_reads
+       * to mark the def as invalid, adding load_reg for the sources will not
+       * help.
+       */
+      if (inst->reads_accumulator_implicitly())
+         continue;
+
+      bool bad_source = false;
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == ARF &&
+             (inst->src[i].nr == BRW_ARF_ADDRESS ||
+              inst->src[i].nr == BRW_ARF_ACCUMULATOR ||
+              inst->src[i].nr == BRW_ARF_FLAG)) {
+            bad_source = true;
+            break;
+         }
+      }
+
+      if (bad_source)
+         continue;
+
+      /* If the instruction does not fully define the destination, adding
+       * load_reg instructions will not help.
+       */
+      if (!fully_defines(s, inst))
+         continue;
+
+      if (inst->exec_size < 8)
+         continue;
+
+      assert(inst->exec_size == 8 || inst->exec_size == 16 ||
+             inst->exec_size == 32);
+
+      const unsigned mask = (inst->exec_size / 8) - 1;
+
+      /* Replace any non-SSA sources with load_reg of the source. */
+      const brw_builder bld = brw_builder(inst);
+      for (int i = 0; i < inst->sources; i++) {
+         /* LOAD_REG only operates on VGRF sources. If the source is not VGRF,
+          * skip it.
+          */
+         if (inst->src[i].file != VGRF)
+            continue;
+
+         /* The source is already a def, so don't add a LOAD_REG. */
+         if (defs.get(inst->src[i]) != NULL)
+            continue;
+
+         /* Cases of stride != 1 are difficult to handle correctly. For
+          * example, when stride is 0, the source may have been written by
+          * NoMask instruction that cannot be seen from here. In this case,
+          * emitting a non-NoMask LOAD_REG may not actually copy the value
+          * that the instruction is trying to read.
+          *
+          * This means that is_scalar sources in larger exec sizes are not
+          * handled. Since enough information is available in the source, this
+          * could be added later.
+          */
+         if (inst->src[i].stride != 1)
+            continue;
+
+         /* If the size of the VGRF allocation is not an even multiple of
+          * the SIMD size, don't emit a load_reg. This can occur for sparse
+          * texture loads. These will have SIMD-size values for the texture
+          * data and a single SIMD1 register for the residency information.
+          */
+         if ((s.alloc.sizes[inst->src[i].nr] & mask) != 0)
+            continue;
+
+         brw_reg_type t =
+            brw_type_with_size(BRW_TYPE_UD,
+                               brw_type_size_bits(inst->src[i].type));
+         brw_reg old_src = brw_vgrf(inst->src[i].nr, t);
+         brw_reg new_src;
+
+         /* Since the sources of a LOAD_REG will likely not be defs,
+          * none of the existing optimizations passes will eliminate
+          * redundant LOAD_REG instructions. Search back though this
+          * block to find a LOAD_REG of the same value to avoid emitting
+          * too many redundant instructions.
+          */
+         foreach_inst_in_block_reverse_starting_from(brw_inst, scan_inst, inst) {
+            if (scan_inst->dst.file == old_src.file &&
+                scan_inst->dst.nr == old_src.nr) {
+               break;
+            }
+
+            if (scan_inst->opcode == SHADER_OPCODE_LOAD_REG &&
+                scan_inst->exec_size == inst->exec_size &&
+                scan_inst->force_writemask_all == inst->force_writemask_all &&
+                old_src.equals(scan_inst->src[0])) {
+               new_src = scan_inst->dst;
+               break;
+            }
+         }
+
+         if (new_src.file == BAD_FILE)
+            new_src = bld.LOAD_REG(old_src);
+
+         inst->src[i].nr = new_src.nr;
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
+                            BRW_DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+bool
+brw_lower_load_reg(brw_shader &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
+      if (inst->opcode == SHADER_OPCODE_LOAD_REG) {
+         const brw_builder ibld = brw_builder(inst);
+
+         const unsigned bytes = inst->size_written;
+         const unsigned type_bytes = brw_type_size_bytes(inst->dst.type);
+         const unsigned bytes_per_mov = inst->exec_size * type_bytes;
+
+         for (unsigned i = 0; i < bytes; i += bytes_per_mov) {
+            ibld.MOV(byte_offset(inst->dst, i),
+                     byte_offset(inst->src[0], i));
+         }
+
+         inst->remove();
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
+                            BRW_DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_opt.cpp b/src/intel/compiler/brw_opt.cpp
index 5dd4d833cd2..ad35c4f7637 100644
--- a/src/intel/compiler/brw_opt.cpp
+++ b/src/intel/compiler/brw_opt.cpp
@@ -53,6 +53,14 @@ brw_optimize(brw_shader &s)
 
    OPT(brw_opt_eliminate_find_live_channel);
 
+   /* Add load_reg instructions before the main optimization loop to get more
+    * defs available in those passes. Do it after the preceeding few pre-loop
+    * passes so that it hopefully has less work to do. Having it here versus
+    * before the call to opt_dce made some difference, but it was mostly
+    * noise.
+    */
+   OPT(brw_insert_load_reg);
+
    /* Track how much non-SSA at this point. */
    {
       const brw_def_analysis &defs = s.def_analysis.require();
@@ -85,6 +93,12 @@ brw_optimize(brw_shader &s)
    if (OPT(brw_opt_combine_convergent_txf))
       OPT(brw_opt_copy_propagation_defs);
 
+   if (OPT(brw_lower_load_reg)) {
+      OPT(brw_opt_copy_propagation);
+      OPT(brw_opt_register_coalesce);
+      OPT(brw_opt_dead_code_eliminate);
+   }
+
    if (OPT(brw_lower_pack)) {
       OPT(brw_opt_register_coalesce);
       OPT(brw_opt_dead_code_eliminate);
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
index ebe55bd9dbf..d6f85638eaf 100644
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@@ -353,3 +353,6 @@ unsigned brw_get_lowered_simd_width(const brw_shader *shader,
 
 brw_reg brw_allocate_vgrf(brw_shader &s, brw_reg_type type, unsigned count);
 brw_reg brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE);
+
+bool brw_insert_load_reg(brw_shader &s);
+bool brw_lower_load_reg(brw_shader &s);
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index d2d803a2232..5a7cd723ea3 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -56,6 +56,7 @@ libintel_compiler_brw_files = files(
   'brw_inst.cpp',
   'brw_inst.h',
   'brw_isa_info.h',
+  'brw_load_reg.cpp',
   'brw_lower.cpp',
   'brw_lower_dpas.cpp',
   'brw_lower_integer_multiplication.cpp',
@@ -212,6 +213,7 @@ if with_tests
         'test_eu_validate.cpp',
         'test_helpers.cpp',
         'test_helpers.h',
+        'test_insert_load_reg.cpp',
         'test_lower_scoreboard.cpp',
         'test_opt_cmod_propagation.cpp',
         'test_opt_combine_constants.cpp',
diff --git a/src/intel/compiler/test_insert_load_reg.cpp b/src/intel/compiler/test_insert_load_reg.cpp
new file mode 100644
index 00000000000..6aaa91fc878
--- /dev/null
+++ b/src/intel/compiler/test_insert_load_reg.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright © 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "test_helpers.h"
+#include "brw_builder.h"
+
+class insert_load_reg_test : public brw_shader_pass_test {};
+
+TEST_F(insert_load_reg_test, basic)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+   brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst = vgrf(bld, exp, BRW_TYPE_F);
+   brw_reg src = vgrf(bld, exp, BRW_TYPE_F);
+
+   bld.ADD(dst, src, brw_imm_f(1.0));
+
+   EXPECT_PROGRESS(brw_insert_load_reg, bld);
+
+   exp.ADD(dst, exp.LOAD_REG(src), brw_imm_f(1.0));
+
+   EXPECT_SHADERS_MATCH(bld, exp);
+}
+
+TEST_F(insert_load_reg_test, already_defs)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst0 = vgrf(bld, BRW_TYPE_F);
+   brw_reg dst1 = vgrf(bld, BRW_TYPE_F);
+   brw_reg src0 = retype(brw_vec16_reg(FIXED_GRF, 2, 0), BRW_TYPE_F);
+
+   /* The first ADD will produce a def due its FIXED_GRF and IMM sources. The
+    * second ADD will also produces a def due to its def and IMM
+    * sources. brw_insert_load_reg shouldn't do anything.
+    */
+   bld.ADD(dst0, src0, brw_imm_f(1.0));
+   bld.ADD(dst1, dst0, brw_imm_f(1.0));
+
+   EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
+}
+
+TEST_F(insert_load_reg_test, stride_0)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst = vgrf(bld, BRW_TYPE_F);
+   brw_reg src = component(vgrf(bld, BRW_TYPE_F), 0);
+
+   ASSERT_EQ(src.stride, 0);
+   bld.ADD(dst, src, brw_imm_f(1.0));
+
+   EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
+}
+
+TEST_F(insert_load_reg_test, stride_2)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst = vgrf(bld, BRW_TYPE_D);
+   brw_reg src = subscript(vgrf(bld, BRW_TYPE_D), BRW_TYPE_W, 0);
+
+   ASSERT_EQ(src.stride, 2);
+   bld.ADD(dst, src, brw_imm_d(1));
+
+   EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
+}
+
+TEST_F(insert_load_reg_test, is_scalar)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+   brw_builder xbld = bld.scalar_group();
+
+   brw_reg dst = vgrf(bld, BRW_TYPE_F);
+   brw_reg src = vgrf(xbld, BRW_TYPE_F);
+
+   /* Currently, is_scalar cases are treated the same as other stride=0
+    * cases. This does not need to be the case, and it may (should!) be
+    * changed in the future. Split this out as a separate test.
+    */
+   src.is_scalar = true;
+
+   bld.ADD(dst, component(src, 0), brw_imm_f(1.0));
+
+   EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
+}
+
+TEST_F(insert_load_reg_test, emit_load_reg_once)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+   brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst0 = vgrf(bld, exp, BRW_TYPE_F);
+   brw_reg dst1 = vgrf(bld, exp, BRW_TYPE_F);
+   brw_reg src = vgrf(bld, exp, BRW_TYPE_F);
+
+   /* Since both instructions use the same source, only one LOAD_REG should be
+    * generated.
+    */
+   bld.ADD(dst0, src, brw_imm_f(1.0));
+   bld.ADD(dst1, src, brw_imm_f(2.0));
+
+   EXPECT_PROGRESS(brw_insert_load_reg, bld);
+
+   brw_reg dst2 = exp.LOAD_REG(src);
+   exp.ADD(dst0, dst2, brw_imm_f(1.0));
+   exp.ADD(dst1, dst2, brw_imm_f(2.0));
+
+   EXPECT_SHADERS_MATCH(bld, exp);
+}
+
+TEST_F(insert_load_reg_test, no_mask)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+   brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst0 = vgrf(bld, exp, BRW_TYPE_F);
+   brw_reg dst1 = vgrf(bld, exp, BRW_TYPE_F);
+   brw_reg src0 = vgrf(bld, exp, BRW_TYPE_F);
+
+   bld.ADD(dst0, src0, brw_imm_f(1.0));
+   bld.exec_all().ADD(dst1, src0, brw_imm_f(2.0));
+
+   EXPECT_PROGRESS(brw_insert_load_reg, bld);
+
+   brw_reg src1 = exp.LOAD_REG(src0);
+   exp.ADD(dst0, src1, brw_imm_f(1.0));
+   brw_reg src2 = exp.exec_all().LOAD_REG(src0);
+   exp.exec_all().ADD(dst1, src2, brw_imm_f(2.0));
+
+   EXPECT_SHADERS_MATCH(bld, exp);
+}
+
+TEST_F(insert_load_reg_test, odd_size)
+{
+   brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
+
+   brw_reg dst = vgrf(bld, BRW_TYPE_D);
+   brw_reg src = vgrf(bld, BRW_TYPE_D, 3, 8);
+
+   /* The register allocation size is 3 SIMD8 units. Since that is not an even
+    * multiple of the exec size, it would be very difficult to generate a
+    * correct LOAD_REG. This should be skipped.
+    */
+   bld.ADD(dst, src, brw_imm_d(1));
+
+   EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
+}