diff --git a/src/intel/compiler/brw/brw_builder.h b/src/intel/compiler/brw/brw_builder.h
index b9c973f6dee..13540785288 100644
--- a/src/intel/compiler/brw/brw_builder.h
+++ b/src/intel/compiler/brw/brw_builder.h
@@ -638,6 +638,18 @@ public:
       return emit(SHADER_OPCODE_SEND, SEND_NUM_SRCS)->as_send();
    }
 
+   brw_scratch_inst *
+   FILL() const
+   {
+      return emit(SHADER_OPCODE_LSC_FILL, FILL_NUM_SRCS)->as_scratch();
+   }
+
+   brw_scratch_inst *
+   SPILL() const
+   {
+      return emit(SHADER_OPCODE_LSC_SPILL, SPILL_NUM_SRCS)->as_scratch();
+   }
+
    brw_urb_inst *
    URB_WRITE(const brw_reg srcs[], unsigned num_srcs) const
    {
diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h
index da7c3812562..74d3b62d39a 100644
--- a/src/intel/compiler/brw/brw_eu_defines.h
+++ b/src/intel/compiler/brw/brw_eu_defines.h
@@ -549,6 +549,9 @@ enum ENUM_PACKED opcode {
     * Acts as a scheduling barrier.
     */
    SHADER_OPCODE_LOAD_REG,
+
+   SHADER_OPCODE_LSC_FILL,
+   SHADER_OPCODE_LSC_SPILL,
 };
 
 enum send_srcs {
@@ -714,6 +717,23 @@ enum interpolator_logical_srcs {
    INTERP_NUM_SRCS
 };
 
+enum spill_srcs {
+   /** Register used for the address in scratch space. */
+   SPILL_SRC_PAYLOAD1,
+
+   /** Register to be spilled. */
+   SPILL_SRC_PAYLOAD2,
+
+   SPILL_NUM_SRCS
+};
+
+enum fill_srcs {
+   /** Register used for the address in scratch space. */
+   FILL_SRC_PAYLOAD1,
+
+   FILL_NUM_SRCS
+};
+
 enum brw_reduce_op {
    BRW_REDUCE_OP_ADD,
    BRW_REDUCE_OP_MUL,
diff --git a/src/intel/compiler/brw/brw_inst.cpp b/src/intel/compiler/brw/brw_inst.cpp
index ac30b1936f0..0829d3565d2 100644
--- a/src/intel/compiler/brw/brw_inst.cpp
+++ b/src/intel/compiler/brw/brw_inst.cpp
@@ -232,6 +232,10 @@ brw_inst_kind_for_opcode(enum opcode opcode)
    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
       return BRW_KIND_LOGICAL;
 
+   case SHADER_OPCODE_LSC_FILL:
+   case SHADER_OPCODE_LSC_SPILL:
+      return BRW_KIND_SCRATCH;
+
    default:
       return BRW_KIND_BASE;
    }
@@ -302,6 +306,12 @@ brw_inst::is_payload(unsigned arg) const
    case SHADER_OPCODE_SEND:
       return arg >= SEND_SRC_PAYLOAD1;
 
+   case SHADER_OPCODE_LSC_FILL:
+      return arg == FILL_SRC_PAYLOAD1;
+
+   case SHADER_OPCODE_LSC_SPILL:
+      return arg == SPILL_SRC_PAYLOAD1 || arg == SPILL_SRC_PAYLOAD2;
+
    case SHADER_OPCODE_SEND_GATHER:
       return arg >= SEND_GATHER_SRC_SCALAR;
 
@@ -551,6 +561,25 @@ brw_inst::size_read(const struct intel_device_info *devinfo, int arg) const
       }
       break;
 
+   case SHADER_OPCODE_LSC_FILL:
+      if (arg == FILL_SRC_PAYLOAD1) {
+         return lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
+                                 as_scratch()->use_transpose ? 1 : exec_size) *
+                REG_SIZE;
+      }
+      break;
+
+   case SHADER_OPCODE_LSC_SPILL:
+      if (arg == SPILL_SRC_PAYLOAD1) {
+         assert(!as_scratch()->use_transpose);
+
+         return lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, exec_size) *
+                REG_SIZE;
+      } else if (arg == SPILL_SRC_PAYLOAD2) {
+         return src[arg].component_size(exec_size);
+      }
+      break;
+
    case SHADER_OPCODE_SEND_GATHER:
       if (arg >= SEND_GATHER_SRC_PAYLOAD) {
          /* SEND_GATHER is Xe3+, so no need to pass devinfo around. */
@@ -940,6 +969,7 @@ brw_inst::has_side_effects() const
       return as_send()->has_side_effects;
 
    case BRW_OPCODE_SYNC:
+   case SHADER_OPCODE_LSC_SPILL:
    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
@@ -965,6 +995,7 @@ brw_inst::is_volatile() const
    switch (opcode) {
    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
    case SHADER_OPCODE_LOAD_REG:
+   case SHADER_OPCODE_LSC_FILL:
       return true;
    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
       return as_mem()->flags & MEMORY_FLAG_VOLATILE_ACCESS;
diff --git a/src/intel/compiler/brw/brw_inst.h b/src/intel/compiler/brw/brw_inst.h
index 4e9d7a12cd8..c38f95489fe 100644
--- a/src/intel/compiler/brw/brw_inst.h
+++ b/src/intel/compiler/brw/brw_inst.h
@@ -50,6 +50,7 @@ enum ENUM_PACKED brw_inst_kind {
    BRW_KIND_LOAD_PAYLOAD,
    BRW_KIND_URB,
    BRW_KIND_FB_WRITE,
+   BRW_KIND_SCRATCH,
 };
 
 brw_inst_kind brw_inst_kind_for_opcode(enum opcode opcode);
@@ -82,6 +83,7 @@ struct brw_inst : brw_exec_node {
    KIND_HELPERS(as_load_payload, brw_load_payload_inst, BRW_KIND_LOAD_PAYLOAD);
    KIND_HELPERS(as_urb, brw_urb_inst, BRW_KIND_URB);
    KIND_HELPERS(as_fb_write, brw_fb_write_inst, BRW_KIND_FB_WRITE);
+   KIND_HELPERS(as_scratch, brw_scratch_inst, BRW_KIND_SCRATCH);
 
 #undef KIND_HELPERS
 
@@ -370,6 +372,18 @@ struct brw_fb_write_inst : brw_inst {
    bool last_rt;
 };
 
+struct brw_scratch_inst : brw_inst {
+   /** Offset in scratch space for the load or store. */
+   unsigned offset;
+
+   /**
+    * Should a LSC transpose message be used for the fill?
+    *
+    * Currently this must be false for spills.
+    */
+   bool use_transpose;
+};
+
 /**
  * Make the execution of \p inst dependent on the evaluation of a possibly
  * inverted predicate.
diff --git a/src/intel/compiler/brw/brw_lower_fill_spill.cpp b/src/intel/compiler/brw/brw_lower_fill_spill.cpp
new file mode 100644
index 00000000000..994a5199ebd
--- /dev/null
+++ b/src/intel/compiler/brw/brw_lower_fill_spill.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "brw_shader.h"
+#include "brw_builder.h"
+
+static brw_reg
+build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill)
+{
+   /* Use a different area of the address register than what is used in
+    * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have
+    * interactions between the spill/fill instructions and the other send
+    * messages.
+    */
+   brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD,
+                               BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC);
+
+   brw_builder ubld = bld.uniform();
+
+   ubld.AND(ex_desc,
+            retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
+            brw_imm_ud(INTEL_MASK(31, 10)));
+
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   if (devinfo->verx10 >= 200) {
+      ubld.SHR(ex_desc, ex_desc, brw_imm_ud(4));
+   } else {
+      if (unspill) {
+         ubld.OR(ex_desc, ex_desc, brw_imm_ud(BRW_SFID_UGM));
+      } else {
+         ubld.OR(ex_desc,
+                 ex_desc,
+                 brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | BRW_SFID_UGM));
+      }
+   }
+
+   return ex_desc;
+}
+
+static void
+brw_lower_lsc_fill(const intel_device_info *devinfo, brw_shader &s,
+                   brw_inst *inst)
+{
+   assert(devinfo->verx10 >= 125);
+
+   const brw_builder bld(inst);
+   brw_reg dst = inst->dst;
+   brw_reg offset = inst->src[FILL_SRC_PAYLOAD1];
+
+   const unsigned reg_size = inst->dst.component_size(inst->exec_size) /
+                             REG_SIZE;
+   brw_reg ex_desc = build_ex_desc(bld, reg_size, true);
+
+   /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can
+    * load more using transpose messages.
+    */
+   const bool use_transpose = inst->as_scratch()->use_transpose;
+   const brw_builder ubld = use_transpose ? bld.uniform() : bld;
+
+   uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                LSC_ADDR_SURFTYPE_SS,
+                                LSC_ADDR_SIZE_A32,
+                                LSC_DATA_SIZE_D32,
+                                use_transpose ? reg_size * 8 : 1 /* num_channels */,
+                                use_transpose,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
+
+
+   brw_send_inst *unspill_inst = ubld.SEND();
+   unspill_inst->dst = dst;
+
+   unspill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0);
+   unspill_inst->src[SEND_SRC_EX_DESC] = ex_desc;
+   unspill_inst->src[SEND_SRC_PAYLOAD1] = offset;
+   unspill_inst->src[SEND_SRC_PAYLOAD2] = brw_reg();
+
+   unspill_inst->sfid = BRW_SFID_UGM;
+   unspill_inst->header_size = 0;
+   unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
+                                         unspill_inst->exec_size);
+   unspill_inst->ex_mlen = 0;
+   unspill_inst->size_written =
+      lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
+   unspill_inst->has_side_effects = false;
+   unspill_inst->is_volatile = true;
+
+   unspill_inst->src[0] = brw_imm_ud(
+      desc |
+      brw_message_desc(devinfo,
+                       unspill_inst->mlen,
+                       unspill_inst->size_written / REG_SIZE,
+                       unspill_inst->header_size));
+
+   assert(unspill_inst->size_written == inst->size_written);
+   assert(unspill_inst->size_read(devinfo, SEND_SRC_PAYLOAD1) == inst->size_read(devinfo, FILL_SRC_PAYLOAD1));
+
+   inst->remove();
+}
+
+static void
+brw_lower_lsc_spill(const intel_device_info *devinfo, brw_inst *inst)
+{
+   assert(devinfo->verx10 >= 125);
+
+   const brw_builder bld(inst);
+   brw_reg offset = inst->src[SPILL_SRC_PAYLOAD1];
+   brw_reg src = inst->src[SPILL_SRC_PAYLOAD2];
+
+   const unsigned reg_size = src.component_size(bld.dispatch_width()) /
+                             REG_SIZE;
+
+   assert(!inst->as_scratch()->use_transpose);
+
+   const brw_reg ex_desc = build_ex_desc(bld, reg_size, false);
+
+   brw_send_inst *spill_inst = bld.SEND();
+
+   spill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
+   spill_inst->src[SEND_SRC_EX_DESC]  = ex_desc;
+   spill_inst->src[SEND_SRC_PAYLOAD1] = offset;
+   spill_inst->src[SEND_SRC_PAYLOAD2] = src;
+
+   spill_inst->sfid = BRW_SFID_UGM;
+   uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
+                                LSC_ADDR_SURFTYPE_SS,
+                                LSC_ADDR_SIZE_A32,
+                                LSC_DATA_SIZE_D32,
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
+   spill_inst->header_size = 0;
+   spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
+                                       bld.dispatch_width());
+   spill_inst->ex_mlen = reg_size;
+   spill_inst->size_written = 0;
+   spill_inst->has_side_effects = true;
+   spill_inst->is_volatile = false;
+
+   spill_inst->src[0] = brw_imm_ud(
+      desc |
+      brw_message_desc(devinfo,
+                       spill_inst->mlen,
+                       spill_inst->size_written / REG_SIZE,
+                       spill_inst->header_size));
+
+   assert(spill_inst->size_written == inst->size_written);
+   assert(spill_inst->size_read(devinfo, SEND_SRC_PAYLOAD1) == inst->size_read(devinfo, SPILL_SRC_PAYLOAD1));
+   assert(spill_inst->size_read(devinfo, SEND_SRC_PAYLOAD2) == inst->size_read(devinfo, SPILL_SRC_PAYLOAD2));
+
+   inst->remove();
+}
+
+bool
+brw_lower_fill_and_spill(brw_shader &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_LSC_FILL:
+         brw_lower_lsc_fill(s.devinfo, s, inst);
+         progress = true;
+         break;
+
+      case SHADER_OPCODE_LSC_SPILL:
+         brw_lower_lsc_spill(s.devinfo, inst);
+         progress = true;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
+                            BRW_DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp
index 3aa30446074..b0d8e768b0a 100644
--- a/src/intel/compiler/brw/brw_opt_cse.cpp
+++ b/src/intel/compiler/brw/brw_opt_cse.cpp
@@ -473,6 +473,9 @@ hash_inst(const void *v)
    case BRW_KIND_BASE:
       /* Nothing else to do. */
       break;
+
+   case BRW_KIND_SCRATCH:
+      UNREACHABLE("Spill and fills should not exist yet.");
    }
 
    if (inst->opcode == BRW_OPCODE_MAD) {
diff --git a/src/intel/compiler/brw/brw_print.cpp b/src/intel/compiler/brw/brw_print.cpp
index 737bd0886ca..1da137306bd 100644
--- a/src/intel/compiler/brw/brw_print.cpp
+++ b/src/intel/compiler/brw/brw_print.cpp
@@ -269,6 +269,11 @@ brw_instruction_name(const struct brw_isa_info *isa, const brw_inst *inst)
 
    case SHADER_OPCODE_FLOW:
       return "flow";
+
+   case SHADER_OPCODE_LSC_FILL:
+      return "fill_lsc";
+   case SHADER_OPCODE_LSC_SPILL:
+      return "spill_lsc";
    }
 
    UNREACHABLE("not reached");
diff --git a/src/intel/compiler/brw/brw_reg_allocate.cpp b/src/intel/compiler/brw/brw_reg_allocate.cpp
index fdb31af96d9..bd3d3bd9b7e 100644
--- a/src/intel/compiler/brw/brw_reg_allocate.cpp
+++ b/src/intel/compiler/brw/brw_reg_allocate.cpp
@@ -296,8 +296,6 @@ private:
 
    bool build_interference_graph(bool allow_spilling);
 
-   brw_reg build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill);
-
    brw_reg build_lane_offsets(const brw_builder &bld,
                               uint32_t spill_offset, int ip);
    brw_reg build_single_offset(const brw_builder &bld,
@@ -505,6 +503,10 @@ brw_inst_has_source_and_destination_hazard(const struct intel_device_info *devin
        * be overly conservative.
        */
       return inst->as_dpas()->rcount > 1;
+
+   case SHADER_OPCODE_LSC_FILL:
+      return false;
+
    default:
       /* The SIMD16 compressed instruction
        *
@@ -631,6 +633,13 @@ brw_reg_alloc::setup_inst_interference(const brw_inst *inst)
       ra_add_node_interference(g,
          first_vgrf_node + inst->src[SEND_SRC_PAYLOAD1].nr,
          first_vgrf_node + inst->src[SEND_SRC_PAYLOAD2].nr);
+   } else if (inst->opcode == SHADER_OPCODE_LSC_SPILL &&
+              inst->src[SPILL_SRC_PAYLOAD1].file == VGRF &&
+              inst->src[SPILL_SRC_PAYLOAD2].file == VGRF &&
+              inst->src[SPILL_SRC_PAYLOAD1].nr != inst->src[SPILL_SRC_PAYLOAD2].nr) {
+      ra_add_node_interference(g,
+         first_vgrf_node + inst->src[SPILL_SRC_PAYLOAD1].nr,
+         first_vgrf_node + inst->src[SPILL_SRC_PAYLOAD2].nr);
    }
 
    /* When we do send-from-GRF for FB writes, we need to ensure that the last
@@ -774,43 +783,6 @@ brw_reg_alloc::build_single_offset(const brw_builder &bld, uint32_t spill_offset
    return offset;
 }
 
-brw_reg
-brw_reg_alloc::build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill)
-{
-   /* Use a different area of the address register than what is used in
-    * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have
-    * interactions between the spill/fill instructions and the other send
-    * messages.
-    */
-   brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD,
-                               BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC);
-
-   brw_builder ubld = bld.uniform();
-
-   brw_inst *inst = ubld.AND(ex_desc,
-                             retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
-                             brw_imm_ud(INTEL_MASK(31, 10)));
-   _mesa_set_add(spill_insts, inst);
-
-   const intel_device_info *devinfo = bld.shader->devinfo;
-   if (devinfo->verx10 >= 200) {
-      inst = ubld.SHR(ex_desc, ex_desc, brw_imm_ud(4));
-      _mesa_set_add(spill_insts, inst);
-   } else {
-      if (unspill) {
-         inst = ubld.OR(ex_desc, ex_desc, brw_imm_ud(BRW_SFID_UGM));
-         _mesa_set_add(spill_insts, inst);
-      } else {
-         inst = ubld.OR(ex_desc,
-                        ex_desc,
-                        brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | BRW_SFID_UGM));
-         _mesa_set_add(spill_insts, inst);
-      }
-   }
-
-   return ex_desc;
-}
-
 brw_reg
 brw_reg_alloc::build_lane_offsets(const brw_builder &bld, uint32_t spill_offset, int ip)
 {
@@ -905,7 +877,6 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld,
    for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) {
       ++stats->fill_count;
 
-      brw_send_inst *unspill_inst;
       if (devinfo->verx10 >= 125) {
          /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can
           * load more using transpose messages.
@@ -921,46 +892,26 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld,
             offset = build_lane_offsets(ubld, spill_offset, ip);
          }
 
-         uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
-                                      LSC_ADDR_SURFTYPE_SS,
-                                      LSC_ADDR_SIZE_A32,
-                                      LSC_DATA_SIZE_D32,
-                                      use_transpose ? reg_size * 8 : 1 /* num_channels */,
-                                      use_transpose,
-                                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
-
-         const brw_reg ex_desc_reg = build_ex_desc(bld, reg_size, true);
-
-         unspill_inst = ubld.SEND();
+         const bool exec_all = use_transpose || bld.has_writemask_all();
+         brw_scratch_inst *unspill_inst = bld.exec_all(exec_all).FILL();
          unspill_inst->dst = dst;
 
-         unspill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
-         unspill_inst->src[SEND_SRC_EX_DESC]  = ex_desc_reg;
-         unspill_inst->src[SEND_SRC_PAYLOAD1] = offset;
-         unspill_inst->src[SEND_SRC_PAYLOAD2] = brw_reg();
+         unspill_inst->src[FILL_SRC_PAYLOAD1] = offset;
 
-         unspill_inst->sfid = BRW_SFID_UGM;
-         unspill_inst->header_size = 0;
-         unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
-                                               unspill_inst->exec_size);
-         unspill_inst->ex_mlen = 0;
+         unspill_inst->offset = spill_offset;
+         unspill_inst->use_transpose = use_transpose;
          unspill_inst->size_written =
             lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
-         unspill_inst->has_side_effects = false;
-         unspill_inst->is_volatile = true;
+         assert(unspill_inst->size_written == (reg_size * REG_SIZE));
 
-         unspill_inst->src[0] = brw_imm_ud(
-            desc |
-            brw_message_desc(devinfo,
-                             unspill_inst->mlen,
-                             unspill_inst->size_written / REG_SIZE,
-                             unspill_inst->header_size));
+         _mesa_set_add(spill_insts, unspill_inst);
+         assert(unspill_inst->force_writemask_all || count % reg_size == 0);
       } else {
          brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
 
          const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
 
-         unspill_inst = bld.SEND();
+         brw_send_inst *unspill_inst = bld.SEND();
          unspill_inst->dst = dst;
 
          unspill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
@@ -983,9 +934,10 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld,
                              unspill_inst->mlen,
                              unspill_inst->size_written / REG_SIZE,
                              unspill_inst->header_size));
+
+         _mesa_set_add(spill_insts, unspill_inst);
+         assert(unspill_inst->force_writemask_all || count % reg_size == 0);
       }
-      _mesa_set_add(spill_insts, unspill_inst);
-      assert(unspill_inst->force_writemask_all || count % reg_size == 0);
 
       dst.offset += reg_size * REG_SIZE;
       spill_offset += reg_size * REG_SIZE;
@@ -1005,48 +957,26 @@ brw_reg_alloc::emit_spill(const brw_builder &bld,
    for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) {
       ++stats->spill_count;
 
-      brw_send_inst *spill_inst;
       if (devinfo->verx10 >= 125) {
          brw_reg offset = build_lane_offsets(bld, spill_offset, ip);
 
-         const brw_reg ex_desc_reg = build_ex_desc(bld, reg_size, false);
-
-         spill_inst = bld.SEND();
+         brw_scratch_inst *spill_inst = bld.SPILL();
          spill_inst->dst = bld.null_reg_f();
 
-         spill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
-         spill_inst->src[SEND_SRC_EX_DESC]  = ex_desc_reg;
-         spill_inst->src[SEND_SRC_PAYLOAD1] = offset;
-         spill_inst->src[SEND_SRC_PAYLOAD2] = src;
+         spill_inst->src[SPILL_SRC_PAYLOAD1] = offset;
+         spill_inst->src[SPILL_SRC_PAYLOAD2] = src;
 
-         spill_inst->sfid = BRW_SFID_UGM;
-         uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
-                                      LSC_ADDR_SURFTYPE_SS,
-                                      LSC_ADDR_SIZE_A32,
-                                      LSC_DATA_SIZE_D32,
-                                      1 /* num_channels */,
-                                      false /* transpose */,
-                                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
-         spill_inst->header_size = 0;
-         spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
-                                             bld.dispatch_width());
-         spill_inst->ex_mlen = reg_size;
-         spill_inst->size_written = 0;
-         spill_inst->has_side_effects = true;
-         spill_inst->is_volatile = false;
+         spill_inst->offset = spill_offset;
+         spill_inst->use_transpose = false;
 
-         spill_inst->src[0] = brw_imm_ud(
-            desc |
-            brw_message_desc(devinfo,
-                             spill_inst->mlen,
-                             spill_inst->size_written / REG_SIZE,
-                             spill_inst->header_size));
+         _mesa_set_add(spill_insts, spill_inst);
+         assert(spill_inst->force_writemask_all || count % reg_size == 0);
       } else {
          brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
 
          const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
 
-         spill_inst = bld.SEND();
+         brw_send_inst *spill_inst = bld.SEND();
          spill_inst->dst = bld.null_reg_f();
 
          spill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
@@ -1072,9 +1002,10 @@ brw_reg_alloc::emit_spill(const brw_builder &bld,
                              spill_inst->header_size));
          spill_inst->src[1] = brw_imm_ud(
             brw_message_ex_desc(devinfo, spill_inst->ex_mlen));
+
+         _mesa_set_add(spill_insts, spill_inst);
+         assert(spill_inst->force_writemask_all || count % reg_size == 0);
       }
-      _mesa_set_add(spill_insts, spill_inst);
-      assert(spill_inst->force_writemask_all || count % reg_size == 0);
 
       src.offset += reg_size * REG_SIZE;
       spill_offset += reg_size * REG_SIZE;
diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp
index 4a5a8965222..d5f091566ac 100644
--- a/src/intel/compiler/brw/brw_shader.cpp
+++ b/src/intel/compiler/brw/brw_shader.cpp
@@ -1277,6 +1277,10 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
 
    s.debug_optimizer(nir, "post_ra_alloc", iteration, pass_num);
 
+   if (s.spilled_any_registers) {
+      OPT(brw_lower_fill_and_spill);
+   }
+
    OPT(brw_opt_bank_conflicts);
    OPT_V(brw_schedule_instructions_post_ra);
 
diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h
index e03a38b1cee..b53b4f86bb4 100644
--- a/src/intel/compiler/brw/brw_shader.h
+++ b/src/intel/compiler/brw/brw_shader.h
@@ -328,6 +328,7 @@ bool brw_lower_constant_loads(brw_shader &s);
 bool brw_lower_csel(brw_shader &s);
 bool brw_lower_derivatives(brw_shader &s);
 bool brw_lower_dpas(brw_shader &s);
+bool brw_lower_fill_and_spill(brw_shader &s);
 bool brw_lower_find_live_channel(brw_shader &s);
 bool brw_lower_indirect_mov(brw_shader &s);
 bool brw_lower_integer_multiplication(brw_shader &s);
diff --git a/src/intel/compiler/brw/meson.build b/src/intel/compiler/brw/meson.build
index 0d2e8f28dde..cb44c62083e 100644
--- a/src/intel/compiler/brw/meson.build
+++ b/src/intel/compiler/brw/meson.build
@@ -43,6 +43,7 @@ libintel_compiler_brw_files = files(
   'brw_load_reg.cpp',
   'brw_lower.cpp',
   'brw_lower_dpas.cpp',
+  'brw_lower_fill_spill.cpp',
   'brw_lower_integer_multiplication.cpp',
   'brw_lower_logical_sends.cpp',
   'brw_lower_pack.cpp',