brw: Add fill and spill opcodes for LSC platforms

These opcodes are emitted during register allocation instead of the scratch reads and writes that were previously emitted. These instructions contain additional information (i.e., the instruction encodes the scratch offset) that enable optimizations to be added later. The fill and spill opcodes are lowered to scratch reads and writes shortly after register allocation. Eventually this lower may have some optimizations (e.g., reuse previous address calculations for successive spills). v2: Add brw_scratch_inst::offset instead of storing it as a source. Suggested by Lionel. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37827>
2026-01-04 17:50:11 +01:00 · 2025-06-17 16:35:49 -07:00 · 2025-06-17 16:35:49 -07:00 · b7f5285ad3
commit b7f5285ad3
parent 2215003d95
11 changed files with 306 additions and 103 deletions
--- a/src/intel/compiler/brw/brw_builder.h
+++ b/src/intel/compiler/brw/brw_builder.h
@ -638,6 +638,18 @@ public:
      return emit(SHADER_OPCODE_SEND, SEND_NUM_SRCS)->as_send();
   }

+   brw_scratch_inst *
+   FILL() const
+   {
+      return emit(SHADER_OPCODE_LSC_FILL, FILL_NUM_SRCS)->as_scratch();
+   }
+
+   brw_scratch_inst *
+   SPILL() const
+   {
+      return emit(SHADER_OPCODE_LSC_SPILL, SPILL_NUM_SRCS)->as_scratch();
+   }
+
   brw_urb_inst *
   URB_WRITE(const brw_reg srcs[], unsigned num_srcs) const
   {
--- a/src/intel/compiler/brw/brw_eu_defines.h
+++ b/src/intel/compiler/brw/brw_eu_defines.h
@ -549,6 +549,9 @@ enum ENUM_PACKED opcode {
    * Acts as a scheduling barrier.
    */
   SHADER_OPCODE_LOAD_REG,
+
+   SHADER_OPCODE_LSC_FILL,
+   SHADER_OPCODE_LSC_SPILL,
 };

 enum send_srcs {
@ -714,6 +717,23 @@ enum interpolator_logical_srcs {
   INTERP_NUM_SRCS
 };

+enum spill_srcs {
+   /** Register used for the address in scratch space. */
+   SPILL_SRC_PAYLOAD1,
+
+   /** Register to be spilled. */
+   SPILL_SRC_PAYLOAD2,
+
+   SPILL_NUM_SRCS
+};
+
+enum fill_srcs {
+   /** Register used for the address in scratch space. */
+   FILL_SRC_PAYLOAD1,
+
+   FILL_NUM_SRCS
+};
+
 enum brw_reduce_op {
   BRW_REDUCE_OP_ADD,
   BRW_REDUCE_OP_MUL,
--- a/src/intel/compiler/brw/brw_inst.cpp
+++ b/src/intel/compiler/brw/brw_inst.cpp
@ -232,6 +232,10 @@ brw_inst_kind_for_opcode(enum opcode opcode)
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
      return BRW_KIND_LOGICAL;

+   case SHADER_OPCODE_LSC_FILL:
+   case SHADER_OPCODE_LSC_SPILL:
+      return BRW_KIND_SCRATCH;
+
   default:
      return BRW_KIND_BASE;
   }
@ -302,6 +306,12 @@ brw_inst::is_payload(unsigned arg) const
   case SHADER_OPCODE_SEND:
      return arg >= SEND_SRC_PAYLOAD1;

+   case SHADER_OPCODE_LSC_FILL:
+      return arg == FILL_SRC_PAYLOAD1;
+
+   case SHADER_OPCODE_LSC_SPILL:
+      return arg == SPILL_SRC_PAYLOAD1 || arg == SPILL_SRC_PAYLOAD2;
+
   case SHADER_OPCODE_SEND_GATHER:
      return arg >= SEND_GATHER_SRC_SCALAR;

@ -551,6 +561,25 @@ brw_inst::size_read(const struct intel_device_info *devinfo, int arg) const
      }
      break;

+   case SHADER_OPCODE_LSC_FILL:
+      if (arg == FILL_SRC_PAYLOAD1) {
+         return lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
+                                 as_scratch()->use_transpose ? 1 : exec_size) *
+                REG_SIZE;
+      }
+      break;
+
+   case SHADER_OPCODE_LSC_SPILL:
+      if (arg == SPILL_SRC_PAYLOAD1) {
+         assert(!as_scratch()->use_transpose);
+
+         return lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, exec_size) *
+                REG_SIZE;
+      } else if (arg == SPILL_SRC_PAYLOAD2) {
+         return src[arg].component_size(exec_size);
+      }
+      break;
+
   case SHADER_OPCODE_SEND_GATHER:
      if (arg >= SEND_GATHER_SRC_PAYLOAD) {
         /* SEND_GATHER is Xe3+, so no need to pass devinfo around. */
@ -940,6 +969,7 @@ brw_inst::has_side_effects() const
      return as_send()->has_side_effects;

   case BRW_OPCODE_SYNC:
+   case SHADER_OPCODE_LSC_SPILL:
   case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
   case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
   case SHADER_OPCODE_MEMORY_FENCE:
@ -965,6 +995,7 @@ brw_inst::is_volatile() const
   switch (opcode) {
   case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
   case SHADER_OPCODE_LOAD_REG:
+   case SHADER_OPCODE_LSC_FILL:
      return true;
   case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
      return as_mem()->flags & MEMORY_FLAG_VOLATILE_ACCESS;
--- a/src/intel/compiler/brw/brw_inst.h
+++ b/src/intel/compiler/brw/brw_inst.h
@ -50,6 +50,7 @@ enum ENUM_PACKED brw_inst_kind {
   BRW_KIND_LOAD_PAYLOAD,
   BRW_KIND_URB,
   BRW_KIND_FB_WRITE,
+   BRW_KIND_SCRATCH,
 };

 brw_inst_kind brw_inst_kind_for_opcode(enum opcode opcode);
@ -82,6 +83,7 @@ struct brw_inst : brw_exec_node {
   KIND_HELPERS(as_load_payload, brw_load_payload_inst, BRW_KIND_LOAD_PAYLOAD);
   KIND_HELPERS(as_urb, brw_urb_inst, BRW_KIND_URB);
   KIND_HELPERS(as_fb_write, brw_fb_write_inst, BRW_KIND_FB_WRITE);
+   KIND_HELPERS(as_scratch, brw_scratch_inst, BRW_KIND_SCRATCH);

 #undef KIND_HELPERS

@ -370,6 +372,18 @@ struct brw_fb_write_inst : brw_inst {
   bool last_rt;
 };

+struct brw_scratch_inst : brw_inst {
+   /** Offset in scratch space for the load or store. */
+   unsigned offset;
+
+   /**
+    * Should a LSC transpose message be used for the fill?
+    *
+    * Currently this must be false for spills.
+    */
+   bool use_transpose;
+};
+
 /**
 * Make the execution of \p inst dependent on the evaluation of a possibly
 * inverted predicate.
--- a/src/intel/compiler/brw/brw_lower_fill_spill.cpp
+++ b/src/intel/compiler/brw/brw_lower_fill_spill.cpp
@ -0,0 +1,181 @@
+/*
+ * Copyright 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "brw_shader.h"
+#include "brw_builder.h"
+
+static brw_reg
+build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill)
+{
+   /* Use a different area of the address register than what is used in
+    * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have
+    * interactions between the spill/fill instructions and the other send
+    * messages.
+    */
+   brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD,
+                               BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC);
+
+   brw_builder ubld = bld.uniform();
+
+   ubld.AND(ex_desc,
+            retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
+            brw_imm_ud(INTEL_MASK(31, 10)));
+
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   if (devinfo->verx10 >= 200) {
+      ubld.SHR(ex_desc, ex_desc, brw_imm_ud(4));
+   } else {
+      if (unspill) {
+         ubld.OR(ex_desc, ex_desc, brw_imm_ud(BRW_SFID_UGM));
+      } else {
+         ubld.OR(ex_desc,
+                 ex_desc,
+                 brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | BRW_SFID_UGM));
+      }
+   }
+
+   return ex_desc;
+}
+
+static void
+brw_lower_lsc_fill(const intel_device_info *devinfo, brw_shader &s,
+                   brw_inst *inst)
+{
+   assert(devinfo->verx10 >= 125);
+
+   const brw_builder bld(inst);
+   brw_reg dst = inst->dst;
+   brw_reg offset = inst->src[FILL_SRC_PAYLOAD1];
+
+   const unsigned reg_size = inst->dst.component_size(inst->exec_size) /
+                             REG_SIZE;
+   brw_reg ex_desc = build_ex_desc(bld, reg_size, true);
+
+   /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can
+    * load more using transpose messages.
+    */
+   const bool use_transpose = inst->as_scratch()->use_transpose;
+   const brw_builder ubld = use_transpose ? bld.uniform() : bld;
+
+   uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                LSC_ADDR_SURFTYPE_SS,
+                                LSC_ADDR_SIZE_A32,
+                                LSC_DATA_SIZE_D32,
+                                use_transpose ? reg_size * 8 : 1 /* num_channels */,
+                                use_transpose,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
+
+
+   brw_send_inst *unspill_inst = ubld.SEND();
+   unspill_inst->dst = dst;
+
+   unspill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0);
+   unspill_inst->src[SEND_SRC_EX_DESC] = ex_desc;
+   unspill_inst->src[SEND_SRC_PAYLOAD1] = offset;
+   unspill_inst->src[SEND_SRC_PAYLOAD2] = brw_reg();
+
+   unspill_inst->sfid = BRW_SFID_UGM;
+   unspill_inst->header_size = 0;
+   unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
+                                         unspill_inst->exec_size);
+   unspill_inst->ex_mlen = 0;
+   unspill_inst->size_written =
+      lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
+   unspill_inst->has_side_effects = false;
+   unspill_inst->is_volatile = true;
+
+   unspill_inst->src[0] = brw_imm_ud(
+      desc |
+      brw_message_desc(devinfo,
+                       unspill_inst->mlen,
+                       unspill_inst->size_written / REG_SIZE,
+                       unspill_inst->header_size));
+
+   assert(unspill_inst->size_written == inst->size_written);
+   assert(unspill_inst->size_read(devinfo, SEND_SRC_PAYLOAD1) == inst->size_read(devinfo, FILL_SRC_PAYLOAD1));
+
+   inst->remove();
+}
+
+static void
+brw_lower_lsc_spill(const intel_device_info *devinfo, brw_inst *inst)
+{
+   assert(devinfo->verx10 >= 125);
+
+   const brw_builder bld(inst);
+   brw_reg offset = inst->src[SPILL_SRC_PAYLOAD1];
+   brw_reg src = inst->src[SPILL_SRC_PAYLOAD2];
+
+   const unsigned reg_size = src.component_size(bld.dispatch_width()) /
+                             REG_SIZE;
+
+   assert(!inst->as_scratch()->use_transpose);
+
+   const brw_reg ex_desc = build_ex_desc(bld, reg_size, false);
+
+   brw_send_inst *spill_inst = bld.SEND();
+
+   spill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
+   spill_inst->src[SEND_SRC_EX_DESC]  = ex_desc;
+   spill_inst->src[SEND_SRC_PAYLOAD1] = offset;
+   spill_inst->src[SEND_SRC_PAYLOAD2] = src;
+
+   spill_inst->sfid = BRW_SFID_UGM;
+   uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
+                                LSC_ADDR_SURFTYPE_SS,
+                                LSC_ADDR_SIZE_A32,
+                                LSC_DATA_SIZE_D32,
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
+   spill_inst->header_size = 0;
+   spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
+                                       bld.dispatch_width());
+   spill_inst->ex_mlen = reg_size;
+   spill_inst->size_written = 0;
+   spill_inst->has_side_effects = true;
+   spill_inst->is_volatile = false;
+
+   spill_inst->src[0] = brw_imm_ud(
+      desc |
+      brw_message_desc(devinfo,
+                       spill_inst->mlen,
+                       spill_inst->size_written / REG_SIZE,
+                       spill_inst->header_size));
+
+   assert(spill_inst->size_written == inst->size_written);
+   assert(spill_inst->size_read(devinfo, SEND_SRC_PAYLOAD1) == inst->size_read(devinfo, SPILL_SRC_PAYLOAD1));
+   assert(spill_inst->size_read(devinfo, SEND_SRC_PAYLOAD2) == inst->size_read(devinfo, SPILL_SRC_PAYLOAD2));
+
+   inst->remove();
+}
+
+bool
+brw_lower_fill_and_spill(brw_shader &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_LSC_FILL:
+         brw_lower_lsc_fill(s.devinfo, s, inst);
+         progress = true;
+         break;
+
+      case SHADER_OPCODE_LSC_SPILL:
+         brw_lower_lsc_spill(s.devinfo, inst);
+         progress = true;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
+                            BRW_DEPENDENCY_VARIABLES);
+
+   return progress;
+}
--- a/src/intel/compiler/brw/brw_opt_cse.cpp
+++ b/src/intel/compiler/brw/brw_opt_cse.cpp
@ -473,6 +473,9 @@ hash_inst(const void *v)
   case BRW_KIND_BASE:
      /* Nothing else to do. */
      break;
+
+   case BRW_KIND_SCRATCH:
+      UNREACHABLE("Spill and fills should not exist yet.");
   }

   if (inst->opcode == BRW_OPCODE_MAD) {
--- a/src/intel/compiler/brw/brw_print.cpp
+++ b/src/intel/compiler/brw/brw_print.cpp
@ -269,6 +269,11 @@ brw_instruction_name(const struct brw_isa_info *isa, const brw_inst *inst)

   case SHADER_OPCODE_FLOW:
      return "flow";
+
+   case SHADER_OPCODE_LSC_FILL:
+      return "fill_lsc";
+   case SHADER_OPCODE_LSC_SPILL:
+      return "spill_lsc";
   }

   UNREACHABLE("not reached");
--- a/src/intel/compiler/brw/brw_reg_allocate.cpp
+++ b/src/intel/compiler/brw/brw_reg_allocate.cpp
@ -296,8 +296,6 @@ private:

   bool build_interference_graph(bool allow_spilling);

-   brw_reg build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill);
-
   brw_reg build_lane_offsets(const brw_builder &bld,
                              uint32_t spill_offset, int ip);
   brw_reg build_single_offset(const brw_builder &bld,
@ -505,6 +503,10 @@ brw_inst_has_source_and_destination_hazard(const struct intel_device_info *devin
       * be overly conservative.
       */
      return inst->as_dpas()->rcount > 1;
+
+   case SHADER_OPCODE_LSC_FILL:
+      return false;
+
   default:
      /* The SIMD16 compressed instruction
       *
@ -631,6 +633,13 @@ brw_reg_alloc::setup_inst_interference(const brw_inst *inst)
      ra_add_node_interference(g,
         first_vgrf_node + inst->src[SEND_SRC_PAYLOAD1].nr,
         first_vgrf_node + inst->src[SEND_SRC_PAYLOAD2].nr);
+   } else if (inst->opcode == SHADER_OPCODE_LSC_SPILL &&
+              inst->src[SPILL_SRC_PAYLOAD1].file == VGRF &&
+              inst->src[SPILL_SRC_PAYLOAD2].file == VGRF &&
+              inst->src[SPILL_SRC_PAYLOAD1].nr != inst->src[SPILL_SRC_PAYLOAD2].nr) {
+      ra_add_node_interference(g,
+         first_vgrf_node + inst->src[SPILL_SRC_PAYLOAD1].nr,
+         first_vgrf_node + inst->src[SPILL_SRC_PAYLOAD2].nr);
   }

   /* When we do send-from-GRF for FB writes, we need to ensure that the last
@ -774,43 +783,6 @@ brw_reg_alloc::build_single_offset(const brw_builder &bld, uint32_t spill_offset
   return offset;
 }

-brw_reg
-brw_reg_alloc::build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill)
-{
-   /* Use a different area of the address register than what is used in
-    * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have
-    * interactions between the spill/fill instructions and the other send
-    * messages.
-    */
-   brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD,
-                               BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC);
-
-   brw_builder ubld = bld.uniform();
-
-   brw_inst *inst = ubld.AND(ex_desc,
-                             retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
-                             brw_imm_ud(INTEL_MASK(31, 10)));
-   _mesa_set_add(spill_insts, inst);
-
-   const intel_device_info *devinfo = bld.shader->devinfo;
-   if (devinfo->verx10 >= 200) {
-      inst = ubld.SHR(ex_desc, ex_desc, brw_imm_ud(4));
-      _mesa_set_add(spill_insts, inst);
-   } else {
-      if (unspill) {
-         inst = ubld.OR(ex_desc, ex_desc, brw_imm_ud(BRW_SFID_UGM));
-         _mesa_set_add(spill_insts, inst);
-      } else {
-         inst = ubld.OR(ex_desc,
-                        ex_desc,
-                        brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | BRW_SFID_UGM));
-         _mesa_set_add(spill_insts, inst);
-      }
-   }
-
-   return ex_desc;
-}
-
 brw_reg
 brw_reg_alloc::build_lane_offsets(const brw_builder &bld, uint32_t spill_offset, int ip)
 {
@ -905,7 +877,6 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld,
   for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) {
      ++stats->fill_count;

-      brw_send_inst *unspill_inst;
      if (devinfo->verx10 >= 125) {
         /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can
          * load more using transpose messages.
@ -921,46 +892,26 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld,
            offset = build_lane_offsets(ubld, spill_offset, ip);
         }

-         uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
-                                      LSC_ADDR_SURFTYPE_SS,
-                                      LSC_ADDR_SIZE_A32,
-                                      LSC_DATA_SIZE_D32,
-                                      use_transpose ? reg_size * 8 : 1 /* num_channels */,
-                                      use_transpose,
-                                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
-
-         const brw_reg ex_desc_reg = build_ex_desc(bld, reg_size, true);
-
-         unspill_inst = ubld.SEND();
+         const bool exec_all = use_transpose || bld.has_writemask_all();
+         brw_scratch_inst *unspill_inst = bld.exec_all(exec_all).FILL();
         unspill_inst->dst = dst;

-         unspill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
-         unspill_inst->src[SEND_SRC_EX_DESC]  = ex_desc_reg;
-         unspill_inst->src[SEND_SRC_PAYLOAD1] = offset;
-         unspill_inst->src[SEND_SRC_PAYLOAD2] = brw_reg();
+         unspill_inst->src[FILL_SRC_PAYLOAD1] = offset;

-         unspill_inst->sfid = BRW_SFID_UGM;
-         unspill_inst->header_size = 0;
-         unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
-                                               unspill_inst->exec_size);
-         unspill_inst->ex_mlen = 0;
+         unspill_inst->offset = spill_offset;
+         unspill_inst->use_transpose = use_transpose;
         unspill_inst->size_written =
            lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
-         unspill_inst->has_side_effects = false;
-         unspill_inst->is_volatile = true;
+         assert(unspill_inst->size_written == (reg_size * REG_SIZE));

-         unspill_inst->src[0] = brw_imm_ud(
-            desc |
-            brw_message_desc(devinfo,
-                             unspill_inst->mlen,
-                             unspill_inst->size_written / REG_SIZE,
-                             unspill_inst->header_size));
+         _mesa_set_add(spill_insts, unspill_inst);
+         assert(unspill_inst->force_writemask_all || count % reg_size == 0);
      } else {
         brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);

         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;

-         unspill_inst = bld.SEND();
+         brw_send_inst *unspill_inst = bld.SEND();
         unspill_inst->dst = dst;

         unspill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
@ -983,9 +934,10 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld,
                             unspill_inst->mlen,
                             unspill_inst->size_written / REG_SIZE,
                             unspill_inst->header_size));
+
+         _mesa_set_add(spill_insts, unspill_inst);
+         assert(unspill_inst->force_writemask_all || count % reg_size == 0);
      }
-      _mesa_set_add(spill_insts, unspill_inst);
-      assert(unspill_inst->force_writemask_all || count % reg_size == 0);

      dst.offset += reg_size * REG_SIZE;
      spill_offset += reg_size * REG_SIZE;
@ -1005,48 +957,26 @@ brw_reg_alloc::emit_spill(const brw_builder &bld,
   for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) {
      ++stats->spill_count;

-      brw_send_inst *spill_inst;
      if (devinfo->verx10 >= 125) {
         brw_reg offset = build_lane_offsets(bld, spill_offset, ip);

-         const brw_reg ex_desc_reg = build_ex_desc(bld, reg_size, false);
-
-         spill_inst = bld.SEND();
+         brw_scratch_inst *spill_inst = bld.SPILL();
         spill_inst->dst = bld.null_reg_f();

-         spill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
-         spill_inst->src[SEND_SRC_EX_DESC]  = ex_desc_reg;
-         spill_inst->src[SEND_SRC_PAYLOAD1] = offset;
-         spill_inst->src[SEND_SRC_PAYLOAD2] = src;
+         spill_inst->src[SPILL_SRC_PAYLOAD1] = offset;
+         spill_inst->src[SPILL_SRC_PAYLOAD2] = src;

-         spill_inst->sfid = BRW_SFID_UGM;
-         uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
-                                      LSC_ADDR_SURFTYPE_SS,
-                                      LSC_ADDR_SIZE_A32,
-                                      LSC_DATA_SIZE_D32,
-                                      1 /* num_channels */,
-                                      false /* transpose */,
-                                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
-         spill_inst->header_size = 0;
-         spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
-                                             bld.dispatch_width());
-         spill_inst->ex_mlen = reg_size;
-         spill_inst->size_written = 0;
-         spill_inst->has_side_effects = true;
-         spill_inst->is_volatile = false;
+         spill_inst->offset = spill_offset;
+         spill_inst->use_transpose = false;

-         spill_inst->src[0] = brw_imm_ud(
-            desc |
-            brw_message_desc(devinfo,
-                             spill_inst->mlen,
-                             spill_inst->size_written / REG_SIZE,
-                             spill_inst->header_size));
+         _mesa_set_add(spill_insts, spill_inst);
+         assert(spill_inst->force_writemask_all || count % reg_size == 0);
      } else {
         brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);

         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;

-         spill_inst = bld.SEND();
+         brw_send_inst *spill_inst = bld.SEND();
         spill_inst->dst = bld.null_reg_f();

         spill_inst->src[SEND_SRC_DESC]     = brw_imm_ud(0);
@ -1072,9 +1002,10 @@ brw_reg_alloc::emit_spill(const brw_builder &bld,
                             spill_inst->header_size));
         spill_inst->src[1] = brw_imm_ud(
            brw_message_ex_desc(devinfo, spill_inst->ex_mlen));
+
+         _mesa_set_add(spill_insts, spill_inst);
+         assert(spill_inst->force_writemask_all || count % reg_size == 0);
      }
-      _mesa_set_add(spill_insts, spill_inst);
-      assert(spill_inst->force_writemask_all || count % reg_size == 0);

      src.offset += reg_size * REG_SIZE;
      spill_offset += reg_size * REG_SIZE;
--- a/src/intel/compiler/brw/brw_shader.cpp
+++ b/src/intel/compiler/brw/brw_shader.cpp
@ -1277,6 +1277,10 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)

   s.debug_optimizer(nir, "post_ra_alloc", iteration, pass_num);

+   if (s.spilled_any_registers) {
+      OPT(brw_lower_fill_and_spill);
+   }
+
   OPT(brw_opt_bank_conflicts);
   OPT_V(brw_schedule_instructions_post_ra);

--- a/src/intel/compiler/brw/brw_shader.h
+++ b/src/intel/compiler/brw/brw_shader.h
@ -328,6 +328,7 @@ bool brw_lower_constant_loads(brw_shader &s);
 bool brw_lower_csel(brw_shader &s);
 bool brw_lower_derivatives(brw_shader &s);
 bool brw_lower_dpas(brw_shader &s);
+bool brw_lower_fill_and_spill(brw_shader &s);
 bool brw_lower_find_live_channel(brw_shader &s);
 bool brw_lower_indirect_mov(brw_shader &s);
 bool brw_lower_integer_multiplication(brw_shader &s);
--- a/src/intel/compiler/brw/meson.build
+++ b/src/intel/compiler/brw/meson.build
@ -43,6 +43,7 @@ libintel_compiler_brw_files = files(
  'brw_load_reg.cpp',
  'brw_lower.cpp',
  'brw_lower_dpas.cpp',
+  'brw_lower_fill_spill.cpp',
  'brw_lower_integer_multiplication.cpp',
  'brw_lower_logical_sends.cpp',
  'brw_lower_pack.cpp',