brw: Fix Xe2 spilling code to limit to SIMD32 rather than SIMD16

LSC can do native SIMD32 messages on Xe2. Cuts spill/fills on Lunarlake: - q2rtx-rt-pipeline: -20.83% / -16.85% - Borderlands 3 DX12: -18.26% / -2.09% - Cyberpunk 2077: -2.18% / -0.11% Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32986>
2026-01-06 08:50:09 +01:00 · 2025-01-10 01:30:22 -08:00 · 2025-01-10 01:30:22 -08:00 · 894393470a
commit 894393470a
parent 8ac7802ac8
1 changed files with 17 additions and 7 deletions
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@ -353,9 +353,9 @@ namespace {
   unsigned
   spill_max_size(const fs_visitor *s)
   {
-      /* LSC is limited to SIMD16 sends */
+      /* LSC is limited to SIMD16 sends (SIMD32 on Xe2) */
      if (s->devinfo->has_lsc)
-         return 2;
+         return 2 * reg_unit(s->devinfo);

      /* FINISHME - On Gfx7+ it should be possible to avoid this limit
       *            altogether by spilling directly from the temporary GRF
@ -726,8 +726,7 @@ fs_reg_alloc::build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspi
 brw_reg
 fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
 {
-   /* LSC messages are limited to SIMD16 */
-   assert(bld.dispatch_width() <= 16);
+   assert(bld.dispatch_width() <= 16 * reg_unit(bld.shader->devinfo));

   const fs_builder ubld = bld.exec_all();
   const unsigned reg_count = ubld.dispatch_width() / 8;
@ -755,6 +754,15 @@ fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, i
   inst = ubld.SHL(offset, offset, brw_imm_ud(2));
   _mesa_set_add(spill_insts, inst);

+   /* Build offsets in the upper 16 lanes of SIMD32 */
+   if (ubld.dispatch_width() > 16) {
+      inst = ubld.group(16, 0).ADD(
+         byte_offset(offset, 2 * REG_SIZE),
+         byte_offset(offset, 0),
+         brw_imm_ud(16 << 2));
+      _mesa_set_add(spill_insts, inst);
+   }
+
   /* Add the base offset */
   if (spill_offset) {
      inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset));
@ -805,10 +813,12 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,

      fs_inst *unspill_inst;
      if (devinfo->verx10 >= 125) {
-         /* LSC is limited to SIMD16 load/store but we can load more using
-          * transpose messages.
+         /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can
+          * load more using transpose messages.
          */
-         const bool use_transpose = bld.dispatch_width() > 16 || bld.has_writemask_all();
+         const bool use_transpose =
+            bld.dispatch_width() > 16 * reg_unit(devinfo) ||
+            bld.has_writemask_all();
         const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld;
         brw_reg offset;
         if (use_transpose) {