From 0b99c88337907715bf43714c061a4ca380847fd0 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 8 Apr 2026 15:20:54 -0700
Subject: [PATCH] nir, brw: lower scratch in NIR

This will let us share a common scratch swizzling between brw and jay.

Changes by Ken:
- Use an immediate SIMD width when known so we don't need to re-lower
- Switch to load_simd_width_intel because it may not match
  info->api_subgroup_size on Vulkan without VK_EXT_subgroup_size_control
- Stop using DWord Scattered Write messages for scratch.  These take an
  offset in DWords, and our offsets are now always in bytes.  This also
  means that we no longer create MEMORY_OPCODE_* IR with inconsistent
  units of either bytes or dwords.  Yikes.  We use byte scattered
  messages now.

fossil-db stats on Battlemage:

   Instrs: 500477504 -> 500450056 (-0.01%); split: -0.01%, +0.00%
   CodeSize: 7807432368 -> 7806786192 (-0.01%); split: -0.01%, +0.00%
   Cycle count: 62404008370 -> 62398437734 (-0.01%); split: -0.01%, +0.00%
   Fill count: 546690 -> 546695 (+0.00%); split: -0.00%, +0.00%
   Max live registers: 141257956 -> 141258100 (+0.00%); split: -0.00%, +0.00%
   Non SSA regs after NIR: 72350283 -> 72336544 (-0.02%)

   Totals from 99 (0.01% of 1581969) affected shaders:
   Instrs: 366593 -> 339145 (-7.49%); split: -7.58%, +0.09%
   CodeSize: 6425936 -> 5779760 (-10.06%); split: -10.06%, +0.00%
   Cycle count: 2412009876 -> 2406439240 (-0.23%); split: -0.26%, +0.03%
   Fill count: 19675 -> 19680 (+0.03%); split: -0.02%, +0.04%
   Max live registers: 17600 -> 17744 (+0.82%); split: -0.09%, +0.91%
   Non SSA regs after NIR: 37894 -> 24155 (-36.26%)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40843>
---
 src/compiler/nir/nir_divergence_analysis.c    |   1 +
 src/compiler/nir/nir_intrinsics.py            |   7 ++
 src/compiler/nir/nir_lower_io.c               |   3 +
 src/intel/compiler/brw/brw_from_nir.cpp       | 102 ++----------------
 .../compiler/brw/brw_lower_logical_sends.cpp  |  10 +-
 src/intel/compiler/brw/brw_nir.c              |  84 ++++++++++++++-
 6 files changed, 106 insertions(+), 101 deletions(-)

diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
index 32ebac13524..d51cd55c87a 100644
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -892,6 +892,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
    case nir_intrinsic_is_helper_invocation:
    case nir_intrinsic_load_scratch:
    case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_scratch_intel:
    case nir_intrinsic_deref_atomic:
    case nir_intrinsic_deref_atomic_swap:
    case nir_intrinsic_ssbo_atomic:
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 264ea5c1d7c..0a2b9a26f47 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -2660,6 +2660,13 @@ store("ssbo_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { value, offset }.
 store("shared_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET])
 
+# These offsets are into per-subgroup scratch memory, rather than the per-lane
+# offsets the standard NIR intrinsics use.
+# src[] = { offset }.
+load("scratch_intel", [1], [ACCESS], [CAN_ELIMINATE])
+# src[] = { value, offset }.
+store("scratch_intel", [1], [])
+
 # src[] = { address }.
 load("global_constant_uniform_block_intel", [1],
      [ACCESS, ALIGN_MUL, ALIGN_OFFSET, BASE], [CAN_ELIMINATE, CAN_REORDER])
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index b64bb4bd567..6bb281e6cba 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -992,6 +992,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
    case nir_intrinsic_load_global_nv:
    case nir_intrinsic_load_scratch:
    case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_scratch_intel:
    case nir_intrinsic_load_fs_input_interp_deltas:
    case nir_intrinsic_shared_atomic:
    case nir_intrinsic_shared_atomic_nv:
@@ -1043,6 +1044,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
    case nir_intrinsic_store_urb_lsc_intel:
    case nir_intrinsic_store_scratch:
    case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_scratch_intel:
    case nir_intrinsic_ssbo_atomic:
    case nir_intrinsic_ssbo_atomic_swap:
    case nir_intrinsic_ldc_nv:
@@ -1189,6 +1191,7 @@ nir_get_io_data_src_number(const nir_intrinsic_instr *intr)
    case nir_intrinsic_store_global_nv:
    case nir_intrinsic_store_scratch:
    case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_scratch_intel:
    case nir_intrinsic_store_raw_output_pan:
    case nir_intrinsic_store_combined_output_pan:
    case nir_intrinsic_store_tile_pan:
diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index 96697e7664d..986ead9f8df 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -4561,82 +4561,6 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
    return bld.emit_uniformize(retype(surf_index, type));
 }
 
-/**
- * The offsets we get from NIR act as if each SIMD channel has it's own blob
- * of contiguous space.  However, if we actually place each SIMD channel in
- * it's own space, we end up with terrible cache performance because each SIMD
- * channel accesses a different cache line even when they're all accessing the
- * same byte offset.  To deal with this problem, we swizzle the address using
- * a simple algorithm which ensures that any time a SIMD message reads or
- * writes the same address, it's all in the same cache line.  We have to keep
- * the bottom two bits fixed so that we can read/write up to a dword at a time
- * and the individual element is contiguous.  We do this by splitting the
- * address as follows:
- *
- *    31                             4-6           2          0
- *    +-------------------------------+------------+----------+
- *    |        Hi address bits        | chan index | addr low |
- *    +-------------------------------+------------+----------+
- *
- * In other words, the bottom two address bits stay, and the top 30 get
- * shifted up so that we can stick the SIMD channel index in the middle.  This
- * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
- * at the same logical offset, the scratch read/write instruction acts on
- * continuous elements and we get good cache locality.
- */
-static brw_reg
-swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
-                         const brw_builder &bld,
-                         const nir_src &nir_addr_src,
-                         bool in_dwords)
-{
-   brw_shader &s = ntb.s;
-
-   const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
-   const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
-
-   if (nir_src_is_const(nir_addr_src)) {
-      unsigned nir_addr = nir_src_as_uint(nir_addr_src);
-      if (in_dwords) {
-         /* In this case, we know the address is aligned to a DWORD and we want
-          * the final address in DWORDs.
-          */
-         return bld.OR(chan_index,
-                       brw_imm_ud(nir_addr << (chan_index_bits - 2)));
-      } else {
-         /* This case is substantially more annoying because we have to pay
-          * attention to those pesky two bottom bits.
-          */
-         unsigned addr_hi = (nir_addr & ~0x3u) << chan_index_bits;
-         unsigned addr_lo = (nir_addr &  0x3u);
-
-         return bld.OR(bld.SHL(chan_index, brw_imm_ud(2)),
-                       brw_imm_ud(addr_lo | addr_hi));
-      }
-   }
-
-   const brw_reg nir_addr =
-      retype(get_nir_src(ntb, nir_addr_src, 0), BRW_TYPE_UD);
-
-   if (in_dwords) {
-      /* In this case, we know the address is aligned to a DWORD and we want
-       * the final address in DWORDs.
-       */
-      return bld.OR(bld.SHL(nir_addr, brw_imm_ud(chan_index_bits - 2)),
-                    chan_index);
-   } else {
-      /* This case substantially more annoying because we have to pay
-       * attention to those pesky two bottom bits.
-       */
-      brw_reg chan_addr = bld.SHL(chan_index, brw_imm_ud(2));
-      brw_reg addr_bits =
-         bld.OR(bld.AND(nir_addr, brw_imm_ud(0x3u)),
-                bld.SHL(bld.AND(nir_addr, brw_imm_ud(~0x3u)),
-                        brw_imm_ud(chan_index_bits)));
-      return bld.OR(addr_bits, chan_addr);
-   }
-}
-
 static unsigned
 choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
 {
@@ -4919,6 +4843,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
    case nir_intrinsic_global_atomic_swap:
    case nir_intrinsic_load_scratch:
    case nir_intrinsic_store_scratch:
+   case nir_intrinsic_load_scratch_intel:
+   case nir_intrinsic_store_scratch_intel:
    case nir_intrinsic_load_shader_indirect_data_intel:
       brw_from_nir_emit_memory_access(ntb, bld, xbld, instr);
       break;
@@ -6098,8 +6024,8 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
       no_mask_handle = true;
       break;
    }
-   case nir_intrinsic_load_scratch:
-   case nir_intrinsic_store_scratch: {
+   case nir_intrinsic_load_scratch_intel:
+   case nir_intrinsic_store_scratch_intel: {
       mode = MEMORY_MODE_SCRATCH;
 
       const nir_src &addr = instr->src[is_store ? 1 : 0];
@@ -6113,25 +6039,17 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
          if (devinfo->ver >= 20 || intel_has_extended_bindless(devinfo))
             bind = ubld.SHR(bind, brw_imm_ud(4));
 
-         /* load_scratch / store_scratch cannot be is_scalar yet. */
-         assert(xbld.dispatch_width() == bld.dispatch_width());
-
          srcs[MEMORY_LOGICAL_BINDING] = component(bind, 0);
-         srcs[MEMORY_LOGICAL_ADDRESS] =
-            swizzle_nir_scratch_addr(ntb, bld, addr, false);
       } else {
-         unsigned bit_size =
-            is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size;
-         bool dword_aligned = alignment >= 4 && bit_size == 32;
-
-         /* load_scratch / store_scratch cannot be is_scalar yet. */
-         assert(xbld.dispatch_width() == bld.dispatch_width());
-
          binding_type = LSC_ADDR_SURFTYPE_FLAT;
-         srcs[MEMORY_LOGICAL_ADDRESS] =
-            swizzle_nir_scratch_addr(ntb, bld, addr, dword_aligned);
       }
 
+      /* load_scratch / store_scratch cannot be is_scalar yet. */
+      assert(xbld.dispatch_width() == bld.dispatch_width());
+
+      srcs[MEMORY_LOGICAL_ADDRESS] =
+         retype(get_nir_src(ntb, addr, 0), BRW_TYPE_UD);
+
       if (is_store)
          ++s.shader_stats.spill_count;
       else
diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
index d4c9e29b1f2..a2618eb1788 100644
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@@ -1425,9 +1425,9 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
       8 * lsc_data_size_bytes(data_size);
 
    const bool byte_scattered =
-      data_bit_size < 32 || (alignment != 0 && alignment < 4);
-   const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
-   const bool surface_access = !byte_scattered && !dword_scattered && !block;
+      data_bit_size < 32 || (alignment != 0 && alignment < 4) ||
+      mode == MEMORY_MODE_SCRATCH;
+   const bool surface_access = !byte_scattered && !block;
 
    /* SLM block reads must use the 16B-aligned OWord Block Read messages,
     * as the unaligned message doesn't exist for SLM.
@@ -1550,7 +1550,6 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
       desc = brw_dp_oword_block_rw_desc(devinfo, false, components, !has_dest);
    } else if (addr_size == LSC_ADDR_SIZE_A64) {
       assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
-      assert(!dword_scattered);
 
       sfid = BRW_SFID_HDC1;
 
@@ -1595,9 +1594,6 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
       } else if (byte_scattered) {
          desc = brw_dp_byte_scattered_rw_desc(devinfo, mem->exec_size,
                                               data_bit_size, !has_dest);
-      } else if (dword_scattered) {
-         desc = brw_dp_dword_scattered_rw_desc(devinfo, mem->exec_size,
-                                               !has_dest);
       } else {
          desc = brw_dp_untyped_surface_rw_desc(devinfo, mem->exec_size,
                                                components, !has_dest);
diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index 50cca73644c..767e833a788 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -3,7 +3,6 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "intel_nir.h"
 #include "brw_nir.h"
 #include "brw_private.h"
 #include "brw_sampler.h"
@@ -11,6 +10,82 @@
 #include "compiler/nir/nir_builder.h"
 #include "dev/intel_debug.h"
 #include "util/sparse_bitset.h"
+#include "intel_nir.h"
+#include "nir.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+
+/*
+ * Intel scratch swizzling can be described with the formula:
+ *
+ *    (SIMD width * round_down(offset_B, stride_B)) +
+ *    (lane * stride_B) +
+ *    (offset_B % stride_B)
+ */
+static nir_def *
+swizzle_scratch(nir_builder *b,
+                nir_def *offset_B,
+                unsigned stride_B,
+                unsigned align_B)
+{
+   struct shader_info *info = &b->shader->info;
+
+   assert(util_is_power_of_two_nonzero(stride_B));
+   assert(util_is_power_of_two_nonzero(align_B));
+
+   nir_def *trailing_B = NULL;
+   if (align_B < stride_B) {
+      trailing_B = nir_umod_imm(b, offset_B, stride_B);
+      offset_B = nir_iand_imm(b, offset_B, ~(stride_B - 1));
+   }
+
+   nir_def *simd_width = info->min_subgroup_size == info->max_subgroup_size ?
+                         nir_imm_int(b, info->max_subgroup_size) :
+                         nir_load_simd_width_intel(b);
+
+   nir_def *simd_offs_B = nir_imul(b, simd_width, offset_B);
+
+   nir_def *lane = nir_load_subgroup_invocation(b);
+   nir_def *lane_offs_B = nir_imul_imm(b, lane, stride_B);
+   nir_def *swizzled_B = nir_iadd(b, simd_offs_B, lane_offs_B);
+
+   return trailing_B ? nir_iadd(b, swizzled_B, trailing_B) : swizzled_B;
+}
+
+static bool
+lower_scratch(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+   b->constant_fold_alu = true;
+
+   unsigned stride = 4 /* TODO */;
+
+   if (intr->intrinsic == nir_intrinsic_load_scratch) {
+      nir_def *val =
+         nir_load_scratch_intel(b, intr->def.num_components, intr->def.bit_size,
+                                swizzle_scratch(b, intr->src[0].ssa, stride,
+                                                nir_intrinsic_align(intr)),
+                                .access = nir_intrinsic_access(intr));
+      nir_def_replace(&intr->def, val);
+   } else if (intr->intrinsic == nir_intrinsic_store_scratch) {
+      nir_store_scratch_intel(b, intr->src[0].ssa,
+                              swizzle_scratch(b, intr->src[1].ssa, stride,
+                                              nir_intrinsic_align(intr)));
+      nir_instr_remove(&intr->instr);
+   } else {
+      return false;
+   }
+
+   return true;
+}
+
+static bool
+intel_nir_lower_scratch(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir, lower_scratch,
+                                     nir_metadata_control_flow, NULL);
+}
 
 /**
  * Returns the minimum number of vec4 elements needed to pack a type.
@@ -2493,6 +2568,10 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
    OPT(nir_opt_algebraic);
    OPT(nir_opt_cse);
 
+   if (pt->nir->scratch_size) {
+      OPT(intel_nir_lower_scratch);
+   }
+
    /* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads
     * so that we maximize the offset put into the messages.
     */
@@ -3139,6 +3218,7 @@ lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
    case nir_intrinsic_load_ssbo_uniform_block_intel:
    case nir_intrinsic_load_ubo_uniform_block_intel:
    case nir_intrinsic_load_scratch:
+   case nir_intrinsic_load_scratch_intel:
    case nir_intrinsic_load_shader_indirect_data_intel:
       return LSC_OP_LOAD;
 
@@ -3149,7 +3229,7 @@ lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
    case nir_intrinsic_store_global_block_intel:
    case nir_intrinsic_store_shared_block_intel:
    case nir_intrinsic_store_ssbo_block_intel:
-   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_scratch_intel:
       return LSC_OP_STORE;
 
    case nir_intrinsic_image_load: