nir, brw: lower scratch in NIR

This will let us share a common scratch swizzling between brw and jay. Changes by Ken: - Use an immediate SIMD width when known so we don't need to re-lower - Switch to load_simd_width_intel because it may not match info->api_subgroup_size on Vulkan without VK_EXT_subgroup_size_control - Stop using DWord Scattered Write messages for scratch. These take an offset in DWords, and our offsets are now always in bytes. This also means that we no longer create MEMORY_OPCODE_* IR with inconsistent units of either bytes or dwords. Yikes. We use byte scattered messages now. fossil-db stats on Battlemage: Instrs: 500477504 -> 500450056 (-0.01%); split: -0.01%, +0.00% CodeSize: 7807432368 -> 7806786192 (-0.01%); split: -0.01%, +0.00% Cycle count: 62404008370 -> 62398437734 (-0.01%); split: -0.01%, +0.00% Fill count: 546690 -> 546695 (+0.00%); split: -0.00%, +0.00% Max live registers: 141257956 -> 141258100 (+0.00%); split: -0.00%, +0.00% Non SSA regs after NIR: 72350283 -> 72336544 (-0.02%) Totals from 99 (0.01% of 1581969) affected shaders: Instrs: 366593 -> 339145 (-7.49%); split: -7.58%, +0.09% CodeSize: 6425936 -> 5779760 (-10.06%); split: -10.06%, +0.00% Cycle count: 2412009876 -> 2406439240 (-0.23%); split: -0.26%, +0.03% Fill count: 19675 -> 19680 (+0.03%); split: -0.02%, +0.04% Max live registers: 17600 -> 17744 (+0.82%); split: -0.09%, +0.91% Non SSA regs after NIR: 37894 -> 24155 (-36.26%) Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40843>
2026-05-08 02:38:04 +02:00 · 2026-04-08 15:20:54 -07:00 · 2026-04-08 15:20:54 -07:00 · 0b99c88337
commit 0b99c88337
parent 140616d26a
6 changed files with 106 additions and 101 deletions
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -892,6 +892,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_is_helper_invocation:
   case nir_intrinsic_load_scratch:
   case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_scratch_intel:
   case nir_intrinsic_deref_atomic:
   case nir_intrinsic_deref_atomic_swap:
   case nir_intrinsic_ssbo_atomic:
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -2660,6 +2660,13 @@ store("ssbo_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { value, offset }.
 store("shared_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET])

+# These offsets are into per-subgroup scratch memory, rather than the per-lane
+# offsets the standard NIR intrinsics use.
+# src[] = { offset }.
+load("scratch_intel", [1], [ACCESS], [CAN_ELIMINATE])
+# src[] = { value, offset }.
+store("scratch_intel", [1], [])
+
 # src[] = { address }.
 load("global_constant_uniform_block_intel", [1],
     [ACCESS, ALIGN_MUL, ALIGN_OFFSET, BASE], [CAN_ELIMINATE, CAN_REORDER])
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@ -992,6 +992,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
   case nir_intrinsic_load_global_nv:
   case nir_intrinsic_load_scratch:
   case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_scratch_intel:
   case nir_intrinsic_load_fs_input_interp_deltas:
   case nir_intrinsic_shared_atomic:
   case nir_intrinsic_shared_atomic_nv:
@ -1043,6 +1044,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
   case nir_intrinsic_store_urb_lsc_intel:
   case nir_intrinsic_store_scratch:
   case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_scratch_intel:
   case nir_intrinsic_ssbo_atomic:
   case nir_intrinsic_ssbo_atomic_swap:
   case nir_intrinsic_ldc_nv:
@ -1189,6 +1191,7 @@ nir_get_io_data_src_number(const nir_intrinsic_instr *intr)
   case nir_intrinsic_store_global_nv:
   case nir_intrinsic_store_scratch:
   case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_scratch_intel:
   case nir_intrinsic_store_raw_output_pan:
   case nir_intrinsic_store_combined_output_pan:
   case nir_intrinsic_store_tile_pan:
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@ -4561,82 +4561,6 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
   return bld.emit_uniformize(retype(surf_index, type));
 }

-/**
- * The offsets we get from NIR act as if each SIMD channel has it's own blob
- * of contiguous space.  However, if we actually place each SIMD channel in
- * it's own space, we end up with terrible cache performance because each SIMD
- * channel accesses a different cache line even when they're all accessing the
- * same byte offset.  To deal with this problem, we swizzle the address using
- * a simple algorithm which ensures that any time a SIMD message reads or
- * writes the same address, it's all in the same cache line.  We have to keep
- * the bottom two bits fixed so that we can read/write up to a dword at a time
- * and the individual element is contiguous.  We do this by splitting the
- * address as follows:
- *
- *    31                             4-6           2          0
- *    +-------------------------------+------------+----------+
- *    |        Hi address bits        | chan index | addr low |
- *    +-------------------------------+------------+----------+
- *
- * In other words, the bottom two address bits stay, and the top 30 get
- * shifted up so that we can stick the SIMD channel index in the middle.  This
- * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
- * at the same logical offset, the scratch read/write instruction acts on
- * continuous elements and we get good cache locality.
- */
-static brw_reg
-swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
-                         const brw_builder &bld,
-                         const nir_src &nir_addr_src,
-                         bool in_dwords)
-{
-   brw_shader &s = ntb.s;
-
-   const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
-   const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
-
-   if (nir_src_is_const(nir_addr_src)) {
-      unsigned nir_addr = nir_src_as_uint(nir_addr_src);
-      if (in_dwords) {
-         /* In this case, we know the address is aligned to a DWORD and we want
-          * the final address in DWORDs.
-          */
-         return bld.OR(chan_index,
-                       brw_imm_ud(nir_addr << (chan_index_bits - 2)));
-      } else {
-         /* This case is substantially more annoying because we have to pay
-          * attention to those pesky two bottom bits.
-          */
-         unsigned addr_hi = (nir_addr & ~0x3u) << chan_index_bits;
-         unsigned addr_lo = (nir_addr &  0x3u);
-
-         return bld.OR(bld.SHL(chan_index, brw_imm_ud(2)),
-                       brw_imm_ud(addr_lo | addr_hi));
-      }
-   }
-
-   const brw_reg nir_addr =
-      retype(get_nir_src(ntb, nir_addr_src, 0), BRW_TYPE_UD);
-
-   if (in_dwords) {
-      /* In this case, we know the address is aligned to a DWORD and we want
-       * the final address in DWORDs.
-       */
-      return bld.OR(bld.SHL(nir_addr, brw_imm_ud(chan_index_bits - 2)),
-                    chan_index);
-   } else {
-      /* This case substantially more annoying because we have to pay
-       * attention to those pesky two bottom bits.
-       */
-      brw_reg chan_addr = bld.SHL(chan_index, brw_imm_ud(2));
-      brw_reg addr_bits =
-         bld.OR(bld.AND(nir_addr, brw_imm_ud(0x3u)),
-                bld.SHL(bld.AND(nir_addr, brw_imm_ud(~0x3u)),
-                        brw_imm_ud(chan_index_bits)));
-      return bld.OR(addr_bits, chan_addr);
-   }
-}
-
 static unsigned
 choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
 {
@ -4919,6 +4843,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
   case nir_intrinsic_global_atomic_swap:
   case nir_intrinsic_load_scratch:
   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_load_scratch_intel:
+   case nir_intrinsic_store_scratch_intel:
   case nir_intrinsic_load_shader_indirect_data_intel:
      brw_from_nir_emit_memory_access(ntb, bld, xbld, instr);
      break;
@ -6098,8 +6024,8 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
      no_mask_handle = true;
      break;
   }
-   case nir_intrinsic_load_scratch:
-   case nir_intrinsic_store_scratch: {
+   case nir_intrinsic_load_scratch_intel:
+   case nir_intrinsic_store_scratch_intel: {
      mode = MEMORY_MODE_SCRATCH;

      const nir_src &addr = instr->src[is_store ? 1 : 0];
@ -6113,25 +6039,17 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
         if (devinfo->ver >= 20 || intel_has_extended_bindless(devinfo))
            bind = ubld.SHR(bind, brw_imm_ud(4));

-         /* load_scratch / store_scratch cannot be is_scalar yet. */
-         assert(xbld.dispatch_width() == bld.dispatch_width());
-
         srcs[MEMORY_LOGICAL_BINDING] = component(bind, 0);
-         srcs[MEMORY_LOGICAL_ADDRESS] =
-            swizzle_nir_scratch_addr(ntb, bld, addr, false);
      } else {
-         unsigned bit_size =
-            is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size;
-         bool dword_aligned = alignment >= 4 && bit_size == 32;
-
-         /* load_scratch / store_scratch cannot be is_scalar yet. */
-         assert(xbld.dispatch_width() == bld.dispatch_width());
-
         binding_type = LSC_ADDR_SURFTYPE_FLAT;
-         srcs[MEMORY_LOGICAL_ADDRESS] =
-            swizzle_nir_scratch_addr(ntb, bld, addr, dword_aligned);
      }

+      /* load_scratch / store_scratch cannot be is_scalar yet. */
+      assert(xbld.dispatch_width() == bld.dispatch_width());
+
+      srcs[MEMORY_LOGICAL_ADDRESS] =
+         retype(get_nir_src(ntb, addr, 0), BRW_TYPE_UD);
+
      if (is_store)
         ++s.shader_stats.spill_count;
      else
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@ -1425,9 +1425,9 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
      8 * lsc_data_size_bytes(data_size);

   const bool byte_scattered =
-      data_bit_size < 32 || (alignment != 0 && alignment < 4);
-   const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
-   const bool surface_access = !byte_scattered && !dword_scattered && !block;
+      data_bit_size < 32 || (alignment != 0 && alignment < 4) ||
+      mode == MEMORY_MODE_SCRATCH;
+   const bool surface_access = !byte_scattered && !block;

   /* SLM block reads must use the 16B-aligned OWord Block Read messages,
    * as the unaligned message doesn't exist for SLM.
@ -1550,7 +1550,6 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
      desc = brw_dp_oword_block_rw_desc(devinfo, false, components, !has_dest);
   } else if (addr_size == LSC_ADDR_SIZE_A64) {
      assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
-      assert(!dword_scattered);

      sfid = BRW_SFID_HDC1;

@ -1595,9 +1594,6 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
      } else if (byte_scattered) {
         desc = brw_dp_byte_scattered_rw_desc(devinfo, mem->exec_size,
                                              data_bit_size, !has_dest);
-      } else if (dword_scattered) {
-         desc = brw_dp_dword_scattered_rw_desc(devinfo, mem->exec_size,
-                                               !has_dest);
      } else {
         desc = brw_dp_untyped_surface_rw_desc(devinfo, mem->exec_size,
                                               components, !has_dest);
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@ -3,7 +3,6 @@
 * SPDX-License-Identifier: MIT
 */

-#include "intel_nir.h"
 #include "brw_nir.h"
 #include "brw_private.h"
 #include "brw_sampler.h"
@ -11,6 +10,82 @@
 #include "compiler/nir/nir_builder.h"
 #include "dev/intel_debug.h"
 #include "util/sparse_bitset.h"
+#include "intel_nir.h"
+#include "nir.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+
+/*
+ * Intel scratch swizzling can be described with the formula:
+ *
+ *    (SIMD width * round_down(offset_B, stride_B)) +
+ *    (lane * stride_B) +
+ *    (offset_B % stride_B)
+ */
+static nir_def *
+swizzle_scratch(nir_builder *b,
+                nir_def *offset_B,
+                unsigned stride_B,
+                unsigned align_B)
+{
+   struct shader_info *info = &b->shader->info;
+
+   assert(util_is_power_of_two_nonzero(stride_B));
+   assert(util_is_power_of_two_nonzero(align_B));
+
+   nir_def *trailing_B = NULL;
+   if (align_B < stride_B) {
+      trailing_B = nir_umod_imm(b, offset_B, stride_B);
+      offset_B = nir_iand_imm(b, offset_B, ~(stride_B - 1));
+   }
+
+   nir_def *simd_width = info->min_subgroup_size == info->max_subgroup_size ?
+                         nir_imm_int(b, info->max_subgroup_size) :
+                         nir_load_simd_width_intel(b);
+
+   nir_def *simd_offs_B = nir_imul(b, simd_width, offset_B);
+
+   nir_def *lane = nir_load_subgroup_invocation(b);
+   nir_def *lane_offs_B = nir_imul_imm(b, lane, stride_B);
+   nir_def *swizzled_B = nir_iadd(b, simd_offs_B, lane_offs_B);
+
+   return trailing_B ? nir_iadd(b, swizzled_B, trailing_B) : swizzled_B;
+}
+
+static bool
+lower_scratch(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+   b->constant_fold_alu = true;
+
+   unsigned stride = 4 /* TODO */;
+
+   if (intr->intrinsic == nir_intrinsic_load_scratch) {
+      nir_def *val =
+         nir_load_scratch_intel(b, intr->def.num_components, intr->def.bit_size,
+                                swizzle_scratch(b, intr->src[0].ssa, stride,
+                                                nir_intrinsic_align(intr)),
+                                .access = nir_intrinsic_access(intr));
+      nir_def_replace(&intr->def, val);
+   } else if (intr->intrinsic == nir_intrinsic_store_scratch) {
+      nir_store_scratch_intel(b, intr->src[0].ssa,
+                              swizzle_scratch(b, intr->src[1].ssa, stride,
+                                              nir_intrinsic_align(intr)));
+      nir_instr_remove(&intr->instr);
+   } else {
+      return false;
+   }
+
+   return true;
+}
+
+static bool
+intel_nir_lower_scratch(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir, lower_scratch,
+                                     nir_metadata_control_flow, NULL);
+}

 /**
 * Returns the minimum number of vec4 elements needed to pack a type.
@ -2493,6 +2568,10 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
   OPT(nir_opt_algebraic);
   OPT(nir_opt_cse);

+   if (pt->nir->scratch_size) {
+      OPT(intel_nir_lower_scratch);
+   }
+
   /* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads
    * so that we maximize the offset put into the messages.
    */
@ -3139,6 +3218,7 @@ lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
   case nir_intrinsic_load_ssbo_uniform_block_intel:
   case nir_intrinsic_load_ubo_uniform_block_intel:
   case nir_intrinsic_load_scratch:
+   case nir_intrinsic_load_scratch_intel:
   case nir_intrinsic_load_shader_indirect_data_intel:
      return LSC_OP_LOAD;

@ -3149,7 +3229,7 @@ lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
   case nir_intrinsic_store_global_block_intel:
   case nir_intrinsic_store_shared_block_intel:
   case nir_intrinsic_store_ssbo_block_intel:
-   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_scratch_intel:
      return LSC_OP_STORE;

   case nir_intrinsic_image_load: