intel/brw: Blockify convergent load_shared on Gfx11-12 as well

Gfx11-12 can support SLM block loads via OWord Block Load messages (notably, the aligned version, not the unaligned version). A while back we deleted the SHADER_OPCODE_OWORD_BLOCK_READ opcode. Rather than bring it back, we continue using UNALIGNED_OWORD_BLOCK_READ for SLM block access (like we do for SSBOs) but switch it over to the aligned variant when lowering logical sends. We do ensure the alignment is at least 16B, however. This is ugly, but it's probably not worth bringing back a whole extra opcode for a legacy HDC block load quirk. References: BSpec 47652 and 1689 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9960 Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29429>
2026-05-06 09:28:07 +02:00 · 2024-05-24 16:06:12 -07:00 · 2024-05-24 16:06:12 -07:00 · fbe0f8d36d
commit fbe0f8d36d
parent 3b1b2d9e6d
3 changed files with 20 additions and 3 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -6544,6 +6544,9 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
            get_nir_buffer_intrinsic_index(ntb, bld, instr);
      } else {
         srcs[SURFACE_LOGICAL_SRC_SURFACE] = fs_reg(brw_imm_ud(GFX7_BTI_SLM));
+
+         /* SLM has to use aligned OWord Block Read messages on pre-LSC HW. */
+         assert(devinfo->has_lsc || nir_intrinsic_align(instr) >= 16);
      }

      const unsigned total_dwords = ALIGN(instr->num_components,
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@ -1930,8 +1930,14 @@ lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)

   const bool has_side_effects = inst->has_side_effects();

+   /* SLM block reads must use the 16B-aligned OWord Block Read messages,
+    * as the unaligned message doesn't exist for SLM.  However, we still
+    * use SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL in that case
+    * (to avoid adding more opcodes), but only emit it with 16B alignment.
+    */
   const bool align_16B =
-      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
+      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL ||
+      (surface.file == IMM && surface.ud == GFX7_BTI_SLM);

   const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;

--- a/src/intel/compiler/intel_nir_blockify_uniform_loads.c
+++ b/src/intel/compiler/intel_nir_blockify_uniform_loads.c
@ -69,8 +69,8 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b,
      return true;

   case nir_intrinsic_load_shared:
-      /* Block loads on shared memory are not supported before the LSC. */
-      if (!devinfo->has_lsc)
+      /* Block loads on shared memory are not supported before Icelake. */
+      if (devinfo->ver < 11)
         return false;

      if (nir_src_is_divergent(intrin->src[0]))
@ -79,6 +79,14 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b,
      if (intrin->def.bit_size != 32)
         return false;

+      /* Without the LSC, we have to use OWord Block Load messages (the one
+       * that requires OWord aligned offsets, too).
+       */
+      if (!devinfo->has_lsc &&
+          (intrin->def.num_components < 4 ||
+           nir_intrinsic_align(intrin) < 16))
+         return false;
+
      intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
      return true;