From fbe0f8d36d62bb11158ec50de5275bb26e0a6bbc Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 24 May 2024 16:06:12 -0700
Subject: [PATCH] intel/brw: Blockify convergent load_shared on Gfx11-12 as
 well

Gfx11-12 can support SLM block loads via OWord Block Load messages
(notably, the aligned version, not the unaligned version).

A while back we deleted the SHADER_OPCODE_OWORD_BLOCK_READ opcode.
Rather than bring it back, we continue using UNALIGNED_OWORD_BLOCK_READ
for SLM block access (like we do for SSBOs) but switch it over to the
aligned variant when lowering logical sends.  We do ensure the alignment
is at least 16B, however.  This is ugly, but it's probably not worth
bringing back a whole extra opcode for a legacy HDC block load quirk.

References: BSpec 47652 and 1689
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9960
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29429>
---
 src/intel/compiler/brw_fs_nir.cpp                    |  3 +++
 src/intel/compiler/brw_lower_logical_sends.cpp       |  8 +++++++-
 .../compiler/intel_nir_blockify_uniform_loads.c      | 12 ++++++++++--
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 44f150aa166..5505787af40 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -6544,6 +6544,9 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
             get_nir_buffer_intrinsic_index(ntb, bld, instr);
       } else {
          srcs[SURFACE_LOGICAL_SRC_SURFACE] = fs_reg(brw_imm_ud(GFX7_BTI_SLM));
+
+         /* SLM has to use aligned OWord Block Read messages on pre-LSC HW. */
+         assert(devinfo->has_lsc || nir_intrinsic_align(instr) >= 16);
       }
 
       const unsigned total_dwords = ALIGN(instr->num_components,
diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp
index e3caebd8e13..c28a40c18bb 100644
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -1930,8 +1930,14 @@ lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
 
    const bool has_side_effects = inst->has_side_effects();
 
+   /* SLM block reads must use the 16B-aligned OWord Block Read messages,
+    * as the unaligned message doesn't exist for SLM.  However, we still
+    * use SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL in that case
+    * (to avoid adding more opcodes), but only emit it with 16B alignment.
+    */
    const bool align_16B =
-      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
+      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL ||
+      (surface.file == IMM && surface.ud == GFX7_BTI_SLM);
 
    const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
 
diff --git a/src/intel/compiler/intel_nir_blockify_uniform_loads.c b/src/intel/compiler/intel_nir_blockify_uniform_loads.c
index 2ad0a117a34..40dd87ebec6 100644
--- a/src/intel/compiler/intel_nir_blockify_uniform_loads.c
+++ b/src/intel/compiler/intel_nir_blockify_uniform_loads.c
@@ -69,8 +69,8 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b,
       return true;
 
    case nir_intrinsic_load_shared:
-      /* Block loads on shared memory are not supported before the LSC. */
-      if (!devinfo->has_lsc)
+      /* Block loads on shared memory are not supported before Icelake. */
+      if (devinfo->ver < 11)
          return false;
 
       if (nir_src_is_divergent(intrin->src[0]))
@@ -79,6 +79,14 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b,
       if (intrin->def.bit_size != 32)
          return false;
 
+      /* Without the LSC, we have to use OWord Block Load messages (the one
+       * that requires OWord aligned offsets, too).
+       */
+      if (!devinfo->has_lsc &&
+          (intrin->def.num_components < 4 ||
+           nir_intrinsic_align(intrin) < 16))
+         return false;
+
       intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
       return true;