From 71ca8529c5ffa54787f30e6e8b1f9f2971ff649e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 14 Aug 2024 19:21:30 -0700
Subject: [PATCH] intel/brw/gfx12.5+: Fix IR of sub-dword atomic LSC
 operations.

We were currently emitting logical atomic instructions with a packed
destination region for sub-dword LSC atomics, along the lines of:

> untyped_atomic_logical(32) dst<1>:HF, ...

However, these instructions use an LSC data size D16U32, which means
that the 16b data on the return payload is expanded to 32b by the LSC
shared function, so we were lying to the compiler about the location
of the individual channels on the return payload, its execution
masking, etc.  This is why the hacks that manually set the
'inst->size_written' of the instruction were required.

In some cases this worked, but any non-trivial manipulation of the
instruction destination by lowering or optimization passes could have
led to corruption, as has been reproduced in deqp-vk during
lower_simd_width() for shaders that use 16-bit atomics in SIMD32
dispatch mode.

Note that LSC sub-dword reads aren't affected by this because they use
raw UD destinations and specify the actual bit size of the operation
datatype as the immediate SURFACE_LOGICAL_SRC_IMM_ARG, which doesn't
work for atomic operations since that immediate specifies the atomic
opcode.

Instead, have the logical operation implement the behavior of 16-bit
destinations correctly instead of silently replacing the 16-bit region
with an inconsistent 32-bit region -- This is done by emitting the MOV
instructions used to pack the data from the UD temporary into the
packed destination from the lower_logical_sends() pass instead of from
the NIR translation pass.

Fixes: 43169dbbe5f96 ("intel/compiler: Support 16 bit float ops")
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30683>
---
 src/intel/compiler/brw_fs_nir.cpp             | 53 ++-----------------
 .../compiler/brw_lower_logical_sends.cpp      | 39 ++++++++++++--
 2 files changed, 38 insertions(+), 54 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 606f457a785..6fee10058cd 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -8165,32 +8165,9 @@ fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
    }
    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
 
-   fs_inst *inst;
-   unsigned size_written = 0;
    /* Emit the actual atomic operation */
-   switch (instr->def.bit_size) {
-      case 16: {
-         brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
-         inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
-                         retype(dest32, dest.type),
-                         srcs, SURFACE_LOGICAL_NUM_SRCS);
-         size_written = dest32.component_size(inst->exec_size);
-         bld.MOV(retype(dest, BRW_TYPE_UW), dest32);
-         break;
-      }
-
-      case 32:
-      case 64:
-         inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
-                         dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
-         size_written = dest.component_size(inst->exec_size);
-         break;
-      default:
-         unreachable("Unsupported bit size");
-   }
-
-   assert(size_written);
-   inst->size_written = size_written * instr->def.num_components;
+   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, dest, srcs,
+            SURFACE_LOGICAL_NUM_SRCS);
 }
 
 static void
@@ -8224,30 +8201,8 @@ fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
    srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
    srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
 
-   fs_inst *inst;
-   unsigned size_written = 0;
-   switch (instr->def.bit_size) {
-   case 16: {
-      brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
-      inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
-                      retype(dest32, dest.type),
-                      srcs, A64_LOGICAL_NUM_SRCS);
-      size_written = dest32.component_size(inst->exec_size);
-      bld.MOV(retype(dest, BRW_TYPE_UW), dest32);
-      break;
-   }
-   case 32:
-   case 64:
-      inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
-                      srcs, A64_LOGICAL_NUM_SRCS);
-      size_written = dest.component_size(inst->exec_size);
-      break;
-   default:
-      unreachable("Unsupported bit size");
-   }
-
-   assert(size_written);
-   inst->size_written = size_written * instr->def.num_components;
+   bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
+            srcs, A64_LOGICAL_NUM_SRCS);
 }
 
 static void
diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp
index 5819c4cf282..b6e272c03ec 100644
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -1653,7 +1653,8 @@ lsc_bits_to_data_size(unsigned bit_size)
 }
 
 static void
-lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
+lower_lsc_surface_logical_send(bblock_t *block, const fs_builder &bld,
+                               fs_inst *inst)
 {
    const brw_compiler *compiler = bld.shader->compiler;
    const intel_device_info *devinfo = bld.shader->devinfo;
@@ -1810,6 +1811,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
                        compiler->extended_bindless_surface_offset;
 
+   /* Messages with destination datatypes narrower than a dword use a
+    * D*32 LSC data size, update the destination to use a temporary of
+    * the raw (UD) return payload datatype.
+    */
+   if (dst_sz < 4) {
+      assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4);
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
+      const brw_reg_type t = brw_int_type(dst_sz, false);
+      bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32);
+      inst->dst = dest32;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+   }
+
    inst->resize_sources(4);
 
    if (non_bindless) {
@@ -2032,7 +2047,7 @@ emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
 }
 
 static void
-lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
+lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
 
@@ -2144,6 +2159,20 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
    inst->send_has_side_effects = has_side_effects;
    inst->send_is_volatile = !has_side_effects;
 
+   /* Messages with destination datatypes narrower than a dword use a
+    * D*32 LSC data size, update the destination to use a temporary of
+    * the raw (UD) return payload datatype.
+    */
+   if (dst_sz < 4) {
+      assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4);
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
+      const brw_reg_type t = brw_int_type(dst_sz, false);
+      bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32);
+      inst->dst = dest32;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+   }
+
    /* Set up SFID and descriptors */
    inst->sfid = GFX12_SFID_UGM;
    inst->resize_sources(4);
@@ -2805,7 +2834,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
       case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
       case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
          if (devinfo->has_lsc)
-            lower_lsc_surface_logical_send(ibld, inst);
+            lower_lsc_surface_logical_send(block, ibld, inst);
          else
             lower_surface_logical_send(ibld, inst);
          break;
@@ -2814,7 +2843,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
          devinfo->ver >= 20 && devinfo->has_lsc ?
-            lower_lsc_surface_logical_send(ibld, inst) :
+            lower_lsc_surface_logical_send(block, ibld, inst) :
             lower_surface_logical_send(ibld, inst);
          break;
 
@@ -2836,7 +2865,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
       case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
       case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
          if (devinfo->has_lsc) {
-            lower_lsc_a64_logical_send(ibld, inst);
+            lower_lsc_a64_logical_send(block, ibld, inst);
             break;
          }
          lower_a64_logical_send(ibld, inst);