intel/brw/gfx12.5+: Fix IR of sub-dword atomic LSC operations.

We were currently emitting logical atomic instructions with a packed destination region for sub-dword LSC atomics, along the lines of: > untyped_atomic_logical(32) dst<1>:HF, ... However, these instructions use an LSC data size D16U32, which means that the 16b data on the return payload is expanded to 32b by the LSC shared function, so we were lying to the compiler about the location of the individual channels on the return payload, its execution masking, etc. This is why the hacks that manually set the 'inst->size_written' of the instruction were required. In some cases this worked, but any non-trivial manipulation of the instruction destination by lowering or optimization passes could have led to corruption, as has been reproduced in deqp-vk during lower_simd_width() for shaders that use 16-bit atomics in SIMD32 dispatch mode. Note that LSC sub-dword reads aren't affected by this because they use raw UD destinations and specify the actual bit size of the operation datatype as the immediate SURFACE_LOGICAL_SRC_IMM_ARG, which doesn't work for atomic operations since that immediate specifies the atomic opcode. Instead, have the logical operation implement the behavior of 16-bit destinations correctly instead of silently replacing the 16-bit region with an inconsistent 32-bit region -- This is done by emitting the MOV instructions used to pack the data from the UD temporary into the packed destination from the lower_logical_sends() pass instead of from the NIR translation pass. Fixes: 43169dbbe5 ("intel/compiler: Support 16 bit float ops") Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30683>
2026-05-07 07:08:04 +02:00 · 2024-08-14 19:21:30 -07:00 · 2024-08-14 19:21:30 -07:00 · 71ca8529c5
commit 71ca8529c5
parent 7cbe8c390d
2 changed files with 38 additions and 54 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -8165,32 +8165,9 @@ fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
   }
   srcs[SURFACE_LOGICAL_SRC_DATA] = data;

-   fs_inst *inst;
-   unsigned size_written = 0;
   /* Emit the actual atomic operation */
-   switch (instr->def.bit_size) {
-      case 16: {
-         brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
-         inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
-                         retype(dest32, dest.type),
-                         srcs, SURFACE_LOGICAL_NUM_SRCS);
-         size_written = dest32.component_size(inst->exec_size);
-         bld.MOV(retype(dest, BRW_TYPE_UW), dest32);
-         break;
-      }
-
-      case 32:
-      case 64:
-         inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
-                         dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
-         size_written = dest.component_size(inst->exec_size);
-         break;
-      default:
-         unreachable("Unsupported bit size");
-   }
-
-   assert(size_written);
-   inst->size_written = size_written * instr->def.num_components;
+   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, dest, srcs,
+            SURFACE_LOGICAL_NUM_SRCS);
 }

 static void
@ -8224,30 +8201,8 @@ fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
   srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
   srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);

-   fs_inst *inst;
-   unsigned size_written = 0;
-   switch (instr->def.bit_size) {
-   case 16: {
-      brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
-      inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
-                      retype(dest32, dest.type),
-                      srcs, A64_LOGICAL_NUM_SRCS);
-      size_written = dest32.component_size(inst->exec_size);
-      bld.MOV(retype(dest, BRW_TYPE_UW), dest32);
-      break;
-   }
-   case 32:
-   case 64:
-      inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
-                      srcs, A64_LOGICAL_NUM_SRCS);
-      size_written = dest.component_size(inst->exec_size);
-      break;
-   default:
-      unreachable("Unsupported bit size");
-   }
-
-   assert(size_written);
-   inst->size_written = size_written * instr->def.num_components;
+   bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
+            srcs, A64_LOGICAL_NUM_SRCS);
 }

 static void
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@ -1653,7 +1653,8 @@ lsc_bits_to_data_size(unsigned bit_size)
 }

 static void
-lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
+lower_lsc_surface_logical_send(bblock_t *block, const fs_builder &bld,
+                               fs_inst *inst)
 {
   const brw_compiler *compiler = bld.shader->compiler;
   const intel_device_info *devinfo = bld.shader->devinfo;
@ -1810,6 +1811,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
   inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
                       compiler->extended_bindless_surface_offset;

+   /* Messages with destination datatypes narrower than a dword use a
+    * D*32 LSC data size, update the destination to use a temporary of
+    * the raw (UD) return payload datatype.
+    */
+   if (dst_sz < 4) {
+      assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4);
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
+      const brw_reg_type t = brw_int_type(dst_sz, false);
+      bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32);
+      inst->dst = dest32;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+   }
+
   inst->resize_sources(4);

   if (non_bindless) {
@ -2032,7 +2047,7 @@ emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
 }

 static void
-lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
+lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst)
 {
   const intel_device_info *devinfo = bld.shader->devinfo;

@ -2144,6 +2159,20 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
   inst->send_has_side_effects = has_side_effects;
   inst->send_is_volatile = !has_side_effects;

+   /* Messages with destination datatypes narrower than a dword use a
+    * D*32 LSC data size, update the destination to use a temporary of
+    * the raw (UD) return payload datatype.
+    */
+   if (dst_sz < 4) {
+      assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4);
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
+      const brw_reg_type t = brw_int_type(dst_sz, false);
+      bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32);
+      inst->dst = dest32;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+   }
+
   /* Set up SFID and descriptors */
   inst->sfid = GFX12_SFID_UGM;
   inst->resize_sources(4);
@ -2805,7 +2834,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
      case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
      case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
         if (devinfo->has_lsc)
-            lower_lsc_surface_logical_send(ibld, inst);
+            lower_lsc_surface_logical_send(block, ibld, inst);
         else
            lower_surface_logical_send(ibld, inst);
         break;
@ -2814,7 +2843,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
         devinfo->ver >= 20 && devinfo->has_lsc ?
-            lower_lsc_surface_logical_send(ibld, inst) :
+            lower_lsc_surface_logical_send(block, ibld, inst) :
            lower_surface_logical_send(ibld, inst);
         break;

@ -2836,7 +2865,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
      case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
      case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
         if (devinfo->has_lsc) {
-            lower_lsc_a64_logical_send(ibld, inst);
+            lower_lsc_a64_logical_send(block, ibld, inst);
            break;
         }
         lower_a64_logical_send(ibld, inst);