From 71ca8529c5ffa54787f30e6e8b1f9f2971ff649e Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Wed, 14 Aug 2024 19:21:30 -0700 Subject: [PATCH] intel/brw/gfx12.5+: Fix IR of sub-dword atomic LSC operations. We were currently emitting logical atomic instructions with a packed destination region for sub-dword LSC atomics, along the lines of: > untyped_atomic_logical(32) dst<1>:HF, ... However, these instructions use an LSC data size D16U32, which means that the 16b data on the return payload is expanded to 32b by the LSC shared function, so we were lying to the compiler about the location of the individual channels on the return payload, its execution masking, etc. This is why the hacks that manually set the 'inst->size_written' of the instruction were required. In some cases this worked, but any non-trivial manipulation of the instruction destination by lowering or optimization passes could have led to corruption, as has been reproduced in deqp-vk during lower_simd_width() for shaders that use 16-bit atomics in SIMD32 dispatch mode. Note that LSC sub-dword reads aren't affected by this because they use raw UD destinations and specify the actual bit size of the operation datatype as the immediate SURFACE_LOGICAL_SRC_IMM_ARG, which doesn't work for atomic operations since that immediate specifies the atomic opcode. Instead, have the logical operation implement the behavior of 16-bit destinations correctly instead of silently replacing the 16-bit region with an inconsistent 32-bit region -- This is done by emitting the MOV instructions used to pack the data from the UD temporary into the packed destination from the lower_logical_sends() pass instead of from the NIR translation pass. Fixes: 43169dbbe5f96 ("intel/compiler: Support 16 bit float ops") Reviewed-by: Sagar Ghuge Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 53 ++----------------- .../compiler/brw_lower_logical_sends.cpp | 39 ++++++++++++-- 2 files changed, 38 insertions(+), 54 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 606f457a785..6fee10058cd 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -8165,32 +8165,9 @@ fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld, } srcs[SURFACE_LOGICAL_SRC_DATA] = data; - fs_inst *inst; - unsigned size_written = 0; /* Emit the actual atomic operation */ - switch (instr->def.bit_size) { - case 16: { - brw_reg dest32 = bld.vgrf(BRW_TYPE_UD); - inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, - retype(dest32, dest.type), - srcs, SURFACE_LOGICAL_NUM_SRCS); - size_written = dest32.component_size(inst->exec_size); - bld.MOV(retype(dest, BRW_TYPE_UW), dest32); - break; - } - - case 32: - case 64: - inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, - dest, srcs, SURFACE_LOGICAL_NUM_SRCS); - size_written = dest.component_size(inst->exec_size); - break; - default: - unreachable("Unsupported bit size"); - } - - assert(size_written); - inst->size_written = size_written * instr->def.num_components; + bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, dest, srcs, + SURFACE_LOGICAL_NUM_SRCS); } static void @@ -8224,30 +8201,8 @@ fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld, srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); - fs_inst *inst; - unsigned size_written = 0; - switch (instr->def.bit_size) { - case 16: { - brw_reg dest32 = bld.vgrf(BRW_TYPE_UD); - inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, - retype(dest32, dest.type), - srcs, A64_LOGICAL_NUM_SRCS); - size_written = dest32.component_size(inst->exec_size); - bld.MOV(retype(dest, BRW_TYPE_UW), dest32); - break; - } - case 32: - case 64: - inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest, - srcs, A64_LOGICAL_NUM_SRCS); - size_written = dest.component_size(inst->exec_size); - break; - default: - unreachable("Unsupported bit size"); - } - - assert(size_written); - inst->size_written = size_written * instr->def.num_components; + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest, + srcs, A64_LOGICAL_NUM_SRCS); } static void diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 5819c4cf282..b6e272c03ec 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -1653,7 +1653,8 @@ lsc_bits_to_data_size(unsigned bit_size) } static void -lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) +lower_lsc_surface_logical_send(bblock_t *block, const fs_builder &bld, + fs_inst *inst) { const brw_compiler *compiler = bld.shader->compiler; const intel_device_info *devinfo = bld.shader->devinfo; @@ -1810,6 +1811,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS && compiler->extended_bindless_surface_offset; + /* Messages with destination datatypes narrower than a dword use a + * D*32 LSC data size, update the destination to use a temporary of + * the raw (UD) return payload datatype. + */ + if (dst_sz < 4) { + assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4); + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); + const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD); + const brw_reg_type t = brw_int_type(dst_sz, false); + bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32); + inst->dst = dest32; + inst->size_written = inst->dst.component_size(inst->exec_size); + } + inst->resize_sources(4); if (non_bindless) { @@ -2032,7 +2047,7 @@ emit_fragment_mask(const fs_builder &bld, fs_inst *inst) } static void -lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) +lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; @@ -2144,6 +2159,20 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) inst->send_has_side_effects = has_side_effects; inst->send_is_volatile = !has_side_effects; + /* Messages with destination datatypes narrower than a dword use a + * D*32 LSC data size, update the destination to use a temporary of + * the raw (UD) return payload datatype. + */ + if (dst_sz < 4) { + assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4); + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); + const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD); + const brw_reg_type t = brw_int_type(dst_sz, false); + bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32); + inst->dst = dest32; + inst->size_written = inst->dst.component_size(inst->exec_size); + } + /* Set up SFID and descriptors */ inst->sfid = GFX12_SFID_UGM; inst->resize_sources(4); @@ -2805,7 +2834,7 @@ brw_fs_lower_logical_sends(fs_visitor &s) case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: if (devinfo->has_lsc) - lower_lsc_surface_logical_send(ibld, inst); + lower_lsc_surface_logical_send(block, ibld, inst); else lower_surface_logical_send(ibld, inst); break; @@ -2814,7 +2843,7 @@ brw_fs_lower_logical_sends(fs_visitor &s) case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: devinfo->ver >= 20 && devinfo->has_lsc ? - lower_lsc_surface_logical_send(ibld, inst) : + lower_lsc_surface_logical_send(block, ibld, inst) : lower_surface_logical_send(ibld, inst); break; @@ -2836,7 +2865,7 @@ brw_fs_lower_logical_sends(fs_visitor &s) case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: if (devinfo->has_lsc) { - lower_lsc_a64_logical_send(ibld, inst); + lower_lsc_a64_logical_send(block, ibld, inst); break; } lower_a64_logical_send(ibld, inst);