intel/brw/gfx12.5+: Fix IR of sub-dword atomic LSC operations.

We were currently emitting logical atomic instructions with a packed
destination region for sub-dword LSC atomics, along the lines of:

> untyped_atomic_logical(32) dst<1>:HF, ...

However, these instructions use an LSC data size D16U32, which means
that the 16b data on the return payload is expanded to 32b by the LSC
shared function, so we were lying to the compiler about the location
of the individual channels on the return payload, its execution
masking, etc.  This is why the hacks that manually set the
'inst->size_written' of the instruction were required.

In some cases this worked, but any non-trivial manipulation of the
instruction destination by lowering or optimization passes could have
led to corruption, as has been reproduced in deqp-vk during
lower_simd_width() for shaders that use 16-bit atomics in SIMD32
dispatch mode.

Note that LSC sub-dword reads aren't affected by this because they use
raw UD destinations and specify the actual bit size of the operation
datatype as the immediate SURFACE_LOGICAL_SRC_IMM_ARG, which doesn't
work for atomic operations since that immediate specifies the atomic
opcode.

Instead, have the logical operation implement the behavior of 16-bit
destinations correctly instead of silently replacing the 16-bit region
with an inconsistent 32-bit region -- This is done by emitting the MOV
instructions used to pack the data from the UD temporary into the
packed destination from the lower_logical_sends() pass instead of from
the NIR translation pass.

Fixes: 43169dbbe5 ("intel/compiler: Support 16 bit float ops")
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30683>
This commit is contained in:
Francisco Jerez 2024-08-14 19:21:30 -07:00 committed by Marge Bot
parent 7cbe8c390d
commit 71ca8529c5
2 changed files with 38 additions and 54 deletions

View file

@ -8165,32 +8165,9 @@ fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
}
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
fs_inst *inst;
unsigned size_written = 0;
/* Emit the actual atomic operation */
switch (instr->def.bit_size) {
case 16: {
brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
retype(dest32, dest.type),
srcs, SURFACE_LOGICAL_NUM_SRCS);
size_written = dest32.component_size(inst->exec_size);
bld.MOV(retype(dest, BRW_TYPE_UW), dest32);
break;
}
case 32:
case 64:
inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
size_written = dest.component_size(inst->exec_size);
break;
default:
unreachable("Unsupported bit size");
}
assert(size_written);
inst->size_written = size_written * instr->def.num_components;
bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, dest, srcs,
SURFACE_LOGICAL_NUM_SRCS);
}
static void
@ -8224,30 +8201,8 @@ fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
fs_inst *inst;
unsigned size_written = 0;
switch (instr->def.bit_size) {
case 16: {
brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
retype(dest32, dest.type),
srcs, A64_LOGICAL_NUM_SRCS);
size_written = dest32.component_size(inst->exec_size);
bld.MOV(retype(dest, BRW_TYPE_UW), dest32);
break;
}
case 32:
case 64:
inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
srcs, A64_LOGICAL_NUM_SRCS);
size_written = dest.component_size(inst->exec_size);
break;
default:
unreachable("Unsupported bit size");
}
assert(size_written);
inst->size_written = size_written * instr->def.num_components;
bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
srcs, A64_LOGICAL_NUM_SRCS);
}
static void

View file

@ -1653,7 +1653,8 @@ lsc_bits_to_data_size(unsigned bit_size)
}
static void
lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
lower_lsc_surface_logical_send(bblock_t *block, const fs_builder &bld,
fs_inst *inst)
{
const brw_compiler *compiler = bld.shader->compiler;
const intel_device_info *devinfo = bld.shader->devinfo;
@ -1810,6 +1811,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
compiler->extended_bindless_surface_offset;
/* Messages with destination datatypes narrower than a dword use a
* D*32 LSC data size, update the destination to use a temporary of
* the raw (UD) return payload datatype.
*/
if (dst_sz < 4) {
assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4);
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
const brw_reg_type t = brw_int_type(dst_sz, false);
bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32);
inst->dst = dest32;
inst->size_written = inst->dst.component_size(inst->exec_size);
}
inst->resize_sources(4);
if (non_bindless) {
@ -2032,7 +2047,7 @@ emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
}
static void
lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
@ -2144,6 +2159,20 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
/* Messages with destination datatypes narrower than a dword use a
* D*32 LSC data size, update the destination to use a temporary of
* the raw (UD) return payload datatype.
*/
if (dst_sz < 4) {
assert(lsc_data_size_bytes(lsc_bits_to_data_size(dst_sz * 8)) == 4);
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
const brw_reg dest32 = bld.vgrf(BRW_TYPE_UD);
const brw_reg_type t = brw_int_type(dst_sz, false);
bld.at(block, inst->next).MOV(retype(inst->dst, t), dest32);
inst->dst = dest32;
inst->size_written = inst->dst.component_size(inst->exec_size);
}
/* Set up SFID and descriptors */
inst->sfid = GFX12_SFID_UGM;
inst->resize_sources(4);
@ -2805,7 +2834,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
if (devinfo->has_lsc)
lower_lsc_surface_logical_send(ibld, inst);
lower_lsc_surface_logical_send(block, ibld, inst);
else
lower_surface_logical_send(ibld, inst);
break;
@ -2814,7 +2843,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
devinfo->ver >= 20 && devinfo->has_lsc ?
lower_lsc_surface_logical_send(ibld, inst) :
lower_lsc_surface_logical_send(block, ibld, inst) :
lower_surface_logical_send(ibld, inst);
break;
@ -2836,7 +2865,7 @@ brw_fs_lower_logical_sends(fs_visitor &s)
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
if (devinfo->has_lsc) {
lower_lsc_a64_logical_send(ibld, inst);
lower_lsc_a64_logical_send(block, ibld, inst);
break;
}
lower_a64_logical_send(ibld, inst);