From c3325b22d852584d1b4cef944cfaf9e2801131a4 Mon Sep 17 00:00:00 2001 From: Simon Perretta Date: Thu, 8 May 2025 18:19:43 +0100 Subject: [PATCH] pco: image atomics support Signed-off-by: Simon Perretta Acked-by: Erik Faye-Lund Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 5 ++ src/imagination/pco/pco_map.py | 31 +++++++++ src/imagination/pco/pco_nir_tex.c | 73 +++++++++++++++++++--- src/imagination/pco/pco_nir_vk.c | 8 ++- src/imagination/pco/pco_ops.py | 1 + src/imagination/pco/pco_trans_nir.c | 42 +++++++++++++ 7 files changed, 150 insertions(+), 11 deletions(-) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 00c9047c411..81a285b6e6c 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -859,6 +859,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_global_atomic_swap_agx: case nir_intrinsic_global_atomic_2x32: case nir_intrinsic_global_atomic_swap_2x32: + case nir_intrinsic_global_atomic_pco: case nir_intrinsic_atomic_counter_add: case nir_intrinsic_atomic_counter_min: case nir_intrinsic_atomic_counter_max: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index df0b4cedd6e..c529a95b2a3 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -895,6 +895,10 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0, # # AGX global variants take a 64-bit base address plus a 32-bit offset in words. # The offset is sign-extended or zero-extended based on the SIGN_EXTEND index. +# +# PCO global variants use a vec3 for the memory address and data, where component X +# has the low 32 address bits, component Y has the high 32 address bits, and component Z +# has the data parameter. intrinsic("deref_atomic", src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) intrinsic("ssbo_atomic", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT]) @@ -904,6 +908,7 @@ intrinsic("global_atomic", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP]) intrinsic("global_atomic_2x32", src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP]) intrinsic("global_atomic_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic_agx", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND]) +intrinsic("global_atomic_pco", src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32]) intrinsic("deref_atomic_swap", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) intrinsic("ssbo_atomic_swap", src_comp=[-1, 1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT]) diff --git a/src/imagination/pco/pco_map.py b/src/imagination/pco/pco_map.py index 0e498222933..c71979e02bd 100644 --- a/src/imagination/pco/pco_map.py +++ b/src/imagination/pco/pco_map.py @@ -1595,6 +1595,18 @@ encode_map(O_REV, op_ref_maps=[('0', ['ft2'], ['s2'])] ) +encode_map(O_SHUFFLE, + encodings=[ + (I_PHASE0_SRC, [ + ('count_src', 'ft2'), + ('count_op', 'byp'), + ('bitmask_src_op', 'byp'), + ('shift1_op', 'shfl') + ]) + ], + op_ref_maps=[('0', ['ft2'], ['s2', 's1'])] +) + encode_map(O_LOGICAL, encodings=[ (I_PHASE1, [ @@ -3283,6 +3295,25 @@ group_map(O_REV, dests=[('w[1]', ('0', DEST(0)), 'ft2')] ) +group_map(O_SHUFFLE, + hdr=(I_IGRP_HDR_BITWISE, [ + ('opcnt', 'p0'), + ('olchk', OM_OLCHK), + ('w1p', True), + ('w0p', False), + ('cc', OM_EXEC_CND), + ('end', OM_END), + ('atom', OM_ATOM), + ('rpt', OM_RPT) + ]), + enc_ops=[('0', O_SHUFFLE)], + srcs=[ + ('s[1]', ('0', SRC(1)), 's1'), + ('s[2]', ('0', SRC(0)), 's2') + ], + dests=[('w[1]', ('0', DEST(0)), 'ft2')] +) + group_map(O_LOGICAL, hdr=(I_IGRP_HDR_BITWISE, [ ('opcnt', ['p0', 'p1']), diff --git a/src/imagination/pco/pco_nir_tex.c b/src/imagination/pco/pco_nir_tex.c index c9aa5ee2835..c419b234ecc 100644 --- a/src/imagination/pco/pco_nir_tex.c +++ b/src/imagination/pco/pco_nir_tex.c @@ -797,7 +797,7 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data) bool is_cube_array = image_dim == GLSL_SAMPLER_DIM_CUBE && is_array; - nir_def *lod; + nir_def *lod = NULL; switch (intr->intrinsic) { case nir_intrinsic_image_deref_load: lod = intr->src[3].ssa; @@ -811,6 +811,10 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data) lod = intr->src[1].ssa; break; + case nir_intrinsic_image_deref_atomic: + case nir_intrinsic_image_deref_atomic_swap: + break; + default: UNREACHABLE(""); } @@ -862,9 +866,11 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data) return nir_vec(b, size_comps, intr->def.num_components); } - nir_alu_type type = intr->intrinsic == nir_intrinsic_image_deref_load - ? nir_intrinsic_dest_type(intr) - : nir_intrinsic_src_type(intr); + nir_alu_type type = nir_type_invalid; + if (intr->intrinsic == nir_intrinsic_image_deref_load) + type = nir_intrinsic_dest_type(intr); + else if (intr->intrinsic == nir_intrinsic_image_deref_store) + type = nir_intrinsic_src_type(intr); bool msaa = image_dim == GLSL_SAMPLER_DIM_MS || image_dim == GLSL_SAMPLER_DIM_SUBPASS_MS; @@ -1050,24 +1056,70 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data) } } - unsigned smp_desc = ia ? PCO_IA_SAMPLER : PCO_POINT_SAMPLER; - nir_def *tex_state = nir_load_tex_state_pco(b, ROGUE_NUM_TEXSTATE_DWORDS, elem, .desc_set = desc_set, .binding = binding); + unsigned num_coord_comps = nir_image_intrinsic_coord_components(intr); + if (coords) + coords = nir_trim_vector(b, coords, num_coord_comps); + + if (intr->intrinsic == nir_intrinsic_image_deref_atomic || + intr->intrinsic == nir_intrinsic_image_deref_atomic_swap) { + assert(image_dim == GLSL_SAMPLER_DIM_2D); + assert(!is_array); + + assert(util_format_is_plain(format)); + assert(util_format_is_pure_integer(format)); + + assert(util_format_get_nr_components(format) == 1); + assert(util_format_get_blockwidth(format) == 1); + assert(util_format_get_blockheight(format) == 1); + assert(util_format_get_blockdepth(format) == 1); + assert(util_format_get_blocksize(format) == sizeof(uint32_t)); + + nir_def *tex_state_word[] = { + [0] = nir_channel(b, tex_state, 0), + [1] = nir_channel(b, tex_state, 1), + [2] = nir_channel(b, tex_state, 2), + [3] = nir_channel(b, tex_state, 3), + }; + + nir_def *base_addr_lo; + nir_def *base_addr_hi; + unpack_base_addr(b, tex_state_word, &base_addr_lo, &base_addr_hi); + + /* Calculate untwiddled offset. */ + nir_def *x = nir_i2i16(b, nir_channel(b, coords, 0)); + nir_def *y = nir_i2i16(b, nir_channel(b, coords, 1)); + nir_def *twiddled_offset = nir_interleave(b, y, x); + twiddled_offset = + nir_imul_imm(b, twiddled_offset, util_format_get_blocksize(format)); + + /* Offset the address by the co-ordinates. */ + nir_def *addr = + nir_uadd64_32(b, base_addr_lo, base_addr_hi, twiddled_offset); + + nir_def *addr_lo = nir_channel(b, addr, 0); + nir_def *addr_hi = nir_channel(b, addr, 1); + nir_def *data = intr->src[3].ssa; + + nir_def *addr_data = nir_vec3(b, addr_lo, addr_hi, data); + + return nir_global_atomic_pco(b, + addr_data, + .atomic_op = nir_intrinsic_atomic_op(intr)); + } + + unsigned smp_desc = ia ? PCO_IA_SAMPLER : PCO_POINT_SAMPLER; nir_def *smp_state = nir_load_smp_state_pco(b, ROGUE_NUM_TEXSTATE_DWORDS, nir_imm_int(b, 0), .desc_set = smp_desc, .binding = smp_desc); - unsigned num_coord_comps = nir_image_intrinsic_coord_components(intr); - if (coords) - coords = nir_trim_vector(b, coords, num_coord_comps); - /* Special case, override buffers to be 2D. */ if (image_dim == GLSL_SAMPLER_DIM_BUF) { image_dim = GLSL_SAMPLER_DIM_2D; @@ -1200,6 +1252,7 @@ static bool is_image(const nir_instr *instr, UNUSED const void *cb_data) switch (intr->intrinsic) { case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic: case nir_intrinsic_image_deref_size: return true; diff --git a/src/imagination/pco/pco_nir_vk.c b/src/imagination/pco/pco_nir_vk.c index 8245beeedc3..12137e29373 100644 --- a/src/imagination/pco/pco_nir_vk.c +++ b/src/imagination/pco/pco_nir_vk.c @@ -205,7 +205,9 @@ lower_image_derefs(nir_builder *b, nir_intrinsic_instr *intr, pco_data *data) /* Sampler not needed for on-chip input attachments. */ data->common.uses.ia_sampler = true; - } else { + } else if (intr->intrinsic == nir_intrinsic_image_deref_load || + intr->intrinsic == nir_intrinsic_image_deref_store) { + /* Sampler not needed for other types of image accesses. */ data->common.uses.point_sampler = true; } @@ -240,6 +242,8 @@ static nir_def *lower_vk(nir_builder *b, nir_instr *instr, void *cb_data) case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic: + case nir_intrinsic_image_deref_atomic_swap: case nir_intrinsic_image_deref_size: return lower_image_derefs(b, intr, data); @@ -279,6 +283,8 @@ static bool is_vk(const nir_instr *instr, UNUSED const void *cb_data) case nir_intrinsic_load_vulkan_descriptor: case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic: + case nir_intrinsic_image_deref_atomic_swap: case nir_intrinsic_image_deref_size: return true; diff --git a/src/imagination/pco/pco_ops.py b/src/imagination/pco/pco_ops.py index eb772a09429..8d0e589a87c 100644 --- a/src/imagination/pco/pco_ops.py +++ b/src/imagination/pco/pco_ops.py @@ -411,6 +411,7 @@ O_MOVI32 = hw_op('movi32', OM_ALU, 1, 1) O_CBS = hw_op('cbs', OM_ALU, 1, 1) O_FTB = hw_op('ftb', OM_ALU, 1, 1) O_REV = hw_op('rev', OM_ALU, 1, 1) +O_SHUFFLE = hw_op('shuffle', OM_ALU, 1, 2) O_LOGICAL = hw_op('logical', OM_ALU + [OM_LOGIOP], 1, 4) O_SHIFT = hw_op('shift', OM_ALU + [OM_SHIFTOP], 1, 3) diff --git a/src/imagination/pco/pco_trans_nir.c b/src/imagination/pco/pco_trans_nir.c index b0b5bbedf15..587bc3882b4 100644 --- a/src/imagination/pco/pco_trans_nir.c +++ b/src/imagination/pco/pco_trans_nir.c @@ -1165,6 +1165,35 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx, UNREACHABLE(""); } +static pco_instr *trans_global_atomic_buffer(trans_ctx *tctx, + nir_intrinsic_instr *intr, + pco_ref dest, + pco_ref addr_data) +{ + enum pco_atom_op atom_op = to_atom_op(nir_intrinsic_atomic_op(intr)); + /* Should have been lowered. */ + assert(atom_op != PCO_ATOM_OP_CMPXCHG); + + ASSERTED unsigned chans = pco_ref_get_chans(dest); + unsigned bits = pco_ref_get_bits(dest); + + assert(chans == 1); + + switch (bits) { + case 32: + return pco_atomic(&tctx->b, + dest, + pco_ref_drc(PCO_DRC_0), + addr_data, + .atom_op = atom_op); + + default: + break; + } + + UNREACHABLE(""); +} + static inline enum pco_reg_class sys_val_to_reg_class(gl_system_value sys_val, mesa_shader_stage stage) { @@ -1593,6 +1622,10 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr) instr = trans_atomic_buffer(tctx, intr, dest, src[1], src[2]); break; + case nir_intrinsic_global_atomic_pco: + instr = trans_global_atomic_buffer(tctx, intr, dest, src[0]); + break; + /* Vertex sysvals. */ case nir_intrinsic_load_vertex_id: case nir_intrinsic_load_instance_id: @@ -2571,6 +2604,10 @@ static pco_instr *trans_alu(trans_ctx *tctx, nir_alu_instr *alu) instr = pco_rev(&tctx->b, dest, src[0]); break; + case nir_op_interleave: + instr = pco_shuffle(&tctx->b, dest, src[0], src[1]); + break; + case nir_op_f2i32: instr = pco_pck(&tctx->b, dest, @@ -2599,6 +2636,11 @@ static pco_instr *trans_alu(trans_ctx *tctx, nir_alu_instr *alu) .pck_fmt = PCO_PCK_FMT_F16F16); break; + /* Just consume/treat as 32-bit for now. */ + case nir_op_i2i16: + instr = pco_mov(&tctx->b, pco_ref_bits(dest, 32), src[0]); + break; + case nir_op_f2i32_rtne: instr = pco_pck(&tctx->b, dest, src[0], .pck_fmt = PCO_PCK_FMT_S32); break;