pco: fully support Vulkan 1.2 image atomics

Signed-off-by: Simon Perretta <simon.perretta@imgtec.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37512>
This commit is contained in:
Simon Perretta 2025-09-03 15:01:31 +01:00 committed by Marge Bot
parent 08e3740e07
commit 6dc5e1e109
7 changed files with 222 additions and 19 deletions

View file

@ -918,6 +918,7 @@ intrinsic("global_atomic_swap", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMI
intrinsic("global_atomic_swap_2x32", src_comp=[2, 1, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_swap_amd", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_swap_agx", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND])
intrinsic("global_atomic_swap_pco", src_comp=[4], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32])
def system_value(name, dest_comp, indices=[], bit_sizes=[32], can_reorder=True):
flags = [CAN_ELIMINATE, CAN_REORDER] if can_reorder else [CAN_ELIMINATE]

View file

@ -837,7 +837,6 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
NIR_PASS(_, nir, pco_nir_lower_vk, data);
NIR_PASS(_, nir, pco_nir_lower_io);
NIR_PASS(_, nir, pco_nir_lower_atomics, data);
NIR_PASS(_, nir, nir_opt_constant_folding);
@ -873,6 +872,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
NIR_PASS(_, nir, pco_nir_lower_clip_cull_vars);
NIR_PASS(_, nir, pco_nir_lower_images, data);
NIR_PASS(_, nir, pco_nir_lower_atomics, data);
NIR_PASS(_,
nir,
nir_lower_tex,

View file

@ -101,19 +101,42 @@ static nir_def *lower_atomic(nir_builder *b, nir_instr *instr, void *cb_data)
b->cursor = nir_before_instr(instr);
nir_def *buffer = intr->src[0].ssa;
nir_def *offset = intr->src[1].ssa;
nir_def *value = intr->src[2].ssa;
nir_def *value_swap = intr->src[3].ssa;
if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) {
nir_def *buffer = intr->src[0].ssa;
nir_def *offset = intr->src[1].ssa;
nir_def *value = intr->src[2].ssa;
nir_def *value_swap = intr->src[3].ssa;
ASSERTED enum gl_access_qualifier access = nir_intrinsic_access(intr);
ASSERTED unsigned num_components = intr->def.num_components;
ASSERTED unsigned bit_size = intr->def.bit_size;
assert(access == ACCESS_COHERENT);
assert(num_components == 1 && bit_size == 32);
*uses_usclib = true;
return usclib_emu_ssbo_atomic_comp_swap(b,
buffer,
offset,
value,
value_swap);
}
nir_def *addr_data = intr->src[0].ssa;
nir_def *addr_lo = nir_channel(b, addr_data, 0);
nir_def *addr_hi = nir_channel(b, addr_data, 1);
nir_def *value = nir_channel(b, addr_data, 2);
nir_def *value_swap = nir_channel(b, addr_data, 3);
ASSERTED enum gl_access_qualifier access = nir_intrinsic_access(intr);
ASSERTED unsigned num_components = intr->def.num_components;
ASSERTED unsigned bit_size = intr->def.bit_size;
assert(access == ACCESS_COHERENT);
assert(num_components == 1 && bit_size == 32);
*uses_usclib = true;
return usclib_emu_ssbo_atomic_comp_swap(b, buffer, offset, value, value_swap);
return usclib_emu_global_atomic_comp_swap(b,
addr_lo,
addr_hi,
value,
value_swap);
}
/**
@ -129,8 +152,10 @@ static bool is_lowerable_atomic(const nir_instr *instr,
if (instr->type != nir_instr_type_intrinsic)
return false;
return nir_instr_as_intrinsic(instr)->intrinsic ==
nir_intrinsic_ssbo_atomic_swap;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
return intr->intrinsic == nir_intrinsic_ssbo_atomic_swap ||
intr->intrinsic == nir_intrinsic_global_atomic_swap_pco;
}
/**

View file

@ -839,6 +839,7 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data)
case nir_intrinsic_image_deref_atomic:
case nir_intrinsic_image_deref_atomic_swap:
lod = nir_imm_int(b, 0);
break;
default:
@ -1158,9 +1159,6 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data)
if (intr->intrinsic == nir_intrinsic_image_deref_atomic ||
intr->intrinsic == nir_intrinsic_image_deref_atomic_swap) {
assert(image_dim == GLSL_SAMPLER_DIM_2D);
assert(!is_array);
assert(util_format_is_plain(format));
assert(util_format_is_pure_integer(format));
@ -1170,12 +1168,90 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data)
assert(util_format_get_blockdepth(format) == 1);
assert(util_format_get_blocksize(format) == sizeof(uint32_t));
/* Calculate untwiddled offset. */
nir_def *x = nir_i2i16(b, nir_channel(b, coords, 0));
nir_def *y = nir_i2i16(b, nir_channel(b, coords, 1));
nir_def *twiddled_offset = nir_interleave(b, y, x);
twiddled_offset =
nir_imul_imm(b, twiddled_offset, util_format_get_blocksize(format));
if (image_dim == GLSL_SAMPLER_DIM_CUBE) {
image_dim = GLSL_SAMPLER_DIM_2D;
is_array = true;
} else if (image_dim == GLSL_SAMPLER_DIM_BUF) {
image_dim = GLSL_SAMPLER_DIM_2D;
coords = nir_vec2(b,
nir_umod_imm(b, coords, 8192),
nir_udiv_imm(b, coords, 8192));
}
nir_def *twiddled_offset = NULL;
nir_def *array_index = NULL;
switch (image_dim) {
case GLSL_SAMPLER_DIM_1D: {
twiddled_offset = nir_channel(b, coords, 0);
twiddled_offset =
nir_imul_imm(b, twiddled_offset, util_format_get_blocksize(format));
if (is_array)
array_index = nir_channel(b, coords, 1);
break;
}
case GLSL_SAMPLER_DIM_2D: {
/* Calculate untwiddled offset. */
nir_def *x = nir_i2i16(b, nir_channel(b, coords, 0));
nir_def *y = nir_i2i16(b, nir_channel(b, coords, 1));
twiddled_offset = nir_interleave(b, y, x);
twiddled_offset =
nir_imul_imm(b, twiddled_offset, util_format_get_blocksize(format));
if (is_array)
array_index = nir_channel(b, coords, 2);
break;
}
case GLSL_SAMPLER_DIM_3D: {
assert(!is_array);
/* Calculate untwiddled offset. */
nir_def *num_comps = nir_imm_int(b, 3);
nir_def *dim = nir_imm_int(b, image_dim);
nir_def *_is_array = nir_imm_bool(b, is_array);
nir_def *is_image = nir_imm_bool(b, true);
nir_def *size_comps = usclib_tex_state_size(b,
tex_state,
num_comps,
dim,
_is_array,
is_image,
lod);
twiddled_offset = usclib_twiddle3d(b, coords, size_comps);
data->common.uses.usclib = true;
twiddled_offset =
nir_imul_imm(b, twiddled_offset, util_format_get_blocksize(format));
break;
}
default:
UNREACHABLE("");
}
assert(twiddled_offset);
if (is_array) {
assert(array_index);
nir_def *array_max = usclib_tex_state_array_max(b, tex_state);
array_index = nir_uclamp(b, array_index, nir_imm_int(b, 0), array_max);
nir_def *tex_meta = nir_load_tex_meta_pco(b,
PCO_IMAGE_META_COUNT,
elem,
.desc_set = desc_set,
.binding = binding);
nir_def *array_stride =
nir_channel(b, tex_meta, PCO_IMAGE_META_LAYER_SIZE);
nir_def *array_offset = nir_imul(b, array_index, array_stride);
twiddled_offset = nir_iadd(b, twiddled_offset, array_offset);
}
/* Offset the address by the co-ordinates. */
nir_def *base_addr = usclib_tex_state_address(b, tex_state);
@ -1187,6 +1263,19 @@ static nir_def *lower_image(nir_builder *b, nir_instr *instr, void *cb_data)
nir_def *addr_lo = nir_channel(b, addr, 0);
nir_def *addr_hi = nir_channel(b, addr, 1);
if (intr->intrinsic == nir_intrinsic_image_deref_atomic_swap) {
nir_def *compare = intr->src[3].ssa;
nir_def *dma_data = intr->src[4].ssa;
nir_def *addr_data = nir_vec4(b, addr_lo, addr_hi, compare, dma_data);
return nir_global_atomic_swap_pco(b,
addr_data,
.atomic_op =
nir_intrinsic_atomic_op(intr));
}
nir_def *dma_data = intr->src[3].ssa;
nir_def *addr_data = nir_vec3(b, addr_lo, addr_hi, dma_data);
@ -1360,6 +1449,7 @@ static bool is_image(const nir_instr *instr, UNUSED const void *cb_data)
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_store:
case nir_intrinsic_image_deref_atomic:
case nir_intrinsic_image_deref_atomic_swap:
case nir_intrinsic_image_deref_size:
return true;

View file

@ -112,4 +112,6 @@ uint nir_smp_pco(uint16 data,
uint4 smp_state,
uint smp_flags,
uint range);
uint nir_umax(uint a, uint b);
#endif /* PCO_LIBCL_H */

View file

@ -33,6 +33,27 @@ usclib_emu_ssbo_atomic_comp_swap(uint2 ssbo_buffer, uint ssbo_offset, uint compa
return result;
}
uint32_t
usclib_emu_global_atomic_comp_swap(uint32_t addr_lo, uint32_t addr_hi, uint compare, uint data)
{
uint32_t result;
nir_mutex_pco(PCO_MUTEX_ID_ATOMIC_EMU, PCO_MUTEX_OP_LOCK);
for (uint u = 0; u < ROGUE_MAX_INSTANCES_PER_TASK; ++u) {
if (u == nir_load_instance_num_pco()) {
uint2 addr = (uint2)(addr_lo, addr_hi);
uint32_t pre_val = nir_dma_ld_pco(1, addr);
result = pre_val;
uint32_t post_val = (pre_val == compare) ? data : pre_val;
nir_dma_st_pco(false, addr, post_val);
}
}
nir_mutex_pco(PCO_MUTEX_ID_ATOMIC_EMU, PCO_MUTEX_OP_RELEASE);
return result;
}
void
usclib_barrier(uint num_slots, uint counter_offset)
{

View file

@ -11,6 +11,7 @@
#include "csbgen/rogue_texstate.h"
#include "libcl.h"
#include "util/u_math.h"
uint
@ -123,3 +124,66 @@ usclib_tex_lod_dval_post_clamp_resource_to_view_space(uint4 tex_state, uint4 smp
return MAX2(lod_dval_post_clamp, 0.0f);
}
/* TODO: this can probably be optimized with nir_interleave. */
uint32_t
usclib_twiddle3d(uint3 coords, uint3 size)
{
uint32_t width = nir_umax(size.x, 4);
width = util_next_power_of_two(width);
uint32_t height = nir_umax(size.y, 4);
height = util_next_power_of_two(height);
uint32_t depth = nir_umax(size.z, 4);
depth = util_next_power_of_two(depth);
/* Get to the inner 4x4 cube. */
width /= 4;
height /= 4;
depth /= 4;
uint32_t cx = coords.x / 4;
uint32_t cy = coords.y / 4;
uint32_t cz = coords.z / 4;
uint32_t shift = 0;
uint32_t cubeoffset = 0;
uint32_t i = 0;
while (width > 1 || height > 1 || depth > 1) {
uint32_t b1, b2, b3;
if (height > 1) {
b2 = ((cy & (1 << i)) >> i);
cubeoffset |= (b2 << shift);
shift++;
height >>= 1;
}
if (width > 1) {
b1 = ((cx & (1 << i)) >> i);
cubeoffset |= (b1 << shift);
shift++;
width >>= 1;
}
if (depth > 1) {
b3 = ((cz & (1 << i)) >> i);
cubeoffset |= (b3 << shift);
shift++;
depth >>= 1;
}
++i;
}
cubeoffset *= 4 * 4 * 4;
/* Get to slice. */
cubeoffset += 4 * 4 * (coords.z % 4);
/* Twiddle within slice. */
uint32_t r = (coords.y & 1) | ((coords.x & 1) << 1) | (((coords.y & 2) >> 1) << 2) | (((coords.x & 2) >> 1) << 3);
return cubeoffset + r;
}