ir3: ingest global addresses as 64b values from NIR

There are currently two places where we have to handle values that are
logically 64b: 64b atomics and 64b global addresses. For the former, we
ingest the values as 64b from NIR, while the latter uses 2x32b values.
This commit makes things more consistent by using 64b NIR values for
global addresses as well.

Of course, we could also go the other way around and use 2x32b values
everywhere, which would make things consistent as well. Given that ir3
doesn't actually have 64b registers, and 64b values are represented by
collected 2x32b registers, this could actually make more sense.

In the end, both methods are mostly equivalent and it probably doesn't
matter too much one way or the other. However, the reason I have a
slight preference for ingesting things as 64b is that it allows us to
use more of the generic NIR intrinsics, which use 1-component values for
64b addresses or atomic values. This commit already makes
global_atomic(_swap)_ir3 obsolete and I'm planning to create generic
intrinsics to support ldg.a/stg.a as well.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33797>
This commit is contained in:
Job Noorman 2025-06-12 12:14:47 +02:00 committed by Marge Bot
parent c91f4b2363
commit 2490ecf5fc
8 changed files with 38 additions and 45 deletions

View file

@ -1435,11 +1435,11 @@ load("shared_ir3", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
# src[] = { value, address(vec2 of hi+lo uint32_t), offset }. # src[] = { value, address(vec2 of hi+lo uint32_t), offset }.
# const_index[] = { write_mask, align_mul, align_offset } # const_index[] = { write_mask, align_mul, align_offset }
store("global_ir3", [2, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET]) store("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET])
# src[] = { address(vec2 of hi+lo uint32_t), offset }. # src[] = { address(vec2 of hi+lo uint32_t), offset }.
# const_index[] = { access, align_mul, align_offset } # const_index[] = { access, align_mul, align_offset }
# the alignment applies to the base address # the alignment applies to the base address
load("global_ir3", [2, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], flags=[CAN_ELIMINATE]) load("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], flags=[CAN_ELIMINATE])
# Etnaviv-specific load/glboal intrinsics. They take a 32-bit base address and # Etnaviv-specific load/glboal intrinsics. They take a 32-bit base address and
# a 32-bit offset, which doesn't need to be an immediate. # a 32-bit offset, which doesn't need to be an immediate.
@ -1489,7 +1489,7 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
# IR3-specific intrinsic for ldg.k. # IR3-specific intrinsic for ldg.k.
# base is an offset to apply to the address in bytes, range_base is the # base is an offset to apply to the address in bytes, range_base is the
# const file base in components, range is the amount to copy in vec4's. # const file base in components, range is the amount to copy in vec4's.
intrinsic("copy_global_to_uniform_ir3", [2], indices=[BASE, RANGE_BASE, RANGE]) intrinsic("copy_global_to_uniform_ir3", [1], indices=[BASE, RANGE_BASE, RANGE])
# IR3-specific intrinsic for stsc. Loads from push consts to constant file # IR3-specific intrinsic for stsc. Loads from push consts to constant file
# Should be used in the shader preamble. # Should be used in the shader preamble.

View file

@ -416,8 +416,7 @@ emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
unsigned dest_components = nir_intrinsic_dest_components(intr); unsigned dest_components = nir_intrinsic_dest_components(intr);
struct ir3_instruction *addr, *offset; struct ir3_instruction *addr, *offset;
addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0], addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0]);
ir3_get_src(ctx, &intr->src[0])[1]);
struct ir3_instruction *load; struct ir3_instruction *load;
@ -459,8 +458,7 @@ emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
struct ir3_instruction *value, *addr, *offset; struct ir3_instruction *value, *addr, *offset;
unsigned ncomp = nir_intrinsic_src_components(intr, 0); unsigned ncomp = nir_intrinsic_src_components(intr, 0);
addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0], addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0]);
ir3_get_src(ctx, &intr->src[1])[1]);
value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp); value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
@ -507,8 +505,7 @@ emit_intrinsic_atomic_global(struct ir3_context *ctx, nir_intrinsic_instr *intr)
type = TYPE_ATOMIC_U64; type = TYPE_ATOMIC_U64;
} }
addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0], addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0]);
ir3_get_src(ctx, &intr->src[0])[1]);
if (op == nir_atomic_op_cmpxchg) { if (op == nir_atomic_op_cmpxchg) {
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[2])[0]; struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[2])[0];

View file

@ -1246,9 +1246,8 @@ emit_intrinsic_copy_global_to_uniform(struct ir3_context *ctx,
if (dst_hi) if (dst_hi)
a1 = ir3_create_addr1(&ctx->build, dst_hi << 8); a1 = ir3_create_addr1(&ctx->build, dst_hi << 8);
struct ir3_instruction *addr_lo = ir3_get_src(ctx, &intr->src[0])[0]; struct ir3_instruction *addr =
struct ir3_instruction *addr_hi = ir3_get_src(ctx, &intr->src[0])[1]; ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0]);
struct ir3_instruction *addr = ir3_collect(b, addr_lo, addr_hi);
struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0, struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0,
create_immed(b, addr_offset), 0, create_immed(b, addr_offset), 0,
create_immed(b, size), 0); create_immed(b, size), 0);
@ -3268,8 +3267,8 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_bindless_resource_ir3: case nir_intrinsic_bindless_resource_ir3:
dst[0] = ir3_get_src(ctx, &intr->src[0])[0]; dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
break; break;
case nir_intrinsic_global_atomic_ir3: case nir_intrinsic_global_atomic:
case nir_intrinsic_global_atomic_swap_ir3: { case nir_intrinsic_global_atomic_swap: {
dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr); dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
break; break;
} }

View file

@ -569,8 +569,7 @@ lower_shader_clock(struct nir_builder *b, nir_intrinsic_instr *instr, void *data
nir_push_if(b, nir_elect(b, 1)); nir_push_if(b, nir_elect(b, 1));
{ {
/* ALWAYSON counter is mapped to this address. */ /* ALWAYSON counter is mapped to this address. */
nir_def *base_addr = nir_def *base_addr = nir_imm_int64(b, uche_trap_base);
nir_unpack_64_2x32(b, nir_imm_int64(b, uche_trap_base));
/* Reading _LO first presumably latches _HI making the read atomic. */ /* Reading _LO first presumably latches _HI making the read atomic. */
nir_def *clock_lo = nir_def *clock_lo =
nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 0)); nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 0));

View file

@ -36,8 +36,8 @@ lower_64b_intrinsics_filter(const nir_instr *instr, const void *unused)
/* skip over ssbo atomics, we'll lower them later */ /* skip over ssbo atomics, we'll lower them later */
if (intr->intrinsic == nir_intrinsic_ssbo_atomic || if (intr->intrinsic == nir_intrinsic_ssbo_atomic ||
intr->intrinsic == nir_intrinsic_ssbo_atomic_swap || intr->intrinsic == nir_intrinsic_ssbo_atomic_swap ||
intr->intrinsic == nir_intrinsic_global_atomic_ir3 || intr->intrinsic == nir_intrinsic_global_atomic ||
intr->intrinsic == nir_intrinsic_global_atomic_swap_ir3) intr->intrinsic == nir_intrinsic_global_atomic_swap)
return false; return false;
if (nir_intrinsic_dest_components(intr) == 0) if (nir_intrinsic_dest_components(intr) == 0)
@ -226,8 +226,6 @@ lower_64b_global_filter(const nir_instr *instr, const void *unused)
case nir_intrinsic_load_global: case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant: case nir_intrinsic_load_global_constant:
case nir_intrinsic_store_global: case nir_intrinsic_store_global:
case nir_intrinsic_global_atomic:
case nir_intrinsic_global_atomic_swap:
return true; return true;
default: default:
return false; return false;
@ -242,26 +240,13 @@ lower_64b_global(nir_builder *b, nir_instr *instr, void *unused)
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
bool load = intr->intrinsic != nir_intrinsic_store_global; bool load = intr->intrinsic != nir_intrinsic_store_global;
nir_def *addr64 = intr->src[load ? 0 : 1].ssa; nir_def *addr = intr->src[load ? 0 : 1].ssa;
nir_def *addr = nir_unpack_64_2x32(b, addr64);
/* /*
* Note that we can get vec8/vec16 with OpenCL.. we need to split * Note that we can get vec8/vec16 with OpenCL.. we need to split
* those up into max 4 components per load/store. * those up into max 4 components per load/store.
*/ */
if (intr->intrinsic == nir_intrinsic_global_atomic) {
return nir_global_atomic_ir3(
b, intr->def.bit_size, addr,
intr->src[1].ssa,
.atomic_op = nir_intrinsic_atomic_op(intr));
} else if (intr->intrinsic == nir_intrinsic_global_atomic_swap) {
return nir_global_atomic_swap_ir3(
b, intr->def.bit_size, addr,
intr->src[1].ssa, intr->src[2].ssa,
.atomic_op = nir_intrinsic_atomic_op(intr));
}
enum gl_access_qualifier access = nir_intrinsic_access(intr); enum gl_access_qualifier access = nir_intrinsic_access(intr);
if (load) { if (load) {

View file

@ -55,6 +55,18 @@ build_local_primitive_id(nir_builder *b, struct state *state)
63); 63);
} }
static nir_def *
load_tess_param_base(nir_builder *b)
{
return nir_pack_64_2x32(b, nir_load_tess_param_base_ir3(b));
}
static nir_def *
load_tess_factor_base(nir_builder *b)
{
return nir_pack_64_2x32(b, nir_load_tess_factor_base_ir3(b));
}
static bool static bool
is_tess_levels(gl_varying_slot slot) is_tess_levels(gl_varying_slot slot)
{ {
@ -517,7 +529,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
b->cursor = nir_before_instr(&intr->instr); b->cursor = nir_before_instr(&intr->instr);
nir_def *address = nir_load_tess_param_base_ir3(b); nir_def *address = load_tess_param_base(b);
nir_def *offset = build_per_vertex_offset( nir_def *offset = build_per_vertex_offset(
b, state, intr->src[0].ssa, b, state, intr->src[0].ssa,
nir_intrinsic_io_semantics(intr).location, nir_intrinsic_io_semantics(intr).location,
@ -538,7 +550,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
nir_def *value = intr->src[0].ssa; nir_def *value = intr->src[0].ssa;
nir_def *address = nir_load_tess_param_base_ir3(b); nir_def *address = load_tess_param_base(b);
nir_def *offset = build_per_vertex_offset( nir_def *offset = build_per_vertex_offset(
b, state, intr->src[1].ssa, b, state, intr->src[1].ssa,
nir_intrinsic_io_semantics(intr).location, nir_intrinsic_io_semantics(intr).location,
@ -565,11 +577,11 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
gl_varying_slot location = nir_intrinsic_io_semantics(intr).location; gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
if (is_tess_levels(location)) { if (is_tess_levels(location)) {
assert(intr->def.num_components == 1); assert(intr->def.num_components == 1);
address = nir_load_tess_factor_base_ir3(b); address = load_tess_factor_base(b);
offset = build_tessfactor_base( offset = build_tessfactor_base(
b, location, nir_intrinsic_component(intr), state); b, location, nir_intrinsic_component(intr), state);
} else { } else {
address = nir_load_tess_param_base_ir3(b); address = load_tess_param_base(b);
offset = build_patch_offset(b, state, location, offset = build_patch_offset(b, state, location,
nir_intrinsic_component(intr), nir_intrinsic_component(intr),
intr->src[0].ssa); intr->src[0].ssa);
@ -620,14 +632,14 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
intr->src[0].ssa, intr->src[0].ssa,
nir_load_tess_factor_base_ir3(b), load_tess_factor_base(b),
nir_iadd(b, intr->src[1].ssa, offset)); nir_iadd(b, intr->src[1].ssa, offset));
if (location != VARYING_SLOT_PRIMITIVE_ID) { if (location != VARYING_SLOT_PRIMITIVE_ID) {
nir_pop_if(b, nif); nir_pop_if(b, nif);
} }
} else { } else {
nir_def *address = nir_load_tess_param_base_ir3(b); nir_def *address = load_tess_param_base(b);
nir_def *offset = build_patch_offset( nir_def *offset = build_patch_offset(
b, state, location, nir_intrinsic_component(intr), b, state, location, nir_intrinsic_component(intr),
intr->src[1].ssa); intr->src[1].ssa);
@ -729,7 +741,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
b->cursor = nir_before_instr(&intr->instr); b->cursor = nir_before_instr(&intr->instr);
nir_def *address = nir_load_tess_param_base_ir3(b); nir_def *address = load_tess_param_base(b);
nir_def *offset = build_per_vertex_offset( nir_def *offset = build_per_vertex_offset(
b, state, intr->src[0].ssa, b, state, intr->src[0].ssa,
nir_intrinsic_io_semantics(intr).location, nir_intrinsic_io_semantics(intr).location,
@ -755,11 +767,11 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
gl_varying_slot location = nir_intrinsic_io_semantics(intr).location; gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
if (is_tess_levels(location)) { if (is_tess_levels(location)) {
assert(intr->def.num_components == 1); assert(intr->def.num_components == 1);
address = nir_load_tess_factor_base_ir3(b); address = load_tess_factor_base(b);
offset = build_tessfactor_base( offset = build_tessfactor_base(
b, location, nir_intrinsic_component(intr), state); b, location, nir_intrinsic_component(intr), state);
} else { } else {
address = nir_load_tess_param_base_ir3(b); address = load_tess_param_base(b);
offset = build_patch_offset(b, state, location, offset = build_patch_offset(b, state, location,
nir_intrinsic_component(intr), nir_intrinsic_component(intr),
intr->src[0].ssa); intr->src[0].ssa);

View file

@ -262,7 +262,7 @@ load_tlas(nir_builder *b, nir_def *tlas,
.align_offset = offset); .align_offset = offset);
} else { } else {
return nir_load_global_ir3(b, components, 32, return nir_load_global_ir3(b, components, 32,
tlas, nir_pack_64_2x32(b, tlas),
nir_iadd_imm(b, nir_imul_imm(b, index, AS_RECORD_SIZE / 4), nir_iadd_imm(b, nir_imul_imm(b, index, AS_RECORD_SIZE / 4),
offset / 4), offset / 4),
/* The required alignment of the /* The required alignment of the

View file

@ -742,7 +742,8 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
} }
val = nir_load_global_ir3(b, intrin->num_components, val = nir_load_global_ir3(b, intrin->num_components,
intrin->def.bit_size, intrin->def.bit_size,
base_addr, nir_ishr_imm(b, offset, 2), nir_pack_64_2x32(b, base_addr),
nir_ishr_imm(b, offset, 2),
.access = .access =
(enum gl_access_qualifier)( (enum gl_access_qualifier)(
(enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) | (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |