nir: add uniform address to nvidia IO intrinsics

Adding the zero constants have a minor impact on stats

Totals from 61 (0.01% of 1212873) affected shaders:
CodeSize: 1044720 -> 1047472 (+0.26%); split: -0.00%, +0.27%
Static cycle count: 1198932 -> 1198490 (-0.04%); split: -0.07%, +0.04%
This commit is contained in:
Karol Herbst 2026-01-29 22:36:17 +01:00 committed by Karol Herbst
parent e639aa342d
commit 24b725a5d2
9 changed files with 121 additions and 40 deletions

View file

@ -5787,11 +5787,13 @@ nir_lower_shader_calls(nir_shader *shader,
void *mem_ctx);
int nir_get_io_offset_src_number(const nir_intrinsic_instr *instr);
int nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr);
int nir_get_io_index_src_number(const nir_intrinsic_instr *instr);
int nir_get_io_data_src_number(const nir_intrinsic_instr *instr);
int nir_get_io_arrayed_index_src_number(const nir_intrinsic_instr *instr);
nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr);
nir_src *nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr);
nir_src *nir_get_io_index_src(nir_intrinsic_instr *instr);
nir_src *nir_get_io_data_src(nir_intrinsic_instr *instr);
nir_src *nir_get_io_arrayed_index_src(nir_intrinsic_instr *instr);
@ -5801,7 +5803,6 @@ static inline unsigned
nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
{
switch (intr->intrinsic) {
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_shared_atomic_nv:
case nir_intrinsic_shared_atomic_swap_nv:
@ -5814,6 +5815,9 @@ nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
case nir_intrinsic_store_shared_nv:
case nir_intrinsic_store_shared_unlock_nv:
return 24;
case nir_intrinsic_global_atomic_nv:
/* TODO: SM100+ only has 23 bits for the UGPR + GPR form */
return 23;
case nir_intrinsic_ldc_nv:
case nir_intrinsic_ldcx_nv:
return 16;

View file

@ -941,7 +941,8 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
# The offset is sign-extended or zero-extended based on the SIGN_EXTEND index.
#
# NV variants all come with a 24 bit base, that is unsigned with a constant 0 address,
# signed otherwise.
# signed otherwise. Non swap atomic also comes with an additional uniform address source
# right after the non uniform memory address.
#
# PCO global variants use a vec3 for the memory address and data, where component X
# has the low 32 address bits, component Y has the high 32 address bits, and component Z
@ -950,13 +951,13 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
intrinsic("deref_atomic", src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
intrinsic("ssbo_atomic", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT])
intrinsic("shared_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("shared_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV])
intrinsic("shared_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV])
intrinsic("task_payload_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_2x32", src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_agx", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND])
intrinsic("global_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_pco", src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32])
intrinsic("deref_atomic_swap", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
@ -1920,15 +1921,15 @@ load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flag
# src[] = { value, address, unsigned 32-bit offset }.
store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRITE_MASK])
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
# signed otherwise.
# src[] = { address, uniform_address }. BASE is a 24 bit unsigned offset if a constant 0 address and
# a constant 0 uniform_address is given, signed otherwise.
# load_global_nv has an additional boolean input that makes the load return 0 on false.
load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
load("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
load("global_nv", [1, 1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
load("scratch_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("scratch_nv", [1, 1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
load("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
# Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0}
intrinsic("gds_atomic_add_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
@ -2942,7 +2943,8 @@ intrinsic("ssa_bar_nv", src_comp=[1])
intrinsic("cmat_muladd_nv", src_comp=[-1, -1, -1], dest_comp=0, bit_sizes=src2,
indices=[FLAGS], flags=[CAN_ELIMINATE])
intrinsic("cmat_load_shared_nv", src_comp=[1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE])
# src[] = { address, uniform_address }
intrinsic("cmat_load_shared_nv", src_comp=[1, 1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE])
# Moves a 8x8 16bit matrix with transposition within a subgroup
intrinsic("cmat_mov_transpose_nv", src_comp=[2], dest_comp=2, bit_sizes=[16], flags=[CAN_ELIMINATE, CAN_REORDER, SUBGROUP])

View file

@ -1106,6 +1106,39 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr)
case nir_intrinsic_bindless_image_##name: \
case nir_intrinsic_image_heap_##name
/**
* Return the uniform offset source number for a load/store intrinsic or -1 if there's no offset.
*/
int
nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
case nir_intrinsic_cmat_load_shared_nv:
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_load_shared_nv:
case nir_intrinsic_shared_atomic_nv:
return 1;
case nir_intrinsic_store_global_nv:
case nir_intrinsic_store_scratch_nv:
case nir_intrinsic_store_shared_nv:
return 2;
default:
return -1;
}
}
/**
* Return the uniform offset source for a load/store intrinsic.
*/
nir_src *
nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr)
{
const int idx = nir_get_io_uniform_offset_src_number(instr);
return idx >= 0 ? &instr->src[idx] : NULL;
}
/**
* Return the index or handle source number for a load/store intrinsic or -1
* if there's no index or handle.

View file

@ -193,11 +193,12 @@ try_fold_load_store_nv(nir_builder *b,
assert(offset_idx >= 0);
nir_src src = intrin->src[offset_idx];
nir_src *uniform_src = nir_get_io_uniform_offset_src(intrin);
int32_t min = 0;
uint32_t max = BITFIELD_MASK(offset_bits);
if (!nir_src_is_const(src)) {
if (!nir_src_is_const(src) || (uniform_src && !nir_src_is_const(*uniform_src))) {
max >>= 1;
min = ~max;
}
@ -211,6 +212,11 @@ try_fold_load_store_nv(nir_builder *b,
return false;
}
/* We don't try to fold the offset for the uniform source on purpose,
* because we rely on running nir_opt_offsets before moving in the uniform
* source. However, we might run this pass again _after_ that, because we
* can eliminate a u2u64 on the _non uniform_ source and therefore might be
* able to fold in more constants into base. */
return try_fold_load_store(b, intrin, state, offset_idx, min, max, false);
}

View file

@ -761,9 +761,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
case nir_intrinsic_vild_nv: {
int base = nir_intrinsic_base(instr);
nir_src src = *nir_get_io_offset_src(instr);
nir_src *uniform_src = nir_get_io_uniform_offset_src(instr);
unsigned const_bits = nir_get_io_base_size_nv(instr);
if (nir_src_is_const(src) && nir_src_as_int(src) == 0) {
if (nir_src_is_const(src) && nir_src_as_int(src) == 0 &&
(!uniform_src || (nir_src_is_const(*uniform_src) && nir_src_as_int(*uniform_src) == 0))) {
validate_assert(state, base >= 0 && base < BITFIELD_MASK(const_bits));
} else {
int32_t max = BITFIELD_MASK(const_bits - 1);
@ -771,8 +773,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
validate_assert(state, base >= min && base < max);
}
if (uniform_src) {
validate_assert(state, uniform_src->ssa->bit_size >= src.ssa->bit_size);
if (state->impl->valid_metadata & nir_metadata_divergence)
validate_assert(state, !uniform_src->ssa->divergent);
}
if (instr->intrinsic == nir_intrinsic_load_global_nv) {
validate_assert(state, instr->src[1].ssa->bit_size == 1);
validate_assert(state, instr->src[2].ssa->bit_size == 1);
}
break;

View file

@ -2975,7 +2975,8 @@ impl<'a> ShaderFromNir<'a> {
nir_intrinsic_global_atomic_nv => {
let bit_size = intrin.def.bit_size();
let addr = self.get_src(&srcs[0]);
let data = self.get_src(&srcs[1]);
let uaddr = self.get_src(&srcs[1]);
let data = self.get_src(&srcs[2]);
let atom_type = self.get_atomic_type(intrin);
let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
@ -2992,7 +2993,7 @@ impl<'a> ShaderFromNir<'a> {
dst.clone().into()
},
addr: addr,
uniform_address: Src::ZERO,
uniform_address: uaddr,
cmpr: 0.into(),
data: data,
atom_op: atom_op,
@ -3220,13 +3221,14 @@ impl<'a> ShaderFromNir<'a> {
.get_eviction_priority(intrin.access()),
};
let addr = self.get_src(&srcs[0]);
let pred = self.get_src(&srcs[1]);
let uaddr = self.get_src(&srcs[1]);
let pred = self.get_src(&srcs[2]);
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
b.push_op(OpLd {
dst: dst.clone().into(),
addr: addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
pred: pred,
offset: intrin.base(),
stride: OffsetStride::X1,
@ -3333,12 +3335,13 @@ impl<'a> ShaderFromNir<'a> {
eviction_priority: MemEvictionPriority::Normal,
};
let addr = self.get_src(&srcs[0]);
let uaddr = self.get_src(&srcs[1]);
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
b.push_op(OpLd {
dst: dst.clone().into(),
addr: addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
pred: true.into(),
offset: intrin.base(),
stride: OffsetStride::X1,
@ -3357,12 +3360,14 @@ impl<'a> ShaderFromNir<'a> {
eviction_priority: MemEvictionPriority::Normal,
};
let addr = self.get_src(&srcs[0]);
let uaddr = self.get_src(&srcs[1]);
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
b.push_op(OpLd {
dst: dst.clone().into(),
addr: addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
pred: true.into(),
offset: intrin.base(),
stride: intrin.offset_shift_nv().try_into().unwrap(),
@ -3673,7 +3678,8 @@ impl<'a> ShaderFromNir<'a> {
nir_intrinsic_shared_atomic_nv => {
let bit_size = intrin.def.bit_size();
let addr = self.get_src(&srcs[0]);
let data = self.get_src(&srcs[1]);
let uaddr = self.get_src(&srcs[1]);
let data = self.get_src(&srcs[2]);
let atom_type = self.get_atomic_type(intrin);
let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
@ -3683,7 +3689,7 @@ impl<'a> ShaderFromNir<'a> {
b.push_op(OpAtom {
dst: dst.clone().into(),
addr: addr,
uniform_address: Src::ZERO,
uniform_address: uaddr,
cmpr: 0.into(),
data: data,
atom_op: atom_op,
@ -3740,10 +3746,11 @@ impl<'a> ShaderFromNir<'a> {
.get_eviction_priority(intrin.access()),
};
let addr = self.get_src(&srcs[1]);
let uaddr = self.get_src(&srcs[2]);
b.push_op(OpSt {
addr: addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
data: data,
offset: intrin.base(),
stride: OffsetStride::X1,
@ -3772,10 +3779,11 @@ impl<'a> ShaderFromNir<'a> {
eviction_priority: MemEvictionPriority::Normal,
};
let addr = self.get_src(&srcs[1]);
let uaddr = self.get_src(&srcs[2]);
b.push_op(OpSt {
addr: addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
data: data,
offset: intrin.base(),
stride: OffsetStride::X1,
@ -3794,10 +3802,11 @@ impl<'a> ShaderFromNir<'a> {
eviction_priority: MemEvictionPriority::Normal,
};
let addr = self.get_src(&srcs[1]);
let uaddr = self.get_src(&srcs[2]);
b.push_op(OpSt {
addr: addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
data: data,
offset: intrin.base(),
stride: intrin.offset_shift_nv().try_into().unwrap(),
@ -3912,12 +3921,13 @@ impl<'a> ShaderFromNir<'a> {
};
let dst = b.alloc_ssa_vec(RegFile::GPR, comps);
let addr = self.get_src(&srcs[0]);
let uaddr = self.get_src(&srcs[1]);
b.push_op(OpLdsm {
dst: dst.clone().into(),
mat_size,
mat_count,
addr,
uniform_addr: Src::ZERO,
uniform_addr: uaddr,
offset: intrin.base(),
});
self.set_dst(&intrin.def, dst);

View file

@ -1019,8 +1019,23 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_src *addr;
switch (intr->intrinsic) {
case nir_intrinsic_load_global_bounded:
case nir_intrinsic_load_global_constant_bounded: {
addr = &intr->src[0];
break;
}
default:
addr = nir_get_io_offset_src(intr);
break;
}
if (!addr)
continue;
b.cursor = nir_before_instr(instr);
nir_src *addr = nir_get_io_offset_src(intr);
nir_def *uaddr = nir_imm_zero(&b, 1, addr->ssa->bit_size);
nir_def *res = NULL;
nir_intrinsic_instr *new = NULL;
@ -1028,7 +1043,7 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant: {
nir_def *nir_true = nir_imm_bool(&b, true);
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr, nir_true);
break;
}
case nir_intrinsic_load_global_bounded:
@ -1044,32 +1059,32 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa));
nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1);
nir_def *cond = nir_ult(&b, last_byte, size->ssa);
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond);
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, uaddr, cond);
break;
}
case nir_intrinsic_load_scratch:
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr);
break;
case nir_intrinsic_load_shared:
res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr);
break;
case nir_intrinsic_store_global:
new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa);
new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
break;
case nir_intrinsic_store_scratch:
new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa);
new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
break;
case nir_intrinsic_store_shared:
new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa);
new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
break;
case nir_intrinsic_global_atomic:
res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa);
res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa);
break;
case nir_intrinsic_global_atomic_swap:
res = nir_global_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa);
break;
case nir_intrinsic_shared_atomic:
res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa);
res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa);
break;
case nir_intrinsic_shared_atomic_swap:
res = nir_shared_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa);

View file

@ -723,6 +723,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
nir_def *base = intr->src[1].ssa;
offset = nir_u2uN(b, offset, base->bit_size);
nir_def *addr = nir_iadd(b, base, offset);
nir_def *zero = nir_imm_zero(b, addr->num_components, addr->bit_size);
/* flip the layout for B matrices */
if (desc.use == GLSL_CMAT_USE_B) {
@ -734,7 +735,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
/* Each thread loads 32 bits per matrix */
assert(length * bit_size == 32 * ldsm_count);
return nir_cmat_load_shared_nv(b, length, bit_size, addr,
return nir_cmat_load_shared_nv(b, length, bit_size, addr, zero,
.num_matrices = ldsm_count,
.matrix_layout = layout);
}

View file

@ -56,10 +56,12 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load,
* simple less-than check here.
*/
nir_def *cond = nir_ilt(b, offset, size);
nir_def *zero_addr = nir_imm_zero(b, addr->num_components,
addr->bit_size);
nir_def *val = nir_load_global_nv(b,
load->def.num_components, load->def.bit_size,
nir_iadd(b, addr, nir_u2u64(b, offset)),
cond,
zero_addr, cond,
.align_mul = nir_intrinsic_align_mul(load),
.align_offset = nir_intrinsic_align_offset(load),
.access = ACCESS_CAN_REORDER,