mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
nir: add uniform address to nvidia IO intrinsics
Adding the zero constants have a minor impact on stats Totals from 61 (0.01% of 1212873) affected shaders: CodeSize: 1044720 -> 1047472 (+0.26%); split: -0.00%, +0.27% Static cycle count: 1198932 -> 1198490 (-0.04%); split: -0.07%, +0.04%
This commit is contained in:
parent
e639aa342d
commit
24b725a5d2
9 changed files with 121 additions and 40 deletions
|
|
@ -5787,11 +5787,13 @@ nir_lower_shader_calls(nir_shader *shader,
|
|||
void *mem_ctx);
|
||||
|
||||
int nir_get_io_offset_src_number(const nir_intrinsic_instr *instr);
|
||||
int nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr);
|
||||
int nir_get_io_index_src_number(const nir_intrinsic_instr *instr);
|
||||
int nir_get_io_data_src_number(const nir_intrinsic_instr *instr);
|
||||
int nir_get_io_arrayed_index_src_number(const nir_intrinsic_instr *instr);
|
||||
|
||||
nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr);
|
||||
nir_src *nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr);
|
||||
nir_src *nir_get_io_index_src(nir_intrinsic_instr *instr);
|
||||
nir_src *nir_get_io_data_src(nir_intrinsic_instr *instr);
|
||||
nir_src *nir_get_io_arrayed_index_src(nir_intrinsic_instr *instr);
|
||||
|
|
@ -5801,7 +5803,6 @@ static inline unsigned
|
|||
nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
|
||||
{
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_global_atomic_nv:
|
||||
case nir_intrinsic_global_atomic_swap_nv:
|
||||
case nir_intrinsic_shared_atomic_nv:
|
||||
case nir_intrinsic_shared_atomic_swap_nv:
|
||||
|
|
@ -5814,6 +5815,9 @@ nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
|
|||
case nir_intrinsic_store_shared_nv:
|
||||
case nir_intrinsic_store_shared_unlock_nv:
|
||||
return 24;
|
||||
case nir_intrinsic_global_atomic_nv:
|
||||
/* TODO: SM100+ only has 23 bits for the UGPR + GPR form */
|
||||
return 23;
|
||||
case nir_intrinsic_ldc_nv:
|
||||
case nir_intrinsic_ldcx_nv:
|
||||
return 16;
|
||||
|
|
|
|||
|
|
@ -941,7 +941,8 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
|
|||
# The offset is sign-extended or zero-extended based on the SIGN_EXTEND index.
|
||||
#
|
||||
# NV variants all come with a 24 bit base, that is unsigned with a constant 0 address,
|
||||
# signed otherwise.
|
||||
# signed otherwise. Non swap atomic also comes with an additional uniform address source
|
||||
# right after the non uniform memory address.
|
||||
#
|
||||
# PCO global variants use a vec3 for the memory address and data, where component X
|
||||
# has the low 32 address bits, component Y has the high 32 address bits, and component Z
|
||||
|
|
@ -950,13 +951,13 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
|
|||
intrinsic("deref_atomic", src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
|
||||
intrinsic("ssbo_atomic", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT])
|
||||
intrinsic("shared_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
|
||||
intrinsic("shared_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV])
|
||||
intrinsic("shared_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV])
|
||||
intrinsic("task_payload_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
|
||||
intrinsic("global_atomic", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP])
|
||||
intrinsic("global_atomic_2x32", src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP])
|
||||
intrinsic("global_atomic_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
|
||||
intrinsic("global_atomic_agx", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND])
|
||||
intrinsic("global_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
|
||||
intrinsic("global_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
|
||||
intrinsic("global_atomic_pco", src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32])
|
||||
|
||||
intrinsic("deref_atomic_swap", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
|
||||
|
|
@ -1920,15 +1921,15 @@ load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flag
|
|||
# src[] = { value, address, unsigned 32-bit offset }.
|
||||
store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRITE_MASK])
|
||||
|
||||
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
|
||||
# signed otherwise.
|
||||
# src[] = { address, uniform_address }. BASE is a 24 bit unsigned offset if a constant 0 address and
|
||||
# a constant 0 uniform_address is given, signed otherwise.
|
||||
# load_global_nv has an additional boolean input that makes the load return 0 on false.
|
||||
load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
|
||||
load("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||
load("global_nv", [1, 1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||
load("scratch_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("scratch_nv", [1, 1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
|
||||
load("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||
|
||||
# Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0}
|
||||
intrinsic("gds_atomic_add_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
|
||||
|
|
@ -2942,7 +2943,8 @@ intrinsic("ssa_bar_nv", src_comp=[1])
|
|||
intrinsic("cmat_muladd_nv", src_comp=[-1, -1, -1], dest_comp=0, bit_sizes=src2,
|
||||
indices=[FLAGS], flags=[CAN_ELIMINATE])
|
||||
|
||||
intrinsic("cmat_load_shared_nv", src_comp=[1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE])
|
||||
# src[] = { address, uniform_address }
|
||||
intrinsic("cmat_load_shared_nv", src_comp=[1, 1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE])
|
||||
|
||||
# Moves a 8x8 16bit matrix with transposition within a subgroup
|
||||
intrinsic("cmat_mov_transpose_nv", src_comp=[2], dest_comp=2, bit_sizes=[16], flags=[CAN_ELIMINATE, CAN_REORDER, SUBGROUP])
|
||||
|
|
|
|||
|
|
@ -1106,6 +1106,39 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr)
|
|||
case nir_intrinsic_bindless_image_##name: \
|
||||
case nir_intrinsic_image_heap_##name
|
||||
|
||||
/**
|
||||
* Return the uniform offset source number for a load/store intrinsic or -1 if there's no offset.
|
||||
*/
|
||||
int
|
||||
nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr)
|
||||
{
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_cmat_load_shared_nv:
|
||||
case nir_intrinsic_global_atomic_nv:
|
||||
case nir_intrinsic_load_global_nv:
|
||||
case nir_intrinsic_load_scratch_nv:
|
||||
case nir_intrinsic_load_shared_nv:
|
||||
case nir_intrinsic_shared_atomic_nv:
|
||||
return 1;
|
||||
case nir_intrinsic_store_global_nv:
|
||||
case nir_intrinsic_store_scratch_nv:
|
||||
case nir_intrinsic_store_shared_nv:
|
||||
return 2;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the uniform offset source for a load/store intrinsic.
|
||||
*/
|
||||
nir_src *
|
||||
nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr)
|
||||
{
|
||||
const int idx = nir_get_io_uniform_offset_src_number(instr);
|
||||
return idx >= 0 ? &instr->src[idx] : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the index or handle source number for a load/store intrinsic or -1
|
||||
* if there's no index or handle.
|
||||
|
|
|
|||
|
|
@ -193,11 +193,12 @@ try_fold_load_store_nv(nir_builder *b,
|
|||
|
||||
assert(offset_idx >= 0);
|
||||
nir_src src = intrin->src[offset_idx];
|
||||
nir_src *uniform_src = nir_get_io_uniform_offset_src(intrin);
|
||||
|
||||
int32_t min = 0;
|
||||
uint32_t max = BITFIELD_MASK(offset_bits);
|
||||
|
||||
if (!nir_src_is_const(src)) {
|
||||
if (!nir_src_is_const(src) || (uniform_src && !nir_src_is_const(*uniform_src))) {
|
||||
max >>= 1;
|
||||
min = ~max;
|
||||
}
|
||||
|
|
@ -211,6 +212,11 @@ try_fold_load_store_nv(nir_builder *b,
|
|||
return false;
|
||||
}
|
||||
|
||||
/* We don't try to fold the offset for the uniform source on purpose,
|
||||
* because we rely on running nir_opt_offsets before moving in the uniform
|
||||
* source. However, we might run this pass again _after_ that, because we
|
||||
* can eliminate a u2u64 on the _non uniform_ source and therefore might be
|
||||
* able to fold in more constants into base. */
|
||||
return try_fold_load_store(b, intrin, state, offset_idx, min, max, false);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -761,9 +761,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
|
|||
case nir_intrinsic_vild_nv: {
|
||||
int base = nir_intrinsic_base(instr);
|
||||
nir_src src = *nir_get_io_offset_src(instr);
|
||||
nir_src *uniform_src = nir_get_io_uniform_offset_src(instr);
|
||||
unsigned const_bits = nir_get_io_base_size_nv(instr);
|
||||
|
||||
if (nir_src_is_const(src) && nir_src_as_int(src) == 0) {
|
||||
if (nir_src_is_const(src) && nir_src_as_int(src) == 0 &&
|
||||
(!uniform_src || (nir_src_is_const(*uniform_src) && nir_src_as_int(*uniform_src) == 0))) {
|
||||
validate_assert(state, base >= 0 && base < BITFIELD_MASK(const_bits));
|
||||
} else {
|
||||
int32_t max = BITFIELD_MASK(const_bits - 1);
|
||||
|
|
@ -771,8 +773,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
|
|||
validate_assert(state, base >= min && base < max);
|
||||
}
|
||||
|
||||
if (uniform_src) {
|
||||
validate_assert(state, uniform_src->ssa->bit_size >= src.ssa->bit_size);
|
||||
if (state->impl->valid_metadata & nir_metadata_divergence)
|
||||
validate_assert(state, !uniform_src->ssa->divergent);
|
||||
}
|
||||
|
||||
if (instr->intrinsic == nir_intrinsic_load_global_nv) {
|
||||
validate_assert(state, instr->src[1].ssa->bit_size == 1);
|
||||
validate_assert(state, instr->src[2].ssa->bit_size == 1);
|
||||
}
|
||||
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -2975,7 +2975,8 @@ impl<'a> ShaderFromNir<'a> {
|
|||
nir_intrinsic_global_atomic_nv => {
|
||||
let bit_size = intrin.def.bit_size();
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let data = self.get_src(&srcs[1]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
let data = self.get_src(&srcs[2]);
|
||||
let atom_type = self.get_atomic_type(intrin);
|
||||
let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
|
||||
|
||||
|
|
@ -2992,7 +2993,7 @@ impl<'a> ShaderFromNir<'a> {
|
|||
dst.clone().into()
|
||||
},
|
||||
addr: addr,
|
||||
uniform_address: Src::ZERO,
|
||||
uniform_address: uaddr,
|
||||
cmpr: 0.into(),
|
||||
data: data,
|
||||
atom_op: atom_op,
|
||||
|
|
@ -3220,13 +3221,14 @@ impl<'a> ShaderFromNir<'a> {
|
|||
.get_eviction_priority(intrin.access()),
|
||||
};
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let pred = self.get_src(&srcs[1]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
let pred = self.get_src(&srcs[2]);
|
||||
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
|
||||
|
||||
b.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
pred: pred,
|
||||
offset: intrin.base(),
|
||||
stride: OffsetStride::X1,
|
||||
|
|
@ -3333,12 +3335,13 @@ impl<'a> ShaderFromNir<'a> {
|
|||
eviction_priority: MemEvictionPriority::Normal,
|
||||
};
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
|
||||
|
||||
b.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
pred: true.into(),
|
||||
offset: intrin.base(),
|
||||
stride: OffsetStride::X1,
|
||||
|
|
@ -3357,12 +3360,14 @@ impl<'a> ShaderFromNir<'a> {
|
|||
eviction_priority: MemEvictionPriority::Normal,
|
||||
};
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
|
||||
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
|
||||
|
||||
b.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
pred: true.into(),
|
||||
offset: intrin.base(),
|
||||
stride: intrin.offset_shift_nv().try_into().unwrap(),
|
||||
|
|
@ -3673,7 +3678,8 @@ impl<'a> ShaderFromNir<'a> {
|
|||
nir_intrinsic_shared_atomic_nv => {
|
||||
let bit_size = intrin.def.bit_size();
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let data = self.get_src(&srcs[1]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
let data = self.get_src(&srcs[2]);
|
||||
let atom_type = self.get_atomic_type(intrin);
|
||||
let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
|
||||
|
||||
|
|
@ -3683,7 +3689,7 @@ impl<'a> ShaderFromNir<'a> {
|
|||
b.push_op(OpAtom {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
uniform_address: Src::ZERO,
|
||||
uniform_address: uaddr,
|
||||
cmpr: 0.into(),
|
||||
data: data,
|
||||
atom_op: atom_op,
|
||||
|
|
@ -3740,10 +3746,11 @@ impl<'a> ShaderFromNir<'a> {
|
|||
.get_eviction_priority(intrin.access()),
|
||||
};
|
||||
let addr = self.get_src(&srcs[1]);
|
||||
let uaddr = self.get_src(&srcs[2]);
|
||||
|
||||
b.push_op(OpSt {
|
||||
addr: addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
data: data,
|
||||
offset: intrin.base(),
|
||||
stride: OffsetStride::X1,
|
||||
|
|
@ -3772,10 +3779,11 @@ impl<'a> ShaderFromNir<'a> {
|
|||
eviction_priority: MemEvictionPriority::Normal,
|
||||
};
|
||||
let addr = self.get_src(&srcs[1]);
|
||||
let uaddr = self.get_src(&srcs[2]);
|
||||
|
||||
b.push_op(OpSt {
|
||||
addr: addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
data: data,
|
||||
offset: intrin.base(),
|
||||
stride: OffsetStride::X1,
|
||||
|
|
@ -3794,10 +3802,11 @@ impl<'a> ShaderFromNir<'a> {
|
|||
eviction_priority: MemEvictionPriority::Normal,
|
||||
};
|
||||
let addr = self.get_src(&srcs[1]);
|
||||
let uaddr = self.get_src(&srcs[2]);
|
||||
|
||||
b.push_op(OpSt {
|
||||
addr: addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
data: data,
|
||||
offset: intrin.base(),
|
||||
stride: intrin.offset_shift_nv().try_into().unwrap(),
|
||||
|
|
@ -3912,12 +3921,13 @@ impl<'a> ShaderFromNir<'a> {
|
|||
};
|
||||
let dst = b.alloc_ssa_vec(RegFile::GPR, comps);
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
b.push_op(OpLdsm {
|
||||
dst: dst.clone().into(),
|
||||
mat_size,
|
||||
mat_count,
|
||||
addr,
|
||||
uniform_addr: Src::ZERO,
|
||||
uniform_addr: uaddr,
|
||||
offset: intrin.base(),
|
||||
});
|
||||
self.set_dst(&intrin.def, dst);
|
||||
|
|
|
|||
|
|
@ -1019,8 +1019,23 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
|
|||
continue;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_src *addr;
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_global_bounded:
|
||||
case nir_intrinsic_load_global_constant_bounded: {
|
||||
addr = &intr->src[0];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
addr = nir_get_io_offset_src(intr);
|
||||
break;
|
||||
}
|
||||
if (!addr)
|
||||
continue;
|
||||
|
||||
b.cursor = nir_before_instr(instr);
|
||||
nir_src *addr = nir_get_io_offset_src(intr);
|
||||
nir_def *uaddr = nir_imm_zero(&b, 1, addr->ssa->bit_size);
|
||||
nir_def *res = NULL;
|
||||
nir_intrinsic_instr *new = NULL;
|
||||
|
||||
|
|
@ -1028,7 +1043,7 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
|
|||
case nir_intrinsic_load_global:
|
||||
case nir_intrinsic_load_global_constant: {
|
||||
nir_def *nir_true = nir_imm_bool(&b, true);
|
||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
|
||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr, nir_true);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_global_bounded:
|
||||
|
|
@ -1044,32 +1059,32 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
|
|||
nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa));
|
||||
nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1);
|
||||
nir_def *cond = nir_ult(&b, last_byte, size->ssa);
|
||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond);
|
||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, uaddr, cond);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_scratch:
|
||||
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
||||
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr);
|
||||
break;
|
||||
case nir_intrinsic_load_shared:
|
||||
res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
||||
res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr);
|
||||
break;
|
||||
case nir_intrinsic_store_global:
|
||||
new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa);
|
||||
new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
|
||||
break;
|
||||
case nir_intrinsic_store_scratch:
|
||||
new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa);
|
||||
new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
|
||||
break;
|
||||
case nir_intrinsic_store_shared:
|
||||
new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa);
|
||||
new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
|
||||
break;
|
||||
case nir_intrinsic_global_atomic:
|
||||
res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa);
|
||||
res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa);
|
||||
break;
|
||||
case nir_intrinsic_global_atomic_swap:
|
||||
res = nir_global_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa);
|
||||
break;
|
||||
case nir_intrinsic_shared_atomic:
|
||||
res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa);
|
||||
res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa);
|
||||
break;
|
||||
case nir_intrinsic_shared_atomic_swap:
|
||||
res = nir_shared_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa);
|
||||
|
|
|
|||
|
|
@ -723,6 +723,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
|
|||
nir_def *base = intr->src[1].ssa;
|
||||
offset = nir_u2uN(b, offset, base->bit_size);
|
||||
nir_def *addr = nir_iadd(b, base, offset);
|
||||
nir_def *zero = nir_imm_zero(b, addr->num_components, addr->bit_size);
|
||||
|
||||
/* flip the layout for B matrices */
|
||||
if (desc.use == GLSL_CMAT_USE_B) {
|
||||
|
|
@ -734,7 +735,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
|
|||
|
||||
/* Each thread loads 32 bits per matrix */
|
||||
assert(length * bit_size == 32 * ldsm_count);
|
||||
return nir_cmat_load_shared_nv(b, length, bit_size, addr,
|
||||
return nir_cmat_load_shared_nv(b, length, bit_size, addr, zero,
|
||||
.num_matrices = ldsm_count,
|
||||
.matrix_layout = layout);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,10 +56,12 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load,
|
|||
* simple less-than check here.
|
||||
*/
|
||||
nir_def *cond = nir_ilt(b, offset, size);
|
||||
nir_def *zero_addr = nir_imm_zero(b, addr->num_components,
|
||||
addr->bit_size);
|
||||
nir_def *val = nir_load_global_nv(b,
|
||||
load->def.num_components, load->def.bit_size,
|
||||
nir_iadd(b, addr, nir_u2u64(b, offset)),
|
||||
cond,
|
||||
zero_addr, cond,
|
||||
.align_mul = nir_intrinsic_align_mul(load),
|
||||
.align_offset = nir_intrinsic_align_offset(load),
|
||||
.access = ACCESS_CAN_REORDER,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue