nir: add load_deref_transpose_amd

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41653>
This commit is contained in:
Rhys Perry 2026-05-07 16:24:23 +01:00 committed by Marge Bot
parent 6229e89fa8
commit 57498eca83
15 changed files with 62 additions and 7 deletions

View file

@ -2914,6 +2914,12 @@ nir_image_intrinsic_coord_components(const nir_intrinsic_instr *instr)
bool
nir_intrinsic_can_reorder(nir_intrinsic_instr *instr)
{
/* Subgroup operations can't be reordered because they might then read inactive
* invocations. load_global_transpose_amd is an example of one which also has ACCESS.
*/
if (nir_intrinsic_has_semantic(instr, NIR_INTRINSIC_SUBGROUP | NIR_INTRINSIC_QUADGROUP))
return false;
if (nir_intrinsic_has_access(instr)) {
enum gl_access_qualifier access = nir_intrinsic_access(instr);
if (access & ACCESS_VOLATILE)

View file

@ -198,6 +198,7 @@ nir_deref_instr_has_complex_use(nir_deref_instr *deref,
nir_intrinsic_instr *use_intrin = nir_instr_as_intrinsic(use_instr);
switch (use_intrin->intrinsic) {
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_transpose_amd:
assert(use_src == &use_intrin->src[0]);
continue;
@ -1535,6 +1536,8 @@ nir_opt_deref_impl(nir_function_impl *impl)
case nir_intrinsic_load_deref:
if (opt_load_vec_deref(&b, intrin))
progress = true;
FALLTHROUGH;
case nir_intrinsic_load_deref_transpose_amd:
if (opt_load_undef_deref(&b, intrin))
progress = true;
break;

View file

@ -1075,6 +1075,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_zs_emit_pan:
case nir_intrinsic_load_return_param_amd:
case nir_intrinsic_load_local_invocation_index_intel:
case nir_intrinsic_load_deref_transpose_amd:
is_divergent = true;
break;

View file

@ -2299,6 +2299,16 @@ intrinsic("strict_wqm_coord_amd", src_comp=[0], dest_comp=0, bit_sizes=[32], ind
intrinsic("cmat_muladd_amd", src_comp=[-1, -1, 0], dest_comp=0, bit_sizes=src2,
indices=[SATURATE, NEG_LO_AMD, NEG_HI_AMD, SRC_BASE_TYPE, SRC_BASE_TYPE2], flags=SUBGROUP_FLAGS)
# Global cooperative matrix load with combined cooperative matrix transpose.
# This corresponds to RDNA4's global_load_tr_b{64,128}. Like typical cooperative matrix operations,
# this has to be in subgroup uniform control flow with all invocations active.
# The definition's component size may be 8-bit or 16-bit and matches the type of matrix to load.
# The result has 8 components (wave32) or 4 components (wave64). The address is ignored for lanes
# 32-63, and the actual address that's loaded from is probably offset from the values in lanes 0-31.
# src[] = { address }.
intrinsic("load_deref_transpose_amd", bit_sizes=[8, 16], dest_comp=0, src_comp=[1],
indices=[ACCESS], flags=SUBGROUP_FLAGS)
# Get the debug log buffer descriptor.
intrinsic("load_debug_log_desc_amd", bit_sizes=[32], dest_comp=4, flags=[CAN_ELIMINATE, CAN_REORDER])

View file

@ -1482,7 +1482,8 @@ nir_lower_explicit_io_instr(nir_builder *b,
break;
}
case nir_intrinsic_load_deref_block_intel: {
case nir_intrinsic_load_deref_block_intel:
case nir_intrinsic_load_deref_transpose_amd: {
nir_io_offset addr = build_addr(b, intrin, base_addr, addr_format, 0,
align_mul, align_offset);
nir_def *value = build_explicit_io_load(b, intrin, addr, addr_format,
@ -1760,6 +1761,7 @@ nir_lower_explicit_io_impl(nir_function_impl *impl, nir_variable_mode modes,
case nir_intrinsic_store_deref:
case nir_intrinsic_load_deref_block_intel:
case nir_intrinsic_store_deref_block_intel:
case nir_intrinsic_load_deref_transpose_amd:
case nir_intrinsic_deref_atomic:
case nir_intrinsic_deref_atomic_swap: {
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);

View file

@ -79,6 +79,7 @@ get_intrinsic_info(nir_intrinsic_instr *intrin, nir_variable_mode *modes,
*writes = true;
break;
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_transpose_amd:
*modes = nir_src_as_deref(intrin->src[0])->modes;
*reads = true;
break;

View file

@ -146,6 +146,7 @@ gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_transpose_amd:
case nir_intrinsic_store_deref:
case nir_intrinsic_deref_atomic:
case nir_intrinsic_deref_atomic_swap: {
@ -154,9 +155,11 @@ gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr)
break;
bool ssbo = nir_deref_mode_is(deref, nir_var_mem_ssbo);
bool is_write = instr->intrinsic != nir_intrinsic_load_deref &&
instr->intrinsic != nir_intrinsic_load_deref_transpose_amd;
gather_buffer_access(state, ssbo ? instr->src[0].ssa : NULL,
instr->intrinsic != nir_intrinsic_store_deref,
instr->intrinsic != nir_intrinsic_load_deref);
is_write);
break;
}
@ -296,6 +299,7 @@ process_intrinsic(struct access_state *state, nir_intrinsic_instr *instr)
false);
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_transpose_amd:
case nir_intrinsic_store_deref: {
if (nir_deref_mode_is(nir_src_as_deref(instr->src[0]), nir_var_mem_global))
return update_access(state, instr, nir_var_mem_global, false, true);

View file

@ -212,6 +212,7 @@ nir_opt_acquire_release_barriers_impl(nir_function_impl *impl,
switch (intrin->intrinsic) {
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_block_intel:
case nir_intrinsic_load_deref_transpose_amd:
case nir_intrinsic_store_deref:
case nir_intrinsic_store_deref_block_intel:
if (last_atomic) {

View file

@ -344,7 +344,8 @@ combine_stores_block(struct combine_stores_state *state, nir_block *block)
}
case nir_intrinsic_load_deref_block_intel:
case nir_intrinsic_store_deref_block_intel: {
case nir_intrinsic_store_deref_block_intel:
case nir_intrinsic_load_deref_transpose_amd: {
/* Combine all the stores that may alias with the whole variable (or
* cast).
*/

View file

@ -216,6 +216,7 @@ node_is_dead(nir_cf_node *node)
switch (intrin->intrinsic) {
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_transpose_amd:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_bounded:
@ -230,7 +231,8 @@ node_is_dead(nir_cf_node *node)
* Consider only loads that the result can be affected by other
* invocations.
*/
if (intrin->intrinsic == nir_intrinsic_load_deref) {
if (intrin->intrinsic == nir_intrinsic_load_deref ||
intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd) {
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
if (!nir_deref_mode_may_be(deref, nir_var_mem_ssbo |
nir_var_mem_shared |

View file

@ -195,6 +195,12 @@ remove_dead_write_vars_local(nir_shader *shader, nir_block *block,
break;
}
case nir_intrinsic_load_deref_transpose_amd: {
nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
clear_unused_for_modes(unused_writes, src->modes);
break;
}
case nir_intrinsic_store_deref: {
nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);

View file

@ -89,6 +89,7 @@ get_info(nir_intrinsic_op op)
LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1, 1)
STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0, 1)
LOAD(0, deref, -1, -1, 0, 1)
INFO(0, load_deref_transpose_amd, true, -1, -1, 0, -1, 1)
STORE(0, deref, -1, -1, 0, 1, 1)
LOAD(nir_var_mem_shared, shared, -1, 0, -1, 1)
STORE(nir_var_mem_shared, shared, -1, 1, -1, 0, 1)
@ -1241,7 +1242,7 @@ bindings_different_restrict(nir_shader *shader, struct entry *a, struct entry *b
((a_access | b_access) & ACCESS_RESTRICT);
}
static int64_t
static bool
may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t b_offset)
{
/* use adjacency information */
@ -1252,7 +1253,10 @@ may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t
int64_t diff = get_offset_diff(a, b) + b_offset - a_offset;
struct entry *first = diff < 0 ? b : a;
unsigned size = get_bit_size(first) / 8u * first->num_components;
if (first->intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd)
return true;
uint64_t size = get_bit_size(first) / 8u * first->num_components;
return llabs(diff) < size;
}

View file

@ -830,7 +830,8 @@ is_indirect_load(nir_instr *instr)
return true;
}
if (intrin->intrinsic == nir_intrinsic_load_global)
if (intrin->intrinsic == nir_intrinsic_load_global ||
intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd)
return true;
if (intrin->intrinsic == nir_intrinsic_load_deref ||

View file

@ -178,6 +178,7 @@ propagate_invariant_instr(nir_instr *instr, struct set *invariants, uint8_t *var
break;
case nir_intrinsic_load_deref:
case nir_intrinsic_load_deref_transpose_amd:
if (def_is_invariant(&intrin->def, invariants))
add_var(nir_src_as_deref(intrin->src[0]), invariants, var_invariant);
break;

View file

@ -745,6 +745,18 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
}
break;
case nir_intrinsic_load_deref_transpose_amd: {
nir_deref_instr *src = nir_src_as_deref(instr->src[0]);
assert(src);
unsigned disallow_access = ACCESS_ATOMIC | ACCESS_SKIP_HELPERS | ACCESS_SMEM_AMD;
validate_assert(state, !(nir_intrinsic_access(instr) & disallow_access));
validate_assert(state, glsl_type_is_scalar(src->type));
validate_assert(state, instr->num_components == 8 || instr->num_components == 4);
dest_bit_size = glsl_get_bit_size(src->type);
src_bit_sizes[0] = 64;
break;
}
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_shared_atomic_nv: