From 57498eca832645e5eb7bb1bbec47bf371865d02e Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 7 May 2026 16:24:23 +0100 Subject: [PATCH] nir: add load_deref_transpose_amd Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir.c | 6 ++++++ src/compiler/nir/nir_deref.c | 3 +++ src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 10 ++++++++++ src/compiler/nir/nir_lower_explicit_io.c | 4 +++- src/compiler/nir/nir_lower_memory_model.c | 1 + src/compiler/nir/nir_opt_access.c | 6 +++++- src/compiler/nir/nir_opt_barriers.c | 1 + src/compiler/nir/nir_opt_combine_stores.c | 3 ++- src/compiler/nir/nir_opt_dead_cf.c | 4 +++- src/compiler/nir/nir_opt_dead_write_vars.c | 6 ++++++ src/compiler/nir/nir_opt_load_store_vectorize.c | 8 ++++++-- src/compiler/nir/nir_opt_loop_unroll.c | 3 ++- src/compiler/nir/nir_propagate_invariant.c | 1 + src/compiler/nir/nir_validate.c | 12 ++++++++++++ 15 files changed, 62 insertions(+), 7 deletions(-) diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c index d15a3b58b87..2c3d401a89c 100644 --- a/src/compiler/nir/nir.c +++ b/src/compiler/nir/nir.c @@ -2914,6 +2914,12 @@ nir_image_intrinsic_coord_components(const nir_intrinsic_instr *instr) bool nir_intrinsic_can_reorder(nir_intrinsic_instr *instr) { + /* Subgroup operations can't be reordered because they might then read inactive + * invocations. load_global_transpose_amd is an example of one which also has ACCESS. + */ + if (nir_intrinsic_has_semantic(instr, NIR_INTRINSIC_SUBGROUP | NIR_INTRINSIC_QUADGROUP)) + return false; + if (nir_intrinsic_has_access(instr)) { enum gl_access_qualifier access = nir_intrinsic_access(instr); if (access & ACCESS_VOLATILE) diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c index 4750b992997..860a0f84822 100644 --- a/src/compiler/nir/nir_deref.c +++ b/src/compiler/nir/nir_deref.c @@ -198,6 +198,7 @@ nir_deref_instr_has_complex_use(nir_deref_instr *deref, nir_intrinsic_instr *use_intrin = nir_instr_as_intrinsic(use_instr); switch (use_intrin->intrinsic) { case nir_intrinsic_load_deref: + case nir_intrinsic_load_deref_transpose_amd: assert(use_src == &use_intrin->src[0]); continue; @@ -1535,6 +1536,8 @@ nir_opt_deref_impl(nir_function_impl *impl) case nir_intrinsic_load_deref: if (opt_load_vec_deref(&b, intrin)) progress = true; + FALLTHROUGH; + case nir_intrinsic_load_deref_transpose_amd: if (opt_load_undef_deref(&b, intrin)) progress = true; break; diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index c386b901e5a..4c2f10a3b12 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -1075,6 +1075,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_zs_emit_pan: case nir_intrinsic_load_return_param_amd: case nir_intrinsic_load_local_invocation_index_intel: + case nir_intrinsic_load_deref_transpose_amd: is_divergent = true; break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 08b4fb059d7..56313301719 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2299,6 +2299,16 @@ intrinsic("strict_wqm_coord_amd", src_comp=[0], dest_comp=0, bit_sizes=[32], ind intrinsic("cmat_muladd_amd", src_comp=[-1, -1, 0], dest_comp=0, bit_sizes=src2, indices=[SATURATE, NEG_LO_AMD, NEG_HI_AMD, SRC_BASE_TYPE, SRC_BASE_TYPE2], flags=SUBGROUP_FLAGS) +# Global cooperative matrix load with combined cooperative matrix transpose. +# This corresponds to RDNA4's global_load_tr_b{64,128}. Like typical cooperative matrix operations, +# this has to be in subgroup uniform control flow with all invocations active. +# The definition's component size may be 8-bit or 16-bit and matches the type of matrix to load. +# The result has 8 components (wave32) or 4 components (wave64). The address is ignored for lanes +# 32-63, and the actual address that's loaded from is probably offset from the values in lanes 0-31. +# src[] = { address }. +intrinsic("load_deref_transpose_amd", bit_sizes=[8, 16], dest_comp=0, src_comp=[1], + indices=[ACCESS], flags=SUBGROUP_FLAGS) + # Get the debug log buffer descriptor. intrinsic("load_debug_log_desc_amd", bit_sizes=[32], dest_comp=4, flags=[CAN_ELIMINATE, CAN_REORDER]) diff --git a/src/compiler/nir/nir_lower_explicit_io.c b/src/compiler/nir/nir_lower_explicit_io.c index 642ae6c16ac..81966dac8ee 100644 --- a/src/compiler/nir/nir_lower_explicit_io.c +++ b/src/compiler/nir/nir_lower_explicit_io.c @@ -1482,7 +1482,8 @@ nir_lower_explicit_io_instr(nir_builder *b, break; } - case nir_intrinsic_load_deref_block_intel: { + case nir_intrinsic_load_deref_block_intel: + case nir_intrinsic_load_deref_transpose_amd: { nir_io_offset addr = build_addr(b, intrin, base_addr, addr_format, 0, align_mul, align_offset); nir_def *value = build_explicit_io_load(b, intrin, addr, addr_format, @@ -1760,6 +1761,7 @@ nir_lower_explicit_io_impl(nir_function_impl *impl, nir_variable_mode modes, case nir_intrinsic_store_deref: case nir_intrinsic_load_deref_block_intel: case nir_intrinsic_store_deref_block_intel: + case nir_intrinsic_load_deref_transpose_amd: case nir_intrinsic_deref_atomic: case nir_intrinsic_deref_atomic_swap: { nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); diff --git a/src/compiler/nir/nir_lower_memory_model.c b/src/compiler/nir/nir_lower_memory_model.c index 4ddece7f67b..4851aaa940a 100644 --- a/src/compiler/nir/nir_lower_memory_model.c +++ b/src/compiler/nir/nir_lower_memory_model.c @@ -79,6 +79,7 @@ get_intrinsic_info(nir_intrinsic_instr *intrin, nir_variable_mode *modes, *writes = true; break; case nir_intrinsic_load_deref: + case nir_intrinsic_load_deref_transpose_amd: *modes = nir_src_as_deref(intrin->src[0])->modes; *reads = true; break; diff --git a/src/compiler/nir/nir_opt_access.c b/src/compiler/nir/nir_opt_access.c index 1c08d902070..2228e653447 100644 --- a/src/compiler/nir/nir_opt_access.c +++ b/src/compiler/nir/nir_opt_access.c @@ -146,6 +146,7 @@ gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_deref: + case nir_intrinsic_load_deref_transpose_amd: case nir_intrinsic_store_deref: case nir_intrinsic_deref_atomic: case nir_intrinsic_deref_atomic_swap: { @@ -154,9 +155,11 @@ gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) break; bool ssbo = nir_deref_mode_is(deref, nir_var_mem_ssbo); + bool is_write = instr->intrinsic != nir_intrinsic_load_deref && + instr->intrinsic != nir_intrinsic_load_deref_transpose_amd; gather_buffer_access(state, ssbo ? instr->src[0].ssa : NULL, instr->intrinsic != nir_intrinsic_store_deref, - instr->intrinsic != nir_intrinsic_load_deref); + is_write); break; } @@ -296,6 +299,7 @@ process_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) false); case nir_intrinsic_load_deref: + case nir_intrinsic_load_deref_transpose_amd: case nir_intrinsic_store_deref: { if (nir_deref_mode_is(nir_src_as_deref(instr->src[0]), nir_var_mem_global)) return update_access(state, instr, nir_var_mem_global, false, true); diff --git a/src/compiler/nir/nir_opt_barriers.c b/src/compiler/nir/nir_opt_barriers.c index 812c731809d..7f7fc527dc1 100644 --- a/src/compiler/nir/nir_opt_barriers.c +++ b/src/compiler/nir/nir_opt_barriers.c @@ -212,6 +212,7 @@ nir_opt_acquire_release_barriers_impl(nir_function_impl *impl, switch (intrin->intrinsic) { case nir_intrinsic_load_deref: case nir_intrinsic_load_deref_block_intel: + case nir_intrinsic_load_deref_transpose_amd: case nir_intrinsic_store_deref: case nir_intrinsic_store_deref_block_intel: if (last_atomic) { diff --git a/src/compiler/nir/nir_opt_combine_stores.c b/src/compiler/nir/nir_opt_combine_stores.c index e4048ff7a72..d69141e50b4 100644 --- a/src/compiler/nir/nir_opt_combine_stores.c +++ b/src/compiler/nir/nir_opt_combine_stores.c @@ -344,7 +344,8 @@ combine_stores_block(struct combine_stores_state *state, nir_block *block) } case nir_intrinsic_load_deref_block_intel: - case nir_intrinsic_store_deref_block_intel: { + case nir_intrinsic_store_deref_block_intel: + case nir_intrinsic_load_deref_transpose_amd: { /* Combine all the stores that may alias with the whole variable (or * cast). */ diff --git a/src/compiler/nir/nir_opt_dead_cf.c b/src/compiler/nir/nir_opt_dead_cf.c index 3fe0f524c34..daefd2e8448 100644 --- a/src/compiler/nir/nir_opt_dead_cf.c +++ b/src/compiler/nir/nir_opt_dead_cf.c @@ -216,6 +216,7 @@ node_is_dead(nir_cf_node *node) switch (intrin->intrinsic) { case nir_intrinsic_load_deref: + case nir_intrinsic_load_deref_transpose_amd: case nir_intrinsic_load_ssbo: case nir_intrinsic_load_global: case nir_intrinsic_load_global_bounded: @@ -230,7 +231,8 @@ node_is_dead(nir_cf_node *node) * Consider only loads that the result can be affected by other * invocations. */ - if (intrin->intrinsic == nir_intrinsic_load_deref) { + if (intrin->intrinsic == nir_intrinsic_load_deref || + intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd) { nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); if (!nir_deref_mode_may_be(deref, nir_var_mem_ssbo | nir_var_mem_shared | diff --git a/src/compiler/nir/nir_opt_dead_write_vars.c b/src/compiler/nir/nir_opt_dead_write_vars.c index efb948101b3..4249b6f5319 100644 --- a/src/compiler/nir/nir_opt_dead_write_vars.c +++ b/src/compiler/nir/nir_opt_dead_write_vars.c @@ -195,6 +195,12 @@ remove_dead_write_vars_local(nir_shader *shader, nir_block *block, break; } + case nir_intrinsic_load_deref_transpose_amd: { + nir_deref_instr *src = nir_src_as_deref(intrin->src[0]); + clear_unused_for_modes(unused_writes, src->modes); + break; + } + case nir_intrinsic_store_deref: { nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]); diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index c0e4c9e4ff4..9a2ccb3c376 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -89,6 +89,7 @@ get_info(nir_intrinsic_op op) LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1, 1) STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0, 1) LOAD(0, deref, -1, -1, 0, 1) + INFO(0, load_deref_transpose_amd, true, -1, -1, 0, -1, 1) STORE(0, deref, -1, -1, 0, 1, 1) LOAD(nir_var_mem_shared, shared, -1, 0, -1, 1) STORE(nir_var_mem_shared, shared, -1, 1, -1, 0, 1) @@ -1241,7 +1242,7 @@ bindings_different_restrict(nir_shader *shader, struct entry *a, struct entry *b ((a_access | b_access) & ACCESS_RESTRICT); } -static int64_t +static bool may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t b_offset) { /* use adjacency information */ @@ -1252,7 +1253,10 @@ may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t int64_t diff = get_offset_diff(a, b) + b_offset - a_offset; struct entry *first = diff < 0 ? b : a; - unsigned size = get_bit_size(first) / 8u * first->num_components; + if (first->intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd) + return true; + + uint64_t size = get_bit_size(first) / 8u * first->num_components; return llabs(diff) < size; } diff --git a/src/compiler/nir/nir_opt_loop_unroll.c b/src/compiler/nir/nir_opt_loop_unroll.c index df3a2767f6f..eaecfc61678 100644 --- a/src/compiler/nir/nir_opt_loop_unroll.c +++ b/src/compiler/nir/nir_opt_loop_unroll.c @@ -830,7 +830,8 @@ is_indirect_load(nir_instr *instr) return true; } - if (intrin->intrinsic == nir_intrinsic_load_global) + if (intrin->intrinsic == nir_intrinsic_load_global || + intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd) return true; if (intrin->intrinsic == nir_intrinsic_load_deref || diff --git a/src/compiler/nir/nir_propagate_invariant.c b/src/compiler/nir/nir_propagate_invariant.c index 7d914b1578e..b9f02ada708 100644 --- a/src/compiler/nir/nir_propagate_invariant.c +++ b/src/compiler/nir/nir_propagate_invariant.c @@ -178,6 +178,7 @@ propagate_invariant_instr(nir_instr *instr, struct set *invariants, uint8_t *var break; case nir_intrinsic_load_deref: + case nir_intrinsic_load_deref_transpose_amd: if (def_is_invariant(&intrin->def, invariants)) add_var(nir_src_as_deref(intrin->src[0]), invariants, var_invariant); break; diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 2717e1a005f..d2a55cef86b 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -745,6 +745,18 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) } break; + case nir_intrinsic_load_deref_transpose_amd: { + nir_deref_instr *src = nir_src_as_deref(instr->src[0]); + assert(src); + unsigned disallow_access = ACCESS_ATOMIC | ACCESS_SKIP_HELPERS | ACCESS_SMEM_AMD; + validate_assert(state, !(nir_intrinsic_access(instr) & disallow_access)); + validate_assert(state, glsl_type_is_scalar(src->type)); + validate_assert(state, instr->num_components == 8 || instr->num_components == 4); + dest_bit_size = glsl_get_bit_size(src->type); + src_bit_sizes[0] = 64; + break; + } + case nir_intrinsic_global_atomic_nv: case nir_intrinsic_global_atomic_swap_nv: case nir_intrinsic_shared_atomic_nv: