From b982e71084d71a53005d82fb683542c99449abb8 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 13 May 2026 10:32:23 +0100 Subject: [PATCH] nir: add load_global_transpose_amd Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/common/nir/ac_nir_opt_flip_if_for_mem_loads.c | 1 + src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 2 ++ src/compiler/nir/nir_lower_amul.c | 1 + src/compiler/nir/nir_lower_explicit_io.c | 7 +++++++ src/compiler/nir/nir_lower_io.c | 1 + src/compiler/nir/nir_lower_memory_model.c | 1 + src/compiler/nir/nir_opt_dead_cf.c | 1 + src/compiler/nir/nir_opt_load_store_vectorize.c | 5 ++++- src/compiler/nir/nir_opt_loop_unroll.c | 1 + src/compiler/nir/nir_validate.c | 8 ++++++++ 11 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/amd/common/nir/ac_nir_opt_flip_if_for_mem_loads.c b/src/amd/common/nir/ac_nir_opt_flip_if_for_mem_loads.c index 2b7753feb68..d3cb2d29ea6 100644 --- a/src/amd/common/nir/ac_nir_opt_flip_if_for_mem_loads.c +++ b/src/amd/common/nir/ac_nir_opt_flip_if_for_mem_loads.c @@ -91,6 +91,7 @@ is_vmem_or_lds_load(nir_def *def, unsigned depth, unsigned begin, unsigned end) case nir_intrinsic_load_global: case nir_intrinsic_load_global_constant: case nir_intrinsic_load_global_amd: + case nir_intrinsic_load_global_transpose_amd: case nir_intrinsic_load_scratch: case nir_intrinsic_load_shared: case nir_intrinsic_load_shared2_amd: diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 4c2f10a3b12..0a0bede25f2 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -1075,6 +1075,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_zs_emit_pan: case nir_intrinsic_load_return_param_amd: case nir_intrinsic_load_local_invocation_index_intel: + case nir_intrinsic_load_global_transpose_amd: case nir_intrinsic_load_deref_transpose_amd: is_divergent = true; break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 56313301719..ea587db9454 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2308,6 +2308,8 @@ intrinsic("cmat_muladd_amd", src_comp=[-1, -1, 0], dest_comp=0, bit_sizes=src2, # src[] = { address }. intrinsic("load_deref_transpose_amd", bit_sizes=[8, 16], dest_comp=0, src_comp=[1], indices=[ACCESS], flags=SUBGROUP_FLAGS) +intrinsic("load_global_transpose_amd", bit_sizes=[8, 16], dest_comp=0, src_comp=[1], + indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=SUBGROUP_FLAGS) # Get the debug log buffer descriptor. intrinsic("load_debug_log_desc_amd", bit_sizes=[32], dest_comp=4, flags=[CAN_ELIMINATE, CAN_REORDER]) diff --git a/src/compiler/nir/nir_lower_amul.c b/src/compiler/nir/nir_lower_amul.c index ff38ac1a536..d707cee8e13 100644 --- a/src/compiler/nir/nir_lower_amul.c +++ b/src/compiler/nir/nir_lower_amul.c @@ -153,6 +153,7 @@ lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr) case nir_intrinsic_global_atomic_swap: case nir_intrinsic_load_global_constant: case nir_intrinsic_load_global: + case nir_intrinsic_load_global_transpose_amd: case nir_intrinsic_load_pixel_local: /* just assume that 24b is not sufficient: */ lower_large_src(&intr->src[0], state); diff --git a/src/compiler/nir/nir_lower_explicit_io.c b/src/compiler/nir/nir_lower_explicit_io.c index 81966dac8ee..8e39751567e 100644 --- a/src/compiler/nir/nir_lower_explicit_io.c +++ b/src/compiler/nir/nir_lower_explicit_io.c @@ -726,6 +726,13 @@ build_explicit_io_load(nir_builder *b, nir_intrinsic_instr *intrin, } break; + case nir_intrinsic_load_deref_transpose_amd: + if (mode != nir_var_mem_global) + UNREACHABLE("Unsupported explicit IO variable mode"); + assert(addr_format == nir_address_format_64bit_global); + op = nir_intrinsic_load_global_transpose_amd; + break; + default: UNREACHABLE("Invalid intrinsic"); } diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 286db823baf..acd5642dac7 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -991,6 +991,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr) case nir_intrinsic_load_global_constant: case nir_intrinsic_load_global_etna: case nir_intrinsic_load_global_nv: + case nir_intrinsic_load_global_transpose_amd: case nir_intrinsic_load_scratch: case nir_intrinsic_load_scratch_nv: case nir_intrinsic_load_scratch_intel: diff --git a/src/compiler/nir/nir_lower_memory_model.c b/src/compiler/nir/nir_lower_memory_model.c index 4851aaa940a..91d45fa4bbb 100644 --- a/src/compiler/nir/nir_lower_memory_model.c +++ b/src/compiler/nir/nir_lower_memory_model.c @@ -65,6 +65,7 @@ get_intrinsic_info(nir_intrinsic_instr *intrin, nir_variable_mode *modes, *writes = true; break; case nir_intrinsic_load_global: + case nir_intrinsic_load_global_transpose_amd: *modes = nir_var_mem_global; *reads = true; break; diff --git a/src/compiler/nir/nir_opt_dead_cf.c b/src/compiler/nir/nir_opt_dead_cf.c index daefd2e8448..7d17157ced6 100644 --- a/src/compiler/nir/nir_opt_dead_cf.c +++ b/src/compiler/nir/nir_opt_dead_cf.c @@ -221,6 +221,7 @@ node_is_dead(nir_cf_node *node) case nir_intrinsic_load_global: case nir_intrinsic_load_global_bounded: case nir_intrinsic_load_global_nv: + case nir_intrinsic_load_global_transpose_amd: case nir_intrinsic_load_ssbo_intel: case nir_intrinsic_load_ssbo_ir3: /* If there's a memory barrier after the loop, a load might be diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 9a2ccb3c376..fc416c49bda 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -96,6 +96,7 @@ get_info(nir_intrinsic_op op) INFO(nir_var_mem_shared, load_shared2_amd, true, -1, 0, -1, -1, 1); INFO(nir_var_mem_shared, store_shared2_amd, true, -1, 1, -1, 0, 1) LOAD(nir_var_mem_global, global, -1, 0, -1, 1) + INFO(nir_var_mem_global, load_global_transpose_amd, true, -1, 0, -1, -1, 1) STORE(nir_var_mem_global, global, -1, 1, -1, 0, 1) LOAD(nir_var_mem_global, global_constant, -1, 0, -1, 1) LOAD(nir_var_mem_task_payload, task_payload, -1, 0, -1, 1) @@ -1253,8 +1254,10 @@ may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t int64_t diff = get_offset_diff(a, b) + b_offset - a_offset; struct entry *first = diff < 0 ? b : a; - if (first->intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd) + if (first->intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd || + first->intrin->intrinsic == nir_intrinsic_load_global_transpose_amd) { return true; + } uint64_t size = get_bit_size(first) / 8u * first->num_components; return llabs(diff) < size; diff --git a/src/compiler/nir/nir_opt_loop_unroll.c b/src/compiler/nir/nir_opt_loop_unroll.c index eaecfc61678..4bf9d7abd19 100644 --- a/src/compiler/nir/nir_opt_loop_unroll.c +++ b/src/compiler/nir/nir_opt_loop_unroll.c @@ -831,6 +831,7 @@ is_indirect_load(nir_instr *instr) } if (intrin->intrinsic == nir_intrinsic_load_global || + intrin->intrinsic == nir_intrinsic_load_global_transpose_amd || intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd) return true; diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index d2a55cef86b..a60c224e18a 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -757,6 +757,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) break; } + case nir_intrinsic_load_global_transpose_amd: { + unsigned disallow_access = ACCESS_ATOMIC | ACCESS_SKIP_HELPERS | ACCESS_SMEM_AMD; + validate_assert(state, !(nir_intrinsic_access(instr) & disallow_access)); + validate_assert(state, instr->num_components == 8 || instr->num_components == 4); + src_bit_sizes[0] = 64; + break; + } + case nir_intrinsic_global_atomic_nv: case nir_intrinsic_global_atomic_swap_nv: case nir_intrinsic_shared_atomic_nv: