nir: add load_global_transpose_amd

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41653>
This commit is contained in:
Rhys Perry 2026-05-13 10:32:23 +01:00 committed by Marge Bot
parent 57498eca83
commit b982e71084
11 changed files with 28 additions and 1 deletions

View file

@ -91,6 +91,7 @@ is_vmem_or_lds_load(nir_def *def, unsigned depth, unsigned begin, unsigned end)
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_amd:
case nir_intrinsic_load_global_transpose_amd:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared2_amd:

View file

@ -1075,6 +1075,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_zs_emit_pan:
case nir_intrinsic_load_return_param_amd:
case nir_intrinsic_load_local_invocation_index_intel:
case nir_intrinsic_load_global_transpose_amd:
case nir_intrinsic_load_deref_transpose_amd:
is_divergent = true;
break;

View file

@ -2308,6 +2308,8 @@ intrinsic("cmat_muladd_amd", src_comp=[-1, -1, 0], dest_comp=0, bit_sizes=src2,
# src[] = { address }.
intrinsic("load_deref_transpose_amd", bit_sizes=[8, 16], dest_comp=0, src_comp=[1],
indices=[ACCESS], flags=SUBGROUP_FLAGS)
intrinsic("load_global_transpose_amd", bit_sizes=[8, 16], dest_comp=0, src_comp=[1],
indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=SUBGROUP_FLAGS)
# Get the debug log buffer descriptor.
intrinsic("load_debug_log_desc_amd", bit_sizes=[32], dest_comp=4, flags=[CAN_ELIMINATE, CAN_REORDER])

View file

@ -153,6 +153,7 @@ lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
case nir_intrinsic_global_atomic_swap:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_transpose_amd:
case nir_intrinsic_load_pixel_local:
/* just assume that 24b is not sufficient: */
lower_large_src(&intr->src[0], state);

View file

@ -726,6 +726,13 @@ build_explicit_io_load(nir_builder *b, nir_intrinsic_instr *intrin,
}
break;
case nir_intrinsic_load_deref_transpose_amd:
if (mode != nir_var_mem_global)
UNREACHABLE("Unsupported explicit IO variable mode");
assert(addr_format == nir_address_format_64bit_global);
op = nir_intrinsic_load_global_transpose_amd;
break;
default:
UNREACHABLE("Invalid intrinsic");
}

View file

@ -991,6 +991,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_etna:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_global_transpose_amd:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_load_scratch_intel:

View file

@ -65,6 +65,7 @@ get_intrinsic_info(nir_intrinsic_instr *intrin, nir_variable_mode *modes,
*writes = true;
break;
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_transpose_amd:
*modes = nir_var_mem_global;
*reads = true;
break;

View file

@ -221,6 +221,7 @@ node_is_dead(nir_cf_node *node)
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_bounded:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_global_transpose_amd:
case nir_intrinsic_load_ssbo_intel:
case nir_intrinsic_load_ssbo_ir3:
/* If there's a memory barrier after the loop, a load might be

View file

@ -96,6 +96,7 @@ get_info(nir_intrinsic_op op)
INFO(nir_var_mem_shared, load_shared2_amd, true, -1, 0, -1, -1, 1);
INFO(nir_var_mem_shared, store_shared2_amd, true, -1, 1, -1, 0, 1)
LOAD(nir_var_mem_global, global, -1, 0, -1, 1)
INFO(nir_var_mem_global, load_global_transpose_amd, true, -1, 0, -1, -1, 1)
STORE(nir_var_mem_global, global, -1, 1, -1, 0, 1)
LOAD(nir_var_mem_global, global_constant, -1, 0, -1, 1)
LOAD(nir_var_mem_task_payload, task_payload, -1, 0, -1, 1)
@ -1253,8 +1254,10 @@ may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t
int64_t diff = get_offset_diff(a, b) + b_offset - a_offset;
struct entry *first = diff < 0 ? b : a;
if (first->intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd)
if (first->intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd ||
first->intrin->intrinsic == nir_intrinsic_load_global_transpose_amd) {
return true;
}
uint64_t size = get_bit_size(first) / 8u * first->num_components;
return llabs(diff) < size;

View file

@ -831,6 +831,7 @@ is_indirect_load(nir_instr *instr)
}
if (intrin->intrinsic == nir_intrinsic_load_global ||
intrin->intrinsic == nir_intrinsic_load_global_transpose_amd ||
intrin->intrinsic == nir_intrinsic_load_deref_transpose_amd)
return true;

View file

@ -757,6 +757,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
break;
}
case nir_intrinsic_load_global_transpose_amd: {
unsigned disallow_access = ACCESS_ATOMIC | ACCESS_SKIP_HELPERS | ACCESS_SMEM_AMD;
validate_assert(state, !(nir_intrinsic_access(instr) & disallow_access));
validate_assert(state, instr->num_components == 8 || instr->num_components == 4);
src_bit_sizes[0] = 64;
break;
}
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_shared_atomic_nv: