nir: add nvidia IO intrinsics

Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39525>
This commit is contained in:
Karol Herbst 2025-09-16 22:58:18 +02:00 committed by Marge Bot
parent 24073b66fa
commit e779538ad2
6 changed files with 79 additions and 0 deletions

View file

@ -5595,6 +5595,27 @@ nir_src *nir_get_io_index_src(nir_intrinsic_instr *instr);
nir_src *nir_get_io_arrayed_index_src(nir_intrinsic_instr *instr);
nir_src *nir_get_shader_call_payload_src(nir_intrinsic_instr *call);
static inline unsigned
nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
{
switch (intr->intrinsic) {
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_shared_atomic_nv:
case nir_intrinsic_shared_atomic_swap_nv:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_load_shared_nv:
case nir_intrinsic_store_global_nv:
case nir_intrinsic_store_scratch_nv:
case nir_intrinsic_store_shared_nv:
return 24;
default:
UNREACHABLE("unknown nvidia intrinsic");
return -1;
}
}
bool nir_is_shared_access(nir_intrinsic_instr *intr);
bool nir_is_output_load(nir_intrinsic_instr *intr);
bool nir_is_input_load(nir_intrinsic_instr *intr);

View file

@ -612,6 +612,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared_ir3:
case nir_intrinsic_load_shared_nv:
is_divergent = src_divergent(instr->src[0], state) ||
(options & nir_divergence_uniform_load_tears);
break;
@ -619,6 +620,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_2x32:
case nir_intrinsic_load_global_ir3:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_deref: {
if (load_may_tear(state, instr)) {
is_divergent = true;
@ -880,6 +882,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_helper_invocation:
case nir_intrinsic_is_helper_invocation:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_deref_atomic:
case nir_intrinsic_deref_atomic_swap:
case nir_intrinsic_ssbo_atomic:
@ -894,6 +897,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_bindless_image_atomic_swap:
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
case nir_intrinsic_shared_atomic_nv:
case nir_intrinsic_shared_atomic_swap_nv:
case nir_intrinsic_task_payload_atomic:
case nir_intrinsic_task_payload_atomic_swap:
case nir_intrinsic_global_atomic:
@ -905,6 +910,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_global_atomic_swap_agx:
case nir_intrinsic_global_atomic_2x32:
case nir_intrinsic_global_atomic_swap_2x32:
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_global_atomic_pco:
case nir_intrinsic_atomic_counter_add:
case nir_intrinsic_atomic_counter_min:

View file

@ -905,6 +905,9 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
# AGX global variants take a 64-bit base address plus a 32-bit offset in words.
# The offset is sign-extended or zero-extended based on the SIGN_EXTEND index.
#
# NV variants all come with a 24 bit base, that is unsigned with a constant 0 address,
# signed otherwise.
#
# PCO global variants use a vec3 for the memory address and data, where component X
# has the low 32 address bits, component Y has the high 32 address bits, and component Z
# has the data parameter.
@ -912,21 +915,25 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
intrinsic("deref_atomic", src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
intrinsic("ssbo_atomic", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT])
intrinsic("shared_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("shared_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("task_payload_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_2x32", src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_agx", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND])
intrinsic("global_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_pco", src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32])
intrinsic("deref_atomic_swap", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
intrinsic("ssbo_atomic_swap", src_comp=[-1, 1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT])
intrinsic("shared_atomic_swap", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("shared_atomic_swap_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("task_payload_atomic_swap", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_swap", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_swap_2x32", src_comp=[2, 1, 1], dest_comp=1, indices=[ATOMIC_OP])
intrinsic("global_atomic_swap_amd", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_swap_agx", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND])
intrinsic("global_atomic_swap_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
intrinsic("global_atomic_swap_pco", src_comp=[4], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32])
def system_value(name, dest_comp, indices=[], bit_sizes=[32], can_reorder=True):
@ -1825,6 +1832,15 @@ load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flag
# src[] = { value, address, unsigned 32-bit offset }.
store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRITE_MASK])
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
# signed otherwise.
load("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
load("shared_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("shared_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
# Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0}
intrinsic("gds_atomic_add_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])

View file

@ -979,6 +979,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
case nir_intrinsic_load_output:
case nir_intrinsic_load_pixel_local:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared_nv:
case nir_intrinsic_load_task_payload:
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_constant:
@ -988,16 +989,22 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
case nir_intrinsic_load_global_2x32:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_etna:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_load_fs_input_interp_deltas:
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_nv:
case nir_intrinsic_shared_atomic_swap:
case nir_intrinsic_shared_atomic_swap_nv:
case nir_intrinsic_task_payload_atomic:
case nir_intrinsic_task_payload_atomic_swap:
case nir_intrinsic_global_atomic:
case nir_intrinsic_global_atomic_2x32:
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap:
case nir_intrinsic_global_atomic_swap_2x32:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_load_coefficients_agx:
case nir_intrinsic_load_shared_block_intel:
case nir_intrinsic_load_global_block_intel:
@ -1021,11 +1028,14 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
case nir_intrinsic_store_output:
case nir_intrinsic_store_pixel_local:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_shared_nv:
case nir_intrinsic_store_task_payload:
case nir_intrinsic_store_global:
case nir_intrinsic_store_global_2x32:
case nir_intrinsic_store_global_etna:
case nir_intrinsic_store_global_nv:
case nir_intrinsic_store_scratch:
case nir_intrinsic_store_scratch_nv:
case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap:
case nir_intrinsic_ldc_nv:

View file

@ -240,6 +240,7 @@ node_is_dead(nir_cf_node *node)
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared2_amd:
case nir_intrinsic_load_shared_nv:
case nir_intrinsic_load_output:
case nir_intrinsic_load_pixel_local:
case nir_intrinsic_load_per_vertex_output:

View file

@ -729,6 +729,30 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
}
break;
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_global_atomic_swap_nv:
case nir_intrinsic_shared_atomic_nv:
case nir_intrinsic_shared_atomic_swap_nv:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_load_shared_nv:
case nir_intrinsic_store_global_nv:
case nir_intrinsic_store_scratch_nv:
case nir_intrinsic_store_shared_nv: {
int base = nir_intrinsic_base(instr);
nir_src src = *nir_get_io_offset_src(instr);
unsigned const_bits = nir_get_io_base_size_nv(instr);
if (nir_src_is_const(src) && nir_src_as_int(src) == 0) {
validate_assert(state, base >= 0 && base < BITFIELD_MASK(const_bits));
} else {
int32_t max = BITFIELD_MASK(const_bits - 1);
int32_t min = ~BITFIELD_MASK(const_bits - 1);
validate_assert(state, base >= min && base < max);
}
break;
}
default:
break;
}