From e0bcab953def3f03d884c6371706663f72b160c0 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sat, 7 Sep 2024 14:22:11 +0200 Subject: [PATCH] nir: add amd shared append/consume Reviewed-by: Rhys Perry Reviewed-by: Alyssa Rosenzweig Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 2 ++ src/compiler/nir/nir_intrinsics.py | 5 +++++ src/compiler/nir/nir_lower_multiview.c | 2 ++ src/compiler/nir/nir_opt_load_store_vectorize.c | 2 ++ src/compiler/nir/nir_schedule.c | 2 ++ 5 files changed, 13 insertions(+) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 9017a23324c..35ccc489415 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -133,6 +133,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_first_invocation: case nir_intrinsic_last_invocation: case nir_intrinsic_load_subgroup_id: + case nir_intrinsic_shared_append_amd: + case nir_intrinsic_shared_consume_amd: /* VS/TES/GS invocations of the same primitive can be in different * subgroups, so subgroup ops are always divergent between vertices of * the same primitive. diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index e07730547ed..26175fdce92 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1535,6 +1535,11 @@ store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRIT # Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0} intrinsic("gds_atomic_add_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE]) +# Optimized shared_atomic_add (1/-1) with constant address +# returning the uniform pre-op value for all invocations. +intrinsic("shared_append_amd", src_comp=[], dest_comp=1, bit_sizes=[32], indices=[BASE]) +intrinsic("shared_consume_amd", src_comp=[], dest_comp=1, bit_sizes=[32], indices=[BASE]) + # src[] = { sample_id, num_samples } intrinsic("load_sample_positions_amd", src_comp=[1, 1], dest_comp=2, flags=[CAN_ELIMINATE, CAN_REORDER]) diff --git a/src/compiler/nir/nir_lower_multiview.c b/src/compiler/nir/nir_lower_multiview.c index 144c662669c..b48902cdae6 100644 --- a/src/compiler/nir/nir_lower_multiview.c +++ b/src/compiler/nir/nir_lower_multiview.c @@ -64,6 +64,8 @@ shader_writes_to_memory(nir_shader *shader) case nir_intrinsic_store_shared2_amd: case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_shared_append_amd: + case nir_intrinsic_shared_consume_amd: case nir_intrinsic_task_payload_atomic: case nir_intrinsic_task_payload_atomic_swap: case nir_intrinsic_image_deref_store: diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 0f87e5ab55a..1ac7ccce676 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -109,6 +109,8 @@ get_info(nir_intrinsic_op op) INFO(nir_var_mem_ubo, ldcx_nv, false, 0, 1, -1, -1, 1) LOAD(nir_var_uniform, const_ir3, -1, 0, -1, 4) STORE(nir_var_uniform, const_ir3, -1, -1, -1, 0, 4) + INFO(nir_var_mem_shared, shared_append_amd, true, -1, -1, -1, -1, 1) + INFO(nir_var_mem_shared, shared_consume_amd, true, -1, -1, -1, -1, 1) default: break; #undef ATOMIC diff --git a/src/compiler/nir/nir_schedule.c b/src/compiler/nir/nir_schedule.c index 4190a06ac29..1439b024a1c 100644 --- a/src/compiler/nir/nir_schedule.c +++ b/src/compiler/nir/nir_schedule.c @@ -395,6 +395,8 @@ nir_schedule_intrinsic_deps(nir_deps_state *state, case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_shared_append_amd: + case nir_intrinsic_shared_consume_amd: case nir_intrinsic_store_shared: case nir_intrinsic_store_shared2_amd: add_write_dep(state, &state->store_shared, n);