From 1ea96a47cdf13cfc833f1f3ddf3106d0ea1ffc47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 2 Jun 2024 15:39:49 -0400 Subject: [PATCH] ac/nir/lower_ngg: use voffset in global_atomic_add for xfb Acked-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Georg Lehmann Part-of: --- src/amd/common/ac_nir_lower_ngg.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index cdd24b5e639..fc5093aed7f 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -1881,7 +1881,9 @@ ngg_build_streamout_buffer_info(nir_builder *b, workgroup_buffer_sizes[buffer] = undef; } - nir_def *buffer_offsets = NULL, *xfb_state_address = NULL; + nir_def *buffer_offsets = NULL; + nir_def *xfb_state_address = nir_load_xfb_state_address_gfx12_amd(b); + nir_def *xfb_voffset = nir_imul_imm(b, tid_in_tg, 8); /* Get current global offset of buffer and increase by amount of * workgroup buffer size. This is an ordered operation sorted by @@ -1929,9 +1931,6 @@ ngg_build_streamout_buffer_info(nir_builder *b, * - The whole structure should be entirely within one 64B block of memory * for performance. (the address bits above 64B should not differ between lanes) */ - nir_def *voffset = nir_imul_imm(b, tid_in_tg, 8); - xfb_state_address = nir_iadd(b, nir_load_xfb_state_address_gfx12_amd(b), - nir_u2u64(b, voffset)); nir_def *buffer_offset_per_lane; /* The gfx12 intrinsic inserts hand-written assembly producing better code than current @@ -1939,8 +1938,8 @@ ngg_build_streamout_buffer_info(nir_builder *b, */ if (use_gfx12_xfb_intrinsic) { buffer_offset_per_lane = - nir_ordered_xfb_counter_add_gfx12_amd(b, nir_load_xfb_state_address_gfx12_amd(b), - voffset, ordered_id, atomic_src); + nir_ordered_xfb_counter_add_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id, + atomic_src); } else { /* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */ enum { NUM_ATOMICS_IN_FLIGHT = 6 }; @@ -1962,7 +1961,7 @@ ngg_build_streamout_buffer_info(nir_builder *b, */ for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT - 1; i++) { nir_store_var(b, result_ring[i], - nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, nir_imm_int(b, 0), + nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, xfb_voffset, .atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1); } @@ -1977,7 +1976,7 @@ ngg_build_streamout_buffer_info(nir_builder *b, /* Issue (or repeat) the atomic. */ nir_store_var(b, result_ring[issue_index], - nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, nir_imm_int(b, 0), + nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, xfb_voffset, .atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1); /* Break if the oldest atomic succeeded in incrementing the offsets. */ @@ -1987,7 +1986,7 @@ ngg_build_streamout_buffer_info(nir_builder *b, /* Debug: Write the vec4 into a shader log ring buffer. */ #if 0 - ac_nir_store_debug_log_amd(b, nir_vec4(b, nir_u2u32(b, nir_load_xfb_state_address_gfx12_amd(b)), + ac_nir_store_debug_log_amd(b, nir_vec4(b, nir_u2u32(b, xfb_state_address), ordered_id, loaded_ordered_id, loaded_dwords_written)); #endif @@ -2027,8 +2026,6 @@ ngg_build_streamout_buffer_info(nir_builder *b, buffer_offsets = nir_vec(b, offset, 4); } nir_pop_if(b, if_4lanes); - - xfb_state_address = nir_if_phi(b, xfb_state_address, nir_undef(b, 1, 64)); buffer_offsets = nir_if_phi(b, buffer_offsets, nir_undef(b, 4, 32)); if_invocation_0 = nir_push_if(b, nir_ieq_imm(b, tid_in_tg, 0)); @@ -2101,10 +2098,8 @@ ngg_build_streamout_buffer_info(nir_builder *b, nir_def *overflow_amount_per_lane = write_values_to_lanes(b, overflow_amount, info->buffers_written); - nir_def *address_per_lane = nir_iadd(b, xfb_state_address, - nir_imm_intN_t(b, 4, 64)); - nir_global_atomic_amd(b, 32, address_per_lane, nir_ineg(b, overflow_amount_per_lane), - nir_imm_int(b, 0), .atomic_op = nir_atomic_op_iadd); + nir_global_atomic_amd(b, 32, xfb_state_address, nir_ineg(b, overflow_amount_per_lane), + xfb_voffset, .base = 4, .atomic_op = nir_atomic_op_iadd); } nir_pop_if(b, if_any_overflow_4_lanes);