ac/nir/lower_ngg: use voffset in global_atomic_add for xfb

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29510>
This commit is contained in:
Marek Olšák 2024-06-02 15:39:49 -04:00 committed by Marge Bot
parent b002564633
commit 1ea96a47cd

View file

@ -1881,7 +1881,9 @@ ngg_build_streamout_buffer_info(nir_builder *b,
workgroup_buffer_sizes[buffer] = undef; workgroup_buffer_sizes[buffer] = undef;
} }
nir_def *buffer_offsets = NULL, *xfb_state_address = NULL; nir_def *buffer_offsets = NULL;
nir_def *xfb_state_address = nir_load_xfb_state_address_gfx12_amd(b);
nir_def *xfb_voffset = nir_imul_imm(b, tid_in_tg, 8);
/* Get current global offset of buffer and increase by amount of /* Get current global offset of buffer and increase by amount of
* workgroup buffer size. This is an ordered operation sorted by * workgroup buffer size. This is an ordered operation sorted by
@ -1929,9 +1931,6 @@ ngg_build_streamout_buffer_info(nir_builder *b,
* - The whole structure should be entirely within one 64B block of memory * - The whole structure should be entirely within one 64B block of memory
* for performance. (the address bits above 64B should not differ between lanes) * for performance. (the address bits above 64B should not differ between lanes)
*/ */
nir_def *voffset = nir_imul_imm(b, tid_in_tg, 8);
xfb_state_address = nir_iadd(b, nir_load_xfb_state_address_gfx12_amd(b),
nir_u2u64(b, voffset));
nir_def *buffer_offset_per_lane; nir_def *buffer_offset_per_lane;
/* The gfx12 intrinsic inserts hand-written assembly producing better code than current /* The gfx12 intrinsic inserts hand-written assembly producing better code than current
@ -1939,8 +1938,8 @@ ngg_build_streamout_buffer_info(nir_builder *b,
*/ */
if (use_gfx12_xfb_intrinsic) { if (use_gfx12_xfb_intrinsic) {
buffer_offset_per_lane = buffer_offset_per_lane =
nir_ordered_xfb_counter_add_gfx12_amd(b, nir_load_xfb_state_address_gfx12_amd(b), nir_ordered_xfb_counter_add_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id,
voffset, ordered_id, atomic_src); atomic_src);
} else { } else {
/* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */ /* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */
enum { NUM_ATOMICS_IN_FLIGHT = 6 }; enum { NUM_ATOMICS_IN_FLIGHT = 6 };
@ -1962,7 +1961,7 @@ ngg_build_streamout_buffer_info(nir_builder *b,
*/ */
for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT - 1; i++) { for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT - 1; i++) {
nir_store_var(b, result_ring[i], nir_store_var(b, result_ring[i],
nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, nir_imm_int(b, 0), nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, xfb_voffset,
.atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1); .atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1);
} }
@ -1977,7 +1976,7 @@ ngg_build_streamout_buffer_info(nir_builder *b,
/* Issue (or repeat) the atomic. */ /* Issue (or repeat) the atomic. */
nir_store_var(b, result_ring[issue_index], nir_store_var(b, result_ring[issue_index],
nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, nir_imm_int(b, 0), nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, xfb_voffset,
.atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1); .atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1);
/* Break if the oldest atomic succeeded in incrementing the offsets. */ /* Break if the oldest atomic succeeded in incrementing the offsets. */
@ -1987,7 +1986,7 @@ ngg_build_streamout_buffer_info(nir_builder *b,
/* Debug: Write the vec4 into a shader log ring buffer. */ /* Debug: Write the vec4 into a shader log ring buffer. */
#if 0 #if 0
ac_nir_store_debug_log_amd(b, nir_vec4(b, nir_u2u32(b, nir_load_xfb_state_address_gfx12_amd(b)), ac_nir_store_debug_log_amd(b, nir_vec4(b, nir_u2u32(b, xfb_state_address),
ordered_id, loaded_ordered_id, ordered_id, loaded_ordered_id,
loaded_dwords_written)); loaded_dwords_written));
#endif #endif
@ -2027,8 +2026,6 @@ ngg_build_streamout_buffer_info(nir_builder *b,
buffer_offsets = nir_vec(b, offset, 4); buffer_offsets = nir_vec(b, offset, 4);
} }
nir_pop_if(b, if_4lanes); nir_pop_if(b, if_4lanes);
xfb_state_address = nir_if_phi(b, xfb_state_address, nir_undef(b, 1, 64));
buffer_offsets = nir_if_phi(b, buffer_offsets, nir_undef(b, 4, 32)); buffer_offsets = nir_if_phi(b, buffer_offsets, nir_undef(b, 4, 32));
if_invocation_0 = nir_push_if(b, nir_ieq_imm(b, tid_in_tg, 0)); if_invocation_0 = nir_push_if(b, nir_ieq_imm(b, tid_in_tg, 0));
@ -2101,10 +2098,8 @@ ngg_build_streamout_buffer_info(nir_builder *b,
nir_def *overflow_amount_per_lane = nir_def *overflow_amount_per_lane =
write_values_to_lanes(b, overflow_amount, info->buffers_written); write_values_to_lanes(b, overflow_amount, info->buffers_written);
nir_def *address_per_lane = nir_iadd(b, xfb_state_address, nir_global_atomic_amd(b, 32, xfb_state_address, nir_ineg(b, overflow_amount_per_lane),
nir_imm_intN_t(b, 4, 64)); xfb_voffset, .base = 4, .atomic_op = nir_atomic_op_iadd);
nir_global_atomic_amd(b, 32, address_per_lane, nir_ineg(b, overflow_amount_per_lane),
nir_imm_int(b, 0), .atomic_op = nir_atomic_op_iadd);
} }
nir_pop_if(b, if_any_overflow_4_lanes); nir_pop_if(b, if_any_overflow_4_lanes);