From e5d9cdb62afffdb4156079544ff7af6cd502393f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 9 May 2026 11:13:33 -0400 Subject: [PATCH] radeonsi/tests: add an ordered append bandwidth test This uses global_atomic_ordered_add_b64 to implement Ordered Append and experimentally measure its memory throughput. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/meson.build | 1 + .../radeonsi/tests/si_test_oa_bandwidth.c | 142 ++++++++++++++++++ src/gallium/drivers/radeonsi/tests/si_tests.c | 4 + src/gallium/drivers/radeonsi/tests/si_tests.h | 1 + .../drivers/radeonsi/tests/si_tests_private.h | 5 + 5 files changed, 153 insertions(+) create mode 100644 src/gallium/drivers/radeonsi/tests/si_test_oa_bandwidth.c diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 9d5071d4052..7177c60bbc1 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -149,6 +149,7 @@ if with_gfx_compute 'gfx/si_nir_mark_divergent_texture_non_uniform.c', 'gfx/si_nir_optim.c', 'tests/si_tests.c', + 'tests/si_test_oa_bandwidth.c', 'tests/si_test_dma_perf.c', 'tests/si_test_image_copy_region.c', 'tests/si_test_vm_fault.c', diff --git a/src/gallium/drivers/radeonsi/tests/si_test_oa_bandwidth.c b/src/gallium/drivers/radeonsi/tests/si_test_oa_bandwidth.c new file mode 100644 index 00000000000..1e8e9215fd9 --- /dev/null +++ b/src/gallium/drivers/radeonsi/tests/si_test_oa_bandwidth.c @@ -0,0 +1,142 @@ +/* Copyright © 2026 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "si_tests.h" +#include "si_tests_private.h" +#include "si_pipe.h" +#include "nir_builder.h" +#include "ac_nir_helpers.h" + +void +si_test_oa_bandwidth(struct si_screen *sscreen) +{ + struct pipe_context *ctx = sscreen->b.context_create(&sscreen->b, NULL, 0); + struct si_context *sctx = (struct si_context *)ctx; + assert(sctx->gfx_level >= GFX12); + + /* Only ac_nir_to_llvm implements ordered_add_loop_gfx12_amd. */ + sscreen->use_aco = false; + + struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0); + + const unsigned size = debug_get_num_option("mb", 64) * 1024 * 1024; + struct pipe_shader_buffer sb = {}; + sb.buffer = pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, size); + sb.buffer_size = size; + ctx->set_shader_buffers(ctx, MESA_SHADER_COMPUTE, 0, 1, &sb, 0x1); + + struct pipe_shader_buffer sb2 = {}; + sb2.buffer = pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 8); + sb2.buffer_size = 8; + ctx->set_shader_buffers(ctx, MESA_SHADER_COMPUTE, 1, 1, &sb2, 0x1); + + union pipe_query_result total[5][6] = {}; + + for (unsigned num_dwords_per_thread = 1; num_dwords_per_thread <= 16; + num_dwords_per_thread *= 2) { + for (unsigned block_size = 32; block_size <= 1024; block_size *= 2) { + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sscreen->nir_options, + "ordered_append_test"); + b.shader->info.workgroup_size[0] = block_size; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.num_ssbos = 2; + b.shader->info.shared_size = 4; + + nir_def *local_id = nir_channel(&b, nir_load_local_invocation_id(&b), 0); + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); + nir_def *global_id = ac_get_global_ids(&b, 1, 32); + nir_def *atomic_address = nir_load_ssbo_address(&b, 1, 64, nir_imm_int(&b, 1), nir_imm_int(&b, 0)); + + nir_if *if_tid0 = nir_push_if(&b, nir_ieq(&b, local_id, nir_imm_int(&b, 0))); + + nir_def *ordered_id = nir_iand_imm(&b, wg_id, 0xfff); + nir_def *atomic_src = nir_pack_64_2x32_split(&b, ordered_id, nir_imm_int(&b, 0)); + nir_def *count = nir_ordered_add_loop_gfx12_amd(&b, atomic_address, nir_imm_int(&b, 0), + ordered_id, atomic_src); + + if (SHADER_DEBUG_LOG) { + ac_nir_store_debug_log_amd(&b, nir_vec4(&b, ordered_id, count, + nir_imm_int(&b, 0), nir_imm_int(&b, 0))); + } + + nir_store_shared(&b, nir_imm_int(&b, 0), nir_imm_int(&b, 0)); + nir_pop_if(&b, if_tid0); + + nir_barrier(&b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP, + .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared); + nir_def *loaded_zero = nir_load_shared(&b, 1, 32, nir_imm_int(&b, 0)); + + /* Convert the global thread ID into bytes. */ + for (unsigned i = 0; i < DIV_ROUND_UP(num_dwords_per_thread, 4); i++) { + nir_def *offset = nir_iadd_imm(&b, nir_imul_imm(&b, global_id, 4 * num_dwords_per_thread), i * 16); + offset = nir_iadd(&b, offset, loaded_zero); + nir_store_ssbo(&b, nir_imm_zero(&b, MIN2(4, num_dwords_per_thread), 32), nir_imm_int(&b, 0), offset, + .access = ACCESS_RESTRICT); + } + + void *shader = si_create_shader_state(sctx, b.shader); + ctx->bind_compute_state(ctx, shader); + + unsigned wave_size = ((struct si_compute*)shader)->shader.wave_size; + sctx->cs_max_waves_per_sh = debug_get_num_option("max_wg_per_sa", 8) * (block_size / wave_size); + assert(sctx->cs_max_waves_per_sh); + + struct pipe_grid_info info = {}; + info.block[0] = block_size; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = size / (info.block[0] * MAX2(1, num_dwords_per_thread) * 4); + info.grid[1] = 1; + info.grid[2] = 1; + + union pipe_query_result result; + const unsigned num_warmup_repeats = SHADER_DEBUG_LOG ? 1 : 5; + const unsigned num_repeats = SHADER_DEBUG_LOG ? 1 : 32; + + for (unsigned i = 0; i < num_warmup_repeats + num_repeats; i++) { + uint32_t clear_value = 0; + si_barrier_before_simple_buffer_op(sctx, 0, sb2.buffer, NULL); + si_clear_buffer(sctx, sb2.buffer, 0, 8, &clear_value, 4, SI_AUTO_SELECT_CLEAR_METHOD, false); + si_barrier_after_simple_buffer_op(sctx, 0, sb2.buffer, NULL); + + if (i >= num_warmup_repeats) + ctx->begin_query(ctx, q); + + ctx->launch_grid(ctx, &info); + + if (i >= num_warmup_repeats) { + ctx->end_query(ctx, q); + ctx->get_query_result(ctx, q, true, &result); + + total[util_logbase2(num_dwords_per_thread)][util_logbase2(block_size) - 5].u64 += result.u64; + } + } + + total[util_logbase2(num_dwords_per_thread)][util_logbase2(block_size) - 5].u64 /= num_repeats; + } + } + + printf("Printing GB/s ordered append store bandwidth.\n"); + printf(" Stored dw/lane, Workgroup sizes\n"); + printf(" ,"); + for (unsigned j = 0; j < 6; j++) { + unsigned wg_size = 32 << j; + unsigned spaces = 4 - (int)(log(wg_size) / log(10)); + for (unsigned a = 0; a < spaces; a++) + printf("_"); + + printf("%u,", wg_size); + } + printf("\n"); + + for (unsigned i = 0; i < 5; i++) { + printf(" %4u| ", 4 << i); + + for (unsigned j = 0; j < 6; j++) + printf("%4u, ", (unsigned)((double)size / total[i][j].u64)); + + printf("\n"); + } +} diff --git a/src/gallium/drivers/radeonsi/tests/si_tests.c b/src/gallium/drivers/radeonsi/tests/si_tests.c index d828c7c7bd0..35ef4abc34c 100644 --- a/src/gallium/drivers/radeonsi/tests/si_tests.c +++ b/src/gallium/drivers/radeonsi/tests/si_tests.c @@ -19,6 +19,7 @@ static const struct debug_named_value test_options[] = { {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."}, {"dmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"}, {"testmemperf", DBG(TEST_MEM_PERF), "Test map + memcpy perf using the winsys."}, + {"testoa", DBG(TEST_OA_BANDWIDTH), "Test ordered append bandwidth"}, DEBUG_NAMED_VALUE_END /* must be last */ }; @@ -48,6 +49,9 @@ void si_run_tests(struct si_screen *sscreen) if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER))) si_test_vmfault(sscreen, test_flags); + if (test_flags & (DBG(TEST_OA_BANDWIDTH))) + si_test_oa_bandwidth(sscreen); + if (test_flags) exit(0); } diff --git a/src/gallium/drivers/radeonsi/tests/si_tests.h b/src/gallium/drivers/radeonsi/tests/si_tests.h index 1bd672f22f2..bbe35283106 100644 --- a/src/gallium/drivers/radeonsi/tests/si_tests.h +++ b/src/gallium/drivers/radeonsi/tests/si_tests.h @@ -20,6 +20,7 @@ enum DBG_TEST_VMFAULT_SHADER, DBG_TEST_DMA_PERF, DBG_TEST_MEM_PERF, + DBG_TEST_OA_BANDWIDTH, }; struct si_screen; diff --git a/src/gallium/drivers/radeonsi/tests/si_tests_private.h b/src/gallium/drivers/radeonsi/tests/si_tests_private.h index 655bd9578a0..32da73b98e6 100644 --- a/src/gallium/drivers/radeonsi/tests/si_tests_private.h +++ b/src/gallium/drivers/radeonsi/tests/si_tests_private.h @@ -9,6 +9,11 @@ #include +struct si_screen; + +/* si_test_oa_bandwidth.c */ +void si_test_oa_bandwidth(struct si_screen *sscreen); + /* si_test_image_copy_region.c */ void si_test_image_copy_region(struct si_screen *sscreen); void si_test_blit(struct si_screen *sscreen, unsigned test_flags);