mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-04 06:38:19 +02:00
radeonsi/tests: add an ordered append bandwidth test
This uses global_atomic_ordered_add_b64 to implement Ordered Append and experimentally measure its memory throughput. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41452>
This commit is contained in:
parent
addd09fb76
commit
e5d9cdb62a
5 changed files with 153 additions and 0 deletions
|
|
@ -149,6 +149,7 @@ if with_gfx_compute
|
|||
'gfx/si_nir_mark_divergent_texture_non_uniform.c',
|
||||
'gfx/si_nir_optim.c',
|
||||
'tests/si_tests.c',
|
||||
'tests/si_test_oa_bandwidth.c',
|
||||
'tests/si_test_dma_perf.c',
|
||||
'tests/si_test_image_copy_region.c',
|
||||
'tests/si_test_vm_fault.c',
|
||||
|
|
|
|||
142
src/gallium/drivers/radeonsi/tests/si_test_oa_bandwidth.c
Normal file
142
src/gallium/drivers/radeonsi/tests/si_test_oa_bandwidth.c
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
/* Copyright © 2026 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "si_tests.h"
|
||||
#include "si_tests_private.h"
|
||||
#include "si_pipe.h"
|
||||
#include "nir_builder.h"
|
||||
#include "ac_nir_helpers.h"
|
||||
|
||||
void
|
||||
si_test_oa_bandwidth(struct si_screen *sscreen)
|
||||
{
|
||||
struct pipe_context *ctx = sscreen->b.context_create(&sscreen->b, NULL, 0);
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
assert(sctx->gfx_level >= GFX12);
|
||||
|
||||
/* Only ac_nir_to_llvm implements ordered_add_loop_gfx12_amd. */
|
||||
sscreen->use_aco = false;
|
||||
|
||||
struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
|
||||
|
||||
const unsigned size = debug_get_num_option("mb", 64) * 1024 * 1024;
|
||||
struct pipe_shader_buffer sb = {};
|
||||
sb.buffer = pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, size);
|
||||
sb.buffer_size = size;
|
||||
ctx->set_shader_buffers(ctx, MESA_SHADER_COMPUTE, 0, 1, &sb, 0x1);
|
||||
|
||||
struct pipe_shader_buffer sb2 = {};
|
||||
sb2.buffer = pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 8);
|
||||
sb2.buffer_size = 8;
|
||||
ctx->set_shader_buffers(ctx, MESA_SHADER_COMPUTE, 1, 1, &sb2, 0x1);
|
||||
|
||||
union pipe_query_result total[5][6] = {};
|
||||
|
||||
for (unsigned num_dwords_per_thread = 1; num_dwords_per_thread <= 16;
|
||||
num_dwords_per_thread *= 2) {
|
||||
for (unsigned block_size = 32; block_size <= 1024; block_size *= 2) {
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sscreen->nir_options,
|
||||
"ordered_append_test");
|
||||
b.shader->info.workgroup_size[0] = block_size;
|
||||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ssbos = 2;
|
||||
b.shader->info.shared_size = 4;
|
||||
|
||||
nir_def *local_id = nir_channel(&b, nir_load_local_invocation_id(&b), 0);
|
||||
nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
|
||||
nir_def *global_id = ac_get_global_ids(&b, 1, 32);
|
||||
nir_def *atomic_address = nir_load_ssbo_address(&b, 1, 64, nir_imm_int(&b, 1), nir_imm_int(&b, 0));
|
||||
|
||||
nir_if *if_tid0 = nir_push_if(&b, nir_ieq(&b, local_id, nir_imm_int(&b, 0)));
|
||||
|
||||
nir_def *ordered_id = nir_iand_imm(&b, wg_id, 0xfff);
|
||||
nir_def *atomic_src = nir_pack_64_2x32_split(&b, ordered_id, nir_imm_int(&b, 0));
|
||||
nir_def *count = nir_ordered_add_loop_gfx12_amd(&b, atomic_address, nir_imm_int(&b, 0),
|
||||
ordered_id, atomic_src);
|
||||
|
||||
if (SHADER_DEBUG_LOG) {
|
||||
ac_nir_store_debug_log_amd(&b, nir_vec4(&b, ordered_id, count,
|
||||
nir_imm_int(&b, 0), nir_imm_int(&b, 0)));
|
||||
}
|
||||
|
||||
nir_store_shared(&b, nir_imm_int(&b, 0), nir_imm_int(&b, 0));
|
||||
nir_pop_if(&b, if_tid0);
|
||||
|
||||
nir_barrier(&b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
|
||||
.memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
|
||||
nir_def *loaded_zero = nir_load_shared(&b, 1, 32, nir_imm_int(&b, 0));
|
||||
|
||||
/* Convert the global thread ID into bytes. */
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP(num_dwords_per_thread, 4); i++) {
|
||||
nir_def *offset = nir_iadd_imm(&b, nir_imul_imm(&b, global_id, 4 * num_dwords_per_thread), i * 16);
|
||||
offset = nir_iadd(&b, offset, loaded_zero);
|
||||
nir_store_ssbo(&b, nir_imm_zero(&b, MIN2(4, num_dwords_per_thread), 32), nir_imm_int(&b, 0), offset,
|
||||
.access = ACCESS_RESTRICT);
|
||||
}
|
||||
|
||||
void *shader = si_create_shader_state(sctx, b.shader);
|
||||
ctx->bind_compute_state(ctx, shader);
|
||||
|
||||
unsigned wave_size = ((struct si_compute*)shader)->shader.wave_size;
|
||||
sctx->cs_max_waves_per_sh = debug_get_num_option("max_wg_per_sa", 8) * (block_size / wave_size);
|
||||
assert(sctx->cs_max_waves_per_sh);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = block_size;
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = size / (info.block[0] * MAX2(1, num_dwords_per_thread) * 4);
|
||||
info.grid[1] = 1;
|
||||
info.grid[2] = 1;
|
||||
|
||||
union pipe_query_result result;
|
||||
const unsigned num_warmup_repeats = SHADER_DEBUG_LOG ? 1 : 5;
|
||||
const unsigned num_repeats = SHADER_DEBUG_LOG ? 1 : 32;
|
||||
|
||||
for (unsigned i = 0; i < num_warmup_repeats + num_repeats; i++) {
|
||||
uint32_t clear_value = 0;
|
||||
si_barrier_before_simple_buffer_op(sctx, 0, sb2.buffer, NULL);
|
||||
si_clear_buffer(sctx, sb2.buffer, 0, 8, &clear_value, 4, SI_AUTO_SELECT_CLEAR_METHOD, false);
|
||||
si_barrier_after_simple_buffer_op(sctx, 0, sb2.buffer, NULL);
|
||||
|
||||
if (i >= num_warmup_repeats)
|
||||
ctx->begin_query(ctx, q);
|
||||
|
||||
ctx->launch_grid(ctx, &info);
|
||||
|
||||
if (i >= num_warmup_repeats) {
|
||||
ctx->end_query(ctx, q);
|
||||
ctx->get_query_result(ctx, q, true, &result);
|
||||
|
||||
total[util_logbase2(num_dwords_per_thread)][util_logbase2(block_size) - 5].u64 += result.u64;
|
||||
}
|
||||
}
|
||||
|
||||
total[util_logbase2(num_dwords_per_thread)][util_logbase2(block_size) - 5].u64 /= num_repeats;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Printing GB/s ordered append store bandwidth.\n");
|
||||
printf(" Stored dw/lane, Workgroup sizes\n");
|
||||
printf(" ,");
|
||||
for (unsigned j = 0; j < 6; j++) {
|
||||
unsigned wg_size = 32 << j;
|
||||
unsigned spaces = 4 - (int)(log(wg_size) / log(10));
|
||||
for (unsigned a = 0; a < spaces; a++)
|
||||
printf("_");
|
||||
|
||||
printf("%u,", wg_size);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
for (unsigned i = 0; i < 5; i++) {
|
||||
printf(" %4u| ", 4 << i);
|
||||
|
||||
for (unsigned j = 0; j < 6; j++)
|
||||
printf("%4u, ", (unsigned)((double)size / total[i][j].u64));
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
|
@ -19,6 +19,7 @@ static const struct debug_named_value test_options[] = {
|
|||
{"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."},
|
||||
{"dmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
|
||||
{"testmemperf", DBG(TEST_MEM_PERF), "Test map + memcpy perf using the winsys."},
|
||||
{"testoa", DBG(TEST_OA_BANDWIDTH), "Test ordered append bandwidth"},
|
||||
|
||||
DEBUG_NAMED_VALUE_END /* must be last */
|
||||
};
|
||||
|
|
@ -48,6 +49,9 @@ void si_run_tests(struct si_screen *sscreen)
|
|||
if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER)))
|
||||
si_test_vmfault(sscreen, test_flags);
|
||||
|
||||
if (test_flags & (DBG(TEST_OA_BANDWIDTH)))
|
||||
si_test_oa_bandwidth(sscreen);
|
||||
|
||||
if (test_flags)
|
||||
exit(0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ enum
|
|||
DBG_TEST_VMFAULT_SHADER,
|
||||
DBG_TEST_DMA_PERF,
|
||||
DBG_TEST_MEM_PERF,
|
||||
DBG_TEST_OA_BANDWIDTH,
|
||||
};
|
||||
|
||||
struct si_screen;
|
||||
|
|
|
|||
|
|
@ -9,6 +9,11 @@
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
struct si_screen;
|
||||
|
||||
/* si_test_oa_bandwidth.c */
|
||||
void si_test_oa_bandwidth(struct si_screen *sscreen);
|
||||
|
||||
/* si_test_image_copy_region.c */
|
||||
void si_test_image_copy_region(struct si_screen *sscreen);
|
||||
void si_test_blit(struct si_screen *sscreen, unsigned test_flags);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue