mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-20 23:00:36 +02:00
radeonsi: update the DMA perf test
- don't test 2 waves/SA - create the compute shader only once per subtest - use only 1 TIME_ELAPSED query per subtest - don't invalidate sL0 (it's not used) - don't invalidate L2 for L2_LRU to test L2 throughput - don't flush the CS after every run - remove unused min/max computation Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7055>
This commit is contained in:
parent
4182fbcb60
commit
7a0162905b
1 changed files with 32 additions and 30 deletions
|
|
@ -45,7 +45,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
const uint32_t clear_value = 0x12345678;
|
||||
static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
|
||||
static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};
|
||||
static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
|
||||
|
||||
#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
|
||||
#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
|
||||
|
|
@ -142,6 +142,12 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
else
|
||||
printf(" ,");
|
||||
|
||||
void *compute_shader = NULL;
|
||||
if (test_cs) {
|
||||
compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
|
||||
cache_policy == L2_STREAM, is_copy);
|
||||
}
|
||||
|
||||
double score = 0;
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
/* Don't test bigger sizes if it's too slow. Print 0. */
|
||||
|
|
@ -152,7 +158,6 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
|
||||
enum pipe_resource_usage dst_usage, src_usage;
|
||||
struct pipe_resource *dst, *src;
|
||||
struct pipe_query *q[NUM_RUNS];
|
||||
unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
|
||||
unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
|
||||
|
||||
|
|
@ -176,11 +181,17 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
|
||||
src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
|
||||
|
||||
/* Wait for idle before testing, so that other processes don't mess up the results. */
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_FLUSH_AND_INV_CB |
|
||||
SI_CONTEXT_FLUSH_AND_INV_DB;
|
||||
sctx->emit_cache_flush(sctx);
|
||||
|
||||
struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
|
||||
ctx->begin_query(ctx, q);
|
||||
|
||||
/* Run tests. */
|
||||
for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
|
||||
q[iter] = ctx->create_query(ctx, query_type, 0);
|
||||
ctx->begin_query(ctx, q[iter]);
|
||||
|
||||
if (test_cp) {
|
||||
/* CP DMA */
|
||||
if (is_copy) {
|
||||
|
|
@ -210,9 +221,6 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
unsigned num_dwords = size / 4;
|
||||
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
|
||||
|
||||
void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
|
||||
cache_policy == L2_STREAM, is_copy);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = MIN2(64, num_instructions);
|
||||
info.block[1] = 1;
|
||||
|
|
@ -233,48 +241,39 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
sctx->cs_user_data[i] = clear_value;
|
||||
}
|
||||
|
||||
sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
|
||||
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
|
||||
ctx->bind_compute_state(ctx, cs);
|
||||
ctx->bind_compute_state(ctx, compute_shader);
|
||||
sctx->cs_max_waves_per_sh = cs_waves_per_sh;
|
||||
|
||||
ctx->launch_grid(ctx, &info);
|
||||
|
||||
ctx->bind_compute_state(ctx, NULL);
|
||||
ctx->delete_compute_state(ctx, cs);
|
||||
sctx->cs_max_waves_per_sh = 0; /* disable the limit */
|
||||
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
/* Flush L2, so that we don't just test L2 cache performance. */
|
||||
/* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
|
||||
if (!test_sdma) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
sctx->flags |= SI_CONTEXT_INV_VCACHE |
|
||||
(cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->emit_cache_flush(sctx);
|
||||
}
|
||||
|
||||
ctx->end_query(ctx, q[iter]);
|
||||
ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
|
||||
}
|
||||
|
||||
ctx->end_query(ctx, q);
|
||||
ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
|
||||
|
||||
pipe_resource_reference(&dst, NULL);
|
||||
pipe_resource_reference(&src, NULL);
|
||||
|
||||
/* Get results. */
|
||||
uint64_t min = ~0ull, max = 0, total = 0;
|
||||
|
||||
for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
|
||||
union pipe_query_result result;
|
||||
union pipe_query_result result;
|
||||
|
||||
ctx->get_query_result(ctx, q[iter], true, &result);
|
||||
ctx->destroy_query(ctx, q[iter]);
|
||||
ctx->get_query_result(ctx, q, true, &result);
|
||||
ctx->destroy_query(ctx, q);
|
||||
|
||||
min = MIN2(min, result.u64);
|
||||
max = MAX2(max, result.u64);
|
||||
total += result.u64;
|
||||
}
|
||||
|
||||
score = get_MBps_rate(size, total / (double)NUM_RUNS);
|
||||
score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
|
||||
printf("%7.0f ,", score);
|
||||
fflush(stdout);
|
||||
|
||||
|
|
@ -290,6 +289,9 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
r->index = method;
|
||||
}
|
||||
puts("");
|
||||
|
||||
if (compute_shader)
|
||||
ctx->delete_compute_state(ctx, compute_shader);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue