From 434eddd422a751729c2e6558ee058529d5e79858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 22 Aug 2024 06:38:01 -0400 Subject: [PATCH] radeonsi: tweak si_test_dma_perf for better experience Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_test_dma_perf.c | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index ec4b5a9c6dc..4859dcef662 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -187,7 +187,7 @@ void si_test_dma_perf(struct si_screen *sscreen) /* Don't test large sizes with GTT because it's slow. */ if ((dst_usage == PIPE_USAGE_STREAM || src_usage == PIPE_USAGE_STREAM) && - size > 32 * 1024 * 1024) { + size > 16 * 1024 * 1024) { printf("%8s,", "n/a"); continue; } @@ -222,18 +222,27 @@ void si_test_dma_perf(struct si_screen *sscreen) } if (is_copy) { + /* CP DMA copies are about as slow as PCIe on GFX6-8. */ + if (sctx->gfx_level <= GFX8 && size > 16 * 1024 * 1024) { + success = false; + continue; + } si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, - SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER, L2_LRU); + SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER, + sctx->gfx_level >= GFX7 ? L2_LRU : L2_BYPASS); } else { /* CP DMA clears must be aligned to 4 bytes. */ - if (dst_offset % 4 || size % 4) { + if (dst_offset % 4 || size % 4 || + /* CP DMA clears are so slow on GFX6-8 that we risk getting a GPU timeout. */ + (sctx->gfx_level <= GFX8 && size > 512 * 1024)) { success = false; continue; } assert(clear_value_size == 4); si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size, clear_value[0], SI_OP_SYNC_BEFORE_AFTER, - SI_COHERENCY_SHADER, L2_LRU); + SI_COHERENCY_SHADER, + sctx->gfx_level >= GFX7 ? L2_LRU : L2_BYPASS); } } else { /* Compute */ @@ -258,7 +267,10 @@ void si_test_dma_perf(struct si_screen *sscreen) ctx->get_query_result(ctx, q, true, &result); ctx->destroy_query(ctx, q); - if (success) { + /* Navi10 and Vega10 sometimes incorrectly return elapsed time of 0 nanoseconds + * for very small ops. + */ + if (success && result.u64) { double GB = 1024.0 * 1024.0 * 1024.0; double seconds = result.u64 / (double)NUM_RUNS / (1000.0 * 1000.0 * 1000.0); double GBps = (size / GB) / seconds * (test_flavor == TEST_COPY_VRAM_VRAM ? 2 : 1);