From fa85b4b49ed1771ab7339ccc51ae3de0d366c9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 16 Jul 2024 03:10:28 -0400 Subject: [PATCH] radeonsi: minor changes at the beginning of si_compute_clear_copy_buffer Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_compute_blit.c | 54 ++++++++++++------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 3546c099ed2..8d9fb7bd9fc 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -263,28 +263,46 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource unsigned dwords_per_thread, bool fail_if_slow) { bool is_copy = src != NULL; + uint32_t tmp_clear_value; - if (src_offset % 4 || dst_offset % 4 || size % 4 || clear_value_size % 4) - return false; + si_improve_sync_flags(sctx, dst, src, &flags); - if (dwords_per_thread) { - /* Validate dwords_per_thread. Only set by the microbenchmark. */ - if (dwords_per_thread > 4) { - assert(!"dwords_per_thread must be <= 4"); - return false; /* invalid value */ - } + if (!is_copy) { + if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &tmp_clear_value)) + clear_value = &tmp_clear_value; - if (clear_value_size > dwords_per_thread * 4) { - assert(!"clear_value_size must be <= dwords_per_thread"); - return false; /* invalid value */ - } - } else { + assert(clear_value_size % 4 == 0); + } + + if (!dwords_per_thread) { /* Set default optimal settings. */ - /* Clearing 4 dwords per thread with a 3-dword clear value is slightly faster with big sizes. */ - if (!is_copy && clear_value_size == 12) - dwords_per_thread = size <= 4096 ? 3 : 4; - else - dwords_per_thread = 4; + dwords_per_thread = size <= 64 * 1024 ? 2 : 4; + + if (!is_copy) { + if (clear_value_size == 12) { + /* Clearing 4 dwords per thread with a 3-dword clear value is faster with big sizes. */ + dwords_per_thread = size <= 4096 ? 3 : 4; + } else { + /* dwords_per_thread must be at least the size of the clear value. */ + dwords_per_thread = MAX2(dwords_per_thread, clear_value_size / 4); + } + } + } + + /* Validate dwords_per_thread. */ + if (dwords_per_thread > 4) { + assert(!"dwords_per_thread must be <= 4"); + return false; /* invalid value */ + } + + if (clear_value_size > dwords_per_thread * 4) { + assert(!"clear_value_size must be <= dwords_per_thread"); + return false; /* invalid value */ + } + + if (clear_value_size == 12 && dst_offset % 4) { + assert(!"if clear_value_size == 12, dst_offset must be aligned to 4"); + return false; /* invalid value */ } /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't