radeonsi: always use the L2 LRU cache policy for faster clears and copies

Waves and CP DMA can finish sooner if L2 doesn't do any evictions, which is hard to predict. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
2026-05-01 14:38:06 +02:00 · 2021-05-07 23:47:23 -04:00 · 2021-05-07 23:47:23 -04:00 · 36e07198a7
commit 36e07198a7
parent 805c785314
3 changed files with 6 additions and 7 deletions
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@ -62,10 +62,10 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,

   /* Flush caches and wait for idle. */
   if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC))
-      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_STREAM);
+      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU);

   if (types & SI_CLEAR_TYPE_HTILE)
-      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_STREAM);
+      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_LRU);

   /* Flush caches in case we use compute. */
   sctx->flags |= SI_CONTEXT_INV_VCACHE;
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -27,9 +27,7 @@
 #include "util/format/u_format.h"
 #include "util/format_srgb.h"

-/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
- * and L2_STREAM for src.
- */
+/* Determine the cache policy. */
 static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
                                             uint64_t size)
 {
@ -37,7 +35,7 @@ static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_co
                                     coher == SI_COHERENCY_DB_META ||
                                     coher == SI_COHERENCY_CP)) ||
       (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
-      return size <= sctx->screen->info.l2_cache_size / 8 ? L2_LRU : L2_STREAM;
+      return L2_LRU; /* it's faster if L2 doesn't evict anything  */

   return L2_BYPASS;
 }
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -64,7 +64,8 @@ extern "C" {
 /* Tunables for compute-based clear_buffer and copy_buffer: */
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
 #define SI_COMPUTE_COPY_DW_PER_THREAD  4
-#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
+/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
+#define SI_COMPUTE_DST_CACHE_POLICY    L2_LRU

 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)