radeonsi: always use the L2 LRU cache policy for faster clears and copies

Waves and CP DMA can finish sooner if L2 doesn't do any evictions, which
is hard to predict.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
This commit is contained in:
Marek Olšák 2021-05-07 23:47:23 -04:00 committed by Marge Bot
parent 805c785314
commit 36e07198a7
3 changed files with 6 additions and 7 deletions

View file

@ -62,10 +62,10 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
/* Flush caches and wait for idle. */
if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC))
sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_STREAM);
sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU);
if (types & SI_CLEAR_TYPE_HTILE)
sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_STREAM);
sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_LRU);
/* Flush caches in case we use compute. */
sctx->flags |= SI_CONTEXT_INV_VCACHE;

View file

@ -27,9 +27,7 @@
#include "util/format/u_format.h"
#include "util/format_srgb.h"
/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
* and L2_STREAM for src.
*/
/* Determine the cache policy. */
static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
uint64_t size)
{
@ -37,7 +35,7 @@ static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_co
coher == SI_COHERENCY_DB_META ||
coher == SI_COHERENCY_CP)) ||
(sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
return size <= sctx->screen->info.l2_cache_size / 8 ? L2_LRU : L2_STREAM;
return L2_LRU; /* it's faster if L2 doesn't evict anything */
return L2_BYPASS;
}

View file

@ -64,7 +64,8 @@ extern "C" {
/* Tunables for compute-based clear_buffer and copy_buffer: */
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM
/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU
/* Pipeline & streamout query controls. */
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)