From e433a57650c1a95e17e05ed58e3069cfb8664db6 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 16 Apr 2025 13:06:11 +0200
Subject: [PATCH] ac,radeonsi: rework computing scratch wavesize and tmpring
 register

To be re-used by RADV.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
---
 src/amd/common/ac_shader_util.c               | 39 ++++++++++---------
 src/amd/common/ac_shader_util.h               |  7 ++--
 src/gallium/drivers/radeonsi/si_compute.c     |  4 +-
 src/gallium/drivers/radeonsi/si_pipe.c        | 19 +++++++--
 src/gallium/drivers/radeonsi/si_pipe.h        |  2 +
 .../drivers/radeonsi/si_state_shaders.cpp     |  3 +-
 6 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c
index 40c854f41c8..ae142488c1a 100644
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -1030,10 +1030,23 @@ uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shif
           (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
 }
 
-/* Return the register value and tune bytes_per_wave to increase scratch performance. */
-void ac_get_scratch_tmpring_size(const struct radeon_info *info,
-                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
-                                 uint32_t *tmpring_size)
+/* Compute the optimal scratch wavesize. */
+uint32_t
+ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave)
+{
+   /* Add 1 scratch item to make the number of items odd. This should improve
+    * scratch performance by more randomly distributing scratch waves among
+    * memory channels.
+    */
+   if (bytes_per_wave)
+      bytes_per_wave |= info->scratch_wavesize_granularity;
+
+   return bytes_per_wave;
+}
+
+/* Return the scratch register value. */
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
+                                 unsigned bytes_per_wave, uint32_t *tmpring_size)
 {
    /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
     * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
@@ -1049,25 +1062,15 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
     * Shaders with SCRATCH_EN=0 don't allocate scratch space.
     */
 
-   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+   /* The compiler shader backend should be reporting aligned scratch_sizes. */
    assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
           "scratch size per wave should be aligned");
 
-   /* Add 1 scratch item to make the number of items odd. This should improve scratch
-    * performance by more randomly distributing scratch waves among memory channels.
-    */
-   if (bytes_per_wave)
-      bytes_per_wave |= info->scratch_wavesize_granularity;
-
-   *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
-
-   unsigned max_scratch_waves = info->max_scratch_waves;
    if (info->gfx_level >= GFX11)
-      max_scratch_waves /= info->max_se; /* WAVES is per SE */
+      num_scratch_waves /= info->max_se; /* WAVES is per SE */
 
-   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
-   *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
-                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
+   *tmpring_size = S_0286E8_WAVES(num_scratch_waves) |
+                   S_0286E8_WAVESIZE(bytes_per_wave >> info->scratch_wavesize_granularity_shift);
 }
 
 /* Convert chip-agnostic memory access flags into hw-specific cache flags.
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h
index 35759fb60f2..d1936ce2540 100644
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -301,9 +301,10 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
                         const struct radeon_info *info);
 
-void ac_get_scratch_tmpring_size(const struct radeon_info *info,
-                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
-                                 uint32_t *tmpring_size);
+uint32_t ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave);
+
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
+                                 unsigned bytes_per_wave, uint32_t *tmpring_size);
 
 unsigned
 ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 892dd89be56..c70112b843a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -380,9 +380,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
          simple_mtx_lock(&shader->selector->mutex);
 
       /* Update max_seen_compute_scratch_bytes_per_wave and compute_tmpring_size. */
-      ac_get_scratch_tmpring_size(&sctx->screen->info,
-                                  config->scratch_bytes_per_wave,
-                                  &sctx->max_seen_compute_scratch_bytes_per_wave,
+      si_get_scratch_tmpring_size(sctx, config->scratch_bytes_per_wave,
                                   &sctx->compute_tmpring_size);
 
       if (!si_setup_compute_scratch_buffer(sctx, shader))
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 1a69bd45f89..882b8497b71 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -889,9 +889,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       goto fail;
 
    /* Initialize compute_tmpring_size. */
-   ac_get_scratch_tmpring_size(&sctx->screen->info, 0,
-                               &sctx->max_seen_compute_scratch_bytes_per_wave,
-                               &sctx->compute_tmpring_size);
+   si_get_scratch_tmpring_size(sctx, 0,  &sctx->compute_tmpring_size);
 
    return &sctx->b;
 fail:
@@ -900,6 +898,21 @@ fail:
    return NULL;
 }
 
+void
+si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
+                            unsigned *spi_tmpring_size)
+{
+   bytes_per_wave = ac_compute_scratch_wavesize(&sctx->screen->info, bytes_per_wave);
+
+   sctx->max_seen_scratch_bytes_per_wave =
+      MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes_per_wave);
+
+   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
+   ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->screen->info.max_scratch_waves,
+                               sctx->max_seen_scratch_bytes_per_wave,
+                               spi_tmpring_size);
+}
+
 static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource,
                                 unsigned usage)
 {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d4534bb9a57..a4501bda8b2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1622,6 +1622,8 @@ struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen);
 void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
 struct si_context *si_get_aux_context(struct si_aux_context *ctx);
 void si_put_aux_context_flush(struct si_aux_context *ctx);
+void si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
+                                 unsigned *spi_tmpring_size);
 void si_destroy_screen(struct pipe_screen *pscreen);
 
 /* si_perfcounters.c */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 9a6bb0fa1fd..6459de467ea 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -4493,8 +4493,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
 bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
 {
    unsigned spi_tmpring_size;
-   ac_get_scratch_tmpring_size(&sctx->screen->info, bytes,
-                               &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
+   si_get_scratch_tmpring_size(sctx, bytes, &spi_tmpring_size);
 
    unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *
                                   sctx->screen->info.max_scratch_waves;