ac,radeonsi: rework computing scratch wavesize and tmpring register

To be re-used by RADV. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
2025-12-21 22:20:14 +01:00 · 2025-04-16 13:06:11 +02:00 · 2025-04-16 13:06:11 +02:00 · e433a57650
commit e433a57650
parent d94f8b4460
6 changed files with 45 additions and 29 deletions
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@ -1030,10 +1030,23 @@ uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shif
          (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
 }

-/* Return the register value and tune bytes_per_wave to increase scratch performance. */
-void ac_get_scratch_tmpring_size(const struct radeon_info *info,
-                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
-                                 uint32_t *tmpring_size)
+/* Compute the optimal scratch wavesize. */
+uint32_t
+ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave)
+{
+   /* Add 1 scratch item to make the number of items odd. This should improve
+    * scratch performance by more randomly distributing scratch waves among
+    * memory channels.
+    */
+   if (bytes_per_wave)
+      bytes_per_wave |= info->scratch_wavesize_granularity;
+
+   return bytes_per_wave;
+}
+
+/* Return the scratch register value. */
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
+                                 unsigned bytes_per_wave, uint32_t *tmpring_size)
 {
   /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
    * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
@ -1049,25 +1062,15 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
    * Shaders with SCRATCH_EN=0 don't allocate scratch space.
    */

-   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+   /* The compiler shader backend should be reporting aligned scratch_sizes. */
   assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
          "scratch size per wave should be aligned");

-   /* Add 1 scratch item to make the number of items odd. This should improve scratch
-    * performance by more randomly distributing scratch waves among memory channels.
-    */
-   if (bytes_per_wave)
-      bytes_per_wave |= info->scratch_wavesize_granularity;
-
-   *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
-
-   unsigned max_scratch_waves = info->max_scratch_waves;
   if (info->gfx_level >= GFX11)
-      max_scratch_waves /= info->max_se; /* WAVES is per SE */
+      num_scratch_waves /= info->max_se; /* WAVES is per SE */

-   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
-   *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
-                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
+   *tmpring_size = S_0286E8_WAVES(num_scratch_waves) |
+                   S_0286E8_WAVESIZE(bytes_per_wave >> info->scratch_wavesize_granularity_shift);
 }

 /* Convert chip-agnostic memory access flags into hw-specific cache flags.
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@ -301,9 +301,10 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
                        const struct radeon_info *info);

-void ac_get_scratch_tmpring_size(const struct radeon_info *info,
-                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
-                                 uint32_t *tmpring_size);
+uint32_t ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave);
+
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
+                                 unsigned bytes_per_wave, uint32_t *tmpring_size);

 unsigned
 ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@ -380,9 +380,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
         simple_mtx_lock(&shader->selector->mutex);

      /* Update max_seen_compute_scratch_bytes_per_wave and compute_tmpring_size. */
-      ac_get_scratch_tmpring_size(&sctx->screen->info,
-                                  config->scratch_bytes_per_wave,
-                                  &sctx->max_seen_compute_scratch_bytes_per_wave,
+      si_get_scratch_tmpring_size(sctx, config->scratch_bytes_per_wave,
                                  &sctx->compute_tmpring_size);

      if (!si_setup_compute_scratch_buffer(sctx, shader))
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -889,9 +889,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
      goto fail;

   /* Initialize compute_tmpring_size. */
-   ac_get_scratch_tmpring_size(&sctx->screen->info, 0,
-                               &sctx->max_seen_compute_scratch_bytes_per_wave,
-                               &sctx->compute_tmpring_size);
+   si_get_scratch_tmpring_size(sctx, 0,  &sctx->compute_tmpring_size);

   return &sctx->b;
 fail:
@ -900,6 +898,21 @@ fail:
   return NULL;
 }

+void
+si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
+                            unsigned *spi_tmpring_size)
+{
+   bytes_per_wave = ac_compute_scratch_wavesize(&sctx->screen->info, bytes_per_wave);
+
+   sctx->max_seen_scratch_bytes_per_wave =
+      MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes_per_wave);
+
+   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
+   ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->screen->info.max_scratch_waves,
+                               sctx->max_seen_scratch_bytes_per_wave,
+                               spi_tmpring_size);
+}
+
 static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource,
                                unsigned usage)
 {
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1622,6 +1622,8 @@ struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen);
 void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
 struct si_context *si_get_aux_context(struct si_aux_context *ctx);
 void si_put_aux_context_flush(struct si_aux_context *ctx);
+void si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
+                                 unsigned *spi_tmpring_size);
 void si_destroy_screen(struct pipe_screen *pscreen);

 /* si_perfcounters.c */
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -4493,8 +4493,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
 bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
 {
   unsigned spi_tmpring_size;
-   ac_get_scratch_tmpring_size(&sctx->screen->info, bytes,
-                               &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
+   si_get_scratch_tmpring_size(sctx, bytes, &spi_tmpring_size);

   unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *
                                  sctx->screen->info.max_scratch_waves;