ac/gpu_info,radv: add scratch_wavesize_granularity info

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
2026-05-05 18:18:06 +02:00 · 2025-04-16 12:18:12 +02:00 · 2025-04-16 12:18:12 +02:00 · d94f8b4460
commit d94f8b4460
parent 82dda21806
5 changed files with 18 additions and 14 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1550,6 +1550,10 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
                                       info->has_dedicated_vram &&
                                       info->drm_minor >= 47;

+   /* Compute the scratch WAVESIZE granularity in bytes. */
+   info->scratch_wavesize_granularity_shift = info->gfx_level >= GFX11 ? 8 : 10;
+   info->scratch_wavesize_granularity = BITFIELD_BIT(info->scratch_wavesize_granularity_shift);
+
   /* The maximum number of scratch waves. The number is only a function of the number of CUs.
    * It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count.
    *
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -278,6 +278,8 @@ struct radeon_info {
   uint32_t min_wave64_vgpr_alloc;
   uint32_t max_vgpr_alloc;
   uint32_t wave64_vgpr_alloc_granularity;
+   uint32_t scratch_wavesize_granularity_shift;
+   uint32_t scratch_wavesize_granularity;
   uint32_t max_scratch_waves;
   bool has_scratch_base_registers;

--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@ -1048,18 +1048,16 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
    *
    * Shaders with SCRATCH_EN=0 don't allocate scratch space.
    */
-   const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10;
-   const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);

   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
-   assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
+   assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
          "scratch size per wave should be aligned");

   /* Add 1 scratch item to make the number of items odd. This should improve scratch
    * performance by more randomly distributing scratch waves among memory channels.
    */
   if (bytes_per_wave)
-      bytes_per_wave |= min_size_per_wave;
+      bytes_per_wave |= info->scratch_wavesize_granularity;

   *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);

@ -1069,7 +1067,7 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,

   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
   *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
-                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
+                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
 }

 /* Convert chip-agnostic memory access flags into hw-specific cache flags.
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -12391,10 +12391,8 @@ radv_emit_rt_stack_size(struct radv_cmd_buffer *cmd_buffer)
   uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
   const uint32_t wave_size = rt_prolog->info.wave_size;

-   /* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
-   const unsigned scratch_alloc_granule = pdev->info.gfx_level >= GFX11 ? 256 : 1024;
-
-   scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
+   scratch_bytes_per_wave +=
+      align(cmd_buffer->state.rt_stack_size * wave_size, pdev->info.scratch_wavesize_granularity);

   cmd_buffer->compute_scratch_size_per_wave_needed =
      MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@ -507,12 +507,14 @@ radv_emit_graphics_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
      waves /= gpu_info->max_se;

      radeon_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
-      radeon_emit(S_0286E8_WAVES(waves) | S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, 256)));
+      radeon_emit(S_0286E8_WAVES(waves) |
+                  S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
      radeon_emit(va >> 8);  /* SPI_GFX_SCRATCH_BASE_LO */
      radeon_emit(va >> 40); /* SPI_GFX_SCRATCH_BASE_HI */
   } else {
      radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE,
-                             S_0286E8_WAVES(waves) | S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, 1024)));
+                             S_0286E8_WAVES(waves) |
+                                S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
   }

   radeon_end();
@ -554,9 +556,9 @@ radv_emit_compute_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
   radeon_emit(scratch_va);
   radeon_emit(rsrc1);

-   radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
-                     S_00B860_WAVES(waves) |
-                        S_00B860_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->gfx_level >= GFX11 ? 256 : 1024)));
+   radeon_set_sh_reg(
+      R_00B860_COMPUTE_TMPRING_SIZE,
+      S_00B860_WAVES(waves) | S_00B860_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));

   radeon_end();
 }