ac/gpu_info,radv: add scratch_wavesize_granularity info

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
This commit is contained in:
Samuel Pitoiset 2025-04-16 12:18:12 +02:00 committed by Marge Bot
parent 82dda21806
commit d94f8b4460
5 changed files with 18 additions and 14 deletions

View file

@ -1550,6 +1550,10 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->has_dedicated_vram &&
info->drm_minor >= 47;
/* Compute the scratch WAVESIZE granularity in bytes. */
info->scratch_wavesize_granularity_shift = info->gfx_level >= GFX11 ? 8 : 10;
info->scratch_wavesize_granularity = BITFIELD_BIT(info->scratch_wavesize_granularity_shift);
/* The maximum number of scratch waves. The number is only a function of the number of CUs.
* It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count.
*

View file

@ -278,6 +278,8 @@ struct radeon_info {
uint32_t min_wave64_vgpr_alloc;
uint32_t max_vgpr_alloc;
uint32_t wave64_vgpr_alloc_granularity;
uint32_t scratch_wavesize_granularity_shift;
uint32_t scratch_wavesize_granularity;
uint32_t max_scratch_waves;
bool has_scratch_base_registers;

View file

@ -1048,18 +1048,16 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
*
* Shaders with SCRATCH_EN=0 don't allocate scratch space.
*/
const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10;
const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
/* The LLVM shader backend should be reporting aligned scratch_sizes. */
assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
"scratch size per wave should be aligned");
/* Add 1 scratch item to make the number of items odd. This should improve scratch
* performance by more randomly distributing scratch waves among memory channels.
*/
if (bytes_per_wave)
bytes_per_wave |= min_size_per_wave;
bytes_per_wave |= info->scratch_wavesize_granularity;
*max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
@ -1069,7 +1067,7 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
*tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
}
/* Convert chip-agnostic memory access flags into hw-specific cache flags.

View file

@ -12391,10 +12391,8 @@ radv_emit_rt_stack_size(struct radv_cmd_buffer *cmd_buffer)
uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
const uint32_t wave_size = rt_prolog->info.wave_size;
/* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
const unsigned scratch_alloc_granule = pdev->info.gfx_level >= GFX11 ? 256 : 1024;
scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
scratch_bytes_per_wave +=
align(cmd_buffer->state.rt_stack_size * wave_size, pdev->info.scratch_wavesize_granularity);
cmd_buffer->compute_scratch_size_per_wave_needed =
MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);

View file

@ -507,12 +507,14 @@ radv_emit_graphics_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
waves /= gpu_info->max_se;
radeon_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
radeon_emit(S_0286E8_WAVES(waves) | S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, 256)));
radeon_emit(S_0286E8_WAVES(waves) |
S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
radeon_emit(va >> 8); /* SPI_GFX_SCRATCH_BASE_LO */
radeon_emit(va >> 40); /* SPI_GFX_SCRATCH_BASE_HI */
} else {
radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE,
S_0286E8_WAVES(waves) | S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, 1024)));
S_0286E8_WAVES(waves) |
S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
}
radeon_end();
@ -554,9 +556,9 @@ radv_emit_compute_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
radeon_emit(scratch_va);
radeon_emit(rsrc1);
radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(waves) |
S_00B860_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->gfx_level >= GFX11 ? 256 : 1024)));
radeon_set_sh_reg(
R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(waves) | S_00B860_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
radeon_end();
}