ac,radeonsi: rework computing scratch wavesize and tmpring register

To be re-used by RADV.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
This commit is contained in:
Samuel Pitoiset 2025-04-16 13:06:11 +02:00 committed by Marge Bot
parent d94f8b4460
commit e433a57650
6 changed files with 45 additions and 29 deletions

View file

@ -1030,10 +1030,23 @@ uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shif
(((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask); (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
} }
/* Return the register value and tune bytes_per_wave to increase scratch performance. */ /* Compute the optimal scratch wavesize. */
void ac_get_scratch_tmpring_size(const struct radeon_info *info, uint32_t
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave)
uint32_t *tmpring_size) {
/* Add 1 scratch item to make the number of items odd. This should improve
* scratch performance by more randomly distributing scratch waves among
* memory channels.
*/
if (bytes_per_wave)
bytes_per_wave |= info->scratch_wavesize_granularity;
return bytes_per_wave;
}
/* Return the scratch register value. */
void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
unsigned bytes_per_wave, uint32_t *tmpring_size)
{ {
/* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors. /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
* WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE. * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
@ -1049,25 +1062,15 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
* Shaders with SCRATCH_EN=0 don't allocate scratch space. * Shaders with SCRATCH_EN=0 don't allocate scratch space.
*/ */
/* The LLVM shader backend should be reporting aligned scratch_sizes. */ /* The compiler shader backend should be reporting aligned scratch_sizes. */
assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 && assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
"scratch size per wave should be aligned"); "scratch size per wave should be aligned");
/* Add 1 scratch item to make the number of items odd. This should improve scratch
* performance by more randomly distributing scratch waves among memory channels.
*/
if (bytes_per_wave)
bytes_per_wave |= info->scratch_wavesize_granularity;
*max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
unsigned max_scratch_waves = info->max_scratch_waves;
if (info->gfx_level >= GFX11) if (info->gfx_level >= GFX11)
max_scratch_waves /= info->max_se; /* WAVES is per SE */ num_scratch_waves /= info->max_se; /* WAVES is per SE */
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ *tmpring_size = S_0286E8_WAVES(num_scratch_waves) |
*tmpring_size = S_0286E8_WAVES(max_scratch_waves) | S_0286E8_WAVESIZE(bytes_per_wave >> info->scratch_wavesize_granularity_shift);
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
} }
/* Convert chip-agnostic memory access flags into hw-specific cache flags. /* Convert chip-agnostic memory access flags into hw-specific cache flags.

View file

@ -301,9 +301,10 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
const struct radeon_info *info); const struct radeon_info *info);
void ac_get_scratch_tmpring_size(const struct radeon_info *info, uint32_t ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave);
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
uint32_t *tmpring_size); void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
unsigned bytes_per_wave, uint32_t *tmpring_size);
unsigned unsigned
ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,

View file

@ -380,9 +380,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
simple_mtx_lock(&shader->selector->mutex); simple_mtx_lock(&shader->selector->mutex);
/* Update max_seen_compute_scratch_bytes_per_wave and compute_tmpring_size. */ /* Update max_seen_compute_scratch_bytes_per_wave and compute_tmpring_size. */
ac_get_scratch_tmpring_size(&sctx->screen->info, si_get_scratch_tmpring_size(sctx, config->scratch_bytes_per_wave,
config->scratch_bytes_per_wave,
&sctx->max_seen_compute_scratch_bytes_per_wave,
&sctx->compute_tmpring_size); &sctx->compute_tmpring_size);
if (!si_setup_compute_scratch_buffer(sctx, shader)) if (!si_setup_compute_scratch_buffer(sctx, shader))

View file

@ -889,9 +889,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
goto fail; goto fail;
/* Initialize compute_tmpring_size. */ /* Initialize compute_tmpring_size. */
ac_get_scratch_tmpring_size(&sctx->screen->info, 0, si_get_scratch_tmpring_size(sctx, 0, &sctx->compute_tmpring_size);
&sctx->max_seen_compute_scratch_bytes_per_wave,
&sctx->compute_tmpring_size);
return &sctx->b; return &sctx->b;
fail: fail:
@ -900,6 +898,21 @@ fail:
return NULL; return NULL;
} }
void
si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
unsigned *spi_tmpring_size)
{
bytes_per_wave = ac_compute_scratch_wavesize(&sctx->screen->info, bytes_per_wave);
sctx->max_seen_scratch_bytes_per_wave =
MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes_per_wave);
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->screen->info.max_scratch_waves,
sctx->max_seen_scratch_bytes_per_wave,
spi_tmpring_size);
}
static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource, static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource,
unsigned usage) unsigned usage)
{ {

View file

@ -1622,6 +1622,8 @@ struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen);
void si_init_aux_async_compute_ctx(struct si_screen *sscreen); void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
struct si_context *si_get_aux_context(struct si_aux_context *ctx); struct si_context *si_get_aux_context(struct si_aux_context *ctx);
void si_put_aux_context_flush(struct si_aux_context *ctx); void si_put_aux_context_flush(struct si_aux_context *ctx);
void si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
unsigned *spi_tmpring_size);
void si_destroy_screen(struct pipe_screen *pscreen); void si_destroy_screen(struct pipe_screen *pscreen);
/* si_perfcounters.c */ /* si_perfcounters.c */

View file

@ -4493,8 +4493,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
{ {
unsigned spi_tmpring_size; unsigned spi_tmpring_size;
ac_get_scratch_tmpring_size(&sctx->screen->info, bytes, si_get_scratch_tmpring_size(sctx, bytes, &spi_tmpring_size);
&sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *
sctx->screen->info.max_scratch_waves; sctx->screen->info.max_scratch_waves;