From e433a57650c1a95e17e05ed58e3069cfb8664db6 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 16 Apr 2025 13:06:11 +0200 Subject: [PATCH] ac,radeonsi: rework computing scratch wavesize and tmpring register To be re-used by RADV. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_shader_util.c | 39 ++++++++++--------- src/amd/common/ac_shader_util.h | 7 ++-- src/gallium/drivers/radeonsi/si_compute.c | 4 +- src/gallium/drivers/radeonsi/si_pipe.c | 19 +++++++-- src/gallium/drivers/radeonsi/si_pipe.h | 2 + .../drivers/radeonsi/si_state_shaders.cpp | 3 +- 6 files changed, 45 insertions(+), 29 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 40c854f41c8..ae142488c1a 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -1030,10 +1030,23 @@ uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shif (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask); } -/* Return the register value and tune bytes_per_wave to increase scratch performance. */ -void ac_get_scratch_tmpring_size(const struct radeon_info *info, - unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, - uint32_t *tmpring_size) +/* Compute the optimal scratch wavesize. */ +uint32_t +ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave) +{ + /* Add 1 scratch item to make the number of items odd. This should improve + * scratch performance by more randomly distributing scratch waves among + * memory channels. + */ + if (bytes_per_wave) + bytes_per_wave |= info->scratch_wavesize_granularity; + + return bytes_per_wave; +} + +/* Return the scratch register value. */ +void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves, + unsigned bytes_per_wave, uint32_t *tmpring_size) { /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors. * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE. @@ -1049,25 +1062,15 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info, * Shaders with SCRATCH_EN=0 don't allocate scratch space. */ - /* The LLVM shader backend should be reporting aligned scratch_sizes. */ + /* The compiler shader backend should be reporting aligned scratch_sizes. */ assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 && "scratch size per wave should be aligned"); - /* Add 1 scratch item to make the number of items odd. This should improve scratch - * performance by more randomly distributing scratch waves among memory channels. - */ - if (bytes_per_wave) - bytes_per_wave |= info->scratch_wavesize_granularity; - - *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave); - - unsigned max_scratch_waves = info->max_scratch_waves; if (info->gfx_level >= GFX11) - max_scratch_waves /= info->max_se; /* WAVES is per SE */ + num_scratch_waves /= info->max_se; /* WAVES is per SE */ - /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ - *tmpring_size = S_0286E8_WAVES(max_scratch_waves) | - S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift); + *tmpring_size = S_0286E8_WAVES(num_scratch_waves) | + S_0286E8_WAVESIZE(bytes_per_wave >> info->scratch_wavesize_granularity_shift); } /* Convert chip-agnostic memory access flags into hw-specific cache flags. diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 35759fb60f2..d1936ce2540 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -301,9 +301,10 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, const struct radeon_info *info); -void ac_get_scratch_tmpring_size(const struct radeon_info *info, - unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, - uint32_t *tmpring_size); +uint32_t ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave); + +void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves, + unsigned bytes_per_wave, uint32_t *tmpring_size); unsigned ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 892dd89be56..c70112b843a 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -380,9 +380,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute simple_mtx_lock(&shader->selector->mutex); /* Update max_seen_compute_scratch_bytes_per_wave and compute_tmpring_size. */ - ac_get_scratch_tmpring_size(&sctx->screen->info, - config->scratch_bytes_per_wave, - &sctx->max_seen_compute_scratch_bytes_per_wave, + si_get_scratch_tmpring_size(sctx, config->scratch_bytes_per_wave, &sctx->compute_tmpring_size); if (!si_setup_compute_scratch_buffer(sctx, shader)) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1a69bd45f89..882b8497b71 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -889,9 +889,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign goto fail; /* Initialize compute_tmpring_size. */ - ac_get_scratch_tmpring_size(&sctx->screen->info, 0, - &sctx->max_seen_compute_scratch_bytes_per_wave, - &sctx->compute_tmpring_size); + si_get_scratch_tmpring_size(sctx, 0, &sctx->compute_tmpring_size); return &sctx->b; fail: @@ -900,6 +898,21 @@ fail: return NULL; } +void +si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave, + unsigned *spi_tmpring_size) +{ + bytes_per_wave = ac_compute_scratch_wavesize(&sctx->screen->info, bytes_per_wave); + + sctx->max_seen_scratch_bytes_per_wave = + MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes_per_wave); + + /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ + ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->screen->info.max_scratch_waves, + sctx->max_seen_scratch_bytes_per_wave, + spi_tmpring_size); +} + static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource, unsigned usage) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d4534bb9a57..a4501bda8b2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1622,6 +1622,8 @@ struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen); void si_init_aux_async_compute_ctx(struct si_screen *sscreen); struct si_context *si_get_aux_context(struct si_aux_context *ctx); void si_put_aux_context_flush(struct si_aux_context *ctx); +void si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave, + unsigned *spi_tmpring_size); void si_destroy_screen(struct pipe_screen *pscreen); /* si_perfcounters.c */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 9a6bb0fa1fd..6459de467ea 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4493,8 +4493,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx) bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { unsigned spi_tmpring_size; - ac_get_scratch_tmpring_size(&sctx->screen->info, bytes, - &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size); + si_get_scratch_tmpring_size(sctx, bytes, &spi_tmpring_size); unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->screen->info.max_scratch_waves;