From f4fac39ded024c11a0a476ed1023638c8d6578b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 2 Nov 2022 14:34:58 -0400 Subject: [PATCH] radeonsi/gfx11: fix compute scratch buffer - WAVES is always per SE Fixes: ba02ed91a60 - ac/gfx11: fix the scratch buffer Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: (cherry picked from commit bdfacd0a24e023515fb7b7fae4a279cff0fbac4e) --- .pick_status.json | 2 +- src/amd/common/ac_shader_util.c | 6 +++--- src/amd/common/ac_shader_util.h | 2 +- src/gallium/drivers/radeonsi/si_compute.c | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index c379501d8d8..0201dacc34a 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -958,7 +958,7 @@ "description": "radeonsi/gfx11: fix compute scratch buffer - WAVES is always per SE", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "ba02ed91a60839f2a6dc6a89fd9de1144b0788aa" }, diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index cba42f176e6..fc95f4ac008 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -916,7 +916,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl } /* Return the register value and tune bytes_per_wave to increase scratch performance. */ -void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute, +void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, uint32_t *tmpring_size) { @@ -949,8 +949,8 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute, *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave); unsigned max_scratch_waves = info->max_scratch_waves; - if (info->gfx_level >= GFX11 && !compute) - max_scratch_waves /= info->num_se; /* WAVES is per SE for SPI_TMPRING_SIZE. */ + if (info->gfx_level >= GFX11) + max_scratch_waves /= info->num_se; /* WAVES is per SE */ /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ *tmpring_size = S_0286E8_WAVES(max_scratch_waves) | diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 87996654d93..6552bb8fa14 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -166,7 +166,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl unsigned value_shift, const struct radeon_info *info, void set_sh_reg(void*, unsigned, uint32_t)); -void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute, +void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, uint32_t *tmpring_size); diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index f59cf3aed81..a4f75e80b0c 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -547,7 +547,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute } unsigned tmpring_size; - ac_get_scratch_tmpring_size(&sctx->screen->info, true, + ac_get_scratch_tmpring_size(&sctx->screen->info, config->scratch_bytes_per_wave, &sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index c2c09185f8a..16012344abd 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4054,7 +4054,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx) bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { unsigned spi_tmpring_size; - ac_get_scratch_tmpring_size(&sctx->screen->info, false, bytes, + ac_get_scratch_tmpring_size(&sctx->screen->info, bytes, &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size); unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *