mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 22:20:14 +01:00
ac,radeonsi: rework computing scratch wavesize and tmpring register
To be re-used by RADV. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
This commit is contained in:
parent
d94f8b4460
commit
e433a57650
6 changed files with 45 additions and 29 deletions
|
|
@ -1030,10 +1030,23 @@ uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shif
|
|||
(((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
|
||||
}
|
||||
|
||||
/* Return the register value and tune bytes_per_wave to increase scratch performance. */
|
||||
void ac_get_scratch_tmpring_size(const struct radeon_info *info,
|
||||
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
|
||||
uint32_t *tmpring_size)
|
||||
/* Compute the optimal scratch wavesize. */
|
||||
uint32_t
|
||||
ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave)
|
||||
{
|
||||
/* Add 1 scratch item to make the number of items odd. This should improve
|
||||
* scratch performance by more randomly distributing scratch waves among
|
||||
* memory channels.
|
||||
*/
|
||||
if (bytes_per_wave)
|
||||
bytes_per_wave |= info->scratch_wavesize_granularity;
|
||||
|
||||
return bytes_per_wave;
|
||||
}
|
||||
|
||||
/* Return the scratch register value. */
|
||||
void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
|
||||
unsigned bytes_per_wave, uint32_t *tmpring_size)
|
||||
{
|
||||
/* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
|
||||
* WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
|
||||
|
|
@ -1049,25 +1062,15 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
|
|||
* Shaders with SCRATCH_EN=0 don't allocate scratch space.
|
||||
*/
|
||||
|
||||
/* The LLVM shader backend should be reporting aligned scratch_sizes. */
|
||||
/* The compiler shader backend should be reporting aligned scratch_sizes. */
|
||||
assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
|
||||
"scratch size per wave should be aligned");
|
||||
|
||||
/* Add 1 scratch item to make the number of items odd. This should improve scratch
|
||||
* performance by more randomly distributing scratch waves among memory channels.
|
||||
*/
|
||||
if (bytes_per_wave)
|
||||
bytes_per_wave |= info->scratch_wavesize_granularity;
|
||||
|
||||
*max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
|
||||
|
||||
unsigned max_scratch_waves = info->max_scratch_waves;
|
||||
if (info->gfx_level >= GFX11)
|
||||
max_scratch_waves /= info->max_se; /* WAVES is per SE */
|
||||
num_scratch_waves /= info->max_se; /* WAVES is per SE */
|
||||
|
||||
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
|
||||
*tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
|
||||
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
|
||||
*tmpring_size = S_0286E8_WAVES(num_scratch_waves) |
|
||||
S_0286E8_WAVESIZE(bytes_per_wave >> info->scratch_wavesize_granularity_shift);
|
||||
}
|
||||
|
||||
/* Convert chip-agnostic memory access flags into hw-specific cache flags.
|
||||
|
|
|
|||
|
|
@ -301,9 +301,10 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
|
|||
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
|
||||
const struct radeon_info *info);
|
||||
|
||||
void ac_get_scratch_tmpring_size(const struct radeon_info *info,
|
||||
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
|
||||
uint32_t *tmpring_size);
|
||||
uint32_t ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_wave);
|
||||
|
||||
void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
|
||||
unsigned bytes_per_wave, uint32_t *tmpring_size);
|
||||
|
||||
unsigned
|
||||
ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
|
||||
|
|
|
|||
|
|
@ -380,9 +380,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
|
|||
simple_mtx_lock(&shader->selector->mutex);
|
||||
|
||||
/* Update max_seen_compute_scratch_bytes_per_wave and compute_tmpring_size. */
|
||||
ac_get_scratch_tmpring_size(&sctx->screen->info,
|
||||
config->scratch_bytes_per_wave,
|
||||
&sctx->max_seen_compute_scratch_bytes_per_wave,
|
||||
si_get_scratch_tmpring_size(sctx, config->scratch_bytes_per_wave,
|
||||
&sctx->compute_tmpring_size);
|
||||
|
||||
if (!si_setup_compute_scratch_buffer(sctx, shader))
|
||||
|
|
|
|||
|
|
@ -889,9 +889,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
|
|||
goto fail;
|
||||
|
||||
/* Initialize compute_tmpring_size. */
|
||||
ac_get_scratch_tmpring_size(&sctx->screen->info, 0,
|
||||
&sctx->max_seen_compute_scratch_bytes_per_wave,
|
||||
&sctx->compute_tmpring_size);
|
||||
si_get_scratch_tmpring_size(sctx, 0, &sctx->compute_tmpring_size);
|
||||
|
||||
return &sctx->b;
|
||||
fail:
|
||||
|
|
@ -900,6 +898,21 @@ fail:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
void
|
||||
si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
|
||||
unsigned *spi_tmpring_size)
|
||||
{
|
||||
bytes_per_wave = ac_compute_scratch_wavesize(&sctx->screen->info, bytes_per_wave);
|
||||
|
||||
sctx->max_seen_scratch_bytes_per_wave =
|
||||
MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes_per_wave);
|
||||
|
||||
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
|
||||
ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->screen->info.max_scratch_waves,
|
||||
sctx->max_seen_scratch_bytes_per_wave,
|
||||
spi_tmpring_size);
|
||||
}
|
||||
|
||||
static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource,
|
||||
unsigned usage)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1622,6 +1622,8 @@ struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen);
|
|||
void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
|
||||
struct si_context *si_get_aux_context(struct si_aux_context *ctx);
|
||||
void si_put_aux_context_flush(struct si_aux_context *ctx);
|
||||
void si_get_scratch_tmpring_size(struct si_context *sctx, unsigned bytes_per_wave,
|
||||
unsigned *spi_tmpring_size);
|
||||
void si_destroy_screen(struct pipe_screen *pscreen);
|
||||
|
||||
/* si_perfcounters.c */
|
||||
|
|
|
|||
|
|
@ -4493,8 +4493,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
|
|||
bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
|
||||
{
|
||||
unsigned spi_tmpring_size;
|
||||
ac_get_scratch_tmpring_size(&sctx->screen->info, bytes,
|
||||
&sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
|
||||
si_get_scratch_tmpring_size(sctx, bytes, &spi_tmpring_size);
|
||||
|
||||
unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *
|
||||
sctx->screen->info.max_scratch_waves;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue