mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-29 19:20:42 +02:00
ac/gpu_info,radv: add scratch_wavesize_granularity info
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34549>
This commit is contained in:
parent
82dda21806
commit
d94f8b4460
5 changed files with 18 additions and 14 deletions
|
|
@ -1550,6 +1550,10 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
|
|||
info->has_dedicated_vram &&
|
||||
info->drm_minor >= 47;
|
||||
|
||||
/* Compute the scratch WAVESIZE granularity in bytes. */
|
||||
info->scratch_wavesize_granularity_shift = info->gfx_level >= GFX11 ? 8 : 10;
|
||||
info->scratch_wavesize_granularity = BITFIELD_BIT(info->scratch_wavesize_granularity_shift);
|
||||
|
||||
/* The maximum number of scratch waves. The number is only a function of the number of CUs.
|
||||
* It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -278,6 +278,8 @@ struct radeon_info {
|
|||
uint32_t min_wave64_vgpr_alloc;
|
||||
uint32_t max_vgpr_alloc;
|
||||
uint32_t wave64_vgpr_alloc_granularity;
|
||||
uint32_t scratch_wavesize_granularity_shift;
|
||||
uint32_t scratch_wavesize_granularity;
|
||||
uint32_t max_scratch_waves;
|
||||
bool has_scratch_base_registers;
|
||||
|
||||
|
|
|
|||
|
|
@ -1048,18 +1048,16 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
|
|||
*
|
||||
* Shaders with SCRATCH_EN=0 don't allocate scratch space.
|
||||
*/
|
||||
const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10;
|
||||
const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
|
||||
|
||||
/* The LLVM shader backend should be reporting aligned scratch_sizes. */
|
||||
assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
|
||||
assert((bytes_per_wave & BITFIELD_MASK(info->scratch_wavesize_granularity_shift)) == 0 &&
|
||||
"scratch size per wave should be aligned");
|
||||
|
||||
/* Add 1 scratch item to make the number of items odd. This should improve scratch
|
||||
* performance by more randomly distributing scratch waves among memory channels.
|
||||
*/
|
||||
if (bytes_per_wave)
|
||||
bytes_per_wave |= min_size_per_wave;
|
||||
bytes_per_wave |= info->scratch_wavesize_granularity;
|
||||
|
||||
*max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
|
||||
|
||||
|
|
@ -1069,7 +1067,7 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
|
|||
|
||||
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
|
||||
*tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
|
||||
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
|
||||
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> info->scratch_wavesize_granularity_shift);
|
||||
}
|
||||
|
||||
/* Convert chip-agnostic memory access flags into hw-specific cache flags.
|
||||
|
|
|
|||
|
|
@ -12391,10 +12391,8 @@ radv_emit_rt_stack_size(struct radv_cmd_buffer *cmd_buffer)
|
|||
uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
|
||||
const uint32_t wave_size = rt_prolog->info.wave_size;
|
||||
|
||||
/* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
|
||||
const unsigned scratch_alloc_granule = pdev->info.gfx_level >= GFX11 ? 256 : 1024;
|
||||
|
||||
scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
|
||||
scratch_bytes_per_wave +=
|
||||
align(cmd_buffer->state.rt_stack_size * wave_size, pdev->info.scratch_wavesize_granularity);
|
||||
|
||||
cmd_buffer->compute_scratch_size_per_wave_needed =
|
||||
MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
|
||||
|
|
|
|||
|
|
@ -507,12 +507,14 @@ radv_emit_graphics_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
|
|||
waves /= gpu_info->max_se;
|
||||
|
||||
radeon_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
|
||||
radeon_emit(S_0286E8_WAVES(waves) | S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, 256)));
|
||||
radeon_emit(S_0286E8_WAVES(waves) |
|
||||
S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
|
||||
radeon_emit(va >> 8); /* SPI_GFX_SCRATCH_BASE_LO */
|
||||
radeon_emit(va >> 40); /* SPI_GFX_SCRATCH_BASE_HI */
|
||||
} else {
|
||||
radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE,
|
||||
S_0286E8_WAVES(waves) | S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, 1024)));
|
||||
S_0286E8_WAVES(waves) |
|
||||
S_0286E8_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
|
||||
}
|
||||
|
||||
radeon_end();
|
||||
|
|
@ -554,9 +556,9 @@ radv_emit_compute_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
|
|||
radeon_emit(scratch_va);
|
||||
radeon_emit(rsrc1);
|
||||
|
||||
radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
|
||||
S_00B860_WAVES(waves) |
|
||||
S_00B860_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->gfx_level >= GFX11 ? 256 : 1024)));
|
||||
radeon_set_sh_reg(
|
||||
R_00B860_COMPUTE_TMPRING_SIZE,
|
||||
S_00B860_WAVES(waves) | S_00B860_WAVESIZE(DIV_ROUND_UP(size_per_wave, gpu_info->scratch_wavesize_granularity)));
|
||||
|
||||
radeon_end();
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue