radeonsi: don't use CP DMA on GFX940

It's been defeatured.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30115>
This commit is contained in:
Marek Olšák 2024-07-10 19:41:42 -04:00 committed by Marge Bot
parent b0205a92d9
commit 1fd43bca2c
9 changed files with 17 additions and 10 deletions

View file

@ -1441,8 +1441,9 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->pte_fragment_size = alignment_info.size_local;
info->gart_page_size = alignment_info.size_remote;
if (info->gfx_level == GFX6)
info->gfx_ib_pad_with_type2 = true;
info->gfx_ib_pad_with_type2 = info->gfx_level == GFX6;
/* CDNA starting with GFX940 shouldn't use CP DMA. */
info->has_cp_dma = info->has_graphics || info->family < CHIP_GFX940;
if (info->gfx_level >= GFX11 && info->gfx_level < GFX12) {
/* With num_cu = 4 in gfx11 measured power for idle, video playback and observed
@ -1913,6 +1914,7 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
fprintf(f, "CP info:\n");
fprintf(f, " gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
fprintf(f, " has_cp_dma = %i\n", info->has_cp_dma);
fprintf(f, " me_fw_version = %i\n", info->me_fw_version);
fprintf(f, " me_fw_feature = %i\n", info->me_fw_feature);
fprintf(f, " mec_fw_version = %i\n", info->mec_fw_version);

View file

@ -183,6 +183,7 @@ struct radeon_info {
/* CP info. */
bool gfx_ib_pad_with_type2;
bool has_cp_dma;
uint32_t me_fw_version;
uint32_t me_fw_feature;
uint32_t mec_fw_version;

View file

@ -1251,7 +1251,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
}
/* Prefetch the compute shader to L2. */
if (sctx->gfx_level >= GFX7 && prefetch)
if (sctx->gfx_level >= GFX7 && sctx->screen->info.has_cp_dma && prefetch)
si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
if (program->ir_type != PIPE_SHADER_IR_NATIVE)

View file

@ -346,7 +346,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
clear_value_size > 4 ||
/* Use compute if the size is large enough. Always prefer compute on GFX12. */
(clear_value_size == 4 && offset % 4 == 0 &&
(size > compute_min_size || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))))
(!sctx->screen->info.has_cp_dma ||
sctx->screen->info.cp_sdma_ge_use_system_memory_scope || size > compute_min_size))))
method = SI_COMPUTE_CLEAR_METHOD;
if (method == SI_COMPUTE_CLEAR_METHOD) {
@ -403,10 +404,10 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
/* Only use compute for VRAM copies on dGPUs. */
/* TODO: use compute for unaligned big sizes */
if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
(size > compute_min_size || sctx->screen->info.cp_sdma_ge_use_system_memory_scope)) {
if (dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
(!sctx->screen->info.has_cp_dma || sctx->screen->info.cp_sdma_ge_use_system_memory_scope ||
(sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size))) {
si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
flags, coher);
} else {

View file

@ -51,6 +51,7 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
{
uint32_t header = 0, command = 0;
assert(sctx->screen->info.has_cp_dma);
assert(size <= cp_dma_max_byte_count(sctx));
assert(sctx->gfx_level != GFX6 || cache_policy == L2_BYPASS);

View file

@ -152,7 +152,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
/* Make sure CP DMA is idle at the end of IBs after L2 prefetches
* because the kernel doesn't wait for it. */
if (ctx->gfx_level >= GFX7)
if (ctx->gfx_level >= GFX7 && ctx->screen->info.has_cp_dma)
si_cp_dma_wait_for_idle(ctx, &ctx->gfx_cs);
/* If we use s_sendmsg to set tess factors to all 0 or all 1 instead of writing to the tess

View file

@ -1120,7 +1120,7 @@ static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader
int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
uint64_t scratch_va, int64_t bo_offset)
{
bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) &&
bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
bo_offset < 0;

View file

@ -498,6 +498,7 @@ template<amd_gfx_level GFX_VERSION>
static void si_cp_dma_prefetch_inline(struct si_context *sctx, uint64_t address, unsigned size)
{
assert(GFX_VERSION >= GFX7);
assert(sctx->screen->info.has_cp_dma);
if (GFX_VERSION >= GFX11)
size = MIN2(size, 32768 - SI_CPDMA_ALIGNMENT);

View file

@ -554,6 +554,7 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
ws->info.gfx_ib_pad_with_type2 = ws->info.gfx_level <= GFX6 ||
(ws->info.family == CHIP_HAWAII &&
ws->accel_working2 < 3);
ws->info.has_cp_dma = true;
ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
ws->info.has_bo_metadata = false;
ws->info.has_eqaa_surface_allocator = false;