radeonsi: don't use CP DMA on GFX940

It's been defeatured. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30115>
2026-05-05 09:38:07 +02:00 · 2024-07-10 19:41:42 -04:00 · 2024-07-10 19:41:42 -04:00 · 1fd43bca2c
commit 1fd43bca2c
parent b0205a92d9
9 changed files with 17 additions and 10 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1441,8 +1441,9 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   info->pte_fragment_size = alignment_info.size_local;
   info->gart_page_size = alignment_info.size_remote;

-   if (info->gfx_level == GFX6)
-      info->gfx_ib_pad_with_type2 = true;
+   info->gfx_ib_pad_with_type2 = info->gfx_level == GFX6;
+   /* CDNA starting with GFX940 shouldn't use CP DMA. */
+   info->has_cp_dma = info->has_graphics || info->family < CHIP_GFX940;

   if (info->gfx_level >= GFX11 && info->gfx_level < GFX12) {
      /* With num_cu = 4 in gfx11 measured power for idle, video playback and observed
@ -1913,6 +1914,7 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f)

   fprintf(f, "CP info:\n");
   fprintf(f, "    gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
+   fprintf(f, "    has_cp_dma = %i\n", info->has_cp_dma);
   fprintf(f, "    me_fw_version = %i\n", info->me_fw_version);
   fprintf(f, "    me_fw_feature = %i\n", info->me_fw_feature);
   fprintf(f, "    mec_fw_version = %i\n", info->mec_fw_version);
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -183,6 +183,7 @@ struct radeon_info {

   /* CP info. */
   bool gfx_ib_pad_with_type2;
+   bool has_cp_dma;
   uint32_t me_fw_version;
   uint32_t me_fw_feature;
   uint32_t mec_fw_version;
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@ -1251,7 +1251,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
   }

   /* Prefetch the compute shader to L2. */
-   if (sctx->gfx_level >= GFX7 && prefetch)
+   if (sctx->gfx_level >= GFX7 && sctx->screen->info.has_cp_dma && prefetch)
      si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);

   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -346,7 +346,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
           clear_value_size > 4 ||
           /* Use compute if the size is large enough. Always prefer compute on GFX12. */
           (clear_value_size == 4 && offset % 4 == 0 &&
-            (size > compute_min_size || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))))
+            (!sctx->screen->info.has_cp_dma ||
+             sctx->screen->info.cp_sdma_ge_use_system_memory_scope || size > compute_min_size))))
         method = SI_COMPUTE_CLEAR_METHOD;

      if (method == SI_COMPUTE_CLEAR_METHOD) {
@ -403,10 +404,10 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p

   /* Only use compute for VRAM copies on dGPUs. */
   /* TODO: use compute for unaligned big sizes */
-   if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
-       si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
-       dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
-       (size > compute_min_size || sctx->screen->info.cp_sdma_ge_use_system_memory_scope)) {
+   if (dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
+       (!sctx->screen->info.has_cp_dma || sctx->screen->info.cp_sdma_ge_use_system_memory_scope ||
+        (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+         si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size))) {
      si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
                                  flags, coher);
   } else {
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@ -51,6 +51,7 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
 {
   uint32_t header = 0, command = 0;

+   assert(sctx->screen->info.has_cp_dma);
   assert(size <= cp_dma_max_byte_count(sctx));
   assert(sctx->gfx_level != GFX6 || cache_policy == L2_BYPASS);

--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@ -152,7 +152,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h

   /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
    * because the kernel doesn't wait for it. */
-   if (ctx->gfx_level >= GFX7)
+   if (ctx->gfx_level >= GFX7 && ctx->screen->info.has_cp_dma)
      si_cp_dma_wait_for_idle(ctx, &ctx->gfx_cs);

   /* If we use s_sendmsg to set tess factors to all 0 or all 1 instead of writing to the tess
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -1120,7 +1120,7 @@ static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader
 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
                               uint64_t scratch_va, int64_t bo_offset)
 {
-   bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) &&
+   bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
                     sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
                     bo_offset < 0;

--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -498,6 +498,7 @@ template<amd_gfx_level GFX_VERSION>
 static void si_cp_dma_prefetch_inline(struct si_context *sctx, uint64_t address, unsigned size)
 {
   assert(GFX_VERSION >= GFX7);
+   assert(sctx->screen->info.has_cp_dma);

   if (GFX_VERSION >= GFX11)
      size = MIN2(size, 32768 - SI_CPDMA_ALIGNMENT);
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@ -554,6 +554,7 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
   ws->info.gfx_ib_pad_with_type2 = ws->info.gfx_level <= GFX6 ||
                                    (ws->info.family == CHIP_HAWAII &&
                                     ws->accel_working2 < 3);
+   ws->info.has_cp_dma = true;
   ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
   ws->info.has_bo_metadata = false;
   ws->info.has_eqaa_surface_allocator = false;