ac/nir/meta: tune clear/copy_buffer performance for gfx6-10.3

Finally, old GPUs have optimal clear/copy_buffer performance, but only the top dGPU of each generation gets the best behavior. Other dGPUs might need slightly different conditions. APUs likely need very different conditions. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31082>
2026-05-05 22:38:05 +02:00 · 2024-08-22 05:18:46 -04:00 · 2024-08-22 05:18:46 -04:00 · 0d8fe2d03b
commit 0d8fe2d03b
parent 34bd8427f8
2 changed files with 146 additions and 40 deletions
--- a/src/amd/common/ac_nir_meta.h
+++ b/src/amd/common/ac_nir_meta.h
@ -168,7 +168,7 @@ struct ac_cs_clear_copy_buffer_info {
   unsigned size;
   unsigned clear_value_size;
   uint32_t clear_value[4];
-   unsigned dwords_per_thread;
+   unsigned dwords_per_thread;   /* Set to 0 to let the code choose the optimal value. */
   bool render_condition_enabled;
   bool dst_is_vram;
   bool src_is_vram;
--- a/src/amd/common/ac_nir_meta_cs_clear_copy_buffer.c
+++ b/src/amd/common/ac_nir_meta_cs_clear_copy_buffer.c
@ -349,22 +349,156 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
      assert(clear_value_size % 4 == 0);
   }

-   unsigned dwords_per_thread = info->dwords_per_thread;
-   if (!dwords_per_thread) {
-      /* Set default optimal settings. */
-      dwords_per_thread = info->size <= 64 * 1024 ? 2 : 4;
-
-      if (!is_copy) {
-         if (clear_value_size == 12) {
-            /* Clearing 4 dwords per thread with a 3-dword clear value is faster with big sizes. */
-            dwords_per_thread = info->size <= 4096 ? 3 : 4;
+   /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
+    * support the render condition.
+    */
+   if (options->fail_if_slow && !info->render_condition_enabled && options->info->has_cp_dma &&
+       !options->info->cp_sdma_ge_use_system_memory_scope) {
+      switch (options->info->gfx_level) {
+      /* GFX6-8: CP DMA clears are so slow that we risk getting a GPU timeout. CP DMA copies
+       * are also slow but less.
+       */
+      case GFX6:
+         /* Optimal for Tahiti. */
+         if (is_copy) {
+            if (!info->dst_is_vram || !info->src_is_vram ||
+                info->size <= (info->dst_offset % 4 ||
+                               (info->dst_offset == 4 && info->src_offset % 4) ? 32 * 1024 : 16 * 1024))
+               return false;
         } else {
-            /* dwords_per_thread must be at least the size of the clear value. */
-            dwords_per_thread = MAX2(dwords_per_thread, clear_value_size / 4);
+            /* CP DMA only supports dword-aligned clears and small clear values. */
+            if (clear_value_size <= 4 && info->dst_offset % 4 == 0 && info->size % 4 == 0 &&
+                info->dst_is_vram && info->size <= 1024)
+               return false;
         }
+         break;
+
+      case GFX7:
+         /* Optimal for Hawaii. */
+         if (is_copy && info->dst_is_vram && info->src_is_vram && info->size <= 512)
+            return false;
+         break;
+
+      case GFX8:
+         /* Optimal for Tonga. */
+         break;
+
+      case GFX9:
+         /* Optimal for Vega10. */
+         if (is_copy) {
+            if (info->src_is_vram) {
+               if (info->dst_is_vram) {
+                  if (info->size < 4096)
+                     return false;
+               } else {
+                  if (info->size < (info->dst_offset % 64 ? 8192 : 2048))
+                     return false;
+               }
+            } else {
+               /* GTT->VRAM and GTT->GTT. */
+               return false;
+            }
+         } else {
+            /* CP DMA only supports dword-aligned clears and small clear values. */
+            if (clear_value_size <= 4 && info->dst_offset % 4 == 0 && info->size % 4 == 0 &&
+                !info->dst_is_vram && (info->size < 2048 || info->size >= 8 << 20 /* 8 MB */))
+               return false;
+         }
+         break;
+
+      case GFX10:
+      case GFX10_3:
+         /* Optimal for Navi21, Navi10. */
+         break;
+
+      case GFX11:
+      default:
+         /* Optimal for Navi31. */
+         if (is_copy && info->size < 1024 && info->dst_offset % 256 && info->dst_is_vram && info->src_is_vram)
+            return false;
+         break;
+
+      case GFX12:
+         unreachable("cp_sdma_ge_use_system_memory_scope should be true, so we should never get here");
      }
   }

+   unsigned dwords_per_thread = info->dwords_per_thread;
+
+   /* Determine optimal dwords_per_thread for performance. */
+   if (!info->dwords_per_thread) {
+      /* This is a good initial value to start with. */
+      dwords_per_thread = info->size <= 64 * 1024 ? 2 : 4;
+
+      /* Clearing 4 dwords per thread with a 3-dword clear value is faster with big sizes. */
+      if (!is_copy && clear_value_size == 12)
+         dwords_per_thread = info->size <= 4096 ? 3 : 4;
+
+      switch (options->info->gfx_level) {
+      case GFX6:
+         /* Optimal for Tahiti. */
+         if (is_copy) {
+            if (info->dst_is_vram && info->src_is_vram)
+               dwords_per_thread = 2;
+         } else {
+            if (info->dst_is_vram && clear_value_size != 12)
+               dwords_per_thread = info->size <= 128 * 1024 || info->size >= 4 << 20 /* 4MB */ ? 2 : 4;
+
+            if (clear_value_size == 12)
+               dwords_per_thread = info->size <= (info->dst_is_vram ? 256 : 128) * 1024 ? 3 : 4;
+         }
+         break;
+
+      case GFX7:
+         /* Optimal for Hawaii. */
+         if (is_copy) {
+            if (info->dst_is_vram && info->src_is_vram && info->dst_offset % 4 == 0 &&
+                info->size >= 8 << 20 /* 8MB */)
+               dwords_per_thread = 2;
+         } else {
+            if (info->dst_is_vram && clear_value_size != 12)
+               dwords_per_thread = info->size <= 32 * 1024 ? 2 : 4;
+
+            if (clear_value_size == 12)
+               dwords_per_thread = info->size <= 256 * 1024 ? 3 : 4;
+         }
+         break;
+
+      case GFX8:
+         /* Optimal for Tonga. */
+         if (is_copy) {
+            dwords_per_thread = 2;
+         } else {
+            if (clear_value_size == 12 && info->size < (2 << 20) /* 2MB */)
+               dwords_per_thread = 3;
+         }
+         break;
+
+      case GFX9:
+         /* Optimal for Vega10. */
+         if (is_copy && info->src_is_vram && info->dst_is_vram && info->size >= 8 << 20 /* 8 MB */)
+            dwords_per_thread = 2;
+
+         if (!info->dst_is_vram)
+            dwords_per_thread = 2;
+         break;
+
+      case GFX10:
+      case GFX10_3:
+      case GFX11:
+      case GFX12:
+         /* Optimal for Gfx12xx, Navi31, Navi21, Navi10. */
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   /* dwords_per_thread must be at least the size of the clear value. */
+   if (!is_copy)
+      dwords_per_thread = MAX2(dwords_per_thread, clear_value_size / 4);
+
   /* Validate dwords_per_thread. */
   if (dwords_per_thread > 4) {
      assert(!"dwords_per_thread must be <= 4");
@ -381,34 +515,6 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
      return false; /* invalid value */
   }

-   /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
-    * support the render condition.
-    */
-   if (options->fail_if_slow && !info->render_condition_enabled && options->info->has_cp_dma &&
-       !options->info->cp_sdma_ge_use_system_memory_scope) {
-      switch (options->info->gfx_level) {
-      case GFX11:
-         /* Verified on Navi31. */
-         if (is_copy && info->size < 1024 && info->dst_offset % 256 && info->dst_is_vram && info->src_is_vram)
-            return false;
-         break;
-
-      default:
-         if (is_copy) {
-            /* Only use compute for large VRAM copies on dGPUs. */
-            if (info->size <= 8192 || !options->info->has_dedicated_vram || !info->dst_is_vram ||
-                !info->src_is_vram)
-               return false;
-         } else {
-            /* CP DMA clears are terribly slow with GTT on GFX6-8, which can be encountered with
-             * any buffer due to BO evictions, so never use CP DMA clears on GFX6-8.
-             */
-            if (options->info->gfx_level >= GFX9 && clear_value_size <= 4 && info->size <= 4096)
-               return false;
-         }
-      }
-   }
-
   unsigned dst_align_offset = info->dst_offset % (dwords_per_thread * 4);
   unsigned dst_offset_bound = info->dst_offset - dst_align_offset;
   unsigned src_align_offset = is_copy ? info->src_offset % 4 : 0;