From ec82b4294432aa6da40517655a4f8af2767176a2 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 15 Nov 2023 15:06:12 +0100
Subject: [PATCH] radv: add a missing async compute workaround for
 Tonga/Iceland

After digging into PAL code again, I figured that Tonga/Iceland are
both affected by a hw bug related to async compute dispatches.

The solution is to change the "threadgroup" dimension mode to the
"thread" dimension mode unconditionally.

This should fix a bunch of issues related to RADV_DEBUG=nocompute on
these GPUs.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7551
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6334
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4679
Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26207>
---
 src/amd/common/ac_gpu_info.c     |  7 +++++++
 src/amd/common/ac_gpu_info.h     |  1 +
 src/amd/vulkan/radv_cmd_buffer.c | 17 ++++++++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index 7d2f0f9eead..b3be20c2e52 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -1245,6 +1245,13 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
                                     info->family == CHIP_BONAIRE ||
                                     info->family == CHIP_KABINI;
 
+   /* HW bug workaround with async compute dispatches when threadgroup > 4096.
+    * The workaround is to change the "threadgroup" dimension mode to "thread"
+    * dimension mode.
+    */
+   info->has_async_compute_threadgroup_bug = info->family == CHIP_ICELAND ||
+                                             info->family == CHIP_TONGA;
+
    /* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the
     * feature version wasn't bumped.
     */
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index e2550c1428c..80ef7ac1e7a 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -101,6 +101,7 @@ struct radeon_info {
    bool has_two_planes_iterate256_bug;
    bool has_vgt_flush_ngg_legacy_bug;
    bool has_cs_regalloc_hang_bug;
+   bool has_async_compute_threadgroup_bug;
    bool has_32bit_predication;
    bool has_3d_cube_border_color_mipmap;
    bool has_image_opcodes;
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 99027f982a5..95d7c72f0fb 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -9550,11 +9550,11 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv
          radeon_emit(cs, dispatch_initiator);
       }
    } else {
+      const unsigned *cs_block_size = compute_shader->info.cs.block_size;
       unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
       unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
 
       if (info->unaligned) {
-         const unsigned *cs_block_size = compute_shader->info.cs.block_size;
          unsigned remainder[3];
 
          /* If aligned, these should be an entire block size,
@@ -9619,6 +9619,21 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv
          predicating = false;
       }
 
+      if (cmd_buffer->device->physical_device->rad_info.has_async_compute_threadgroup_bug &&
+          cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
+         for (unsigned i = 0; i < 3; i++) {
+            if (info->unaligned) {
+               /* info->blocks is already in thread dimensions for unaligned dispatches. */
+               blocks[i] = info->blocks[i];
+            } else {
+               /* Force the async compute dispatch to be in "thread" dim mode to workaround a hw bug. */
+               blocks[i] *= cs_block_size[i];
+            }
+
+            dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
+         }
+      }
+
       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
       radeon_emit(cs, blocks[0]);
       radeon_emit(cs, blocks[1]);