From ec82b4294432aa6da40517655a4f8af2767176a2 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 15 Nov 2023 15:06:12 +0100 Subject: [PATCH] radv: add a missing async compute workaround for Tonga/Iceland After digging into PAL code again, I figured that Tonga/Iceland are both affected by a hw bug related to async compute dispatches. The solution is to change the "threadgroup" dimension mode to the "thread" dimension mode unconditionally. This should fix a bunch of issues related to RADV_DEBUG=nocompute on these GPUs. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7551 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6334 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4679 Cc: mesa-stable Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_gpu_info.c | 7 +++++++ src/amd/common/ac_gpu_info.h | 1 + src/amd/vulkan/radv_cmd_buffer.c | 17 ++++++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 7d2f0f9eead..b3be20c2e52 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1245,6 +1245,13 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->family == CHIP_BONAIRE || info->family == CHIP_KABINI; + /* HW bug workaround with async compute dispatches when threadgroup > 4096. + * The workaround is to change the "threadgroup" dimension mode to "thread" + * dimension mode. + */ + info->has_async_compute_threadgroup_bug = info->family == CHIP_ICELAND || + info->family == CHIP_TONGA; + /* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the * feature version wasn't bumped. */ diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index e2550c1428c..80ef7ac1e7a 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -101,6 +101,7 @@ struct radeon_info { bool has_two_planes_iterate256_bug; bool has_vgt_flush_ngg_legacy_bug; bool has_cs_regalloc_hang_bug; + bool has_async_compute_threadgroup_bug; bool has_32bit_predication; bool has_3d_cube_border_color_mipmap; bool has_image_opcodes; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 99027f982a5..95d7c72f0fb 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -9550,11 +9550,11 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv radeon_emit(cs, dispatch_initiator); } } else { + const unsigned *cs_block_size = compute_shader->info.cs.block_size; unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]}; unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]}; if (info->unaligned) { - const unsigned *cs_block_size = compute_shader->info.cs.block_size; unsigned remainder[3]; /* If aligned, these should be an entire block size, @@ -9619,6 +9619,21 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv predicating = false; } + if (cmd_buffer->device->physical_device->rad_info.has_async_compute_threadgroup_bug && + cmd_buffer->qf == RADV_QUEUE_COMPUTE) { + for (unsigned i = 0; i < 3; i++) { + if (info->unaligned) { + /* info->blocks is already in thread dimensions for unaligned dispatches. */ + blocks[i] = info->blocks[i]; + } else { + /* Force the async compute dispatch to be in "thread" dim mode to workaround a hw bug. */ + blocks[i] *= cs_block_size[i]; + } + + dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1); + } + } + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1)); radeon_emit(cs, blocks[0]); radeon_emit(cs, blocks[1]);