From 9931034dca07492b072ce3f4ef6f3bde4d2d0e1d Mon Sep 17 00:00:00 2001 From: Zan Dobersek Date: Sun, 5 Apr 2026 09:26:37 +0200 Subject: [PATCH] tu/a8xx: remove enforced TU_DEBUG_FLUSHALL Remove the TU_DEBUG_FLUSHALL option that was force-enabled for a8xx chips. The problematic CTS cases that required it were failing due to indirect draw commands sourcing draw data from buffers whose content was prepared by compute tasks. Up until a8xx, firmware was managing an implicit wait before any indirect draw parameters were read, with a delayed CP_WAIT_FOR_ME emitted only when necessary or on devices enabling indirect_draw_wfm_quirk due to bugged firmware. That implicit wait is gone on a8xx, so CP_WAIT_FOR_ME should be emitted immediately, which also matches behavior of the proprietary driver. Signed-off-by: Zan Dobersek Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.cc | 28 ++++++++++++++++++++------- src/freedreno/vulkan/tu_device.cc | 2 -- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index ddcc34382a4..d2f7c1728d5 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -6025,6 +6025,7 @@ vk2tu_dst_stage(struct tu_device *dev, return stage; } +template static void tu_flush_for_stage(struct tu_cache_state *cache, enum tu_stage src_stage, enum tu_stage dst_stage) @@ -6040,8 +6041,20 @@ tu_flush_for_stage(struct tu_cache_state *cache, cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; if (dst_stage <= TU_STAGE_BV) { cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_BR; - if (dst_stage == TU_STAGE_BV_CP) - cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + + /* Extending on the comment in vk2tu_single_stage(), up to a8xx, + * indirect opcodes rely on an implicit wait before reading indirect + * parameters, which can help avoid emitting CP_WAIT_FOR_ME. Exception + * to this are devices with bugged firmware that enable indirect_draw_wfm_quirk. + * a8xx removes this implicit wait, so CP_WAIT_FOR_ME should be emitted + * without delay, which also matches proprietary driver. + */ + if (dst_stage == TU_STAGE_BV_CP) { + if (CHIP >= A8XX) + cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + else + cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + } } } } @@ -6426,6 +6439,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, } } +template static void tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, const struct tu_subpass_barrier *barrier, @@ -6458,7 +6472,7 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, enum tu_stage src_stage = vk2tu_src_stage(cmd_buffer->device, src_stage_vk); enum tu_stage dst_stage = vk2tu_dst_stage(cmd_buffer->device, dst_stage_vk); - tu_flush_for_stage(cache, src_stage, dst_stage); + tu_flush_for_stage(cache, src_stage, dst_stage); } template @@ -6906,7 +6920,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, * gets called. However deferred flushes could have to happen later as part * of the subpass. */ - tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); + tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); cmd->state.renderpass_cache.pending_flush_bits = cmd->state.cache.pending_flush_bits; cmd->state.renderpass_cache.flush_bits = 0; @@ -7297,7 +7311,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, tu_cond_exec_end(cs); /* Handle dependencies for the next subpass */ - tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); + tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); if (cmd->state.subpass->feedback_invalidate) { cmd->state.renderpass_cache.flush_bits |= @@ -9520,7 +9534,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, cmd_buffer->state.cache.pending_flush_bits |= cmd_buffer->state.renderpass_cache.pending_flush_bits; - tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); + TU_CALLX(cmd_buffer->device, tu_subpass_barrier)(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments); @@ -9796,7 +9810,7 @@ tu_barrier(struct tu_cmd_buffer *cmd, enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage); enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage); - tu_flush_for_stage(cache, src_stage, dst_stage); + TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 018fb291860..9b602c66769 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2779,8 +2779,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, &dispatch_table, &tu_device_entrypoints_a7xx, false); break; case 8: - /* gen8 TODO: */ - tu_env.debug |= TU_DEBUG_FLUSHALL; /* dEQP-VK.draw.\*from_compute\* */ vk_device_dispatch_table_from_entrypoints( &dispatch_table, &tu_device_entrypoints_a8xx, false); }