tu/a8xx: remove enforced TU_DEBUG_FLUSHALL

Remove the TU_DEBUG_FLUSHALL option that was force-enabled for a8xx chips.
The problematic CTS cases that required it were failing due to indirect
draw commands sourcing draw data from buffers whose content was prepared
by compute tasks.

Up until a8xx, firmware was managing an implicit wait before any indirect
draw parameters were read, with a delayed CP_WAIT_FOR_ME emitted only when
necessary or on devices enabling indirect_draw_wfm_quirk due to bugged
firmware. That implicit wait is gone on a8xx, so CP_WAIT_FOR_ME should be
emitted immediately, which also matches behavior of the proprietary driver.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40550>
This commit is contained in:
Zan Dobersek 2026-04-05 09:26:37 +02:00 committed by Marge Bot
parent 36983b50fe
commit 9931034dca
2 changed files with 21 additions and 9 deletions

View file

@ -6025,6 +6025,7 @@ vk2tu_dst_stage(struct tu_device *dev,
return stage;
}
template <chip CHIP>
static void
tu_flush_for_stage(struct tu_cache_state *cache,
enum tu_stage src_stage, enum tu_stage dst_stage)
@ -6040,8 +6041,20 @@ tu_flush_for_stage(struct tu_cache_state *cache,
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
if (dst_stage <= TU_STAGE_BV) {
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_BR;
if (dst_stage == TU_STAGE_BV_CP)
cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
/* Extending on the comment in vk2tu_single_stage(), up to a8xx,
* indirect opcodes rely on an implicit wait before reading indirect
* parameters, which can help avoid emitting CP_WAIT_FOR_ME. Exception
* to this are devices with bugged firmware that enable indirect_draw_wfm_quirk.
* a8xx removes this implicit wait, so CP_WAIT_FOR_ME should be emitted
* without delay, which also matches proprietary driver.
*/
if (dst_stage == TU_STAGE_BV_CP) {
if (CHIP >= A8XX)
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
else
cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
}
}
}
}
@ -6426,6 +6439,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
}
}
template <chip CHIP>
static void
tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
const struct tu_subpass_barrier *barrier,
@ -6458,7 +6472,7 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
enum tu_stage src_stage = vk2tu_src_stage(cmd_buffer->device, src_stage_vk);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd_buffer->device, dst_stage_vk);
tu_flush_for_stage(cache, src_stage, dst_stage);
tu_flush_for_stage<CHIP>(cache, src_stage, dst_stage);
}
template <chip CHIP>
@ -6906,7 +6920,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
* gets called. However deferred flushes could have to happen later as part
* of the subpass.
*/
tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
tu_subpass_barrier<CHIP>(cmd, &pass->subpasses[0].start_barrier, true);
cmd->state.renderpass_cache.pending_flush_bits =
cmd->state.cache.pending_flush_bits;
cmd->state.renderpass_cache.flush_bits = 0;
@ -7297,7 +7311,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
tu_cond_exec_end(cs);
/* Handle dependencies for the next subpass */
tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
tu_subpass_barrier<CHIP>(cmd, &cmd->state.subpass->start_barrier, false);
if (cmd->state.subpass->feedback_invalidate) {
cmd->state.renderpass_cache.flush_bits |=
@ -9520,7 +9534,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
cmd_buffer->state.cache.pending_flush_bits |=
cmd_buffer->state.renderpass_cache.pending_flush_bits;
tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
TU_CALLX(cmd_buffer->device, tu_subpass_barrier)(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
@ -9796,7 +9810,7 @@ tu_barrier(struct tu_cmd_buffer *cmd,
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
tu_flush_for_stage(cache, src_stage, dst_stage);
TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage);
}
VKAPI_ATTR void VKAPI_CALL

View file

@ -2779,8 +2779,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
&dispatch_table, &tu_device_entrypoints_a7xx, false);
break;
case 8:
/* gen8 TODO: */
tu_env.debug |= TU_DEBUG_FLUSHALL; /* dEQP-VK.draw.\*from_compute\* */
vk_device_dispatch_table_from_entrypoints(
&dispatch_table, &tu_device_entrypoints_a8xx, false);
}