diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index ddcc34382a4..d2f7c1728d5 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -6025,6 +6025,7 @@ vk2tu_dst_stage(struct tu_device *dev, return stage; } +template static void tu_flush_for_stage(struct tu_cache_state *cache, enum tu_stage src_stage, enum tu_stage dst_stage) @@ -6040,8 +6041,20 @@ tu_flush_for_stage(struct tu_cache_state *cache, cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; if (dst_stage <= TU_STAGE_BV) { cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_BR; - if (dst_stage == TU_STAGE_BV_CP) - cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + + /* Extending on the comment in vk2tu_single_stage(), up to a8xx, + * indirect opcodes rely on an implicit wait before reading indirect + * parameters, which can help avoid emitting CP_WAIT_FOR_ME. Exception + * to this are devices with bugged firmware that enable indirect_draw_wfm_quirk. + * a8xx removes this implicit wait, so CP_WAIT_FOR_ME should be emitted + * without delay, which also matches proprietary driver. + */ + if (dst_stage == TU_STAGE_BV_CP) { + if (CHIP >= A8XX) + cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + else + cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + } } } } @@ -6426,6 +6439,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, } } +template static void tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, const struct tu_subpass_barrier *barrier, @@ -6458,7 +6472,7 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, enum tu_stage src_stage = vk2tu_src_stage(cmd_buffer->device, src_stage_vk); enum tu_stage dst_stage = vk2tu_dst_stage(cmd_buffer->device, dst_stage_vk); - tu_flush_for_stage(cache, src_stage, dst_stage); + tu_flush_for_stage(cache, src_stage, dst_stage); } template @@ -6906,7 +6920,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, * gets called. However deferred flushes could have to happen later as part * of the subpass. */ - tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); + tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); cmd->state.renderpass_cache.pending_flush_bits = cmd->state.cache.pending_flush_bits; cmd->state.renderpass_cache.flush_bits = 0; @@ -7297,7 +7311,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, tu_cond_exec_end(cs); /* Handle dependencies for the next subpass */ - tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); + tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); if (cmd->state.subpass->feedback_invalidate) { cmd->state.renderpass_cache.flush_bits |= @@ -9520,7 +9534,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, cmd_buffer->state.cache.pending_flush_bits |= cmd_buffer->state.renderpass_cache.pending_flush_bits; - tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); + TU_CALLX(cmd_buffer->device, tu_subpass_barrier)(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments); @@ -9796,7 +9810,7 @@ tu_barrier(struct tu_cmd_buffer *cmd, enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage); enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage); - tu_flush_for_stage(cache, src_stage, dst_stage); + TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 018fb291860..9b602c66769 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2779,8 +2779,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, &dispatch_table, &tu_device_entrypoints_a7xx, false); break; case 8: - /* gen8 TODO: */ - tu_env.debug |= TU_DEBUG_FLUSHALL; /* dEQP-VK.draw.\*from_compute\* */ vk_device_dispatch_table_from_entrypoints( &dispatch_table, &tu_device_entrypoints_a8xx, false); }