diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index ff1766f4197..b11fb17134a 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -457,6 +457,25 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd, CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); } +void +tu7_set_thread_both_patchpoint(struct tu_cmd_buffer *cmd, + struct tu_cs *cs) +{ + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + + struct tu_cb_control_point info = { + .type = TU_CB_CONTROL_TYPE_PATCHPOINT, + .patchpoint = cs->cur, + .patch_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH), + .original_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE, + }; + util_dynarray_append(&cmd->cb_control_points, info); + + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); +} + /* "Normal" cache flushes outside the renderpass, that don't require any special handling */ template void @@ -6042,7 +6061,7 @@ vk2tu_src_stage(struct tu_device *dev, return stage; } -static enum tu_stage +enum tu_stage vk2tu_dst_stage(struct tu_device *dev, VkPipelineStageFlags2 vk_stages) { @@ -9652,7 +9671,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer, void tu_barrier(struct tu_cmd_buffer *cmd, uint32_t dep_count, - const VkDependencyInfo *dep_infos) + const VkDependencyInfo *dep_infos, + bool no_sync) { VkPipelineStageFlags2 srcStage = 0; VkPipelineStageFlags2 dstStage = 0; @@ -9838,9 +9858,11 @@ tu_barrier(struct tu_cmd_buffer *cmd, tu_flush_for_access(cache, src_flags, dst_flags); - enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage); - enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage); - TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage); + if (!no_sync) { + enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage); + enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage); + TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage); + } } VKAPI_ATTR void VKAPI_CALL @@ -9849,7 +9871,7 @@ tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, { VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - tu_barrier(cmd_buffer, 1, pDependencyInfo); + tu_barrier(cmd_buffer, 1, pDependencyInfo, false); } template diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index c8ebc46a5d1..e8a4ed000ec 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -215,6 +215,10 @@ enum tu_stage { TU_STAGE_BOTTOM, }; +enum tu_stage +vk2tu_dst_stage(struct tu_device *dev, + VkPipelineStageFlags2 vk_stages); + enum tu_cmd_flush_bits { TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0, TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1, @@ -903,7 +907,8 @@ struct tu_vis_stream_patchpoint_cs { void tu_barrier(struct tu_cmd_buffer *cmd, uint32_t dep_count, - const VkDependencyInfo *dep_info); + const VkDependencyInfo *dep_info, + bool no_sync); template void @@ -965,6 +970,10 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool force_disable_cb); +void +tu7_set_thread_both_patchpoint(struct tu_cmd_buffer *cmd, + struct tu_cs *cs); + /* For bin offsetting we want to do "Euclidean division," where the remainder * (i.e. the offset of the bin) is always positive. Unfortunately C/C++ * remainder and division don't do this, so we have to implement it ourselves. diff --git a/src/freedreno/vulkan/tu_event.cc b/src/freedreno/vulkan/tu_event.cc index 337841d2e37..cdbd89aa23f 100644 --- a/src/freedreno/vulkan/tu_event.cc +++ b/src/freedreno/vulkan/tu_event.cc @@ -114,6 +114,13 @@ tu_CmdSetEvent2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 src_stage_mask = vk_collect_dependency_info_src_stages(pDependencyInfo); + if (!(pDependencyInfo->dependencyFlags & + VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR)) { + tu_barrier(cmd, 1, pDependencyInfo, true); + /* Force emit any flushes before the RB_DONE_TS is emitted below. */ + tu_emit_cache_flush(cmd); + } + tu_write_event(cmd, event, src_stage_mask, 1); } TU_GENX(tu_CmdSetEvent2); @@ -131,6 +138,7 @@ tu_CmdResetEvent2(VkCommandBuffer commandBuffer, } TU_GENX(tu_CmdResetEvent2); +template VKAPI_ATTR void VKAPI_CALL tu_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, @@ -140,9 +148,33 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; + bool skip_barrier = true; + for (uint32_t i = 0; i < eventCount; i++) { VK_FROM_HANDLE(tu_event, event, pEvents[i]); + /* If the dependency info in CmdSetEvent is the same, we can rely on all + * flushes/invalidates landing by the time the event is signalled. + * Otherwise, we have to do a full pipeline barrier. + */ + if (pDependencyInfos->dependencyFlags & + VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR) + skip_barrier = false; + + /* If concurrent binning is enabled, and the dstStage includes vertex + * stages, make BV also wait for the event. + */ + bool wait_bv = false; + if (CHIP >= A7XX) { + VkPipelineStageFlags2 dst_stage_mask = + vk_collect_dependency_info_dst_stages(&pDependencyInfos[i]); + enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dst_stage_mask); + if (dst_stage <= TU_STAGE_BV) { + wait_bv = true; + tu7_set_thread_both_patchpoint(cmd, cs); + } + } + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY)); @@ -150,7 +182,12 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer, tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1)); tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u)); tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20)); + + if (wait_bv) + tu7_set_thread_br_patchpoint(cmd, cs, false); } - tu_barrier(cmd, eventCount, pDependencyInfos); + if (!skip_barrier) + tu_barrier(cmd, eventCount, pDependencyInfos, false); } +TU_GENX(tu_CmdWaitEvents2); diff --git a/src/vulkan/runtime/vk_synchronization.c b/src/vulkan/runtime/vk_synchronization.c index bfd13ab09b8..ea789a16890 100644 --- a/src/vulkan/runtime/vk_synchronization.c +++ b/src/vulkan/runtime/vk_synchronization.c @@ -110,6 +110,19 @@ vk_collect_dependency_info_src_stages(const VkDependencyInfo* pDependencyInfo) return stages; } +VkPipelineStageFlags2 +vk_collect_dependency_info_dst_stages(const VkDependencyInfo* pDependencyInfo) +{ + VkPipelineStageFlags2 stages = 0; + for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) + stages |= pDependencyInfo->pMemoryBarriers[i].dstStageMask; + for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) + stages |= pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask; + for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) + stages |= pDependencyInfo->pImageMemoryBarriers[i].dstStageMask; + return stages; +} + VKAPI_ATTR void VKAPI_CALL vk_common_CmdWriteTimestamp( VkCommandBuffer commandBuffer, diff --git a/src/vulkan/runtime/vk_synchronization.h b/src/vulkan/runtime/vk_synchronization.h index 5142005b27f..20804bf21d2 100644 --- a/src/vulkan/runtime/vk_synchronization.h +++ b/src/vulkan/runtime/vk_synchronization.h @@ -89,6 +89,10 @@ vk_filter_dst_access_flags2(VkPipelineStageFlags2 stages, VkPipelineStageFlags2 vk_collect_dependency_info_src_stages(const VkDependencyInfo* pDependencyInfo); +/** Union all the dstStageMasks on a VkDependencyInfo */ +VkPipelineStageFlags2 +vk_collect_dependency_info_dst_stages(const VkDependencyInfo* pDependencyInfo); + #ifdef __cplusplus } #endif