diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 6359f724cf3..1b1b8378536 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -164,6 +164,7 @@ struct panvk_cs_deps { enum mali_cs_condition cond; struct cs_index cond_value; } dst[PANVK_SUBQUEUE_COUNT]; + bool needs_layout_transitions; }; enum panvk_sb_ids { @@ -468,9 +469,16 @@ void panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, enum panvk_subqueue_id subqueue, struct cs_index scratch_regs); -void panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, - const VkDependencyInfo *in, - struct panvk_cs_deps *out); +enum panvk_barrier_stage { + PANVK_BARRIER_STAGE_FIRST, + PANVK_BARRIER_STAGE_AFTER_LAYOUT_TRANSITION, +}; + +void panvk_per_arch(add_cs_deps)( + struct panvk_cmd_buffer *cmdbuf, + enum panvk_barrier_stage barrier_stage, + const VkDependencyInfo *in, + struct panvk_cs_deps *out); VkResult panvk_per_arch(cmd_prepare_exec_cmd_for_draws)( struct panvk_cmd_buffer *primary, struct panvk_cmd_buffer *secondary); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index eea0832a8ee..cb3ad261b35 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -480,41 +480,75 @@ collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, static void normalize_dependency(VkPipelineStageFlags2 *src_stages, VkPipelineStageFlags2 *dst_stages, - VkAccessFlags2 *src_access, VkAccessFlags2 *dst_access, - uint32_t src_qfi, uint32_t dst_qfi) + VkPipelineStageFlags2 transition_stages, + VkAccessFlags2 *src_access, + VkAccessFlags2 *dst_access, + VkAccessFlags2 transition_access, + uint32_t src_qfi, uint32_t dst_qfi, + enum panvk_barrier_stage barrier_stage) { + switch (barrier_stage) { + case PANVK_BARRIER_STAGE_FIRST: + if (transition_stages) { + /* We need to do layout transition, so we want to sync src with layout + * transition, and then later layout transition with dst. + */ + *dst_stages = transition_stages; + *dst_access = transition_access; + } + break; + case PANVK_BARRIER_STAGE_AFTER_LAYOUT_TRANSITION: + /* If transition_stages is empty, there was no layout transition and so we + * won't be waiting for anything. + */ + *src_stages = transition_stages; + *src_access = transition_access; + break; + } + /* Perform queue family ownership transfer if src and dst are unequal. */ if (src_qfi != dst_qfi) { - /* queue family acquire operation */ - switch (src_qfi) { - case VK_QUEUE_FAMILY_EXTERNAL: - /* no execution dependency and no availability operation */ - *src_stages = VK_PIPELINE_STAGE_2_NONE; - *src_access = VK_ACCESS_2_NONE; - break; - case VK_QUEUE_FAMILY_FOREIGN_EXT: - /* treat the foreign queue as the host */ - *src_stages = VK_PIPELINE_STAGE_2_HOST_BIT; - *src_access = VK_ACCESS_2_HOST_WRITE_BIT; - break; - default: - break; + /* Only normalize if we're actually syncing acquire, and not layout + * transition, with dst. + */ + if (barrier_stage == PANVK_BARRIER_STAGE_FIRST) { + /* queue family acquire operation */ + switch (src_qfi) { + case VK_QUEUE_FAMILY_EXTERNAL: + /* no execution dependency and no availability operation */ + *src_stages = VK_PIPELINE_STAGE_2_NONE; + *src_access = VK_ACCESS_2_NONE; + break; + case VK_QUEUE_FAMILY_FOREIGN_EXT: + /* treat the foreign queue as the host */ + *src_stages = VK_PIPELINE_STAGE_2_HOST_BIT; + *src_access = VK_ACCESS_2_HOST_WRITE_BIT; + break; + default: + break; + } } - /* queue family release operation */ - switch (dst_qfi) { - case VK_QUEUE_FAMILY_EXTERNAL: - /* no execution dependency and no visibility operation */ - *dst_stages = VK_PIPELINE_STAGE_2_NONE; - *dst_access = VK_ACCESS_2_NONE; - break; - case VK_QUEUE_FAMILY_FOREIGN_EXT: - /* treat the foreign queue as the host */ - *dst_stages = VK_PIPELINE_STAGE_2_HOST_BIT; - *dst_access = VK_ACCESS_2_HOST_WRITE_BIT; - break; - default: - break; + /* Only normalize if we're actually syncing the latest of either src or + * layout transition, with release. + */ + if ((barrier_stage == PANVK_BARRIER_STAGE_FIRST && !transition_stages) || + (barrier_stage == PANVK_BARRIER_STAGE_AFTER_LAYOUT_TRANSITION && transition_stages)) { + /* queue family release operation */ + switch (dst_qfi) { + case VK_QUEUE_FAMILY_EXTERNAL: + /* no execution dependency and no visibility operation */ + *dst_stages = VK_PIPELINE_STAGE_2_NONE; + *dst_access = VK_ACCESS_2_NONE; + break; + case VK_QUEUE_FAMILY_FOREIGN_EXT: + /* treat the foreign queue as the host */ + *dst_stages = VK_PIPELINE_STAGE_2_HOST_BIT; + *dst_access = VK_ACCESS_2_HOST_WRITE_BIT; + break; + default: + break; + } } } @@ -527,6 +561,7 @@ normalize_dependency(VkPipelineStageFlags2 *src_stages, void panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, + enum panvk_barrier_stage barrier_stage, const VkDependencyInfo *in, struct panvk_cs_deps *out) { @@ -536,8 +571,11 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, VkPipelineStageFlags2 dst_stages = barrier->dstStageMask; VkAccessFlags2 src_access = barrier->srcAccessMask; VkAccessFlags2 dst_access = barrier->dstAccessMask; - normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access, - VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED); + normalize_dependency(&src_stages, &dst_stages, 0, + &src_access, &dst_access, 0, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + barrier_stage); collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, out); @@ -549,9 +587,11 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, VkPipelineStageFlags2 dst_stages = barrier->dstStageMask; VkAccessFlags2 src_access = barrier->srcAccessMask; VkAccessFlags2 dst_access = barrier->dstAccessMask; - normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access, + normalize_dependency(&src_stages, &dst_stages, 0, + &src_access, &dst_access, 0, barrier->srcQueueFamilyIndex, - barrier->dstQueueFamilyIndex); + barrier->dstQueueFamilyIndex, + barrier_stage); collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, out); @@ -563,12 +603,21 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, VkPipelineStageFlags2 dst_stages = barrier->dstStageMask; VkAccessFlags2 src_access = barrier->srcAccessMask; VkAccessFlags2 dst_access = barrier->dstAccessMask; - normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access, + VkPipelineStageFlags2 transition_stages; + VkAccessFlags2 transition_access; + panvk_per_arch(transition_image_layout_sync_scope)(barrier, + &transition_stages, &transition_access); + normalize_dependency(&src_stages, &dst_stages, transition_stages, + &src_access, &dst_access, transition_access, barrier->srcQueueFamilyIndex, - barrier->dstQueueFamilyIndex); + barrier->dstQueueFamilyIndex, + barrier_stage); collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, out); + + if (barrier_stage == PANVK_BARRIER_STAGE_FIRST && transition_stages) + out->needs_layout_transitions = true; } } @@ -693,7 +742,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); - struct panvk_cs_deps deps = {0}; /* Intra render pass barriers can be skipped iff we're inside a render * pass. */ @@ -701,12 +749,32 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, (pDependencyInfo->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT)) return; - panvk_per_arch(add_cs_deps)(cmdbuf, pDependencyInfo, &deps); + struct panvk_cs_deps deps = {0}; + + panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, pDependencyInfo, &deps); if (deps.needs_draw_flush) panvk_per_arch(cmd_flush_draws)(cmdbuf); panvk_per_arch(emit_barrier)(cmdbuf, deps); + + if (deps.needs_layout_transitions) { + for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) { + const VkImageMemoryBarrier2 *barrier = &pDependencyInfo->pImageMemoryBarriers[i]; + + panvk_per_arch(cmd_transition_image_layout)(commandBuffer, barrier); + } + + struct panvk_cs_deps trans_deps = {0}; + + panvk_per_arch(add_cs_deps)( + cmdbuf, PANVK_BARRIER_STAGE_AFTER_LAYOUT_TRANSITION, + pDependencyInfo, &trans_deps); + + assert(!trans_deps.needs_draw_flush); + + panvk_per_arch(emit_barrier)(cmdbuf, trans_deps); + } } #if PAN_ARCH >= 11 diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c index 815d6f9e2ac..fa007d7e691 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c @@ -28,7 +28,7 @@ panvk_per_arch(CmdResetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, }; struct panvk_cs_deps deps = {0}; - panvk_per_arch(add_cs_deps)(cmdbuf, &info, &deps); + panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, &info, &deps); for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); @@ -66,7 +66,7 @@ panvk_per_arch(CmdSetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, VK_FROM_HANDLE(panvk_event, event, _event); struct panvk_cs_deps deps = {0}; - panvk_per_arch(add_cs_deps)(cmdbuf, pDependencyInfo, &deps); + panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, pDependencyInfo, &deps); if (deps.needs_draw_flush) panvk_per_arch(cmd_flush_draws)(cmdbuf); @@ -106,12 +106,13 @@ panvk_per_arch(CmdSetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, } static void -cmd_wait_event(struct panvk_cmd_buffer *cmdbuf, struct panvk_event *event, - const VkDependencyInfo *info) +cmd_wait_event(struct panvk_cmd_buffer *cmdbuf, + struct panvk_event *event, const VkDependencyInfo *info, + struct panvk_cs_deps *trans_deps, bool *needs_trans_barrier) { struct panvk_cs_deps deps = {0}; - panvk_per_arch(add_cs_deps)(cmdbuf, info, &deps); + panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, info, &deps); for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); @@ -129,6 +130,20 @@ cmd_wait_event(struct panvk_cmd_buffer *cmdbuf, struct panvk_event *event, seqno, sync_addr); } } + + if (deps.needs_layout_transitions) { + for (uint32_t i = 0; i < info->imageMemoryBarrierCount; i++) { + const VkImageMemoryBarrier2 *barrier = &info->pImageMemoryBarriers[i]; + + panvk_per_arch(cmd_transition_image_layout)( + panvk_cmd_buffer_to_handle(cmdbuf), barrier); + } + + panvk_per_arch(add_cs_deps)( + cmdbuf, PANVK_BARRIER_STAGE_AFTER_LAYOUT_TRANSITION, + info, trans_deps); + *needs_trans_barrier = true; + } } VKAPI_ATTR void VKAPI_CALL @@ -138,9 +153,19 @@ panvk_per_arch(CmdWaitEvents2)(VkCommandBuffer commandBuffer, { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + struct panvk_cs_deps trans_deps = {0}; + bool needs_trans_barrier = false; + for (uint32_t i = 0; i < eventCount; i++) { VK_FROM_HANDLE(panvk_event, event, pEvents[i]); + const VkDependencyInfo *info = &pDependencyInfos[i]; - cmd_wait_event(cmdbuf, event, &pDependencyInfos[i]); + cmd_wait_event(cmdbuf, event, info, &trans_deps, &needs_trans_barrier); + } + + if (needs_trans_barrier) { + assert(!trans_deps.needs_draw_flush); + + panvk_per_arch(emit_barrier)(cmdbuf, trans_deps); } } diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c index c69c0fcd136..17fd54e90f1 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c @@ -346,6 +346,17 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, panvk_per_arch(cmd_open_batch)(cmdbuf); } + + for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) { + const VkImageMemoryBarrier2 *barrier = &pDependencyInfo->pImageMemoryBarriers[i]; + + panvk_per_arch(cmd_transition_image_layout)(commandBuffer, barrier); + } + + /* If we had any layout transition dispatches, the batch will be closed at + * this point, therefore establishing the sync between itself and the + * commands that follow. + */ } static void diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_event.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_event.c index 85ff28c114d..d87148546ba 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_event.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_event.c @@ -111,6 +111,19 @@ panvk_per_arch(CmdWaitEvents2)(VkCommandBuffer commandBuffer, for (uint32_t i = 0; i < eventCount; i++) { VK_FROM_HANDLE(panvk_event, event, pEvents[i]); + const VkDependencyInfo *info = &pDependencyInfos[i]; + panvk_add_wait_event_operation(cmdbuf, event); + + for (uint32_t i = 0; i < info->imageMemoryBarrierCount; i++) { + const VkImageMemoryBarrier2 *barrier = &info->pImageMemoryBarriers[i]; + + panvk_per_arch(cmd_transition_image_layout)(commandBuffer, barrier); + } + + /* We don't need to do anything here to establish the sync between layout + * transition dispatches and the commands following the barrier. See the + * comment in ./panvk_vX_cmd_buffer.c:CmdPipelineBarrier2 for details. + */ } }