tu: Optimize sync2 event handling in the non-asymmetric case

Before we were falling back to always emitting a pipeline barrier, which
effectively kills any point of having the event. But with sync2 and the
guarantee that src/dst dependency infos match, we can instead emit the
flushes before writing the event and actually use the event as intended.
As a bonus, this also allows the BV to run ahead of the BR.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40552>
This commit is contained in:
Connor Abbott 2026-03-20 14:20:38 -04:00 committed by Marge Bot
parent 012d72f2b0
commit 7171c3dd71
5 changed files with 93 additions and 8 deletions

View file

@ -457,6 +457,25 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd,
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
}
void
tu7_set_thread_both_patchpoint(struct tu_cmd_buffer *cmd,
struct tu_cs *cs)
{
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
struct tu_cb_control_point info = {
.type = TU_CB_CONTROL_TYPE_PATCHPOINT,
.patchpoint = cs->cur,
.patch_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH),
.original_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE,
};
util_dynarray_append(&cmd->cb_control_points, info);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
}
/* "Normal" cache flushes outside the renderpass, that don't require any special handling */
template <chip CHIP>
void
@ -6042,7 +6061,7 @@ vk2tu_src_stage(struct tu_device *dev,
return stage;
}
static enum tu_stage
enum tu_stage
vk2tu_dst_stage(struct tu_device *dev,
VkPipelineStageFlags2 vk_stages)
{
@ -9652,7 +9671,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer,
void
tu_barrier(struct tu_cmd_buffer *cmd,
uint32_t dep_count,
const VkDependencyInfo *dep_infos)
const VkDependencyInfo *dep_infos,
bool no_sync)
{
VkPipelineStageFlags2 srcStage = 0;
VkPipelineStageFlags2 dstStage = 0;
@ -9838,9 +9858,11 @@ tu_barrier(struct tu_cmd_buffer *cmd,
tu_flush_for_access(cache, src_flags, dst_flags);
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage);
if (!no_sync) {
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage);
}
}
VKAPI_ATTR void VKAPI_CALL
@ -9849,7 +9871,7 @@ tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
{
VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
tu_barrier(cmd_buffer, 1, pDependencyInfo);
tu_barrier(cmd_buffer, 1, pDependencyInfo, false);
}
template <chip CHIP>

View file

@ -215,6 +215,10 @@ enum tu_stage {
TU_STAGE_BOTTOM,
};
enum tu_stage
vk2tu_dst_stage(struct tu_device *dev,
VkPipelineStageFlags2 vk_stages);
enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
@ -903,7 +907,8 @@ struct tu_vis_stream_patchpoint_cs {
void
tu_barrier(struct tu_cmd_buffer *cmd,
uint32_t dep_count,
const VkDependencyInfo *dep_info);
const VkDependencyInfo *dep_info,
bool no_sync);
template <chip CHIP>
void
@ -965,6 +970,10 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
bool force_disable_cb);
void
tu7_set_thread_both_patchpoint(struct tu_cmd_buffer *cmd,
struct tu_cs *cs);
/* For bin offsetting we want to do "Euclidean division," where the remainder
* (i.e. the offset of the bin) is always positive. Unfortunately C/C++
* remainder and division don't do this, so we have to implement it ourselves.

View file

@ -114,6 +114,13 @@ tu_CmdSetEvent2(VkCommandBuffer commandBuffer,
VkPipelineStageFlags2 src_stage_mask =
vk_collect_dependency_info_src_stages(pDependencyInfo);
if (!(pDependencyInfo->dependencyFlags &
VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR)) {
tu_barrier(cmd, 1, pDependencyInfo, true);
/* Force emit any flushes before the RB_DONE_TS is emitted below. */
tu_emit_cache_flush<CHIP>(cmd);
}
tu_write_event<CHIP>(cmd, event, src_stage_mask, 1);
}
TU_GENX(tu_CmdSetEvent2);
@ -131,6 +138,7 @@ tu_CmdResetEvent2(VkCommandBuffer commandBuffer,
}
TU_GENX(tu_CmdResetEvent2);
template <chip CHIP>
VKAPI_ATTR void VKAPI_CALL
tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
uint32_t eventCount,
@ -140,9 +148,33 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
bool skip_barrier = true;
for (uint32_t i = 0; i < eventCount; i++) {
VK_FROM_HANDLE(tu_event, event, pEvents[i]);
/* If the dependency info in CmdSetEvent is the same, we can rely on all
* flushes/invalidates landing by the time the event is signalled.
* Otherwise, we have to do a full pipeline barrier.
*/
if (pDependencyInfos->dependencyFlags &
VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR)
skip_barrier = false;
/* If concurrent binning is enabled, and the dstStage includes vertex
* stages, make BV also wait for the event.
*/
bool wait_bv = false;
if (CHIP >= A7XX) {
VkPipelineStageFlags2 dst_stage_mask =
vk_collect_dependency_info_dst_stages(&pDependencyInfos[i]);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dst_stage_mask);
if (dst_stage <= TU_STAGE_BV) {
wait_bv = true;
tu7_set_thread_both_patchpoint(cmd, cs);
}
}
tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
@ -150,7 +182,12 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
if (wait_bv)
tu7_set_thread_br_patchpoint(cmd, cs, false);
}
tu_barrier(cmd, eventCount, pDependencyInfos);
if (!skip_barrier)
tu_barrier(cmd, eventCount, pDependencyInfos, false);
}
TU_GENX(tu_CmdWaitEvents2);

View file

@ -110,6 +110,19 @@ vk_collect_dependency_info_src_stages(const VkDependencyInfo* pDependencyInfo)
return stages;
}
VkPipelineStageFlags2
vk_collect_dependency_info_dst_stages(const VkDependencyInfo* pDependencyInfo)
{
VkPipelineStageFlags2 stages = 0;
for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
stages |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
stages |= pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
stages |= pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
return stages;
}
VKAPI_ATTR void VKAPI_CALL
vk_common_CmdWriteTimestamp(
VkCommandBuffer commandBuffer,

View file

@ -89,6 +89,10 @@ vk_filter_dst_access_flags2(VkPipelineStageFlags2 stages,
VkPipelineStageFlags2
vk_collect_dependency_info_src_stages(const VkDependencyInfo* pDependencyInfo);
/** Union all the dstStageMasks on a VkDependencyInfo */
VkPipelineStageFlags2
vk_collect_dependency_info_dst_stages(const VkDependencyInfo* pDependencyInfo);
#ifdef __cplusplus
}
#endif