mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 02:48:06 +02:00
tu: Optimize sync2 event handling in the non-asymmetric case
Before we were falling back to always emitting a pipeline barrier, which effectively kills any point of having the event. But with sync2 and the guarantee that src/dst dependency infos match, we can instead emit the flushes before writing the event and actually use the event as intended. As a bonus, this also allows the BV to run ahead of the BR. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40552>
This commit is contained in:
parent
012d72f2b0
commit
7171c3dd71
5 changed files with 93 additions and 8 deletions
|
|
@ -457,6 +457,25 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd,
|
|||
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
|
||||
}
|
||||
|
||||
void
|
||||
tu7_set_thread_both_patchpoint(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
|
||||
struct tu_cb_control_point info = {
|
||||
.type = TU_CB_CONTROL_TYPE_PATCHPOINT,
|
||||
.patchpoint = cs->cur,
|
||||
.patch_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH),
|
||||
.original_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
|
||||
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE,
|
||||
};
|
||||
util_dynarray_append(&cmd->cb_control_points, info);
|
||||
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
|
||||
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
|
||||
}
|
||||
|
||||
/* "Normal" cache flushes outside the renderpass, that don't require any special handling */
|
||||
template <chip CHIP>
|
||||
void
|
||||
|
|
@ -6042,7 +6061,7 @@ vk2tu_src_stage(struct tu_device *dev,
|
|||
return stage;
|
||||
}
|
||||
|
||||
static enum tu_stage
|
||||
enum tu_stage
|
||||
vk2tu_dst_stage(struct tu_device *dev,
|
||||
VkPipelineStageFlags2 vk_stages)
|
||||
{
|
||||
|
|
@ -9652,7 +9671,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer,
|
|||
void
|
||||
tu_barrier(struct tu_cmd_buffer *cmd,
|
||||
uint32_t dep_count,
|
||||
const VkDependencyInfo *dep_infos)
|
||||
const VkDependencyInfo *dep_infos,
|
||||
bool no_sync)
|
||||
{
|
||||
VkPipelineStageFlags2 srcStage = 0;
|
||||
VkPipelineStageFlags2 dstStage = 0;
|
||||
|
|
@ -9838,9 +9858,11 @@ tu_barrier(struct tu_cmd_buffer *cmd,
|
|||
|
||||
tu_flush_for_access(cache, src_flags, dst_flags);
|
||||
|
||||
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
|
||||
TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage);
|
||||
if (!no_sync) {
|
||||
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
|
||||
TU_CALLX(cmd->device, tu_flush_for_stage)(cache, src_stage, dst_stage);
|
||||
}
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
|
|
@ -9849,7 +9871,7 @@ tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
|
|||
{
|
||||
VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
|
||||
tu_barrier(cmd_buffer, 1, pDependencyInfo);
|
||||
tu_barrier(cmd_buffer, 1, pDependencyInfo, false);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
|
|
|
|||
|
|
@ -215,6 +215,10 @@ enum tu_stage {
|
|||
TU_STAGE_BOTTOM,
|
||||
};
|
||||
|
||||
enum tu_stage
|
||||
vk2tu_dst_stage(struct tu_device *dev,
|
||||
VkPipelineStageFlags2 vk_stages);
|
||||
|
||||
enum tu_cmd_flush_bits {
|
||||
TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
|
||||
TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
|
||||
|
|
@ -903,7 +907,8 @@ struct tu_vis_stream_patchpoint_cs {
|
|||
void
|
||||
tu_barrier(struct tu_cmd_buffer *cmd,
|
||||
uint32_t dep_count,
|
||||
const VkDependencyInfo *dep_info);
|
||||
const VkDependencyInfo *dep_info,
|
||||
bool no_sync);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
|
|
@ -965,6 +970,10 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd,
|
|||
struct tu_cs *cs,
|
||||
bool force_disable_cb);
|
||||
|
||||
void
|
||||
tu7_set_thread_both_patchpoint(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs);
|
||||
|
||||
/* For bin offsetting we want to do "Euclidean division," where the remainder
|
||||
* (i.e. the offset of the bin) is always positive. Unfortunately C/C++
|
||||
* remainder and division don't do this, so we have to implement it ourselves.
|
||||
|
|
|
|||
|
|
@ -114,6 +114,13 @@ tu_CmdSetEvent2(VkCommandBuffer commandBuffer,
|
|||
VkPipelineStageFlags2 src_stage_mask =
|
||||
vk_collect_dependency_info_src_stages(pDependencyInfo);
|
||||
|
||||
if (!(pDependencyInfo->dependencyFlags &
|
||||
VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR)) {
|
||||
tu_barrier(cmd, 1, pDependencyInfo, true);
|
||||
/* Force emit any flushes before the RB_DONE_TS is emitted below. */
|
||||
tu_emit_cache_flush<CHIP>(cmd);
|
||||
}
|
||||
|
||||
tu_write_event<CHIP>(cmd, event, src_stage_mask, 1);
|
||||
}
|
||||
TU_GENX(tu_CmdSetEvent2);
|
||||
|
|
@ -131,6 +138,7 @@ tu_CmdResetEvent2(VkCommandBuffer commandBuffer,
|
|||
}
|
||||
TU_GENX(tu_CmdResetEvent2);
|
||||
|
||||
template <chip CHIP>
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
|
||||
uint32_t eventCount,
|
||||
|
|
@ -140,9 +148,33 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
|
|||
VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
|
||||
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
|
||||
|
||||
bool skip_barrier = true;
|
||||
|
||||
for (uint32_t i = 0; i < eventCount; i++) {
|
||||
VK_FROM_HANDLE(tu_event, event, pEvents[i]);
|
||||
|
||||
/* If the dependency info in CmdSetEvent is the same, we can rely on all
|
||||
* flushes/invalidates landing by the time the event is signalled.
|
||||
* Otherwise, we have to do a full pipeline barrier.
|
||||
*/
|
||||
if (pDependencyInfos->dependencyFlags &
|
||||
VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR)
|
||||
skip_barrier = false;
|
||||
|
||||
/* If concurrent binning is enabled, and the dstStage includes vertex
|
||||
* stages, make BV also wait for the event.
|
||||
*/
|
||||
bool wait_bv = false;
|
||||
if (CHIP >= A7XX) {
|
||||
VkPipelineStageFlags2 dst_stage_mask =
|
||||
vk_collect_dependency_info_dst_stages(&pDependencyInfos[i]);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dst_stage_mask);
|
||||
if (dst_stage <= TU_STAGE_BV) {
|
||||
wait_bv = true;
|
||||
tu7_set_thread_both_patchpoint(cmd, cs);
|
||||
}
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
|
||||
CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
|
||||
|
|
@ -150,7 +182,12 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
|
|||
tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
|
||||
|
||||
if (wait_bv)
|
||||
tu7_set_thread_br_patchpoint(cmd, cs, false);
|
||||
}
|
||||
|
||||
tu_barrier(cmd, eventCount, pDependencyInfos);
|
||||
if (!skip_barrier)
|
||||
tu_barrier(cmd, eventCount, pDependencyInfos, false);
|
||||
}
|
||||
TU_GENX(tu_CmdWaitEvents2);
|
||||
|
|
|
|||
|
|
@ -110,6 +110,19 @@ vk_collect_dependency_info_src_stages(const VkDependencyInfo* pDependencyInfo)
|
|||
return stages;
|
||||
}
|
||||
|
||||
VkPipelineStageFlags2
|
||||
vk_collect_dependency_info_dst_stages(const VkDependencyInfo* pDependencyInfo)
|
||||
{
|
||||
VkPipelineStageFlags2 stages = 0;
|
||||
for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
|
||||
stages |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
|
||||
for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
|
||||
stages |= pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
|
||||
for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
|
||||
stages |= pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
|
||||
return stages;
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vk_common_CmdWriteTimestamp(
|
||||
VkCommandBuffer commandBuffer,
|
||||
|
|
|
|||
|
|
@ -89,6 +89,10 @@ vk_filter_dst_access_flags2(VkPipelineStageFlags2 stages,
|
|||
VkPipelineStageFlags2
|
||||
vk_collect_dependency_info_src_stages(const VkDependencyInfo* pDependencyInfo);
|
||||
|
||||
/** Union all the dstStageMasks on a VkDependencyInfo */
|
||||
VkPipelineStageFlags2
|
||||
vk_collect_dependency_info_dst_stages(const VkDependencyInfo* pDependencyInfo);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue