From f2c571fabff3f1ba2f0b2069f3f2df0bc9158ace Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 21 Oct 2024 12:02:51 +0300 Subject: [PATCH] anv: add tracking of involved stages in pipe flushes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lionel Landwerlin Reviewed-by: Tapani Pälli Reviewed-by: Caio Oliveira Part-of: --- src/intel/vulkan/anv_blorp.c | 26 ++++- src/intel/vulkan/anv_cmd_buffer.c | 2 + src/intel/vulkan/anv_private.h | 15 ++- src/intel/vulkan/anv_util.c | 21 +++- .../vulkan/genX_acceleration_structure.c | 12 ++- src/intel/vulkan/genX_blorp_exec.c | 4 + src/intel/vulkan/genX_cmd_buffer.c | 97 +++++++++++++++---- src/intel/vulkan/genX_cmd_compute.c | 6 +- src/intel/vulkan/genX_cmd_draw.c | 12 +++ .../vulkan/genX_cmd_draw_generated_indirect.h | 12 +++ src/intel/vulkan/genX_query.c | 21 +++- 11 files changed, 198 insertions(+), 30 deletions(-) diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index 816b4bb4b42..1921e91013d 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -667,7 +667,12 @@ void anv_CmdCopyImage2( anv_cmd_buffer_is_compute_queue(cmd_buffer) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; - anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, + anv_add_pending_pipe_bits(cmd_buffer, + (batch.flags & BLORP_BATCH_USE_COMPUTE) ? + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT : + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + pipe_bits, "Copy flush before astc emu"); for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { @@ -819,7 +824,12 @@ void anv_CmdCopyBufferToImage2( anv_cmd_buffer_is_compute_queue(cmd_buffer) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; - anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, + anv_add_pending_pipe_bits(cmd_buffer, + (batch.flags & BLORP_BATCH_USE_COMPUTE) ? + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT : + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + pipe_bits, "Copy flush before astc emu"); for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { @@ -1177,6 +1187,8 @@ anv_cmd_buffer_update_addr( * texture cache so we don't get anything stale. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_HOST_BIT, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, "before UpdateBuffer"); @@ -1886,6 +1898,8 @@ anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, * hangs when doing a clear with WM_HZ_OP. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT, "before clear hiz"); @@ -1913,6 +1927,8 @@ anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, unsigned wa_flush = cmd_buffer->device->info->verx10 >= 125 ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0; anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT | @@ -1955,6 +1971,8 @@ anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, */ if (cmd_buffer->device->info->verx10 < 120) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT, "after clear hiz"); @@ -2565,6 +2583,8 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, * cache before rendering to it. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT, "before clear DS"); @@ -2584,6 +2604,8 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, * cache before someone starts trying to do stencil on it. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT, "after clear DS"); diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index edfd90fbb39..60789abcbcc 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1796,6 +1796,8 @@ anv_begin_companion_cmd_buffer_helper(struct anv_cmd_buffer **cmd_buffer, */ if (prev_cmd_buffer->device->info->has_aux_map) { anv_add_pending_pipe_bits(prev_cmd_buffer->companion_rcs_cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, "new cmd buffer with aux-tt"); } diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index ea4b2a3ed2b..a42916cb8df 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -4633,6 +4633,8 @@ struct anv_cmd_state { struct anv_cmd_compute_state compute; struct anv_cmd_ray_tracing_state rt; + VkPipelineStageFlags2 pending_src_stages; + VkPipelineStageFlags2 pending_dst_stages; enum anv_pipe_bits pending_pipe_bits; /** @@ -6766,21 +6768,30 @@ anv_dump_pipe_bits(enum anv_pipe_bits bits, struct log_stream *stream); void anv_cmd_buffer_pending_pipe_debug(struct anv_cmd_buffer *cmd_buffer, + VkPipelineStageFlags2 src_stages, + VkPipelineStageFlags2 dst_stages, enum anv_pipe_bits bits, const char* reason); static inline void anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer, + VkPipelineStageFlags2 src_stages, + VkPipelineStageFlags2 dst_stages, enum anv_pipe_bits bits, const char* reason) { + cmd_buffer->state.pending_src_stages |= src_stages; + cmd_buffer->state.pending_dst_stages |= dst_stages; cmd_buffer->state.pending_pipe_bits |= bits; if (unlikely(u_trace_enabled(&cmd_buffer->device->ds.trace_context))) { if (cmd_buffer->batch.pc_reasons_count < ARRAY_SIZE(cmd_buffer->batch.pc_reasons)) cmd_buffer->batch.pc_reasons[cmd_buffer->batch.pc_reasons_count++] = reason; } - if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) - anv_cmd_buffer_pending_pipe_debug(cmd_buffer, bits, reason); + if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { + anv_cmd_buffer_pending_pipe_debug(cmd_buffer, + src_stages, dst_stages, bits, + reason); + } } struct anv_performance_configuration_intel { diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c index b236db2da5a..bfbc340fc6e 100644 --- a/src/intel/vulkan/anv_util.c +++ b/src/intel/vulkan/anv_util.c @@ -75,19 +75,34 @@ __anv_perf_warn(struct anv_device *device, void anv_cmd_buffer_pending_pipe_debug(struct anv_cmd_buffer *cmd_buffer, + VkPipelineStageFlags2 src_stages, + VkPipelineStageFlags2 dst_stages, enum anv_pipe_bits bits, const char* reason) { - if (bits == 0) + if (bits == 0 && src_stages == 0 && dst_stages == 0) return; struct log_stream *stream = mesa_log_streami(); mesa_log_stream_printf(stream, "acc: "); - mesa_log_stream_printf(stream, "bits: "); + mesa_log_stream_printf(stream, "src: "); + u_foreach_bit64(b, src_stages) { + mesa_log_stream_printf(stream, "%s,", + vk_PipelineStageFlagBits2_to_str(BITFIELD_BIT(b)) + + strlen("VK_PIPELINE_STAGE_2_")); + } + mesa_log_stream_printf(stream, " dst: "); + u_foreach_bit64(b, dst_stages) { + mesa_log_stream_printf(stream, "%s,", + vk_PipelineStageFlagBits2_to_str(BITFIELD_BIT(b)) + + strlen("VK_PIPELINE_STAGE_2_")); + } + + mesa_log_stream_printf(stream, " bits: "); anv_dump_pipe_bits(bits, stream); - mesa_log_stream_printf(stream, "reason: %s", reason); + mesa_log_stream_printf(stream, " reason: %s", reason); mesa_log_stream_printf(stream, "\n"); diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c index 145b6cad323..2f1b9cc740e 100644 --- a/src/intel/vulkan/genX_acceleration_structure.c +++ b/src/intel/vulkan/genX_acceleration_structure.c @@ -437,7 +437,10 @@ anv_init_header(VkCommandBuffer commandBuffer, const struct vk_acceleration_stru * dispatch size paramters) is not L3 coherent. */ if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) { - anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_DATA_CACHE_FLUSH_BIT, + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR, + ANV_PIPE_DATA_CACHE_FLUSH_BIT, "copy dispatch size for dispatch"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } @@ -670,7 +673,10 @@ genX(CmdCopyAccelerationStructureKHR)( * dispatch paramters) is not L3 coherent. */ if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) { - anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_DATA_CACHE_FLUSH_BIT, + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_DATA_CACHE_FLUSH_BIT, "bvh size read for dispatch"); } @@ -720,6 +726,8 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)( */ if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_DATA_CACHE_FLUSH_BIT, "bvh size read for dispatch"); } diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 68147936e56..9855f7efb55 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -313,6 +313,8 @@ blorp_exec_on_render(struct blorp_batch *batch, */ if (blorp_uses_bti_rt_writes(batch, params)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT, "before blorp BTI change"); @@ -380,6 +382,8 @@ blorp_exec_on_render(struct blorp_batch *batch, */ if (blorp_uses_bti_rt_writes(batch, params)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT, "after blorp BTI change"); diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 0cebb6a4e1e..f262bb91e92 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -587,6 +587,8 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS && final_needs_depth && !initial_depth_valid) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_TILE_CACHE_FLUSH_BIT, "HIZ-CCS flush"); } @@ -658,6 +660,8 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, */ if (intel_device_info_is_mtl(cmd_buffer->device->info)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_TILE_CACHE_FLUSH_BIT, "HIZ-CCS flush"); } @@ -936,6 +940,8 @@ genX(cmd_buffer_load_clear_color)(struct anv_cmd_buffer *cmd_buffer, * In testing, SKL doesn't actually seem to need this, but HSW does. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, "after load_clear_color surface state update"); #endif @@ -1872,6 +1878,8 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.pending_rhwo_optimization_enabled; if (rhwo_opt_change) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_STALL_AT_SCOREBOARD_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT, "change RHWO optimization"); @@ -1880,8 +1888,12 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; + /* Consume the stages here */ + cmd_buffer->state.pending_src_stages = 0; + cmd_buffer->state.pending_dst_stages = 0; + if (unlikely(cmd_buffer->device->physical->always_flush_cache)) - bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; + bits |= ANV_PIPE_BARRIER_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; else if (bits == 0) return; @@ -1924,8 +1936,7 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) genX(emit_apply_pipe_flushes)(&cmd_buffer->batch, cmd_buffer->device, cmd_buffer->state.current_pipeline, - bits, - &emitted_bits); + bits, &emitted_bits); anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits); #if INTEL_NEEDS_WA_1508744258 @@ -2892,6 +2903,8 @@ genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer, if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && cmd_buffer->device->info->has_aux_map) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, "new cmd buffer with aux-tt"); } @@ -2935,7 +2948,12 @@ add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer *cmd_buffer, assert(ret < sizeof(flush_reason)); } - anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, flush_reason); + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + aux_op_clears(next_aux_op) ? + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT : + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + pipe_bits, flush_reason); } void @@ -3146,6 +3164,8 @@ genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer, * cache invalidation with the texture cache invalidation done on gfx12. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, "Invalidate for new clear color"); } @@ -3267,6 +3287,8 @@ genX(BeginCommandBuffer)( if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && cmd_buffer->device->info->has_aux_map) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, "new cmd buffer with aux-tt"); } @@ -3294,6 +3316,8 @@ genX(BeginCommandBuffer)( if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && cmd_buffer->device->info->has_aux_map) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, "new cmd buffer with aux-tt"); } @@ -3454,6 +3478,8 @@ end_command_buffer(struct anv_cmd_buffer *cmd_buffer) */ if (cmd_buffer->state.queries.clear_bits) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits), "query clear flush prior command buffer end"); } @@ -3563,6 +3589,8 @@ genX(CmdExecuteCommands)( */ if (container->state.queries.clear_bits) { anv_add_pending_pipe_bits(container, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits), "query clear flush prior to secondary buffer"); } @@ -3710,6 +3738,8 @@ genX(CmdExecuteCommands)( */ if (GFX_VER == 9) { anv_add_pending_pipe_bits(container, + VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT, "Secondary cmd buffer not tracked in VF cache"); } @@ -4707,6 +4737,9 @@ cmd_buffer_accumulate_barrier_bits(struct anv_cmd_buffer *cmd_buffer, * dataport. */ if (flush_query_copies) { + src_stages |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT; + dst_stages |= VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT; bits |= (GFX_VER >= 12 ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT); } @@ -4741,7 +4774,7 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, cmd_buffer_accumulate_barrier_bits(cmd_buffer, n_dep_infos, dep_infos, &src_stages, &dst_stages, &bits); - anv_add_pending_pipe_bits(cmd_buffer, bits, reason); + anv_add_pending_pipe_bits(cmd_buffer, src_stages, dst_stages, bits, reason); break; } @@ -4872,6 +4905,8 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, if (cmd_buffer->state.current_pipeline == _3D && cmd_buffer->state.queries.clear_bits) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits), "query clear flush prior to GPGPU"); } @@ -4938,7 +4973,10 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, intel_needs_workaround(cmd_buffer->device->info, 16013063087)) bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; - anv_add_pending_pipe_bits(cmd_buffer, bits, + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + bits, pipeline == _3D ? "flush/invalidate PIPELINE_SELECT 3D" : "flush/invalidate PIPELINE_SELECT GPGPU"); @@ -5048,6 +5086,8 @@ genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer, * settings while we change the registers. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT, @@ -5123,6 +5163,8 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer vb_address, vb_size)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT, "vb > 32b range"); @@ -5232,6 +5274,8 @@ genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, if (cmd_buffer->state.current_hash_scale != scale && (width > min_size[idx][0] || height > min_size[idx][1])) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT, "change pixel hash mode"); @@ -5940,9 +5984,11 @@ void genX(CmdBeginRendering)( * in the case that there are no RTs (depth-only rendering), though. */ anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | - ANV_PIPE_STALL_AT_SCOREBOARD_BIT, - "change RT"); + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "change RT"); } #endif @@ -6031,6 +6077,8 @@ void genX(CmdEndRendering2KHR)( * sampler when we blit to the single-sampled resolve target. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, "MSAA resolve"); @@ -6047,9 +6095,11 @@ void genX(CmdEndRendering2KHR)( * sampler when we blit to the single-sampled resolve target. */ anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, - "MSAA resolve"); + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, + "MSAA resolve"); } #if GFX_VER < 20 @@ -6078,7 +6128,10 @@ void genX(CmdEndRendering2KHR)( * sure unbound regions read 0, as residencyNonResidentStrict * mandates. */ - anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT, + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_TILE_CACHE_FLUSH_BIT, "sparse MSAA resolve"); } #endif @@ -6360,6 +6413,8 @@ VkResult genX(CmdSetPerformanceOverrideINTEL)( if (pOverrideInfo->enable) { /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_BARRIER_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS, "perf counter isolation"); @@ -6599,9 +6654,12 @@ genX(cmd_buffer_begin_companion_rcs_syncpoint)( */ if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) { - anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_BARRIER_FLUSH_BITS | - ANV_PIPE_INVALIDATE_BITS | - ANV_PIPE_STALL_BITS, + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_BARRIER_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS | + ANV_PIPE_STALL_BITS, "post main cmd buffer invalidate"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) { @@ -6671,6 +6729,8 @@ genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer, * - unblock the CCS */ anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_BARRIER_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS | ANV_PIPE_STALL_BITS, @@ -6817,7 +6877,10 @@ genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer, trace_intel_begin_write_buffer_marker(&cmd_buffer->trace); - anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker"); + anv_add_pending_pipe_bits(cmd_buffer, + stage, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + bits, "write buffer marker"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); struct mi_builder b; diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 03491fdb322..fb6edea57b1 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -136,8 +136,10 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer) * sufficient." */ anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_CS_STALL_BIT, - "flush compute state"); + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + ANV_PIPE_CS_STALL_BIT, + "flush compute state"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); #endif diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index 6824b00d0c7..28fe455dd2f 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -712,6 +712,8 @@ cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer, * in the shader always send the color. */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT, "change RT due to shader outputs"); @@ -854,6 +856,8 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer) */ if (intel_needs_workaround(device->info, 16011411144)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT, ANV_PIPE_CS_STALL_BIT, "before SO_BUFFER change WA"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -889,12 +893,16 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer) if (intel_needs_workaround(device->info, 16011411144)) { /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT, ANV_PIPE_CS_STALL_BIT, "after SO_BUFFER change WA"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } else if (GFX_VER >= 10) { /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT, ANV_PIPE_CS_STALL_BIT, "after 3DSTATE_SO_BUFFER call"); } @@ -2365,6 +2373,8 @@ void genX(CmdBeginTransformFeedbackEXT)( * commands are processed. This will likely require a pipeline flush." */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CS_STALL_BIT, "begin transform feedback"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -2417,6 +2427,8 @@ void genX(CmdEndTransformFeedbackEXT)( * commands are processed. This will likely require a pipeline flush." */ anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CS_STALL_BIT, "end transform feedback"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h index de0b75e0cb1..cff60d15f88 100644 --- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h +++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h @@ -548,6 +548,10 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd struct anv_gen_indirect_params *params = params_state.map; anv_add_pending_pipe_bits(cmd_buffer, + gen_kernel->stage == MESA_SHADER_FRAGMENT ? + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT : + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, #if GFX_VER == 9 ANV_PIPE_VF_CACHE_INVALIDATE_BIT | #endif @@ -597,6 +601,10 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd anv_batch_current_address(&cmd_buffer->batch); anv_add_pending_pipe_bits(cmd_buffer, + gen_kernel->stage == MESA_SHADER_FRAGMENT ? + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT : + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_STALL_AT_SCOREBOARD_BIT | ANV_PIPE_CS_STALL_BIT, "after generated draws batch"); @@ -623,6 +631,8 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd mi_ensure_write_fence(&b); anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT, "after generated draws batch increment"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -645,6 +655,8 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd mi_ensure_write_fence(&b); anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT, "after generated draws end"); diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 067bd887836..0ffe547878b 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -917,7 +917,10 @@ void genX(CmdResetQueryPool)( * completed. Otherwise some timestamps written later with MI_STORE_* * commands might race with the PIPE_CONTROL in the loop above. */ - anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_CS_STALL_BIT, "vkCmdResetQueryPool of timestamps"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); break; @@ -1091,6 +1094,9 @@ append_query_clear_flush(struct anv_cmd_buffer *cmd_buffer, return false; anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_QUERY_BITS( cmd_buffer->state.queries.clear_bits), reason); @@ -1735,6 +1741,9 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer, if (needed_flushes) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, needed_flushes, "CopyQueryPoolResults"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -1847,6 +1856,7 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, uint32_t query_count, VkQueryResultFlags flags) { + VkPipelineStageFlags2 wait_stages = 0; enum anv_pipe_bits needed_flushes = 0; trace_intel_begin_query_copy_shader(&cmd_buffer->trace); @@ -1867,11 +1877,14 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, } if ((cmd_buffer->state.queries.buffer_write_bits | - cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH) + cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH) { + wait_stages |= VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT; needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + } if ((cmd_buffer->state.queries.buffer_write_bits | cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_DATA_FLUSH) { + wait_stages |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; needed_flushes |= (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT); } @@ -1901,6 +1914,8 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, if (needed_flushes) { anv_add_pending_pipe_bits(cmd_buffer, + wait_stages, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, needed_flushes | ANV_PIPE_END_OF_PIPE_SYNC_BIT, "CopyQueryPoolResults"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -2071,6 +2086,8 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)( */ if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) { anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, ANV_PIPE_END_OF_PIPE_SYNC_BIT | ANV_PIPE_DATA_CACHE_FLUSH_BIT, "read BVH data using CS");