From 6d48542bbc534df43be401909713b0057a03f7e4 Mon Sep 17 00:00:00 2001 From: Rohan Garg Date: Tue, 6 Jun 2023 23:11:14 +0200 Subject: [PATCH] anv: split ANV_PIPE_RENDER_TARGET_BUFFER_WRITES for finer grained flushing split ANV_PIPE_RENDER_TARGET_BUFFER_WRITES into separate CS_STALL, RT_FLUSH & TILE_FLUSH flags in order to have finer control over cache coherency. Tigerlake CS has it's own cache fetching directly from the memory controller, so we need to do a tile flush to ensure the query data is visible. This fixes test_resolve_non_issued_query_data in vkd3d on TGL. Signed-off-by: Rohan Garg Fixes: 3c4c18341aaf ("anv: narrow flushing of the render target to buffer writes") Reviewed-by: Lionel Landwerlin Part-of: (cherry picked from commit d0e0ba897fd5a15bc5283b150a122dee078006c2) --- .pick_status.json | 2 +- src/intel/vulkan/anv_blorp.c | 27 ++++--- src/intel/vulkan/anv_genX.h | 3 +- src/intel/vulkan/anv_private.h | 74 +++++++++++-------- src/intel/vulkan/genX_cmd_buffer.c | 45 +++++------ .../vulkan/genX_cmd_draw_generated_indirect.h | 3 +- src/intel/vulkan/genX_gpu_memcpy.c | 6 +- src/intel/vulkan/genX_query.c | 74 +++++++++---------- 8 files changed, 129 insertions(+), 105 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 58b2cc7d031..40933245592 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2272,7 +2272,7 @@ "description": "anv: split ANV_PIPE_RENDER_TARGET_BUFFER_WRITES for finer grained flushing", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "3c4c18341aafbdd0c24665a56d0af32b1e4dc981" }, diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index 341a36e1e80..a8a90d5e105 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -534,6 +534,16 @@ void anv_CmdCopyBufferToImage2( anv_blorp_batch_finish(&batch); } +static void +anv_add_buffer_write_pending_bits(struct anv_cmd_buffer *cmd_buffer, + const char *reason) +{ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + + cmd_buffer->state.pending_query_bits |= + ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo); +} + void anv_CmdCopyImageToBuffer2( VkCommandBuffer commandBuffer, const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo) @@ -553,9 +563,7 @@ void anv_CmdCopyImageToBuffer2( anv_blorp_batch_finish(&batch); - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES, - "after copy image to buffer"); + anv_add_buffer_write_pending_bits(cmd_buffer, "after copy image to buffer"); } static bool @@ -780,9 +788,7 @@ void anv_CmdCopyBuffer2( anv_blorp_batch_finish(&batch); - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES, - "after copy buffer"); + anv_add_buffer_write_pending_bits(cmd_buffer, "after copy buffer"); } @@ -844,9 +850,7 @@ void anv_CmdUpdateBuffer( anv_blorp_batch_finish(&batch); - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES, - "update buffer"); + anv_add_buffer_write_pending_bits(cmd_buffer, "update buffer"); } void @@ -954,9 +958,8 @@ void anv_CmdFillBuffer( anv_address_add(dst_buffer->address, dstOffset), fillSize, data); - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES, - "after fill buffer"); + anv_add_buffer_write_pending_bits(cmd_buffer, "after fill buffer"); + } void anv_CmdClearColorImage( diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index e0b0369a842..6c1ceef0f23 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -92,7 +92,8 @@ enum anv_pipe_bits genX(emit_apply_pipe_flushes)(struct anv_batch *batch, struct anv_device *device, uint32_t current_pipeline, - enum anv_pipe_bits bits); + enum anv_pipe_bits bits, + enum anv_query_bits *query_bits); void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, struct anv_device *device, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 632655aac22..a9650bf4129 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2080,32 +2080,54 @@ enum anv_pipe_bits { */ ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT = (1 << 22), - /* This bit does not exist directly in PIPE_CONTROL. It means that render - * target operations related to transfer commands with VkBuffer as - * destination are ongoing. Some operations like copies on the command - * streamer might need to be aware of this to trigger the appropriate stall - * before they can proceed with the copy. - */ - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 23), - /* This bit does not exist directly in PIPE_CONTROL. It means that Gfx12 * AUX-TT data has changed and we need to invalidate AUX-TT data. This is * done by writing the AUX-TT register. */ - ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 24), + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 23), /* This bit does not exist directly in PIPE_CONTROL. It means that a * PIPE_CONTROL with a post-sync operation will follow. This is used to * implement a workaround for Gfx9. */ - ANV_PIPE_POST_SYNC_BIT = (1 << 25), - - /* This bit does not exist directly in PIPE_CONTROL. It means that render - * target operations related to clearing of queries are ongoing. - */ - ANV_PIPE_QUERY_CLEARS_BIT = (1 << 26), + ANV_PIPE_POST_SYNC_BIT = (1 << 24), }; +/* These bits track the state of buffer writes for queries. They get cleared + * based on PIPE_CONTROL emissions. + */ +enum anv_query_bits { + ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH = (1 << 0), + + ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH = (1 << 1), + + ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL = (1 << 2), +}; + +/* Things we need to flush before accessing query data using the command + * streamer. + * + * Prior to DG2 experiments show that the command streamer is not coherent + * with the tile cache so we need to flush it to make any data visible to CS. + * + * Otherwise we want to flush the RT cache which is where blorp writes, either + * for clearing the query buffer or for clearing the destination buffer in + * vkCopyQueryPoolResults(). + */ +#define ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo) \ + (((devinfo->verx10 >= 120 && \ + devinfo->verx10 < 125) ? ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH : 0) | \ + ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH | \ + ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL) + +#define ANV_PIPE_QUERY_BITS(pending_query_bits) ( \ + ((pending_query_bits & ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH) ? \ + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0) | \ + ((pending_query_bits & ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH) ? \ + ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \ + ((pending_query_bits & ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL) ? \ + ANV_PIPE_CS_STALL_BIT : 0)) + #define ANV_PIPE_FLUSH_BITS ( \ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \ ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ @@ -2146,20 +2168,6 @@ enum anv_pipe_bits { #define ANV_PIPE_GPGPU_BITS ( \ (GFX_VERx10 >= 125 ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0)) -/* Things we need to flush before accessing query data using the command - * streamer. - * - * Prior to DG2 experiments show that the command streamer is not coherent - * with the tile cache so we need to flush it to make any data visible to CS. - * - * Otherwise we want to flush the RT cache which is where blorp writes, either - * for clearing the query buffer or for clearing the destination buffer in - * vkCopyQueryPoolResults(). - */ -#define ANV_PIPE_QUERY_FLUSH_BITS ( \ - (GFX_VERx10 < 125 ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \ - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) - enum intel_ds_stall_flag anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits); @@ -2636,6 +2644,14 @@ struct anv_cmd_state { struct anv_cmd_ray_tracing_state rt; enum anv_pipe_bits pending_pipe_bits; + + /** + * Tracks operations susceptible to interfere with queries, either blorp + * clears of the query buffer or the destination buffer of + * vkCmdCopyQueryResults, we need those operations to have completed before + * we do the work of vkCmdCopyQueryResults. + */ + enum anv_query_bits pending_query_bits; VkShaderStageFlags descriptors_dirty; VkShaderStageFlags push_descriptors_dirty; VkShaderStageFlags push_constants_dirty; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index a6b1392109f..10b38e10796 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1507,7 +1507,8 @@ ALWAYS_INLINE enum anv_pipe_bits genX(emit_apply_pipe_flushes)(struct anv_batch *batch, struct anv_device *device, uint32_t current_pipeline, - enum anv_pipe_bits bits) + enum anv_pipe_bits bits, + enum anv_query_bits *query_bits) { #if GFX_VER >= 12 /* From the TGL PRM, Volume 2a, "PIPE_CONTROL": @@ -1748,19 +1749,21 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch, anv_debug_dump_pc(pipe); } - /* If a render target flush was emitted, then we can toggle off the bit - * saying that render target writes are ongoing. + /* Based on emitted flushes, clear the associated buffer write tracking + * bits of buffer writes. */ - if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) - bits &= ~ANV_PIPE_RENDER_TARGET_BUFFER_WRITES; + if (query_bits != NULL) { + if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + *query_bits &= ~ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH; - /* If the conditions for flushing the query clears are met, we can - * toggle the bit off. - */ - if ((bits & ANV_PIPE_QUERY_FLUSH_BITS) == ANV_PIPE_QUERY_FLUSH_BITS && - (bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT | - ANV_PIPE_CS_STALL_BIT))) { - bits &= ~ANV_PIPE_QUERY_CLEARS_BIT; + if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT) + *query_bits &= ~ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH; + + /* Once RT/TILE have been flushed, we can consider the CS_STALL flush */ + if ((*query_bits & (ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH | + ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH)) == 0 && + (bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT | ANV_PIPE_CS_STALL_BIT))) + *query_bits &= ~ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL; } bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | @@ -1907,7 +1910,8 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) genX(emit_apply_pipe_flushes)(&cmd_buffer->batch, cmd_buffer->device, cmd_buffer->state.current_pipeline, - bits); + bits, + &cmd_buffer->state.pending_query_bits); #if GFX_VERx10 == 120 /* Wa_1508744258 handling */ @@ -3851,10 +3855,9 @@ genX(EndCommandBuffer)( /* Flush query clears using blorp so that secondary query writes do not * race with the clear. */ - if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) { + if (cmd_buffer->state.pending_query_bits) { anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_QUERY_FLUSH_BITS | - ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT, + ANV_PIPE_QUERY_BITS(cmd_buffer->state.pending_query_bits), "query clear flush prior command buffer end"); } @@ -3936,10 +3939,9 @@ genX(CmdExecuteCommands)( /* Flush query clears using blorp so that secondary query writes do not * race with the clear. */ - if (primary->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) { + if (primary->state.pending_query_bits) { anv_add_pending_pipe_bits(primary, - ANV_PIPE_QUERY_FLUSH_BITS | - ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT, + ANV_PIPE_QUERY_BITS(primary->state.pending_query_bits), "query clear flush prior to secondary buffer"); } @@ -6626,10 +6628,9 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, * copy/write. So we need to flush it before going to GPGPU mode. */ if (cmd_buffer->state.current_pipeline == _3D && - (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT)) { + cmd_buffer->state.pending_query_bits) { anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_QUERY_FLUSH_BITS | - ANV_PIPE_END_OF_PIPE_SYNC_BIT, + ANV_PIPE_QUERY_BITS(cmd_buffer->state.pending_query_bits), "query clear flush prior to GPGPU"); } #endif diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h index b4f9c5e7ee6..fa60ab49ef1 100644 --- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h +++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h @@ -716,7 +716,8 @@ genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer) ANV_PIPE_VF_CACHE_INVALIDATE_BIT | #endif ANV_PIPE_DATA_CACHE_FLUSH_BIT | - ANV_PIPE_CS_STALL_BIT); + ANV_PIPE_CS_STALL_BIT, + NULL /* query_bits */); #if GFX_VER >= 12 anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) { diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index b903ac92c17..1064f7d1a89 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -269,7 +269,8 @@ void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state) { genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D, - ANV_PIPE_END_OF_PIPE_SYNC_BIT); + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + NULL); } void @@ -295,7 +296,8 @@ genX(emit_so_memcpy)(struct anv_memcpy_state *state, src, size)) { genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D, ANV_PIPE_CS_STALL_BIT | - ANV_PIPE_VF_CACHE_INVALIDATE_BIT); + ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + NULL); memset(&state->vb_dirty, 0, sizeof(state->vb_dirty)); } diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 1429536c856..8c3f94ecbf0 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -794,9 +794,9 @@ void genX(CmdResetQueryPool)( queryCount * pool->stride, 0); - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_QUERY_CLEARS_BIT, - "vkCmdResetQueryPool of timestamps"); + cmd_buffer->state.pending_query_bits = + ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(cmd_buffer->device->info); + return; } @@ -984,12 +984,12 @@ emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer, struct anv_query_pool *pool, const char *reason) { - if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) == 0) + if (cmd_buffer->state.pending_query_bits == 0) return; anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_QUERY_FLUSH_BITS | - ANV_PIPE_END_OF_PIPE_SYNC_BIT, + ANV_PIPE_QUERY_BITS( + cmd_buffer->state.pending_query_bits), reason); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } @@ -1507,45 +1507,45 @@ void genX(CmdCopyQueryPoolResults)( mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); struct mi_value result; + enum anv_pipe_bits needed_flushes = 0; + /* If render target writes are ongoing, request a render target cache flush * to ensure proper ordering of the commands from the 3d pipe and the * command streamer. */ - const bool need_flushes = - (cmd_buffer->state.pending_pipe_bits & - (ANV_PIPE_RENDER_TARGET_BUFFER_WRITES | - ANV_PIPE_QUERY_CLEARS_BIT)); + if (cmd_buffer->state.pending_query_bits & + ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH) + needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; - if (need_flushes) { + if (cmd_buffer->state.pending_query_bits & + ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH) + needed_flushes |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + + if (cmd_buffer->state.pending_query_bits & + ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL) + needed_flushes |= ANV_PIPE_CS_STALL_BIT; + + /* Occlusion & timestamp queries are written using a PIPE_CONTROL and + * because we're about to copy values from MI commands, we need to stall + * the command streamer to make sure the PIPE_CONTROL values have + * landed, otherwise we could see inconsistent values & availability. + * + * From the vulkan spec: + * + * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of + * previous uses of vkCmdResetQueryPool in the same queue, without any + * additional synchronization." + */ + if (pool->type == VK_QUERY_TYPE_OCCLUSION || + pool->type == VK_QUERY_TYPE_TIMESTAMP) + needed_flushes |= ANV_PIPE_CS_STALL_BIT; + + if (needed_flushes) { anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_QUERY_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT, + needed_flushes, "CopyQueryPoolResults"); - } - - bool need_cs_stall = - (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || - /* Occlusion & timestamp queries are written using a PIPE_CONTROL and - * because we're about to copy values from MI commands, we need to stall - * the command streamer to make sure the PIPE_CONTROL values have - * landed, otherwise we could see inconsistent values & availability. - * - * From the vulkan spec: - * - * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of - * previous uses of vkCmdResetQueryPool in the same queue, without - * any additional synchronization." - */ - pool->type == VK_QUERY_TYPE_OCCLUSION || - pool->type == VK_QUERY_TYPE_TIMESTAMP; - - if (need_cs_stall) { - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_CS_STALL_BIT, - "CopyQueryPoolResults stall"); - } - - if (need_cs_stall || need_flushes) genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); for (uint32_t i = 0; i < queryCount; i++) {