anv: split ANV_PIPE_RENDER_TARGET_BUFFER_WRITES for finer grained flushing

split ANV_PIPE_RENDER_TARGET_BUFFER_WRITES into separate CS_STALL,
RT_FLUSH & TILE_FLUSH flags in order to have finer control over cache
coherency.

Tigerlake CS has it's own cache fetching directly from the memory controller,
so we need to do a tile flush to ensure the query data is visible.

This fixes test_resolve_non_issued_query_data in vkd3d on TGL.

Signed-off-by: Rohan Garg <rohan.garg@intel.com>
Fixes: 3c4c18341a ("anv: narrow flushing of the render target to buffer writes")
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23500>
(cherry picked from commit d0e0ba897f)
This commit is contained in:
Rohan Garg 2023-06-06 23:11:14 +02:00 committed by Eric Engestrom
parent 26a920b181
commit 6d48542bbc
8 changed files with 129 additions and 105 deletions

View file

@ -2272,7 +2272,7 @@
"description": "anv: split ANV_PIPE_RENDER_TARGET_BUFFER_WRITES for finer grained flushing",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": "3c4c18341aafbdd0c24665a56d0af32b1e4dc981"
},

View file

@ -534,6 +534,16 @@ void anv_CmdCopyBufferToImage2(
anv_blorp_batch_finish(&batch);
}
static void
anv_add_buffer_write_pending_bits(struct anv_cmd_buffer *cmd_buffer,
const char *reason)
{
const struct intel_device_info *devinfo = cmd_buffer->device->info;
cmd_buffer->state.pending_query_bits |=
ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo);
}
void anv_CmdCopyImageToBuffer2(
VkCommandBuffer commandBuffer,
const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
@ -553,9 +563,7 @@ void anv_CmdCopyImageToBuffer2(
anv_blorp_batch_finish(&batch);
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_BUFFER_WRITES,
"after copy image to buffer");
anv_add_buffer_write_pending_bits(cmd_buffer, "after copy image to buffer");
}
static bool
@ -780,9 +788,7 @@ void anv_CmdCopyBuffer2(
anv_blorp_batch_finish(&batch);
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_BUFFER_WRITES,
"after copy buffer");
anv_add_buffer_write_pending_bits(cmd_buffer, "after copy buffer");
}
@ -844,9 +850,7 @@ void anv_CmdUpdateBuffer(
anv_blorp_batch_finish(&batch);
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_BUFFER_WRITES,
"update buffer");
anv_add_buffer_write_pending_bits(cmd_buffer, "update buffer");
}
void
@ -954,9 +958,8 @@ void anv_CmdFillBuffer(
anv_address_add(dst_buffer->address, dstOffset),
fillSize, data);
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_BUFFER_WRITES,
"after fill buffer");
anv_add_buffer_write_pending_bits(cmd_buffer, "after fill buffer");
}
void anv_CmdClearColorImage(

View file

@ -92,7 +92,8 @@ enum anv_pipe_bits
genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
struct anv_device *device,
uint32_t current_pipeline,
enum anv_pipe_bits bits);
enum anv_pipe_bits bits,
enum anv_query_bits *query_bits);
void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
struct anv_device *device,

View file

@ -2080,32 +2080,54 @@ enum anv_pipe_bits {
*/
ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT = (1 << 22),
/* This bit does not exist directly in PIPE_CONTROL. It means that render
* target operations related to transfer commands with VkBuffer as
* destination are ongoing. Some operations like copies on the command
* streamer might need to be aware of this to trigger the appropriate stall
* before they can proceed with the copy.
*/
ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 23),
/* This bit does not exist directly in PIPE_CONTROL. It means that Gfx12
* AUX-TT data has changed and we need to invalidate AUX-TT data. This is
* done by writing the AUX-TT register.
*/
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 24),
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 23),
/* This bit does not exist directly in PIPE_CONTROL. It means that a
* PIPE_CONTROL with a post-sync operation will follow. This is used to
* implement a workaround for Gfx9.
*/
ANV_PIPE_POST_SYNC_BIT = (1 << 25),
/* This bit does not exist directly in PIPE_CONTROL. It means that render
* target operations related to clearing of queries are ongoing.
*/
ANV_PIPE_QUERY_CLEARS_BIT = (1 << 26),
ANV_PIPE_POST_SYNC_BIT = (1 << 24),
};
/* These bits track the state of buffer writes for queries. They get cleared
* based on PIPE_CONTROL emissions.
*/
enum anv_query_bits {
ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH = (1 << 0),
ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH = (1 << 1),
ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL = (1 << 2),
};
/* Things we need to flush before accessing query data using the command
* streamer.
*
* Prior to DG2 experiments show that the command streamer is not coherent
* with the tile cache so we need to flush it to make any data visible to CS.
*
* Otherwise we want to flush the RT cache which is where blorp writes, either
* for clearing the query buffer or for clearing the destination buffer in
* vkCopyQueryPoolResults().
*/
#define ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo) \
(((devinfo->verx10 >= 120 && \
devinfo->verx10 < 125) ? ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH : 0) | \
ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH | \
ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL)
#define ANV_PIPE_QUERY_BITS(pending_query_bits) ( \
((pending_query_bits & ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH) ? \
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0) | \
((pending_query_bits & ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH) ? \
ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \
((pending_query_bits & ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL) ? \
ANV_PIPE_CS_STALL_BIT : 0))
#define ANV_PIPE_FLUSH_BITS ( \
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
@ -2146,20 +2168,6 @@ enum anv_pipe_bits {
#define ANV_PIPE_GPGPU_BITS ( \
(GFX_VERx10 >= 125 ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0))
/* Things we need to flush before accessing query data using the command
* streamer.
*
* Prior to DG2 experiments show that the command streamer is not coherent
* with the tile cache so we need to flush it to make any data visible to CS.
*
* Otherwise we want to flush the RT cache which is where blorp writes, either
* for clearing the query buffer or for clearing the destination buffer in
* vkCopyQueryPoolResults().
*/
#define ANV_PIPE_QUERY_FLUSH_BITS ( \
(GFX_VERx10 < 125 ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
enum intel_ds_stall_flag
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
@ -2636,6 +2644,14 @@ struct anv_cmd_state {
struct anv_cmd_ray_tracing_state rt;
enum anv_pipe_bits pending_pipe_bits;
/**
* Tracks operations susceptible to interfere with queries, either blorp
* clears of the query buffer or the destination buffer of
* vkCmdCopyQueryResults, we need those operations to have completed before
* we do the work of vkCmdCopyQueryResults.
*/
enum anv_query_bits pending_query_bits;
VkShaderStageFlags descriptors_dirty;
VkShaderStageFlags push_descriptors_dirty;
VkShaderStageFlags push_constants_dirty;

View file

@ -1507,7 +1507,8 @@ ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
struct anv_device *device,
uint32_t current_pipeline,
enum anv_pipe_bits bits)
enum anv_pipe_bits bits,
enum anv_query_bits *query_bits)
{
#if GFX_VER >= 12
/* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
@ -1748,19 +1749,21 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
anv_debug_dump_pc(pipe);
}
/* If a render target flush was emitted, then we can toggle off the bit
* saying that render target writes are ongoing.
/* Based on emitted flushes, clear the associated buffer write tracking
* bits of buffer writes.
*/
if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
bits &= ~ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
if (query_bits != NULL) {
if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
*query_bits &= ~ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH;
/* If the conditions for flushing the query clears are met, we can
* toggle the bit off.
*/
if ((bits & ANV_PIPE_QUERY_FLUSH_BITS) == ANV_PIPE_QUERY_FLUSH_BITS &&
(bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT |
ANV_PIPE_CS_STALL_BIT))) {
bits &= ~ANV_PIPE_QUERY_CLEARS_BIT;
if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT)
*query_bits &= ~ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH;
/* Once RT/TILE have been flushed, we can consider the CS_STALL flush */
if ((*query_bits & (ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH |
ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH)) == 0 &&
(bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT | ANV_PIPE_CS_STALL_BIT)))
*query_bits &= ~ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL;
}
bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
@ -1907,7 +1910,8 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
cmd_buffer->device,
cmd_buffer->state.current_pipeline,
bits);
bits,
&cmd_buffer->state.pending_query_bits);
#if GFX_VERx10 == 120
/* Wa_1508744258 handling */
@ -3851,10 +3855,9 @@ genX(EndCommandBuffer)(
/* Flush query clears using blorp so that secondary query writes do not
* race with the clear.
*/
if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) {
if (cmd_buffer->state.pending_query_bits) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_QUERY_FLUSH_BITS |
ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT,
ANV_PIPE_QUERY_BITS(cmd_buffer->state.pending_query_bits),
"query clear flush prior command buffer end");
}
@ -3936,10 +3939,9 @@ genX(CmdExecuteCommands)(
/* Flush query clears using blorp so that secondary query writes do not
* race with the clear.
*/
if (primary->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) {
if (primary->state.pending_query_bits) {
anv_add_pending_pipe_bits(primary,
ANV_PIPE_QUERY_FLUSH_BITS |
ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT,
ANV_PIPE_QUERY_BITS(primary->state.pending_query_bits),
"query clear flush prior to secondary buffer");
}
@ -6626,10 +6628,9 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
* copy/write. So we need to flush it before going to GPGPU mode.
*/
if (cmd_buffer->state.current_pipeline == _3D &&
(cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT)) {
cmd_buffer->state.pending_query_bits) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_QUERY_FLUSH_BITS |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
ANV_PIPE_QUERY_BITS(cmd_buffer->state.pending_query_bits),
"query clear flush prior to GPGPU");
}
#endif

View file

@ -716,7 +716,8 @@ genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
#endif
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_CS_STALL_BIT);
ANV_PIPE_CS_STALL_BIT,
NULL /* query_bits */);
#if GFX_VER >= 12
anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {

View file

@ -269,7 +269,8 @@ void
genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
{
genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
ANV_PIPE_END_OF_PIPE_SYNC_BIT);
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
NULL);
}
void
@ -295,7 +296,8 @@ genX(emit_so_memcpy)(struct anv_memcpy_state *state,
src, size)) {
genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_VF_CACHE_INVALIDATE_BIT);
ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
NULL);
memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
}

View file

@ -794,9 +794,9 @@ void genX(CmdResetQueryPool)(
queryCount * pool->stride,
0);
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_QUERY_CLEARS_BIT,
"vkCmdResetQueryPool of timestamps");
cmd_buffer->state.pending_query_bits =
ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(cmd_buffer->device->info);
return;
}
@ -984,12 +984,12 @@ emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
struct anv_query_pool *pool,
const char *reason)
{
if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) == 0)
if (cmd_buffer->state.pending_query_bits == 0)
return;
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_QUERY_FLUSH_BITS |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
ANV_PIPE_QUERY_BITS(
cmd_buffer->state.pending_query_bits),
reason);
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
@ -1507,45 +1507,45 @@ void genX(CmdCopyQueryPoolResults)(
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
struct mi_value result;
enum anv_pipe_bits needed_flushes = 0;
/* If render target writes are ongoing, request a render target cache flush
* to ensure proper ordering of the commands from the 3d pipe and the
* command streamer.
*/
const bool need_flushes =
(cmd_buffer->state.pending_pipe_bits &
(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES |
ANV_PIPE_QUERY_CLEARS_BIT));
if (cmd_buffer->state.pending_query_bits &
ANV_QUERY_RENDER_TARGET_WRITES_RT_FLUSH)
needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
if (need_flushes) {
if (cmd_buffer->state.pending_query_bits &
ANV_QUERY_RENDER_TARGET_WRITES_TILE_FLUSH)
needed_flushes |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
if (cmd_buffer->state.pending_query_bits &
ANV_QUERY_RENDER_TARGET_WRITES_CS_STALL)
needed_flushes |= ANV_PIPE_CS_STALL_BIT;
/* Occlusion & timestamp queries are written using a PIPE_CONTROL and
* because we're about to copy values from MI commands, we need to stall
* the command streamer to make sure the PIPE_CONTROL values have
* landed, otherwise we could see inconsistent values & availability.
*
* From the vulkan spec:
*
* "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
* previous uses of vkCmdResetQueryPool in the same queue, without any
* additional synchronization."
*/
if (pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_TIMESTAMP)
needed_flushes |= ANV_PIPE_CS_STALL_BIT;
if (needed_flushes) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_QUERY_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT,
needed_flushes,
"CopyQueryPoolResults");
}
bool need_cs_stall =
(cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
/* Occlusion & timestamp queries are written using a PIPE_CONTROL and
* because we're about to copy values from MI commands, we need to stall
* the command streamer to make sure the PIPE_CONTROL values have
* landed, otherwise we could see inconsistent values & availability.
*
* From the vulkan spec:
*
* "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
* previous uses of vkCmdResetQueryPool in the same queue, without
* any additional synchronization."
*/
pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_TIMESTAMP;
if (need_cs_stall) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"CopyQueryPoolResults stall");
}
if (need_cs_stall || need_flushes)
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
for (uint32_t i = 0; i < queryCount; i++) {