tu: Implement CCHE invalidation

We need invalidate CCHE when we optimize out an invalidation of UCHE,
for example a storage image write to texture read. We missed this
earlier because of the blob's tendency to always over-flush, but the
blob does use this when building acceleration structures.

Fixes: 95104707f1 ("tu: Basic a7xx support")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28445>
(cherry picked from commit fb1c3f7f5d)
This commit is contained in:
Connor Abbott 2024-03-25 13:00:34 -04:00 committed by Eric Engestrom
parent 95e350d2bf
commit 24fd8685b7
3 changed files with 30 additions and 8 deletions

View file

@ -3704,7 +3704,7 @@
"description": "tu: Implement CCHE invalidation",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": "95104707f189b2e1b06c855b563c1203b33da354",
"notes": null

View file

@ -187,6 +187,10 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
));
}
if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_CCHE_INVALIDATE) &&
/* Invalidating UCHE seems to also invalidate CCHE */
!(flushes & TU_CMD_FLAG_CACHE_INVALIDATE))
tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0);
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
@ -3246,6 +3250,13 @@ tu_flush_for_access(struct tu_cache_state *cache,
flush_bits |= TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE;
}
/* There are multiple incoherent copies of CCHE, so any read through it may
* require invalidating it and we cannot optimize away invalidates.
*/
if (dst_mask & TU_ACCESS_CCHE_READ) {
flush_bits |= TU_CMD_FLAG_CCHE_INVALIDATE;
}
#undef DST_INCOHERENT_FLUSH
cache->flush_bits |= flush_bits;
@ -3347,12 +3358,13 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
SHADER_STAGES))
mask |= TU_ACCESS_UCHE_READ;
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ;
if (gfx_read_access(flags, stages,
VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT,
SHADER_STAGES)) {
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_BINDLESS_DESCRIPTOR_READ;
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
TU_ACCESS_CCHE_READ;
}
if (gfx_write_access(flags, stages,

View file

@ -132,6 +132,13 @@ enum tu_cmd_access_mask {
*/
TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
/* The CCHE is a write-through cache which sits behind UCHE, with multiple
* incoherent copies. Because it's write-through we only have to worry
* about invalidating it for reads. It's invalidated by "ccinv" in the
* shader and CP_CCHE_INVALIDATE in the command stream.
*/
TU_ACCESS_CCHE_READ = 1 << 16,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
@ -139,7 +146,8 @@ enum tu_cmd_access_mask {
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
TU_ACCESS_SYSMEM_READ |
TU_ACCESS_BINDLESS_DESCRIPTOR_READ,
TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
TU_ACCESS_CCHE_READ,
TU_ACCESS_WRITE =
TU_ACCESS_UCHE_WRITE |
@ -186,10 +194,11 @@ enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 9,
TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
TU_CMD_FLAG_ALL_FLUSH =
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
@ -205,6 +214,7 @@ enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
TU_CMD_FLAG_CACHE_INVALIDATE |
TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
TU_CMD_FLAG_CCHE_INVALIDATE |
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
* insert an extra WAIT_FOR_ME before an indirect command requiring it