From 9666756f603f0285d8a93ef93db1c7ec702b671f Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Thu, 28 Mar 2024 17:50:35 +0100 Subject: [PATCH] tu: Handle non-overlapping WaW hazard with buffer copy/fill/update Copies/fills/updates for buffers are happening through CCU but need additional synchronization when write range is not aligned to 64 bytes. Because dst buffer access uses either R8_UNORM or R32_UINT and they are not coherent between each other in CCU since format seem to be a part of a cache key. See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306 The synchronization with writes from UCHE (e.g. with SSBO stores) are solved by the fact that UCHE has byte level dirtiness tracking and that CCU flush would happen always before UCHE flush for such case (e.g. both renderpass and dispatch would flush pending CCU write). Additionally see: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111 Fixes geometry corruption and potentially hangs in Resident Evil 3. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/vulkan/tu_clear_blit.cc | 81 +++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 5 deletions(-) diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index b4422bb8644..229a1dc98fa 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -1735,6 +1735,54 @@ copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask) } } +/* Copies/fills/updates for buffers are happening through CCU but need + * additional synchronization when write range is not aligned to 64 bytes. + * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not + * coherent between each other in CCU since format seem to be a part of a + * cache key. + * + * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306 + * + * The synchronization with writes from UCHE (e.g. with SSBO stores) are + * solved by the fact that UCHE has byte level dirtiness tracking and that CCU + * flush would happen always before UCHE flush for such case (e.g. both + * renderpass and dispatch would flush pending CCU write). + * + * Additionally see: + * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111 + */ +template +static void +handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd, + uint64_t dst_va, + uint64_t size, + bool *unaligned_store) +{ + if (*unaligned_store) + return; + + if ((dst_va & 63) || (size & 63)) { + tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE, + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE); + /* Wait for invalidations to land. */ + cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; + tu_emit_cache_flush(cmd); + *unaligned_store = true; + } +} + +template +static void +after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd, + bool unaligned_store) +{ + if (unaligned_store) { + tu_flush_for_access(&cmd->state.cache, + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE, + TU_ACCESS_NONE); + } +} + template void tu6_clear_lrz(struct tu_cmd_buffer *cmd, @@ -2147,7 +2195,8 @@ static void tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_buffer *dst_buffer, - const VkBufferImageCopy2 *info) + const VkBufferImageCopy2 *info, + bool *unaligned_store) { struct tu_cs *cs = &cmd->cs; uint32_t layers = MAX2(info->imageExtent.depth, @@ -2182,6 +2231,10 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, uint32_t pitch = dst_width * util_format_get_blocksize(dst_format); uint32_t layer_size = pitch * dst_height; + handle_buffer_unaligned_store(cmd, + dst_buffer->iova + info->bufferOffset, + layer_size * layers, unaligned_store); + ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false, VK_SAMPLE_COUNT_1_BIT); @@ -2221,9 +2274,13 @@ tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage); VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer); + bool unaligned_store = false; for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i) tu_copy_image_to_buffer(cmd, src_image, dst_buffer, - pCopyImageToBufferInfo->pRegions + i); + pCopyImageToBufferInfo->pRegions + i, + &unaligned_store); + + after_buffer_unaligned_buffer_store(cmd, unaligned_store); } TU_GENX(tu_CmdCopyImageToBuffer2); @@ -2492,13 +2549,16 @@ copy_buffer(struct tu_cmd_buffer *cmd, uint64_t dst_va, uint64_t src_va, uint64_t size, - uint32_t block_size) + uint32_t block_size, + bool *unaligned_store) { const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM; uint64_t blocks = size / block_size; + handle_buffer_unaligned_store(cmd, dst_va, size, unaligned_store); + ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, VK_SAMPLE_COUNT_1_BIT); @@ -2529,13 +2589,16 @@ tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer); VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer); + bool unaligned_store = false; for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) { const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i]; copy_buffer(cmd, dst_buffer->iova + region->dstOffset, src_buffer->iova + region->srcOffset, - region->size, 1); + region->size, 1, &unaligned_store); } + + after_buffer_unaligned_buffer_store(cmd, unaligned_store); } TU_GENX(tu_CmdCopyBuffer2); @@ -2557,8 +2620,11 @@ tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, return; } + bool unaligned_store = false; memcpy(tmp.map, pData, dataSize); - copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4); + copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store); + + after_buffer_unaligned_buffer_store(cmd, unaligned_store); } TU_GENX(tu_CmdUpdateBuffer); @@ -2580,6 +2646,9 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, uint64_t dst_va = buffer->iova + dstOffset; uint32_t blocks = fillSize / 4; + bool unaligned_store = false; + handle_buffer_unaligned_store(cmd, dst_va, fillSize, &unaligned_store); + ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false, VK_SAMPLE_COUNT_1_BIT); @@ -2601,6 +2670,8 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, } ops->teardown(cmd, cs); + + after_buffer_unaligned_buffer_store(cmd, unaligned_store); } TU_GENX(tu_CmdFillBuffer);