mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 20:28:04 +02:00
tu: allow bigger block sizes when copying between buffers
When copying between buffers, find the biggest possible block size usable for all copy regions. A common block size is used since using different block sizes can require additional flushing between different blocks. Besides the single-byte and 4-byte block sizes, also allow for 16-byte block size and the appropriate corresponding format. Using bigger block size when possible helps potentially reduce the number of required CP_BLIT operations. Tested on the Crucible benchmarks, especially for larger copy regions this can improve throughput up to 3x. Signed-off-by: Zan Dobersek <zdobersek@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34587>
This commit is contained in:
parent
ae51c59663
commit
c0dfdc907b
1 changed files with 44 additions and 3 deletions
|
|
@ -3286,8 +3286,21 @@ copy_buffer(struct tu_cmd_buffer *cmd,
|
|||
{
|
||||
const struct blit_ops *ops = &r2d_ops<CHIP>;
|
||||
struct tu_cs *cs = &cmd->cs;
|
||||
enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
|
||||
uint64_t blocks = size / block_size;
|
||||
enum pipe_format format;
|
||||
|
||||
switch (block_size) {
|
||||
case 16:
|
||||
format = PIPE_FORMAT_R32G32B32A32_UINT;
|
||||
break;
|
||||
case 4:
|
||||
format = PIPE_FORMAT_R32_UINT;
|
||||
break;
|
||||
default:
|
||||
assert(block_size == 1);
|
||||
format = PIPE_FORMAT_R8_UNORM;
|
||||
break;
|
||||
}
|
||||
|
||||
handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
|
||||
|
||||
|
|
@ -3321,13 +3334,33 @@ tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
|
|||
VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
|
||||
VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
|
||||
|
||||
/* Choose the largest common block size for all copy regions
|
||||
* to prevent WaW hazards when potentially performing non-overlapping
|
||||
* unaligned stores through CCU. See handle_buffer_unaligned_store.
|
||||
*/
|
||||
uint32_t block_size = 16;
|
||||
for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
|
||||
const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
|
||||
uint64_t alignment_target = region->size |
|
||||
vk_buffer_address(&src_buffer->vk, region->srcOffset) |
|
||||
vk_buffer_address(&dst_buffer->vk, region->dstOffset);
|
||||
|
||||
uint32_t region_block_size = 1;
|
||||
if (!(alignment_target & 15))
|
||||
region_block_size = 16;
|
||||
else if (!(alignment_target & 3))
|
||||
region_block_size = 4;
|
||||
|
||||
block_size = MIN2(block_size, region_block_size);
|
||||
}
|
||||
|
||||
bool unaligned_store = false;
|
||||
for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
|
||||
const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
|
||||
copy_buffer<CHIP>(cmd,
|
||||
vk_buffer_address(&dst_buffer->vk, region->dstOffset),
|
||||
vk_buffer_address(&src_buffer->vk, region->srcOffset),
|
||||
region->size, 1, &unaligned_store);
|
||||
region->size, block_size, &unaligned_store);
|
||||
}
|
||||
|
||||
after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
|
||||
|
|
@ -3352,10 +3385,18 @@ tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
|
|||
return;
|
||||
}
|
||||
|
||||
/* As in tu_CmdCopyBuffer2(), the largest viable block size is used. */
|
||||
uint64_t alignment_target = dataSize | vk_buffer_address(&buffer->vk, dstOffset);
|
||||
uint32_t block_size = 1;
|
||||
if (!(alignment_target & 15))
|
||||
block_size = 16;
|
||||
else if (!(alignment_target & 3))
|
||||
block_size = 4;
|
||||
|
||||
bool unaligned_store = false;
|
||||
memcpy(tmp.map, pData, dataSize);
|
||||
copy_buffer<CHIP>(cmd, vk_buffer_address(&buffer->vk, dstOffset),
|
||||
tmp.iova, dataSize, 4, &unaligned_store);
|
||||
tmp.iova, dataSize, block_size, &unaligned_store);
|
||||
|
||||
after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue