radv: re-introduce the compute vs CP DMA heuristic for copy/fill operations
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

This caused a -5% performance regression in Control because using
compute always eats resources.

This new approach introduces a flag called RADV_COPY_FLAGS_DEVICE_LOCAL
which can be used to indicate if the underlying memory is device local.
This should also help for future work.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12639
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34556>
This commit is contained in:
Samuel Pitoiset 2025-04-16 16:03:29 +02:00 committed by Marge Bot
parent 5e2508e7c4
commit e616761fb2
6 changed files with 99 additions and 24 deletions

View file

@ -65,6 +65,10 @@ struct radv_meta_saved_state {
unsigned active_occlusion_queries;
};
enum radv_copy_flags {
RADV_COPY_FLAGS_DEVICE_LOCAL = 1 << 0,
};
enum radv_blit_ds_layout {
RADV_BLIT_DS_LAYOUT_TILE_ENABLE,
RADV_BLIT_DS_LAYOUT_TILE_DISABLE,
@ -182,6 +186,7 @@ struct radv_meta_blit2d_buffer {
uint32_t pitch;
uint8_t bs;
VkFormat format;
enum radv_copy_flags copy_flags;
};
struct radv_meta_blit2d_rect {
@ -263,12 +268,14 @@ void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image
uint32_t radv_fill_buffer(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va, uint64_t size,
uint32_t value);
uint32_t radv_fill_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, uint32_t value);
uint32_t radv_fill_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, uint32_t value,
enum radv_copy_flags copy_flags);
uint32_t radv_fill_image(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image, uint64_t offset,
uint64_t size, uint32_t value);
void radv_copy_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dst_va, uint64_t size);
void radv_copy_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dst_va, uint64_t size,
enum radv_copy_flags src_copy_flags, enum radv_copy_flags dst_copy_flags);
void radv_cmd_buffer_clear_attachment(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachment *attachment);

View file

@ -190,11 +190,29 @@ radv_compute_copy_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, ui
radv_meta_restore(&saved_state, cmd_buffer);
}
static bool
radv_prefer_compute_or_cp_dma(const struct radv_device *device, uint64_t size, enum radv_copy_flags src_copy_flags,
enum radv_copy_flags dst_copy_flags)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
bool use_compute = size >= RADV_BUFFER_OPS_CS_THRESHOLD;
if (pdev->info.gfx_level >= GFX10 && pdev->info.has_dedicated_vram) {
if (!(src_copy_flags & RADV_COPY_FLAGS_DEVICE_LOCAL) || !(dst_copy_flags & RADV_COPY_FLAGS_DEVICE_LOCAL)) {
/* Prefer CP DMA for GTT on dGPUS due to slow PCIe. */
use_compute = false;
}
}
return use_compute;
}
static uint32_t
radv_fill_memory_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image, uint64_t va,
uint64_t size, uint32_t value)
uint64_t size, uint32_t value, enum radv_copy_flags copy_flags)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
bool use_compute = radv_prefer_compute_or_cp_dma(device, size, copy_flags, copy_flags);
uint32_t flush_bits = 0;
assert(!(va & 3));
@ -202,7 +220,7 @@ radv_fill_memory_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_
if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
radv_sdma_fill_memory(device, cmd_buffer->cs, va, size, value);
} else if (size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
} else if (use_compute) {
radv_compute_fill_memory(cmd_buffer, va, size, value);
flush_bits = RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE |
@ -215,9 +233,10 @@ radv_fill_memory_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_
}
uint32_t
radv_fill_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, uint32_t value)
radv_fill_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, uint32_t value,
enum radv_copy_flags copy_flags)
{
return radv_fill_memory_internal(cmd_buffer, NULL, va, size, value);
return radv_fill_memory_internal(cmd_buffer, NULL, va, size, value, copy_flags);
}
uint32_t
@ -227,10 +246,14 @@ radv_fill_image(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *ima
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const uint64_t va = image->bindings[0].addr + offset;
struct radeon_winsys_bo *bo = image->bindings[0].bo;
enum radv_copy_flags copy_flags = 0;
if (bo->initial_domain & RADEON_DOMAIN_VRAM)
copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
return radv_fill_memory_internal(cmd_buffer, image, va, size, value);
return radv_fill_memory_internal(cmd_buffer, image, va, size, value, copy_flags);
}
uint32_t
@ -238,10 +261,14 @@ radv_fill_buffer(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo
uint32_t value)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
enum radv_copy_flags copy_flags = 0;
if (bo->initial_domain & RADEON_DOMAIN_VRAM)
copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
return radv_fill_memory(cmd_buffer, va, size, value);
return radv_fill_memory(cmd_buffer, va, size, value, copy_flags);
}
VKAPI_ATTR void VKAPI_CALL
@ -261,10 +288,12 @@ radv_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSi
}
void
radv_copy_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dst_va, uint64_t size)
radv_copy_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dst_va, uint64_t size,
enum radv_copy_flags src_copy_flags, enum radv_copy_flags dst_copy_flags)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const bool use_compute = !(size & 3) && !(src_va & 3) && !(dst_va & 3) && size >= RADV_BUFFER_OPS_CS_THRESHOLD;
const bool use_compute = !(size & 3) && !(src_va & 3) && !(dst_va & 3) &&
radv_prefer_compute_or_cp_dma(device, size, src_copy_flags, dst_copy_flags);
if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
radv_sdma_copy_memory(device, cmd_buffer->cs, src_va, dst_va, size);
@ -282,6 +311,12 @@ radv_CmdCopyBuffer2(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCop
VK_FROM_HANDLE(radv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
VK_FROM_HANDLE(radv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
enum radv_copy_flags src_copy_flags = 0, dst_copy_flags = 0;
if (src_buffer->bo->initial_domain & RADEON_DOMAIN_VRAM)
src_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
if (dst_buffer->bo->initial_domain & RADEON_DOMAIN_VRAM)
dst_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
radv_suspend_conditional_rendering(cmd_buffer);
@ -293,7 +328,7 @@ radv_CmdCopyBuffer2(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCop
const uint64_t src_va = vk_buffer_address(&src_buffer->vk, region->srcOffset);
const uint64_t dst_va = vk_buffer_address(&dst_buffer->vk, region->dstOffset);
radv_copy_memory(cmd_buffer, src_va, dst_va, region->size);
radv_copy_memory(cmd_buffer, src_va, dst_va, region->size, src_copy_flags, dst_copy_flags);
}
radv_resume_conditional_rendering(cmd_buffer);
@ -324,7 +359,8 @@ radv_update_buffer_cp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, const voi
}
static void
radv_update_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, const void *data)
radv_update_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, const void *data,
enum radv_copy_flags dst_copy_flags)
{
assert(!(size & 3));
assert(!(va & 3));
@ -335,13 +371,17 @@ radv_update_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t siz
if (size < RADV_BUFFER_UPDATE_THRESHOLD && cmd_buffer->qf != RADV_QUEUE_TRANSFER) {
radv_update_buffer_cp(cmd_buffer, va, data, size);
} else {
enum radv_copy_flags src_copy_flags = 0;
uint32_t buf_offset;
radv_cmd_buffer_upload_data(cmd_buffer, size, data, &buf_offset);
if (cmd_buffer->upload.upload_bo->initial_domain & RADEON_DOMAIN_VRAM)
src_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
const uint64_t src_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + buf_offset;
radv_copy_memory(cmd_buffer, src_va, va, size);
radv_copy_memory(cmd_buffer, src_va, va, size, src_copy_flags, dst_copy_flags);
}
}
@ -353,12 +393,16 @@ radv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDevice
VK_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const uint64_t dst_va = vk_buffer_address(&dst_buffer->vk, dstOffset);
enum radv_copy_flags dst_copy_flags = 0;
if (dst_buffer->bo->initial_domain & RADEON_DOMAIN_VRAM)
dst_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
radv_suspend_conditional_rendering(cmd_buffer);
radv_cs_add_buffer(device->ws, cmd_buffer->cs, dst_buffer->bo);
radv_update_memory(cmd_buffer, dst_va, dataSize, pData);
radv_update_memory(cmd_buffer, dst_va, dataSize, pData, dst_copy_flags);
radv_resume_conditional_rendering(cmd_buffer);
}

View file

@ -622,6 +622,7 @@ fixup_gfx9_cs_copy(struct radv_cmd_buffer *cmd_buffer, const struct radv_meta_bl
const struct radeon_surf *surf = &image->planes[0].surface;
const struct radeon_info *gpu_info = &pdev->info;
struct ac_surf_info surf_info = radv_get_ac_surf_info(device, image);
enum radv_copy_flags img_copy_flags = 0, mem_copy_flags = 0;
/* GFX10 will use a different workaround unless this is not a 2D image */
if (gpu_info->gfx_level < GFX9 || (gpu_info->gfx_level >= GFX10 && image->vk.image_type == VK_IMAGE_TYPE_2D) ||
@ -654,6 +655,10 @@ fixup_gfx9_cs_copy(struct radv_cmd_buffer *cmd_buffer, const struct radv_meta_bl
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
}
if (image->bindings[0].bo && (image->bindings[0].bo->initial_domain & RADEON_DOMAIN_VRAM))
img_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
mem_copy_flags |= buf_bsurf->copy_flags;
for (uint32_t y = 0; y < mip_extent.height; y++) {
uint32_t coordY = y + mip_offset.y;
/* If the default copy algorithm (done previously) has already seen this
@ -670,9 +675,9 @@ fixup_gfx9_cs_copy(struct radv_cmd_buffer *cmd_buffer, const struct radv_meta_bl
/* buf_bsurf->offset already includes the layer offset */
const uint64_t mem_va = buf_bsurf->addr + buf_bsurf->offset + y * buf_bsurf->pitch * surf->bpe + x * surf->bpe;
if (to_image) {
radv_copy_memory(cmd_buffer, mem_va, img_va, surf->bpe);
radv_copy_memory(cmd_buffer, mem_va, img_va, surf->bpe, mem_copy_flags, img_copy_flags);
} else {
radv_copy_memory(cmd_buffer, img_va, mem_va, surf->bpe);
radv_copy_memory(cmd_buffer, img_va, mem_va, surf->bpe, img_copy_flags, mem_copy_flags);
}
}
}

View file

@ -98,7 +98,8 @@ transfer_copy_memory_image(struct radv_cmd_buffer *cmd_buffer, uint64_t buffer_v
static void
copy_memory_to_image(struct radv_cmd_buffer *cmd_buffer, uint64_t buffer_addr, uint64_t buffer_size,
struct radv_image *image, VkImageLayout layout, const VkBufferImageCopy2 *region)
enum radv_copy_flags src_copy_flags, struct radv_image *image, VkImageLayout layout,
const VkBufferImageCopy2 *region)
{
if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
transfer_copy_memory_image(cmd_buffer, buffer_addr, image, region, true);
@ -173,6 +174,7 @@ copy_memory_to_image(struct radv_cmd_buffer *cmd_buffer, uint64_t buffer_addr, u
.format = img_bsurf.format,
.offset = region->bufferOffset,
.pitch = buf_layout.row_stride_B / buf_layout.element_size_B,
.copy_flags = src_copy_flags,
};
if (image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -218,6 +220,10 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, const VkCopyBufferToIm
VK_FROM_HANDLE(radv_image, dst_image, pCopyBufferToImageInfo->dstImage);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
enum radv_copy_flags src_copy_flags = 0;
if (src_buffer->bo->initial_domain & RADEON_DOMAIN_VRAM)
src_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
radv_suspend_conditional_rendering(cmd_buffer);
@ -230,7 +236,7 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, const VkCopyBufferToIm
radv_cs_add_buffer(device->ws, cmd_buffer->cs, dst_image->bindings[bind_idx].bo);
copy_memory_to_image(cmd_buffer, src_buffer->vk.device_address, src_buffer->vk.size, dst_image,
copy_memory_to_image(cmd_buffer, src_buffer->vk.device_address, src_buffer->vk.size, src_copy_flags, dst_image,
pCopyBufferToImageInfo->dstImageLayout, region);
}
@ -262,7 +268,8 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, const VkCopyBufferToIm
static void
copy_image_to_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t buffer_addr, uint64_t buffer_size,
struct radv_image *image, VkImageLayout layout, const VkBufferImageCopy2 *region)
enum radv_copy_flags dst_copy_flags, struct radv_image *image, VkImageLayout layout,
const VkBufferImageCopy2 *region)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
@ -332,6 +339,7 @@ copy_image_to_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t buffer_addr, u
.format = img_info.format,
.offset = region->bufferOffset,
.pitch = buf_extent_el.width,
.copy_flags = dst_copy_flags,
};
if (image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -367,6 +375,10 @@ radv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer, const VkCopyImageToBuf
VK_FROM_HANDLE(radv_image, src_image, pCopyImageToBufferInfo->srcImage);
VK_FROM_HANDLE(radv_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
enum radv_copy_flags dst_copy_flags = 0;
if (dst_buffer->bo->initial_domain & RADEON_DOMAIN_VRAM)
dst_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
radv_suspend_conditional_rendering(cmd_buffer);
@ -379,7 +391,7 @@ radv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer, const VkCopyImageToBuf
radv_cs_add_buffer(device->ws, cmd_buffer->cs, src_image->bindings[bind_idx].bo);
copy_image_to_memory(cmd_buffer, dst_buffer->vk.device_address, dst_buffer->vk.size, src_image,
copy_image_to_memory(cmd_buffer, dst_buffer->vk.device_address, dst_buffer->vk.size, dst_copy_flags, src_image,
pCopyImageToBufferInfo->srcImageLayout, region);
}

View file

@ -93,6 +93,7 @@ static void
radv_fixup_copy_dst_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *src_image,
const struct radv_image *dst_image)
{
enum radv_copy_flags src_copy_flags = 0, dst_copy_flags = 0;
uint64_t src_va, dst_va, size;
assert(src_image->planes[0].surface.cmask_size == dst_image->planes[0].surface.cmask_size &&
@ -102,12 +103,17 @@ radv_fixup_copy_dst_metadata(struct radv_cmd_buffer *cmd_buffer, const struct ra
dst_image->planes[0].surface.fmask_offset + dst_image->planes[0].surface.fmask_size ==
dst_image->planes[0].surface.cmask_offset);
if (src_image->bindings[0].bo && (src_image->bindings[0].bo->initial_domain & RADEON_DOMAIN_VRAM))
src_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
if (dst_image->bindings[0].bo && (dst_image->bindings[0].bo->initial_domain & RADEON_DOMAIN_VRAM))
dst_copy_flags |= RADV_COPY_FLAGS_DEVICE_LOCAL;
/* Copy CMASK+FMASK. */
size = src_image->planes[0].surface.cmask_size + src_image->planes[0].surface.fmask_size;
src_va = src_image->bindings[0].addr + src_image->planes[0].surface.fmask_offset;
dst_va = dst_image->bindings[0].addr + dst_image->planes[0].surface.fmask_offset;
radv_copy_memory(cmd_buffer, src_va, dst_va, size);
radv_copy_memory(cmd_buffer, src_va, dst_va, size, src_copy_flags, dst_copy_flags);
}
bool

View file

@ -581,7 +581,7 @@ radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch,
/* Prepare ready counts for internal nodes */
radv_fill_memory(cmd_buffer, scratch + layout.internal_ready_count_offset,
layout.update_size - layout.internal_ready_count_offset, 0x0);
layout.update_size - layout.internal_ready_count_offset, 0x0, RADV_COPY_FLAGS_DEVICE_LOCAL);
}
static void
@ -627,7 +627,8 @@ radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild
const uint64_t src_va = vk_acceleration_structure_get_va(src);
const uint64_t dst_va = vk_acceleration_structure_get_va(dst);
radv_copy_memory(cmd_buffer, src_va, dst_va, layout.bvh_offset);
radv_copy_memory(cmd_buffer, src_va, dst_va, layout.bvh_offset, RADV_COPY_FLAGS_DEVICE_LOCAL,
RADV_COPY_FLAGS_DEVICE_LOCAL);
}
struct scratch_layout layout;
@ -721,7 +722,7 @@ static void
radv_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer, VkDeviceAddress addr, VkDeviceSize size, uint32_t data)
{
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
radv_fill_memory(cmd_buffer, addr, size, data);
radv_fill_memory(cmd_buffer, addr, size, data, RADV_COPY_FLAGS_DEVICE_LOCAL);
}
VkResult