radv/sqtt: use a staging buffer for faster reads on dGPUS

This is way faster.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39195>
This commit is contained in:
Samuel Pitoiset 2026-01-07 14:25:25 +01:00 committed by Marge Bot
parent 5d430940d2
commit c7d0aa6671
2 changed files with 106 additions and 17 deletions

View file

@ -227,9 +227,13 @@ struct radv_device {
VkCommandBuffer sqtt_start_cmdbuf[2];
VkCommandBuffer sqtt_stop_cmdbuf[2];
uint64_t sqtt_size;
VkBuffer sqtt_buffer;
VkDeviceMemory sqtt_memory;
VkBuffer sqtt_staging_buffer;
VkDeviceMemory sqtt_staging_memory;
/* SQTT timestamps for queue events. */
simple_mtx_t sqtt_timestamp_mtx;
struct radv_sqtt_timestamp sqtt_timestamp;
@ -238,6 +242,9 @@ struct radv_device {
simple_mtx_t sqtt_command_pool_mtx;
struct vk_command_pool *sqtt_command_pool[2];
/* Whether to use a staging buffer for SQTT/SPM buffers. */
bool rgp_use_staging_buffer;
/* Memory trace. */
struct radv_memory_trace_data memory_trace;

View file

@ -334,8 +334,8 @@ radv_sqtt_init_bo(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
unsigned max_se = pdev->info.max_se;
VkDeviceMemory memory;
VkBuffer buffer;
VkDeviceMemory memory, staging_memory;
VkBuffer buffer, staging_buffer;
VkResult result;
uint64_t size;
uint64_t va;
@ -351,24 +351,16 @@ radv_sqtt_init_bo(struct radv_device *device)
size += device->sqtt.buffer_size * (uint64_t)max_se;
/* Allocate the SQTT buffer (it must be in VRAM). */
const uint32_t memory_type_index =
radv_find_memory_index(pdev, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
const uint32_t memory_type_index = radv_find_memory_index(
pdev,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
(device->rgp_use_staging_buffer ? 0
: VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT));
result = radv_sqtt_allocate_buffer(radv_device_to_handle(device), size, memory_type_index, &buffer, &memory);
if (result != VK_SUCCESS)
return false;
VkMemoryMapInfo mem_map_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
.memory = memory,
.size = VK_WHOLE_SIZE,
};
result = radv_MapMemory2(radv_device_to_handle(device), &mem_map_info, &ptr);
if (result != VK_SUCCESS)
return false;
VkBufferDeviceAddressInfo addr_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
.buffer = buffer,
@ -376,8 +368,33 @@ radv_sqtt_init_bo(struct radv_device *device)
va = vk_common_GetBufferDeviceAddress(radv_device_to_handle(device), &addr_info);
/* Allocate a staging buffer in GTT. */
if (device->rgp_use_staging_buffer) {
const uint32_t staging_memory_type_index =
radv_find_memory_index(pdev, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
result = radv_sqtt_allocate_buffer(radv_device_to_handle(device), size, staging_memory_type_index,
&staging_buffer, &staging_memory);
if (result != VK_SUCCESS)
return false;
}
VkMemoryMapInfo mem_map_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
.memory = device->rgp_use_staging_buffer ? staging_memory : memory,
.size = VK_WHOLE_SIZE,
};
result = radv_MapMemory2(radv_device_to_handle(device), &mem_map_info, &ptr);
if (result != VK_SUCCESS)
return false;
device->sqtt_size = size;
device->sqtt_buffer = buffer;
device->sqtt_memory = memory;
device->sqtt_staging_buffer = staging_buffer;
device->sqtt_staging_memory = staging_memory;
device->sqtt.buffer_va = va;
device->sqtt.bo = &device->sqtt_buffer;
device->sqtt.ptr = ptr;
@ -388,16 +405,20 @@ radv_sqtt_init_bo(struct radv_device *device)
static void
radv_sqtt_finish_bo(struct radv_device *device)
{
if (device->sqtt_memory) {
VkDeviceMemory memory = device->rgp_use_staging_buffer ? device->sqtt_staging_memory : device->sqtt_memory;
if (memory) {
VkMemoryUnmapInfo unmap_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_UNMAP_INFO,
.memory = device->sqtt_memory,
.memory = memory,
};
radv_UnmapMemory2(radv_device_to_handle(device), &unmap_info);
}
radv_sqtt_destroy_buffer(radv_device_to_handle(device), device->sqtt_buffer, device->sqtt_memory);
if (device->rgp_use_staging_buffer)
radv_sqtt_destroy_buffer(radv_device_to_handle(device), device->sqtt_staging_buffer, device->sqtt_staging_memory);
}
static VkResult
@ -473,12 +494,16 @@ radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
bool
radv_sqtt_init(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct ac_sqtt *sqtt = &device->sqtt;
/* Default buffer size set to 32MB per SE. */
device->sqtt.buffer_size = (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled();
/* Whether to use a staging buffer for faster reads on dGPUs. */
device->rgp_use_staging_buffer = pdev->info.has_dedicated_vram;
if (device->ws->reserve_vmid(device->ws) < 0) {
fprintf(stderr, "radv: Failed to reserve VMID for SQTT tracing.\n");
return false;
@ -542,6 +567,58 @@ radv_sqtt_resize_bo(struct radv_device *device)
return radv_sqtt_init_bo(device);
}
static void
radv_sqtt_copy_buffer(VkCommandBuffer cmdbuf, VkBuffer src_buffer, VkBuffer dst_buffer, uint64_t size)
{
VkMemoryBarrier2 pre_barrier = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
.srcStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
.srcAccessMask = VK_ACCESS_2_MEMORY_WRITE_BIT,
.dstStageMask = VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT,
.dstAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT,
};
VkDependencyInfo pre_dep_info = {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.memoryBarrierCount = 1,
.pMemoryBarriers = &pre_barrier,
};
radv_CmdPipelineBarrier2(cmdbuf, &pre_dep_info);
VkBufferCopy2 copy = {
.sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
.srcOffset = 0,
.size = size,
};
VkCopyBufferInfo2 copy_info = {
.sType = VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2,
.srcBuffer = src_buffer,
.dstBuffer = dst_buffer,
.regionCount = 1,
.pRegions = &copy,
};
radv_CmdCopyBuffer2(cmdbuf, &copy_info);
VkMemoryBarrier2 post_barrier = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
.srcStageMask = VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT,
.srcAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
.dstStageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
.dstAccessMask = VK_ACCESS_2_HOST_READ_BIT,
};
VkDependencyInfo post_dep_info = {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.memoryBarrierCount = 1,
.pMemoryBarriers = &post_barrier,
};
radv_CmdPipelineBarrier2(cmdbuf, &post_dep_info);
}
static bool
radv_begin_sqtt(struct radv_queue *queue)
{
@ -676,6 +753,11 @@ radv_end_sqtt(struct radv_queue *queue)
/* Restore previous state by re-enabling clock gating. */
ac_emit_cp_inhibit_clockgating(cs->b, pdev->info.gfx_level, false);
/* Copy to the staging buffer for faster reads on dGPUs. */
if (device->rgp_use_staging_buffer) {
radv_sqtt_copy_buffer(cmdbuf, device->sqtt_buffer, device->sqtt_staging_buffer, device->sqtt_size);
}
result = radv_EndCommandBuffer(cmdbuf);
if (result != VK_SUCCESS)
return false;