mesa/src/amd/vulkan/radv_sqtt.c
Samuel Pitoiset f0b3a6f9d4 radv: rework command buffer emission with begin/end sequences
A begin/end sequence is something like (it's all macros based):

   radeon_begin(cs);
   radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
   radeon_emit(vertex_count);
   radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
   radeon_end();

This is loosely based on RadeonSI (see !8653 (a0978fff)) and it seems
indeed faster overall.

The main goal of this rework is to re-use the same logic as RadeonSI
for paired packets on GFX12 (also GFX11 dGPUs) because it's supposed
to be way faster, especially on GFX12 where the CP is slow. The other
goal is to share more cmdbuf emission between both drivers in the near
future.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34229>
2025-04-01 06:18:28 +00:00

857 lines
26 KiB
C

/*
* Copyright © 2020 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include <inttypes.h>
#include "radv_buffer.h"
#include "radv_cs.h"
#include "radv_debug.h"
#include "radv_entrypoints.h"
#include "radv_perfcounter.h"
#include "radv_spm.h"
#include "radv_sqtt.h"
#include "sid.h"
#include "ac_pm4.h"
#include "vk_command_pool.h"
#include "vk_common_entrypoints.h"
bool
radv_is_instruction_timing_enabled(void)
{
return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
}
bool
radv_sqtt_queue_events_enabled(void)
{
return debug_get_bool_option("RADV_THREAD_TRACE_QUEUE_EVENTS", true);
}
static enum radv_queue_family
radv_ip_to_queue_family(enum amd_ip_type t)
{
switch (t) {
case AMD_IP_GFX:
return RADV_QUEUE_GENERAL;
case AMD_IP_COMPUTE:
return RADV_QUEUE_COMPUTE;
case AMD_IP_SDMA:
return RADV_QUEUE_TRANSFER;
default:
unreachable("Unknown IP type");
}
}
static void
radv_emit_wait_for_idle(const struct radv_device *device, struct radeon_cmdbuf *cs, int family)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const enum radv_queue_family qf = radv_ip_to_queue_family(family);
enum rgp_flush_bits sqtt_flush_bits = 0;
radv_cs_emit_cache_flush(
device->ws, cs, pdev->info.gfx_level, NULL, 0, qf,
(family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
: (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2,
&sqtt_flush_bits, 0);
}
static void
radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const bool is_compute_queue = qf == RADV_QUEUE_COMPUTE;
struct ac_pm4_state *pm4;
pm4 = ac_pm4_create_sized(&pdev->info, false, 512, is_compute_queue);
if (!pm4)
return;
ac_sqtt_emit_start(&pdev->info, pm4, &device->sqtt, is_compute_queue);
ac_pm4_finalize(pm4);
radeon_check_space(device->ws, cs, pm4->ndw);
radv_emit_pm4_commands(cs, pm4);
ac_pm4_free_state(pm4);
}
static void
radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const bool is_compute_queue = qf == RADV_QUEUE_COMPUTE;
struct ac_pm4_state *pm4;
pm4 = ac_pm4_create_sized(&pdev->info, false, 512, is_compute_queue);
if (!pm4)
return;
ac_sqtt_emit_stop(&pdev->info, pm4, is_compute_queue);
ac_pm4_finalize(pm4);
radeon_check_space(device->ws, cs, pm4->ndw);
radv_emit_pm4_commands(cs, pm4);
ac_pm4_clear_state(pm4, &pdev->info, false, is_compute_queue);
if (pdev->info.has_sqtt_rb_harvest_bug) {
/* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
radv_emit_wait_for_idle(device, cs, qf);
}
ac_sqtt_emit_wait(&pdev->info, pm4, &device->sqtt, is_compute_queue);
ac_pm4_finalize(pm4);
radeon_check_space(device->ws, cs, pm4->ndw);
radv_emit_pm4_commands(cs, pm4);
ac_pm4_free_state(pm4);
}
void
radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
const enum amd_ip_type ring = radv_queue_family_to_ring(pdev, cmd_buffer->qf);
struct radeon_cmdbuf *cs = cmd_buffer->cs;
const uint32_t *dwords = (uint32_t *)data;
/* SQTT user data packets aren't supported on SDMA queues. */
if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
return;
while (num_dwords > 0) {
uint32_t count = MIN2(num_dwords, 2);
radeon_check_space(device->ws, cs, 2 + count);
radeon_begin(cs);
/* Without the perfctr bit the CP might not always pass the
* write on correctly. */
if (pdev->info.gfx_level >= GFX10)
radeon_set_uconfig_perfctr_reg_seq(gfx_level, ring, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
else
radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
radeon_emit_array(dwords, count);
radeon_end();
dwords += count;
num_dwords -= count;
}
}
void
radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
radeon_set_uconfig_reg(R_031120_SPI_SQG_EVENT_CTL,
S_031120_ENABLE_SQG_TOP_EVENTS(enable) | S_031120_ENABLE_SQG_BOP_EVENTS(enable));
} else if (pdev->info.gfx_level >= GFX9) {
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
if (pdev->info.gfx_level >= GFX10)
spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
} else {
/* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable));
}
radeon_end();
}
void
radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
if (pdev->info.gfx_level >= GFX11)
return; /* not needed */
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX10) {
radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit));
} else if (pdev->info.gfx_level >= GFX8) {
radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit));
}
radeon_end();
}
VkResult
radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo,
uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr)
{
simple_mtx_lock(&device->sqtt_timestamp_mtx);
if (device->sqtt_timestamp.offset + 8 > device->sqtt_timestamp.size) {
struct radeon_winsys_bo *bo;
uint64_t new_size;
VkResult result;
uint8_t *map;
new_size = MAX2(4096, 2 * device->sqtt_timestamp.size);
result = radv_bo_create(device, NULL, new_size, 8, RADEON_DOMAIN_GTT,
RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_SCRATCH, 0,
true, &bo);
if (result != VK_SUCCESS) {
simple_mtx_unlock(&device->sqtt_timestamp_mtx);
return result;
}
map = radv_buffer_map(device->ws, bo);
if (!map) {
radv_bo_destroy(device, NULL, bo);
simple_mtx_unlock(&device->sqtt_timestamp_mtx);
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
if (device->sqtt_timestamp.bo) {
struct radv_sqtt_timestamp *new_timestamp;
new_timestamp = malloc(sizeof(*new_timestamp));
if (!new_timestamp) {
radv_bo_destroy(device, NULL, bo);
simple_mtx_unlock(&device->sqtt_timestamp_mtx);
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
memcpy(new_timestamp, &device->sqtt_timestamp, sizeof(*new_timestamp));
list_add(&new_timestamp->list, &device->sqtt_timestamp.list);
}
device->sqtt_timestamp.bo = bo;
device->sqtt_timestamp.size = new_size;
device->sqtt_timestamp.offset = 0;
device->sqtt_timestamp.map = map;
}
*gpu_timestamp_bo = device->sqtt_timestamp.bo;
*gpu_timestamp_offset = device->sqtt_timestamp.offset;
*gpu_timestamp_ptr = device->sqtt_timestamp.map + device->sqtt_timestamp.offset;
device->sqtt_timestamp.offset += 8;
simple_mtx_unlock(&device->sqtt_timestamp_mtx);
return VK_SUCCESS;
}
static void
radv_sqtt_reset_timestamp(struct radv_device *device)
{
simple_mtx_lock(&device->sqtt_timestamp_mtx);
list_for_each_entry_safe (struct radv_sqtt_timestamp, ts, &device->sqtt_timestamp.list, list) {
radv_bo_destroy(device, NULL, ts->bo);
list_del(&ts->list);
free(ts);
}
device->sqtt_timestamp.offset = 0;
simple_mtx_unlock(&device->sqtt_timestamp_mtx);
}
static bool
radv_sqtt_init_queue_event(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
VkCommandPool cmd_pool;
VkResult result;
const VkCommandPoolCreateInfo create_gfx_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.queueFamilyIndex = RADV_QUEUE_GENERAL, /* Graphics queue is always the first queue. */
};
result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_gfx_info, NULL, &cmd_pool);
if (result != VK_SUCCESS)
return false;
device->sqtt_command_pool[0] = vk_command_pool_from_handle(cmd_pool);
if (radv_compute_queue_enabled(pdev)) {
const VkCommandPoolCreateInfo create_comp_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.queueFamilyIndex = RADV_QUEUE_COMPUTE,
};
result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_comp_info, NULL, &cmd_pool);
if (result != VK_SUCCESS)
return false;
device->sqtt_command_pool[1] = vk_command_pool_from_handle(cmd_pool);
}
simple_mtx_init(&device->sqtt_command_pool_mtx, mtx_plain);
simple_mtx_init(&device->sqtt_timestamp_mtx, mtx_plain);
list_inithead(&device->sqtt_timestamp.list);
return true;
}
static void
radv_sqtt_finish_queue_event(struct radv_device *device)
{
if (device->sqtt_timestamp.bo)
radv_bo_destroy(device, NULL, device->sqtt_timestamp.bo);
simple_mtx_destroy(&device->sqtt_timestamp_mtx);
for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++)
vk_common_DestroyCommandPool(radv_device_to_handle(device),
vk_command_pool_to_handle(device->sqtt_command_pool[i]), NULL);
simple_mtx_destroy(&device->sqtt_command_pool_mtx);
}
static bool
radv_sqtt_init_bo(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
unsigned max_se = pdev->info.max_se;
struct radeon_winsys *ws = device->ws;
VkResult result;
uint64_t size;
/* The buffer size and address need to be aligned in HW regs. Align the
* size as early as possible so that we do all the allocation & addressing
* correctly. */
device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << SQTT_BUFFER_ALIGN_SHIFT);
/* Compute total size of the thread trace BO for all SEs. */
size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << SQTT_BUFFER_ALIGN_SHIFT);
size += device->sqtt.buffer_size * (uint64_t)max_se;
struct radeon_winsys_bo *bo = NULL;
result = radv_bo_create(device, NULL, size, 4096, RADEON_DOMAIN_VRAM,
RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
RADV_BO_PRIORITY_SCRATCH, 0, true, &bo);
device->sqtt.bo = bo;
if (result != VK_SUCCESS)
return false;
result = ws->buffer_make_resident(ws, device->sqtt.bo, true);
if (result != VK_SUCCESS)
return false;
device->sqtt.ptr = radv_buffer_map(ws, device->sqtt.bo);
if (!device->sqtt.ptr)
return false;
device->sqtt.buffer_va = radv_buffer_get_va(device->sqtt.bo);
return true;
}
static void
radv_sqtt_finish_bo(struct radv_device *device)
{
struct radeon_winsys *ws = device->ws;
if (unlikely(device->sqtt.bo)) {
ws->buffer_make_resident(ws, device->sqtt.bo, false);
radv_bo_destroy(device, NULL, device->sqtt.bo);
}
}
static VkResult
radv_register_queue(struct radv_device *device, struct radv_queue *queue)
{
struct ac_sqtt *sqtt = &device->sqtt;
struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
struct rgp_queue_info_record *record;
record = malloc(sizeof(struct rgp_queue_info_record));
if (!record)
return VK_ERROR_OUT_OF_HOST_MEMORY;
record->queue_id = (uintptr_t)queue;
record->queue_context = (uintptr_t)queue->hw_ctx;
if (queue->vk.queue_family_index == RADV_QUEUE_GENERAL) {
record->hardware_info.queue_type = SQTT_QUEUE_TYPE_UNIVERSAL;
record->hardware_info.engine_type = SQTT_ENGINE_TYPE_UNIVERSAL;
} else {
record->hardware_info.queue_type = SQTT_QUEUE_TYPE_COMPUTE;
record->hardware_info.engine_type = SQTT_ENGINE_TYPE_COMPUTE;
}
simple_mtx_lock(&queue_info->lock);
list_addtail(&record->list, &queue_info->record);
queue_info->record_count++;
simple_mtx_unlock(&queue_info->lock);
return VK_SUCCESS;
}
static void
radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
{
struct ac_sqtt *sqtt = &device->sqtt;
struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
/* Destroy queue info record. */
simple_mtx_lock(&queue_info->lock);
if (queue_info->record_count > 0) {
list_for_each_entry_safe (struct rgp_queue_info_record, record, &queue_info->record, list) {
if (record->queue_id == (uintptr_t)queue) {
queue_info->record_count--;
list_del(&record->list);
free(record);
break;
}
}
}
simple_mtx_unlock(&queue_info->lock);
}
static void
radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt)
{
if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
radv_register_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
}
static void
radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
{
if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
radv_unregister_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
}
bool
radv_sqtt_init(struct radv_device *device)
{
struct ac_sqtt *sqtt = &device->sqtt;
/* Default buffer size set to 32MB per SE. */
device->sqtt.buffer_size = (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled();
if (!radv_sqtt_init_bo(device))
return false;
if (!radv_sqtt_init_queue_event(device))
return false;
if (!radv_device_acquire_performance_counters(device))
return false;
ac_sqtt_init(sqtt);
radv_register_queues(device, sqtt);
return true;
}
void
radv_sqtt_finish(struct radv_device *device)
{
struct ac_sqtt *sqtt = &device->sqtt;
struct radeon_winsys *ws = device->ws;
radv_sqtt_finish_bo(device);
radv_sqtt_finish_queue_event(device);
for (unsigned i = 0; i < 2; i++) {
if (device->sqtt.start_cs[i])
ws->cs_destroy(device->sqtt.start_cs[i]);
if (device->sqtt.stop_cs[i])
ws->cs_destroy(device->sqtt.stop_cs[i]);
}
radv_unregister_queues(device, sqtt);
ac_sqtt_finish(sqtt);
}
static bool
radv_sqtt_resize_bo(struct radv_device *device)
{
/* Destroy the previous thread trace BO. */
radv_sqtt_finish_bo(device);
/* Double the size of the thread trace buffer per SE. */
device->sqtt.buffer_size *= 2;
fprintf(stderr,
"Failed to get the thread trace because the buffer "
"was too small, resizing to %d KB\n",
device->sqtt.buffer_size / 1024);
/* Re-create the thread trace BO. */
return radv_sqtt_init_bo(device);
}
static bool
radv_begin_sqtt(struct radv_queue *queue)
{
struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
enum radv_queue_family family = queue->state.qf;
struct radeon_winsys *ws = device->ws;
struct radeon_cmdbuf *cs;
VkResult result;
/* Destroy the previous start CS and create a new one. */
if (device->sqtt.start_cs[family]) {
ws->cs_destroy(device->sqtt.start_cs[family]);
device->sqtt.start_cs[family] = NULL;
}
cs = ws->cs_create(ws, radv_queue_ring(queue), false);
if (!cs)
return false;
radeon_check_space(ws, cs, 512);
radeon_begin(cs);
switch (family) {
case RADV_QUEUE_GENERAL:
radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
break;
case RADV_QUEUE_COMPUTE:
radeon_emit(PKT3(PKT3_NOP, 0, 0));
radeon_emit(0);
break;
default:
unreachable("Incorrect queue family");
break;
}
radeon_end();
/* Make sure to wait-for-idle before starting SQTT. */
radv_emit_wait_for_idle(device, cs, family);
/* Disable clock gating before starting SQTT. */
radv_emit_inhibit_clockgating(device, cs, true);
/* Enable SQG events that collects thread trace data. */
radv_emit_spi_config_cntl(device, cs, true);
radv_perfcounter_emit_reset(cs, true);
if (device->spm.bo) {
/* Enable all shader stages by default. */
radv_perfcounter_emit_shaders(device, cs, ac_sqtt_get_shader_mask(&pdev->info));
radv_emit_spm_setup(device, cs, family);
}
/* Start SQTT. */
radv_emit_sqtt_start(device, cs, family);
if (device->spm.bo) {
radeon_check_space(ws, cs, 8);
radv_perfcounter_emit_spm_start(device, cs, family);
}
result = ws->cs_finalize(cs);
if (result != VK_SUCCESS) {
ws->cs_destroy(cs);
return false;
}
device->sqtt.start_cs[family] = cs;
return radv_queue_internal_submit(queue, cs);
}
static bool
radv_end_sqtt(struct radv_queue *queue)
{
struct radv_device *device = radv_queue_device(queue);
enum radv_queue_family family = queue->state.qf;
struct radeon_winsys *ws = device->ws;
struct radeon_cmdbuf *cs;
VkResult result;
/* Destroy the previous stop CS and create a new one. */
if (device->sqtt.stop_cs[family]) {
ws->cs_destroy(device->sqtt.stop_cs[family]);
device->sqtt.stop_cs[family] = NULL;
}
cs = ws->cs_create(ws, radv_queue_ring(queue), false);
if (!cs)
return false;
radeon_check_space(ws, cs, 512);
radeon_begin(cs);
switch (family) {
case RADV_QUEUE_GENERAL:
radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
break;
case RADV_QUEUE_COMPUTE:
radeon_emit(PKT3(PKT3_NOP, 0, 0));
radeon_emit(0);
break;
default:
unreachable("Incorrect queue family");
break;
}
radeon_end();
/* Make sure to wait-for-idle before stopping SQTT. */
radv_emit_wait_for_idle(device, cs, family);
if (device->spm.bo) {
radeon_check_space(ws, cs, 8);
radv_perfcounter_emit_spm_stop(device, cs, family);
}
/* Stop SQTT. */
radv_emit_sqtt_stop(device, cs, family);
radv_perfcounter_emit_reset(cs, true);
/* Restore previous state by disabling SQG events. */
radv_emit_spi_config_cntl(device, cs, false);
/* Restore previous state by re-enabling clock gating. */
radv_emit_inhibit_clockgating(device, cs, false);
result = ws->cs_finalize(cs);
if (result != VK_SUCCESS) {
ws->cs_destroy(cs);
return false;
}
device->sqtt.stop_cs[family] = cs;
return radv_queue_internal_submit(queue, cs);
}
void
radv_sqtt_start_capturing(struct radv_queue *queue)
{
struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
if (ac_check_profile_state(&pdev->info)) {
fprintf(stderr, "radv: Canceling RGP trace request as a hang condition has been "
"detected. Force the GPU into a profiling mode with e.g. "
"\"echo profile_peak > "
"/sys/class/drm/card0/device/power_dpm_force_performance_level\"\n");
return;
}
/* Sample CPU/GPU clocks before starting the trace. */
if (!radv_sqtt_sample_clocks(device)) {
fprintf(stderr, "radv: Failed to sample clocks\n");
}
radv_begin_sqtt(queue);
assert(!device->sqtt_enabled);
device->sqtt_enabled = true;
}
bool
radv_sqtt_stop_capturing(struct radv_queue *queue)
{
struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
struct ac_sqtt_trace sqtt_trace = {0};
struct ac_spm_trace spm_trace;
bool captured = true;
radv_end_sqtt(queue);
device->sqtt_enabled = false;
/* TODO: Do something better than this whole sync. */
device->vk.dispatch_table.QueueWaitIdle(radv_queue_to_handle(queue));
if (radv_get_sqtt_trace(queue, &sqtt_trace) && (!device->spm.bo || radv_get_spm_trace(queue, &spm_trace))) {
ac_dump_rgp_capture(&pdev->info, &sqtt_trace, device->spm.bo ? &spm_trace : NULL);
} else {
/* Failed to capture because the buffer was too small. */
captured = false;
}
/* Clear resources used for this capture. */
radv_reset_sqtt_trace(device);
return captured;
}
bool
radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace)
{
struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radeon_info *gpu_info = &pdev->info;
if (!ac_sqtt_get_trace(&device->sqtt, gpu_info, sqtt_trace)) {
if (!radv_sqtt_resize_bo(device))
fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n");
return false;
}
return true;
}
void
radv_reset_sqtt_trace(struct radv_device *device)
{
struct ac_sqtt *sqtt = &device->sqtt;
struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event;
/* Clear clock calibration records. */
simple_mtx_lock(&clock_calibration->lock);
list_for_each_entry_safe (struct rgp_clock_calibration_record, record, &clock_calibration->record, list) {
clock_calibration->record_count--;
list_del(&record->list);
free(record);
}
simple_mtx_unlock(&clock_calibration->lock);
/* Clear queue event records. */
simple_mtx_lock(&queue_event->lock);
list_for_each_entry_safe (struct rgp_queue_event_record, record, &queue_event->record, list) {
list_del(&record->list);
free(record);
}
queue_event->record_count = 0;
simple_mtx_unlock(&queue_event->lock);
/* Clear timestamps. */
radv_sqtt_reset_timestamp(device);
/* Clear timed cmdbufs. */
simple_mtx_lock(&device->sqtt_command_pool_mtx);
for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) {
if (device->sqtt_command_pool[i])
vk_common_TrimCommandPool(radv_device_to_handle(device), vk_command_pool_to_handle(device->sqtt_command_pool[i]),
0);
}
simple_mtx_unlock(&device->sqtt_command_pool_mtx);
}
static VkResult
radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timestamp, uint64_t *gpu_timestamp)
{
uint64_t timestamps[2];
uint64_t max_deviation;
VkResult result;
const VkCalibratedTimestampInfoKHR timestamp_infos[2] = {{
.sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
.timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
},
{
.sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
.timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
}};
result = device->vk.dispatch_table.GetCalibratedTimestampsKHR(radv_device_to_handle(device), 2, timestamp_infos,
timestamps, &max_deviation);
if (result != VK_SUCCESS)
return result;
*cpu_timestamp = timestamps[0];
*gpu_timestamp = timestamps[1];
return result;
}
bool
radv_sqtt_sample_clocks(struct radv_device *device)
{
uint64_t cpu_timestamp = 0, gpu_timestamp = 0;
VkResult result;
result = radv_get_calibrated_timestamps(device, &cpu_timestamp, &gpu_timestamp);
if (result != VK_SUCCESS)
return false;
return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp);
}
VkResult
radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, uint32_t timestamp_offset,
VkPipelineStageFlags2 timestamp_stage, VkCommandBuffer *pcmdbuf)
{
struct radv_device *device = radv_queue_device(queue);
enum radv_queue_family queue_family = queue->state.qf;
VkCommandBuffer cmdbuf;
uint64_t timestamp_va;
VkResult result;
assert(queue_family == RADV_QUEUE_GENERAL || queue_family == RADV_QUEUE_COMPUTE);
simple_mtx_lock(&device->sqtt_command_pool_mtx);
const VkCommandBufferAllocateInfo alloc_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = vk_command_pool_to_handle(device->sqtt_command_pool[queue_family]),
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
result = vk_common_AllocateCommandBuffers(radv_device_to_handle(device), &alloc_info, &cmdbuf);
if (result != VK_SUCCESS)
goto fail;
const VkCommandBufferBeginInfo begin_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
};
result = radv_BeginCommandBuffer(cmdbuf, &begin_info);
if (result != VK_SUCCESS)
goto fail;
radeon_check_space(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, 28);
timestamp_va = radv_buffer_get_va(timestamp_bo) + timestamp_offset;
radv_cs_add_buffer(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, timestamp_bo);
radv_write_timestamp(radv_cmd_buffer_from_handle(cmdbuf), timestamp_va, timestamp_stage);
result = radv_EndCommandBuffer(cmdbuf);
if (result != VK_SUCCESS)
goto fail;
*pcmdbuf = cmdbuf;
fail:
simple_mtx_unlock(&device->sqtt_command_pool_mtx);
return result;
}