ac/sqtt: fix registers programming for GFX12

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34049>
This commit is contained in:
Samuel Pitoiset 2025-03-13 11:35:36 +01:00 committed by Marge Bot
parent 13836575e3
commit 3bf2f95a91
4 changed files with 31 additions and 26 deletions

View file

@ -15,13 +15,6 @@
#include "sid.h"
uint32_t
ac_sqtt_get_buffer_align_shift(const struct radeon_info *info)
{
/* SQTT buffer VA is 36-bits on GFX8-11.5. */
return info->gfx_level >= GFX12 ? 0 : 12;
}
uint64_t
ac_sqtt_get_info_offset(unsigned se)
{
@ -31,11 +24,10 @@ ac_sqtt_get_info_offset(unsigned se)
uint64_t
ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
{
const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(rad_info);
unsigned max_se = rad_info->max_se;
uint64_t data_offset;
data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << align_shift);
data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << SQTT_BUFFER_ALIGN_SHIFT);
data_offset += data->buffer_size * se;
return data_offset;
@ -239,7 +231,9 @@ ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
{
uint32_t cu_index;
if (info->gfx_level >= GFX11) {
if (info->gfx_level >= GFX12) {
cu_index = 0;
}else if (info->gfx_level >= GFX11) {
/* GFX11 seems to operate on the last active CU. */
cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
} else {
@ -303,7 +297,7 @@ ac_sqtt_get_ctrl(const struct radeon_info *info, bool enable)
if (info->gfx_level >= GFX11) {
if (info->gfx_level >= GFX12) {
ctrl = S_0367B0_UTIL_TIMER_GFX12(1);
ctrl = S_0367B0_UTIL_TIMER_GFX12(1) | S_0367B0_LOWATER_OFFSET(4);
} else {
ctrl = S_0367B0_UTIL_TIMER_GFX11(1) | S_0367B0_RT_FREQ(2); /* 4096 clk */
}
@ -346,14 +340,13 @@ void
ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
const struct ac_sqtt *sqtt, bool is_compute_queue)
{
const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(info);
const uint32_t shifted_size = sqtt->buffer_size >> align_shift;
const uint32_t shifted_size = sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
const unsigned shader_mask = ac_sqtt_get_shader_mask(info);
const unsigned max_se = info->max_se;
for (unsigned se = 0; se < max_se; se++) {
uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se);
uint64_t shifted_va = data_va >> align_shift;
uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
int active_cu = ac_sqtt_get_active_cu(info, se);
if (ac_sqtt_se_is_disabled(info, se))
@ -371,6 +364,8 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
ac_pm4_set_reg(pm4, R_03679C_SQ_THREAD_TRACE_BUF0_BASE_LO, shifted_va);
ac_pm4_set_reg(pm4, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE_HI, S_0367A0_BASE_HI(shifted_va >> 32));
ac_pm4_set_reg(pm4, R_0367BC_SQ_THREAD_TRACE_WPTR, 0);
} else {
ac_pm4_set_reg(pm4, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
@ -387,7 +382,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
/* Performance counters with SQTT are considered deprecated. */
uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
uint32_t token_exclude = 0;
if (!sqtt->instruction_timing_enabled) {
/* Reduce SQTT traffic when instruction timing isn't enabled. */
@ -397,8 +392,14 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
}
if (info->gfx_level >= GFX12) {
sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX12(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX12(1);
sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX12(token_exclude) |
S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX12(1) |
S_0367B8_EXCLUDE_BARRIER_WAIT(1) |
S_0367B8_REG_EXCLUDE(2); /* CP_ME_MC_RADDR */
} else {
/* Performance counters with SQTT are considered deprecated. */
token_exclude |= V_0367B8_TOKEN_EXCLUDE_PERF;
sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX11(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX11(1);
}

View file

@ -16,6 +16,8 @@
#include "ac_rgp.h"
#include "amd_family.h"
#define SQTT_BUFFER_ALIGN_SHIFT 12
struct radeon_cmdbuf;
struct radeon_info;
@ -543,8 +545,6 @@ bool ac_check_profile_state(const struct radeon_info *info);
union rgp_sqtt_marker_cb_id ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *sqtt,
enum amd_ip_type ip_type);
uint32_t ac_sqtt_get_buffer_align_shift(const struct radeon_info *info);
bool ac_sqtt_get_trace(struct ac_sqtt *sqtt, const struct radeon_info *info,
struct ac_sqtt_trace *sqtt_trace);

View file

@ -151,7 +151,10 @@ radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf
{
const struct radv_physical_device *pdev = radv_device_physical(device);
if (pdev->info.gfx_level >= GFX9) {
if (pdev->info.gfx_level >= GFX12) {
radeon_set_uconfig_reg(cs, R_031120_SPI_SQG_EVENT_CTL,
S_031120_ENABLE_SQG_TOP_EVENTS(enable) | S_031120_ENABLE_SQG_BOP_EVENTS(enable));
} else if (pdev->info.gfx_level >= GFX9) {
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
@ -316,7 +319,6 @@ static bool
radv_sqtt_init_bo(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&pdev->info);
unsigned max_se = pdev->info.max_se;
struct radeon_winsys *ws = device->ws;
VkResult result;
@ -325,10 +327,10 @@ radv_sqtt_init_bo(struct radv_device *device)
/* The buffer size and address need to be aligned in HW regs. Align the
* size as early as possible so that we do all the allocation & addressing
* correctly. */
device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << align_shift);
device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << SQTT_BUFFER_ALIGN_SHIFT);
/* Compute total size of the thread trace BO for all SEs. */
size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << align_shift);
size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << SQTT_BUFFER_ALIGN_SHIFT);
size += device->sqtt.buffer_size * (uint64_t)max_se;
struct radeon_winsys_bo *bo = NULL;

View file

@ -21,7 +21,6 @@ si_emit_spi_config_cntl(struct si_context *sctx,
static bool si_sqtt_init_bo(struct si_context *sctx)
{
const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&sctx->screen->info);
unsigned max_se = sctx->screen->info.max_se;
struct radeon_winsys *ws = sctx->ws;
uint64_t size;
@ -30,11 +29,11 @@ static bool si_sqtt_init_bo(struct si_context *sctx)
* size as early as possible so that we do all the allocation & addressing
* correctly. */
sctx->sqtt->buffer_size =
align64(sctx->sqtt->buffer_size, 1ull << align_shift);
align64(sctx->sqtt->buffer_size, 1ull << SQTT_BUFFER_ALIGN_SHIFT);
/* Compute total size of the thread trace BO for all SEs. */
size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
1ull << align_shift);
1ull << SQTT_BUFFER_ALIGN_SHIFT);
size += sctx->sqtt->buffer_size * (uint64_t)max_se;
sctx->sqtt->bo =
@ -531,7 +530,10 @@ si_emit_spi_config_cntl(struct si_context *sctx,
{
radeon_begin(cs);
if (sctx->gfx_level >= GFX9) {
if (sctx->gfx_level >= GFX12) {
radeon_set_uconfig_reg(R_031120_SPI_SQG_EVENT_CTL,
S_031120_ENABLE_SQG_TOP_EVENTS(enable) | S_031120_ENABLE_SQG_BOP_EVENTS(enable));
} else if (sctx->gfx_level >= GFX9) {
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
S_031100_EXP_PRIORITY_ORDER(3) |
S_031100_ENABLE_SQG_TOP_EVENTS(enable) |