From 3bf2f95a917d9e71b17fa76bc9fd479f0c3e5ccb Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 13 Mar 2025 11:35:36 +0100 Subject: [PATCH] ac/sqtt: fix registers programming for GFX12 Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_sqtt.c | 33 +++++++++++++------------- src/amd/common/ac_sqtt.h | 4 ++-- src/amd/vulkan/radv_sqtt.c | 10 ++++---- src/gallium/drivers/radeonsi/si_sqtt.c | 10 ++++---- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/amd/common/ac_sqtt.c b/src/amd/common/ac_sqtt.c index 1054c203c54..239cf7c3eb3 100644 --- a/src/amd/common/ac_sqtt.c +++ b/src/amd/common/ac_sqtt.c @@ -15,13 +15,6 @@ #include "sid.h" -uint32_t -ac_sqtt_get_buffer_align_shift(const struct radeon_info *info) -{ - /* SQTT buffer VA is 36-bits on GFX8-11.5. */ - return info->gfx_level >= GFX12 ? 0 : 12; -} - uint64_t ac_sqtt_get_info_offset(unsigned se) { @@ -31,11 +24,10 @@ ac_sqtt_get_info_offset(unsigned se) uint64_t ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se) { - const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(rad_info); unsigned max_se = rad_info->max_se; uint64_t data_offset; - data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << align_shift); + data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << SQTT_BUFFER_ALIGN_SHIFT); data_offset += data->buffer_size * se; return data_offset; @@ -239,7 +231,9 @@ ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se) { uint32_t cu_index; - if (info->gfx_level >= GFX11) { + if (info->gfx_level >= GFX12) { + cu_index = 0; + }else if (info->gfx_level >= GFX11) { /* GFX11 seems to operate on the last active CU. */ cu_index = util_last_bit(info->cu_mask[se][0]) - 1; } else { @@ -303,7 +297,7 @@ ac_sqtt_get_ctrl(const struct radeon_info *info, bool enable) if (info->gfx_level >= GFX11) { if (info->gfx_level >= GFX12) { - ctrl = S_0367B0_UTIL_TIMER_GFX12(1); + ctrl = S_0367B0_UTIL_TIMER_GFX12(1) | S_0367B0_LOWATER_OFFSET(4); } else { ctrl = S_0367B0_UTIL_TIMER_GFX11(1) | S_0367B0_RT_FREQ(2); /* 4096 clk */ } @@ -346,14 +340,13 @@ void ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, const struct ac_sqtt *sqtt, bool is_compute_queue) { - const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(info); - const uint32_t shifted_size = sqtt->buffer_size >> align_shift; + const uint32_t shifted_size = sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; const unsigned shader_mask = ac_sqtt_get_shader_mask(info); const unsigned max_se = info->max_se; for (unsigned se = 0; se < max_se; se++) { uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se); - uint64_t shifted_va = data_va >> align_shift; + uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; int active_cu = ac_sqtt_get_active_cu(info, se); if (ac_sqtt_se_is_disabled(info, se)) @@ -371,6 +364,8 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, ac_pm4_set_reg(pm4, R_03679C_SQ_THREAD_TRACE_BUF0_BASE_LO, shifted_va); ac_pm4_set_reg(pm4, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE_HI, S_0367A0_BASE_HI(shifted_va >> 32)); + + ac_pm4_set_reg(pm4, R_0367BC_SQ_THREAD_TRACE_WPTR, 0); } else { ac_pm4_set_reg(pm4, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE, S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32)); @@ -387,7 +382,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG); /* Performance counters with SQTT are considered deprecated. */ - uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF; + uint32_t token_exclude = 0; if (!sqtt->instruction_timing_enabled) { /* Reduce SQTT traffic when instruction timing isn't enabled. */ @@ -397,8 +392,14 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, } if (info->gfx_level >= GFX12) { - sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX12(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX12(1); + sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX12(token_exclude) | + S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX12(1) | + S_0367B8_EXCLUDE_BARRIER_WAIT(1) | + S_0367B8_REG_EXCLUDE(2); /* CP_ME_MC_RADDR */ } else { + /* Performance counters with SQTT are considered deprecated. */ + token_exclude |= V_0367B8_TOKEN_EXCLUDE_PERF; + sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX11(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX11(1); } diff --git a/src/amd/common/ac_sqtt.h b/src/amd/common/ac_sqtt.h index 869913bab16..d3596aa2743 100644 --- a/src/amd/common/ac_sqtt.h +++ b/src/amd/common/ac_sqtt.h @@ -16,6 +16,8 @@ #include "ac_rgp.h" #include "amd_family.h" +#define SQTT_BUFFER_ALIGN_SHIFT 12 + struct radeon_cmdbuf; struct radeon_info; @@ -543,8 +545,6 @@ bool ac_check_profile_state(const struct radeon_info *info); union rgp_sqtt_marker_cb_id ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *sqtt, enum amd_ip_type ip_type); -uint32_t ac_sqtt_get_buffer_align_shift(const struct radeon_info *info); - bool ac_sqtt_get_trace(struct ac_sqtt *sqtt, const struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace); diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c index 05bf62cd357..9a77daa2e6f 100644 --- a/src/amd/vulkan/radv_sqtt.c +++ b/src/amd/vulkan/radv_sqtt.c @@ -151,7 +151,10 @@ radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf { const struct radv_physical_device *pdev = radv_device_physical(device); - if (pdev->info.gfx_level >= GFX9) { + if (pdev->info.gfx_level >= GFX12) { + radeon_set_uconfig_reg(cs, R_031120_SPI_SQG_EVENT_CTL, + S_031120_ENABLE_SQG_TOP_EVENTS(enable) | S_031120_ENABLE_SQG_BOP_EVENTS(enable)); + } else if (pdev->info.gfx_level >= GFX9) { uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) | S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable); @@ -316,7 +319,6 @@ static bool radv_sqtt_init_bo(struct radv_device *device) { const struct radv_physical_device *pdev = radv_device_physical(device); - const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&pdev->info); unsigned max_se = pdev->info.max_se; struct radeon_winsys *ws = device->ws; VkResult result; @@ -325,10 +327,10 @@ radv_sqtt_init_bo(struct radv_device *device) /* The buffer size and address need to be aligned in HW regs. Align the * size as early as possible so that we do all the allocation & addressing * correctly. */ - device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << align_shift); + device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << SQTT_BUFFER_ALIGN_SHIFT); /* Compute total size of the thread trace BO for all SEs. */ - size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << align_shift); + size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << SQTT_BUFFER_ALIGN_SHIFT); size += device->sqtt.buffer_size * (uint64_t)max_se; struct radeon_winsys_bo *bo = NULL; diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index 25e132c530b..3099391043a 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -21,7 +21,6 @@ si_emit_spi_config_cntl(struct si_context *sctx, static bool si_sqtt_init_bo(struct si_context *sctx) { - const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&sctx->screen->info); unsigned max_se = sctx->screen->info.max_se; struct radeon_winsys *ws = sctx->ws; uint64_t size; @@ -30,11 +29,11 @@ static bool si_sqtt_init_bo(struct si_context *sctx) * size as early as possible so that we do all the allocation & addressing * correctly. */ sctx->sqtt->buffer_size = - align64(sctx->sqtt->buffer_size, 1ull << align_shift); + align64(sctx->sqtt->buffer_size, 1ull << SQTT_BUFFER_ALIGN_SHIFT); /* Compute total size of the thread trace BO for all SEs. */ size = align64(sizeof(struct ac_sqtt_data_info) * max_se, - 1ull << align_shift); + 1ull << SQTT_BUFFER_ALIGN_SHIFT); size += sctx->sqtt->buffer_size * (uint64_t)max_se; sctx->sqtt->bo = @@ -531,7 +530,10 @@ si_emit_spi_config_cntl(struct si_context *sctx, { radeon_begin(cs); - if (sctx->gfx_level >= GFX9) { + if (sctx->gfx_level >= GFX12) { + radeon_set_uconfig_reg(R_031120_SPI_SQG_EVENT_CTL, + S_031120_ENABLE_SQG_TOP_EVENTS(enable) | S_031120_ENABLE_SQG_BOP_EVENTS(enable)); + } else if (sctx->gfx_level >= GFX9) { uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) | S_031100_ENABLE_SQG_TOP_EVENTS(enable) |