winsys/amdgpu: enable userq reg shadowing for gfx11.5
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36700>
This commit is contained in:
Yogesh Mohan Marimuthu 2025-10-21 19:42:06 +05:30 committed by Marge Bot
parent 700850f29d
commit 3ba6c9d0ac
8 changed files with 136 additions and 13 deletions

View file

@ -2929,10 +2929,8 @@ void ac_print_nonshadowed_regs(enum amd_gfx_level gfx_level, enum radeon_family
}
}
static void ac_build_load_reg(const struct radeon_info *info,
struct ac_pm4_state *pm4,
enum ac_reg_range_type type,
uint64_t gpu_address)
void ac_build_load_reg(const struct radeon_info *info, struct ac_pm4_state *pm4,
enum ac_reg_range_type type, uint64_t gpu_address)
{
unsigned packet, num_ranges, offset;
const struct ac_reg_range *ranges;

View file

@ -32,6 +32,8 @@ void ac_get_reg_ranges(enum amd_gfx_level gfx_level, enum radeon_family family,
const struct ac_reg_range **ranges);
struct ac_pm4_state *ac_emulate_clear_state(const struct radeon_info *info);
void ac_print_nonshadowed_regs(enum amd_gfx_level gfx_level, enum radeon_family family);
void ac_build_load_reg(const struct radeon_info *info, struct ac_pm4_state *pm4,
enum ac_reg_range_type type, uint64_t gpu_address);
struct ac_pm4_state *ac_create_shadowing_ib_preamble(const struct radeon_info *info,
uint64_t gpu_address,

View file

@ -15,6 +15,36 @@ bool si_init_cp_reg_shadowing(struct si_context *sctx)
return false;
if (sctx->uses_userq_reg_shadowing) {
/* In case of GFX11_5, shadow_va passed in ac_drm_create_userqueue() is not used by the
* firmware. Instead need to initialize the register shadowing addresses using LOAD_* packets.
* Also the LOAD_* packets and enabling register shadowing in CONTEXT_CONTROL packet has to
* be submitted for every job.
*/
if (sctx->gfx_level == GFX11_5) {
struct ac_pm4_state *shadowing_pm4 = ac_pm4_create_sized(&sctx->screen->info, false, 1024,
sctx->is_gfx_queue);
if (!shadowing_pm4) {
mesa_loge("failed to allocate memory for shadowing_pm4");
return false;
}
ac_pm4_cmd_add(shadowing_pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
ac_pm4_cmd_add(shadowing_pm4, CC0_UPDATE_LOAD_ENABLES(1) |
CC0_LOAD_PER_CONTEXT_STATE(1) | CC0_LOAD_CS_SH_REGS(1) |
CC0_LOAD_GFX_SH_REGS(1) | CC0_LOAD_GLOBAL_UCONFIG(1));
ac_pm4_cmd_add(shadowing_pm4, CC1_UPDATE_SHADOW_ENABLES(1) |
CC1_SHADOW_PER_CONTEXT_STATE(1) | CC1_SHADOW_CS_SH_REGS(1) |
CC1_SHADOW_GFX_SH_REGS(1) | CC1_SHADOW_GLOBAL_UCONFIG(1) |
CC1_SHADOW_GLOBAL_CONFIG(1));
for (unsigned i = 0; i < SI_NUM_REG_RANGES; i++)
ac_build_load_reg(&sctx->screen->info, shadowing_pm4, i,
sctx->ws->userq_f32_get_shadow_regs_va(&sctx->gfx_cs));
sctx->ws->userq_f32_init_reg_shadowing(&sctx->gfx_cs, shadowing_pm4);
ac_pm4_free_state(shadowing_pm4);
}
sctx->ws->userq_submit_cs_preamble_ib_once(&sctx->gfx_cs, &sctx->cs_preamble_state->base);
si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0);
sctx->cs_preamble_state = NULL;

View file

@ -5119,13 +5119,18 @@ static bool gfx10_init_gfx_preamble_state(struct si_context *sctx)
}
if (sctx->uses_userq_reg_shadowing) {
ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
ac_pm4_cmd_add(&pm4->base, CC0_UPDATE_LOAD_ENABLES(1) | CC0_LOAD_PER_CONTEXT_STATE(1) |
CC0_LOAD_CS_SH_REGS(1) | CC0_LOAD_GFX_SH_REGS(1) |
CC0_LOAD_GLOBAL_UCONFIG(1));
ac_pm4_cmd_add(&pm4->base, CC1_UPDATE_SHADOW_ENABLES(1) | CC1_SHADOW_PER_CONTEXT_STATE(1) |
CC1_SHADOW_CS_SH_REGS(1) | CC1_SHADOW_GFX_SH_REGS(1) |
CC1_SHADOW_GLOBAL_UCONFIG(1) | CC1_SHADOW_GLOBAL_CONFIG(1));
/* In case of GFX11_5, CONTEXT_CONTROL packet is added in si_init_cp_reg_shaodwing()
* function.
*/
if (sctx->gfx_level != GFX11_5) {
ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
ac_pm4_cmd_add(&pm4->base, CC0_UPDATE_LOAD_ENABLES(1) | CC0_LOAD_PER_CONTEXT_STATE(1) |
CC0_LOAD_CS_SH_REGS(1) | CC0_LOAD_GFX_SH_REGS(1) |
CC0_LOAD_GLOBAL_UCONFIG(1));
ac_pm4_cmd_add(&pm4->base, CC1_UPDATE_SHADOW_ENABLES(1) | CC1_SHADOW_PER_CONTEXT_STATE(1) |
CC1_SHADOW_CS_SH_REGS(1) | CC1_SHADOW_GFX_SH_REGS(1) |
CC1_SHADOW_GLOBAL_UCONFIG(1) | CC1_SHADOW_GLOBAL_CONFIG(1));
}
} else if (sctx->is_gfx_queue && !sctx->uses_kernelq_reg_shadowing) {
ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
ac_pm4_cmd_add(&pm4->base, CC0_UPDATE_LOAD_ENABLES(1));

View file

@ -806,6 +806,21 @@ struct radeon_winsys {
* be combined as a gang submission to GPU.
*/
bool (*cs_create_compute_gang)(struct radeon_cmdbuf *rcs);
/**
* In case of gfx11.5, register shadowing enabling and shadow regs addresses has to be done
* using CONTEXT_CONTROL and LOAD_* packets. Also these packets have to be sumitted for every
* job.
*/
bool (*userq_f32_init_reg_shadowing)(struct radeon_cmdbuf *rcs, struct ac_pm4_state *pm4);
/**
* Gets the shadow regs va address from the given radeon_cmdbuf. The radeon_cmdbuf will be gfx_cs
* and it is per context. In case of userqueue, The shadow regs va address is per userqueue. The
* gfx_cs will be for tied to a userqueue and the shadow regs va address returned will be for
* that userqueue.
*/
uint64_t (*userq_f32_get_shadow_regs_va)(struct radeon_cmdbuf *rcs);
};
static inline bool radeon_emitted(struct radeon_cmdbuf *rcs, unsigned num_dw)

View file

@ -1476,6 +1476,13 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0));
amdgpu_pkt_add_dw(0x0);
if (userq->f32_shadowing_ib_bo) {
amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo));
amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo) >> 32);
amdgpu_pkt_add_dw(userq->f32_shadowing_ib_pm4_dw | S_3F3_INHERIT_VMID_MQD_GFX(1));
}
amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
amdgpu_pkt_add_dw(csc->chunk_ib[IB_MAIN].va_start);
amdgpu_pkt_add_dw(csc->chunk_ib[IB_MAIN].va_start >> 32);

View file

@ -87,6 +87,7 @@ amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq)
radeon_bo_reference(&aws->dummy_sws.base, &userq->gfx_data.csa_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->gfx_data.shadow_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->cs_preamble_ib_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->f32_shadowing_ib_bo, NULL);
break;
case AMD_IP_COMPUTE:
radeon_bo_reference(&aws->dummy_sws.base, &userq->compute_data.eop_bo, NULL);
@ -237,7 +238,7 @@ amdgpu_userq_submit_cs_preamble_ib_once(struct radeon_cmdbuf *rcs, struct ac_pm4
struct amdgpu_cs *acs = amdgpu_cs(rcs);
struct amdgpu_winsys *aws = acs->aws;
struct amdgpu_userq *userq = &aws->queues[acs->queue_index].userq;
uint64_t *cs_preamble_ib_bo_map;
uint8_t *cs_preamble_ib_bo_map;
simple_mtx_lock(&userq->lock);
@ -248,7 +249,6 @@ amdgpu_userq_submit_cs_preamble_ib_once(struct radeon_cmdbuf *rcs, struct ac_pm4
userq->is_cs_preamble_ib_sent = true;
assert(userq->ip_type == AMD_IP_GFX);
assert(!userq->next_wptr);
userq->cs_preamble_ib_bo = amdgpu_bo_create(aws, pm4->ndw * 4, 256, RADEON_DOMAIN_GTT,
RADEON_FLAG_GL2_BYPASS |
@ -279,7 +279,66 @@ amdgpu_userq_submit_cs_preamble_ib_once(struct radeon_cmdbuf *rcs, struct ac_pm4
return true;
}
static bool
amdgpu_userq_f32_init_reg_shadowing(struct radeon_cmdbuf *rcs, struct ac_pm4_state *pm4)
{
struct amdgpu_cs *acs = amdgpu_cs(rcs);
struct amdgpu_winsys *aws = acs->aws;
struct amdgpu_userq *userq = &aws->queues[acs->queue_index].userq;
uint8_t *shadowing_ib_bo_map;
simple_mtx_lock(&userq->lock);
if (userq->f32_is_shadowing_ib_initialized) {
simple_mtx_unlock(&userq->lock);
return true;
}
userq->f32_is_shadowing_ib_initialized = true;
assert(userq->ip_type == AMD_IP_GFX);
assert(!userq->next_wptr);
userq->f32_shadowing_ib_bo = amdgpu_bo_create(aws, pm4->ndw * 4, 256, RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_INTERPROCESS_SHARING);
if (!userq->f32_shadowing_ib_bo) {
simple_mtx_unlock(&userq->lock);
return false;
}
shadowing_ib_bo_map = amdgpu_bo_map(&aws->dummy_sws.base, userq->f32_shadowing_ib_bo, NULL,
PIPE_MAP_READ | PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
if (!shadowing_ib_bo_map) {
simple_mtx_unlock(&userq->lock);
return false;
}
memcpy(shadowing_ib_bo_map, &pm4->pm4, pm4->ndw * 4);
userq->f32_shadowing_ib_pm4_dw = pm4->ndw;
amdgpu_pkt_begin();
amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo));
amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo) >> 32);
amdgpu_pkt_add_dw(pm4->ndw | S_3F3_INHERIT_VMID_MQD_GFX(1));
amdgpu_pkt_end();
simple_mtx_unlock(&userq->lock);
return true;
}
static uint64_t
amdgpu_userq_f32_get_shadow_regs_va(struct radeon_cmdbuf *rcs) {
struct amdgpu_cs *acs = amdgpu_cs(rcs);
struct amdgpu_winsys *aws = acs->aws;
struct amdgpu_userq *userq = &aws->queues[acs->queue_index].userq;
assert(userq->ip_type == AMDGPU_HW_IP_GFX);
return amdgpu_bo_get_va(userq->gfx_data.shadow_bo);
}
void amdgpu_userq_init_functions(struct amdgpu_screen_winsys *sws)
{
sws->base.userq_submit_cs_preamble_ib_once = amdgpu_userq_submit_cs_preamble_ib_once;
sws->base.userq_f32_init_reg_shadowing = amdgpu_userq_f32_init_reg_shadowing;
sws->base.userq_f32_get_shadow_regs_va = amdgpu_userq_f32_get_shadow_regs_va;
}

View file

@ -79,6 +79,13 @@ struct amdgpu_userq {
struct pb_buffer_lean *doorbell_bo;
uint64_t *doorbell_bo_map;
/* In case of gfx11.5 shadow register address has to be initialized using LOAD_* packet.
* Also for every new ib/job submission, the shadowed registers has to be loaded using LOAD_*
* packets.
*/
struct pb_buffer_lean *f32_shadowing_ib_bo;
uint32_t f32_shadowing_ib_pm4_dw;
bool f32_is_shadowing_ib_initialized;
struct pb_buffer_lean *cs_preamble_ib_bo;
bool is_cs_preamble_ib_sent;
uint32_t userq_handle;