diff --git a/src/amd/common/ac_shadowed_regs.c b/src/amd/common/ac_shadowed_regs.c index edf930b391e..435b497e965 100644 --- a/src/amd/common/ac_shadowed_regs.c +++ b/src/amd/common/ac_shadowed_regs.c @@ -2929,10 +2929,8 @@ void ac_print_nonshadowed_regs(enum amd_gfx_level gfx_level, enum radeon_family } } -static void ac_build_load_reg(const struct radeon_info *info, - struct ac_pm4_state *pm4, - enum ac_reg_range_type type, - uint64_t gpu_address) +void ac_build_load_reg(const struct radeon_info *info, struct ac_pm4_state *pm4, + enum ac_reg_range_type type, uint64_t gpu_address) { unsigned packet, num_ranges, offset; const struct ac_reg_range *ranges; diff --git a/src/amd/common/ac_shadowed_regs.h b/src/amd/common/ac_shadowed_regs.h index ecfc79c722b..66fa88219db 100644 --- a/src/amd/common/ac_shadowed_regs.h +++ b/src/amd/common/ac_shadowed_regs.h @@ -32,6 +32,8 @@ void ac_get_reg_ranges(enum amd_gfx_level gfx_level, enum radeon_family family, const struct ac_reg_range **ranges); struct ac_pm4_state *ac_emulate_clear_state(const struct radeon_info *info); void ac_print_nonshadowed_regs(enum amd_gfx_level gfx_level, enum radeon_family family); +void ac_build_load_reg(const struct radeon_info *info, struct ac_pm4_state *pm4, + enum ac_reg_range_type type, uint64_t gpu_address); struct ac_pm4_state *ac_create_shadowing_ib_preamble(const struct radeon_info *info, uint64_t gpu_address, diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index a9e0d16ef37..488e2c4ba47 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -15,6 +15,36 @@ bool si_init_cp_reg_shadowing(struct si_context *sctx) return false; if (sctx->uses_userq_reg_shadowing) { + /* In case of GFX11_5, shadow_va passed in ac_drm_create_userqueue() is not used by the + * firmware. Instead need to initialize the register shadowing addresses using LOAD_* packets. + * Also the LOAD_* packets and enabling register shadowing in CONTEXT_CONTROL packet has to + * be submitted for every job. + */ + if (sctx->gfx_level == GFX11_5) { + struct ac_pm4_state *shadowing_pm4 = ac_pm4_create_sized(&sctx->screen->info, false, 1024, + sctx->is_gfx_queue); + if (!shadowing_pm4) { + mesa_loge("failed to allocate memory for shadowing_pm4"); + return false; + } + + ac_pm4_cmd_add(shadowing_pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + ac_pm4_cmd_add(shadowing_pm4, CC0_UPDATE_LOAD_ENABLES(1) | + CC0_LOAD_PER_CONTEXT_STATE(1) | CC0_LOAD_CS_SH_REGS(1) | + CC0_LOAD_GFX_SH_REGS(1) | CC0_LOAD_GLOBAL_UCONFIG(1)); + ac_pm4_cmd_add(shadowing_pm4, CC1_UPDATE_SHADOW_ENABLES(1) | + CC1_SHADOW_PER_CONTEXT_STATE(1) | CC1_SHADOW_CS_SH_REGS(1) | + CC1_SHADOW_GFX_SH_REGS(1) | CC1_SHADOW_GLOBAL_UCONFIG(1) | + CC1_SHADOW_GLOBAL_CONFIG(1)); + + for (unsigned i = 0; i < SI_NUM_REG_RANGES; i++) + ac_build_load_reg(&sctx->screen->info, shadowing_pm4, i, + sctx->ws->userq_f32_get_shadow_regs_va(&sctx->gfx_cs)); + + sctx->ws->userq_f32_init_reg_shadowing(&sctx->gfx_cs, shadowing_pm4); + ac_pm4_free_state(shadowing_pm4); + } + sctx->ws->userq_submit_cs_preamble_ib_once(&sctx->gfx_cs, &sctx->cs_preamble_state->base); si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0); sctx->cs_preamble_state = NULL; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 5f1bdbe46c5..949fea24d06 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5119,13 +5119,18 @@ static bool gfx10_init_gfx_preamble_state(struct si_context *sctx) } if (sctx->uses_userq_reg_shadowing) { - ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - ac_pm4_cmd_add(&pm4->base, CC0_UPDATE_LOAD_ENABLES(1) | CC0_LOAD_PER_CONTEXT_STATE(1) | - CC0_LOAD_CS_SH_REGS(1) | CC0_LOAD_GFX_SH_REGS(1) | - CC0_LOAD_GLOBAL_UCONFIG(1)); - ac_pm4_cmd_add(&pm4->base, CC1_UPDATE_SHADOW_ENABLES(1) | CC1_SHADOW_PER_CONTEXT_STATE(1) | - CC1_SHADOW_CS_SH_REGS(1) | CC1_SHADOW_GFX_SH_REGS(1) | - CC1_SHADOW_GLOBAL_UCONFIG(1) | CC1_SHADOW_GLOBAL_CONFIG(1)); + /* In case of GFX11_5, CONTEXT_CONTROL packet is added in si_init_cp_reg_shaodwing() + * function. + */ + if (sctx->gfx_level != GFX11_5) { + ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + ac_pm4_cmd_add(&pm4->base, CC0_UPDATE_LOAD_ENABLES(1) | CC0_LOAD_PER_CONTEXT_STATE(1) | + CC0_LOAD_CS_SH_REGS(1) | CC0_LOAD_GFX_SH_REGS(1) | + CC0_LOAD_GLOBAL_UCONFIG(1)); + ac_pm4_cmd_add(&pm4->base, CC1_UPDATE_SHADOW_ENABLES(1) | CC1_SHADOW_PER_CONTEXT_STATE(1) | + CC1_SHADOW_CS_SH_REGS(1) | CC1_SHADOW_GFX_SH_REGS(1) | + CC1_SHADOW_GLOBAL_UCONFIG(1) | CC1_SHADOW_GLOBAL_CONFIG(1)); + } } else if (sctx->is_gfx_queue && !sctx->uses_kernelq_reg_shadowing) { ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); ac_pm4_cmd_add(&pm4->base, CC0_UPDATE_LOAD_ENABLES(1)); diff --git a/src/gallium/include/winsys/radeon_winsys.h b/src/gallium/include/winsys/radeon_winsys.h index e118baf0f69..ebb439b5a00 100644 --- a/src/gallium/include/winsys/radeon_winsys.h +++ b/src/gallium/include/winsys/radeon_winsys.h @@ -806,6 +806,21 @@ struct radeon_winsys { * be combined as a gang submission to GPU. */ bool (*cs_create_compute_gang)(struct radeon_cmdbuf *rcs); + + /** + * In case of gfx11.5, register shadowing enabling and shadow regs addresses has to be done + * using CONTEXT_CONTROL and LOAD_* packets. Also these packets have to be sumitted for every + * job. + */ + bool (*userq_f32_init_reg_shadowing)(struct radeon_cmdbuf *rcs, struct ac_pm4_state *pm4); + + /** + * Gets the shadow regs va address from the given radeon_cmdbuf. The radeon_cmdbuf will be gfx_cs + * and it is per context. In case of userqueue, The shadow regs va address is per userqueue. The + * gfx_cs will be for tied to a userqueue and the shadow regs va address returned will be for + * that userqueue. + */ + uint64_t (*userq_f32_get_shadow_regs_va)(struct radeon_cmdbuf *rcs); }; static inline bool radeon_emitted(struct radeon_cmdbuf *rcs, unsigned num_dw) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp index c240dd249aa..a1b960e4792 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp @@ -1476,6 +1476,13 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0)); amdgpu_pkt_add_dw(0x0); + if (userq->f32_shadowing_ib_bo) { + amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); + amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo)); + amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo) >> 32); + amdgpu_pkt_add_dw(userq->f32_shadowing_ib_pm4_dw | S_3F3_INHERIT_VMID_MQD_GFX(1)); + } + amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); amdgpu_pkt_add_dw(csc->chunk_ib[IB_MAIN].va_start); amdgpu_pkt_add_dw(csc->chunk_ib[IB_MAIN].va_start >> 32); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c index 0595e5ac0a1..14f937ae27c 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c @@ -87,6 +87,7 @@ amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq) radeon_bo_reference(&aws->dummy_sws.base, &userq->gfx_data.csa_bo, NULL); radeon_bo_reference(&aws->dummy_sws.base, &userq->gfx_data.shadow_bo, NULL); radeon_bo_reference(&aws->dummy_sws.base, &userq->cs_preamble_ib_bo, NULL); + radeon_bo_reference(&aws->dummy_sws.base, &userq->f32_shadowing_ib_bo, NULL); break; case AMD_IP_COMPUTE: radeon_bo_reference(&aws->dummy_sws.base, &userq->compute_data.eop_bo, NULL); @@ -237,7 +238,7 @@ amdgpu_userq_submit_cs_preamble_ib_once(struct radeon_cmdbuf *rcs, struct ac_pm4 struct amdgpu_cs *acs = amdgpu_cs(rcs); struct amdgpu_winsys *aws = acs->aws; struct amdgpu_userq *userq = &aws->queues[acs->queue_index].userq; - uint64_t *cs_preamble_ib_bo_map; + uint8_t *cs_preamble_ib_bo_map; simple_mtx_lock(&userq->lock); @@ -248,7 +249,6 @@ amdgpu_userq_submit_cs_preamble_ib_once(struct radeon_cmdbuf *rcs, struct ac_pm4 userq->is_cs_preamble_ib_sent = true; assert(userq->ip_type == AMD_IP_GFX); - assert(!userq->next_wptr); userq->cs_preamble_ib_bo = amdgpu_bo_create(aws, pm4->ndw * 4, 256, RADEON_DOMAIN_GTT, RADEON_FLAG_GL2_BYPASS | @@ -279,7 +279,66 @@ amdgpu_userq_submit_cs_preamble_ib_once(struct radeon_cmdbuf *rcs, struct ac_pm4 return true; } +static bool +amdgpu_userq_f32_init_reg_shadowing(struct radeon_cmdbuf *rcs, struct ac_pm4_state *pm4) +{ + struct amdgpu_cs *acs = amdgpu_cs(rcs); + struct amdgpu_winsys *aws = acs->aws; + struct amdgpu_userq *userq = &aws->queues[acs->queue_index].userq; + uint8_t *shadowing_ib_bo_map; + + simple_mtx_lock(&userq->lock); + + if (userq->f32_is_shadowing_ib_initialized) { + simple_mtx_unlock(&userq->lock); + return true; + } + + userq->f32_is_shadowing_ib_initialized = true; + assert(userq->ip_type == AMD_IP_GFX); + assert(!userq->next_wptr); + + userq->f32_shadowing_ib_bo = amdgpu_bo_create(aws, pm4->ndw * 4, 256, RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_INTERPROCESS_SHARING); + if (!userq->f32_shadowing_ib_bo) { + simple_mtx_unlock(&userq->lock); + return false; + } + + shadowing_ib_bo_map = amdgpu_bo_map(&aws->dummy_sws.base, userq->f32_shadowing_ib_bo, NULL, + PIPE_MAP_READ | PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED); + if (!shadowing_ib_bo_map) { + simple_mtx_unlock(&userq->lock); + return false; + } + + memcpy(shadowing_ib_bo_map, &pm4->pm4, pm4->ndw * 4); + userq->f32_shadowing_ib_pm4_dw = pm4->ndw; + + amdgpu_pkt_begin(); + amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); + amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo)); + amdgpu_pkt_add_dw(amdgpu_bo_get_va(userq->f32_shadowing_ib_bo) >> 32); + amdgpu_pkt_add_dw(pm4->ndw | S_3F3_INHERIT_VMID_MQD_GFX(1)); + amdgpu_pkt_end(); + + simple_mtx_unlock(&userq->lock); + return true; +} + +static uint64_t +amdgpu_userq_f32_get_shadow_regs_va(struct radeon_cmdbuf *rcs) { + struct amdgpu_cs *acs = amdgpu_cs(rcs); + struct amdgpu_winsys *aws = acs->aws; + struct amdgpu_userq *userq = &aws->queues[acs->queue_index].userq; + + assert(userq->ip_type == AMDGPU_HW_IP_GFX); + return amdgpu_bo_get_va(userq->gfx_data.shadow_bo); +} + void amdgpu_userq_init_functions(struct amdgpu_screen_winsys *sws) { sws->base.userq_submit_cs_preamble_ib_once = amdgpu_userq_submit_cs_preamble_ib_once; + sws->base.userq_f32_init_reg_shadowing = amdgpu_userq_f32_init_reg_shadowing; + sws->base.userq_f32_get_shadow_regs_va = amdgpu_userq_f32_get_shadow_regs_va; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h index 018121eb76b..e2d19e2abf2 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h @@ -79,6 +79,13 @@ struct amdgpu_userq { struct pb_buffer_lean *doorbell_bo; uint64_t *doorbell_bo_map; + /* In case of gfx11.5 shadow register address has to be initialized using LOAD_* packet. + * Also for every new ib/job submission, the shadowed registers has to be loaded using LOAD_* + * packets. + */ + struct pb_buffer_lean *f32_shadowing_ib_bo; + uint32_t f32_shadowing_ib_pm4_dw; + bool f32_is_shadowing_ib_initialized; struct pb_buffer_lean *cs_preamble_ib_bo; bool is_cs_preamble_ib_sent; uint32_t userq_handle;