From 32c7805ccca331f726da684a4e74f7d1138daa3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 14 May 2022 10:23:05 -0400 Subject: [PATCH] radeonsi: merge all preamble states into one Tess registers are appended. GS registers are appended or overwritten if they are already set. There are separate TMZ and non-TMZ preambles. The preamble will be passed to the kernel as an IB to execute on a context switch only. Reviewed-by: Mihai Preda Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_debug.c | 4 - src/gallium/drivers/radeonsi/si_gfx_cs.c | 7 +- src/gallium/drivers/radeonsi/si_pipe.c | 9 +- src/gallium/drivers/radeonsi/si_pipe.h | 8 +- src/gallium/drivers/radeonsi/si_state.c | 4 + .../drivers/radeonsi/si_state_shaders.cpp | 130 +++++++++--------- 6 files changed, 80 insertions(+), 82 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index c53d2f589b9..85f5667f2a7 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -416,10 +416,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) if (ctx->cs_preamble_state) ac_parse_ib(f, ctx->cs_preamble_state->pm4, ctx->cs_preamble_state->ndw, NULL, 0, "IB2: Init config", ctx->gfx_level, NULL, NULL); - - if (ctx->cs_preamble_gs_rings) - ac_parse_ib(f, ctx->cs_preamble_gs_rings->pm4, ctx->cs_preamble_gs_rings->ndw, NULL, 0, - "IB2: Init GS rings", ctx->gfx_level, NULL, NULL); } if (scs->flushed) { diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 7cc718fd452..6bebfffccb9 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -437,12 +437,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) /* The CS initialization should be emitted before everything else. */ if (ctx->cs_preamble_state) - si_pm4_emit(ctx, ctx->cs_preamble_state); - if (ctx->cs_preamble_tess_rings) - si_pm4_emit(ctx, unlikely(is_secure) ? ctx->cs_preamble_tess_rings_tmz : - ctx->cs_preamble_tess_rings); - if (ctx->cs_preamble_gs_rings) - si_pm4_emit(ctx, ctx->cs_preamble_gs_rings); + si_pm4_emit(ctx, unlikely(is_secure) ? ctx->cs_preamble_state_tmz : ctx->cs_preamble_state); if (ctx->queued.named.ls) ctx->prefetch_L2_mask |= SI_PREFETCH_LS; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 383056f03fe..589af37dd85 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -222,12 +222,9 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->cs_preamble_state) si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0); - if (sctx->cs_preamble_tess_rings) - si_pm4_free_state(sctx, sctx->cs_preamble_tess_rings, ~0); - if (sctx->cs_preamble_tess_rings_tmz) - si_pm4_free_state(sctx, sctx->cs_preamble_tess_rings_tmz, ~0); - if (sctx->cs_preamble_gs_rings) - si_pm4_free_state(sctx, sctx->cs_preamble_gs_rings, ~0); + if (sctx->cs_preamble_state_tmz) + si_pm4_free_state(sctx, sctx->cs_preamble_state_tmz, ~0); + for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++) si_pm4_free_state(sctx, sctx->vgt_shader_config[i], SI_STATE_IDX(vgt_shader_config)); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a492faec8f6..8dcdb6a0d99 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1051,10 +1051,12 @@ struct si_context { /* Precomputed states. */ struct si_pm4_state *cs_preamble_state; - struct si_pm4_state *cs_preamble_tess_rings; - struct si_pm4_state *cs_preamble_tess_rings_tmz; - struct si_pm4_state *cs_preamble_gs_rings; + struct si_pm4_state *cs_preamble_state_tmz; + uint16_t gs_ring_state_dw_offset; + uint16_t gs_ring_state_dw_offset_tmz; bool cs_preamble_has_vgt_flush; + bool cs_preamble_has_vgt_flush_tmz; + struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES]; /* shaders */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index aea74b7f1e9..ee19553f370 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5905,4 +5905,8 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) } sctx->cs_preamble_state = pm4; + + /* Make a copy of the preamble for TMZ. */ + sctx->cs_preamble_state_tmz = (struct si_pm4_state *)CALLOC_STRUCT(si_cs_preamble); + memcpy(sctx->cs_preamble_state_tmz, sctx->cs_preamble_state, sizeof(struct si_cs_preamble)); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 8d70cfcba90..4e824f39d28 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3666,22 +3666,27 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) /** * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that. */ -static void si_cs_preamble_add_vgt_flush(struct si_context *sctx) +static void si_cs_preamble_add_vgt_flush(struct si_context *sctx, bool tmz) { + struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state; + bool *has_vgt_flush = tmz ? &sctx->cs_preamble_has_vgt_flush_tmz : + &sctx->cs_preamble_has_vgt_flush; + /* We shouldn't get here if registers are shadowed. */ assert(!sctx->shadowed_regs); - if (sctx->cs_preamble_has_vgt_flush) + if (*has_vgt_flush) return; /* Done by Vulkan before VGT_FLUSH. */ - si_pm4_cmd_add(sctx->cs_preamble_state, PKT3(PKT3_EVENT_WRITE, 0, 0)); - si_pm4_cmd_add(sctx->cs_preamble_state, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ - si_pm4_cmd_add(sctx->cs_preamble_state, PKT3(PKT3_EVENT_WRITE, 0, 0)); - si_pm4_cmd_add(sctx->cs_preamble_state, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); - sctx->cs_preamble_has_vgt_flush = true; + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + + *has_vgt_flush = true; } /** @@ -3709,7 +3714,6 @@ bool si_update_gs_ring_buffers(struct si_context *sctx) struct si_shader_selector *es = sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso; struct si_shader_selector *gs = sctx->shader.gs.cso; - struct si_pm4_state *pm4; /* Chip constants. */ unsigned num_se = sctx->screen->info.max_se; @@ -3811,32 +3815,43 @@ bool si_update_gs_ring_buffers(struct si_context *sctx) } /* The codepath without register shadowing. */ - /* Create the "cs_preamble_gs_rings" state. */ - pm4 = CALLOC_STRUCT(si_pm4_state); - if (!pm4) - return false; + for (unsigned tmz = 0; tmz <= 1; tmz++) { + struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state; + uint16_t *gs_ring_state_dw_offset = tmz ? &sctx->gs_ring_state_dw_offset_tmz : + &sctx->gs_ring_state_dw_offset; + unsigned old_ndw = 0; - if (sctx->gfx_level >= GFX7) { - if (sctx->esgs_ring) { - assert(sctx->gfx_level <= GFX8); - si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); + si_cs_preamble_add_vgt_flush(sctx, tmz); + + if (!*gs_ring_state_dw_offset) { + /* We are here for the first time. The packets will be added. */ + *gs_ring_state_dw_offset = pm4->ndw; + } else { + /* We have been here before. Overwrite the previous packets. */ + old_ndw = pm4->ndw; + pm4->ndw = *gs_ring_state_dw_offset; + } + + if (sctx->gfx_level >= GFX7) { + if (sctx->esgs_ring) { + assert(sctx->gfx_level <= GFX8); + si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); + } + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); + } else { + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); + } + + if (old_ndw) { + pm4->ndw = old_ndw; + pm4->last_opcode = 255; /* invalid opcode (we don't save the last opcode) */ } - if (sctx->gsvs_ring) - si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); - } else { - if (sctx->esgs_ring) - si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); - if (sctx->gsvs_ring) - si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); } - /* Set the state. */ - if (sctx->cs_preamble_gs_rings) - si_pm4_free_state(sctx, sctx->cs_preamble_gs_rings, ~0); - sctx->cs_preamble_gs_rings = pm4; - - si_cs_preamble_add_vgt_flush(sctx); - /* Flush the context to re-emit both cs_preamble states. */ sctx->initial_gfx_cs_size = 0; /* force flush */ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); @@ -4081,42 +4096,31 @@ void si_init_tess_factor_ring(struct si_context *sctx) return; } - /* The codepath without register shadowing. */ - si_cs_preamble_add_vgt_flush(sctx); + /* The codepath without register shadowing is below. */ + /* Add these registers to cs_preamble_state. */ + for (unsigned tmz = 0; tmz <= 1; tmz++) { + struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state; + struct pipe_resource *tf_ring = tmz ? sctx->tess_rings_tmz : sctx->tess_rings; - /* Append these registers to the init config state. */ - if (sctx->gfx_level >= GFX7) { - si_pm4_set_reg(sctx->cs_preamble_state, R_030938_VGT_TF_RING_SIZE, - S_030938_SIZE(tf_ring_size_field)); - si_pm4_set_reg(sctx->cs_preamble_state, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8); - if (sctx->gfx_level >= GFX10) - si_pm4_set_reg(sctx->cs_preamble_state, R_030984_VGT_TF_MEMORY_BASE_HI, - S_030984_BASE_HI(factor_va >> 40)); - else if (sctx->gfx_level == GFX9) - si_pm4_set_reg(sctx->cs_preamble_state, R_030944_VGT_TF_MEMORY_BASE_HI, - S_030944_BASE_HI(factor_va >> 40)); - si_pm4_set_reg(sctx->cs_preamble_state, R_03093C_VGT_HS_OFFCHIP_PARAM, - sctx->screen->hs.hs_offchip_param); - } else { - struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); + if (!tf_ring) + continue; /* TMZ not supported */ - si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE, - S_008988_SIZE(tf_ring_size_field)); - si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8); - si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM, - sctx->screen->hs.hs_offchip_param); - sctx->cs_preamble_tess_rings = pm4; + uint64_t va = si_resource(tf_ring)->gpu_address + sctx->screen->hs.tess_offchip_ring_size; - if (sctx->screen->info.has_tmz_support) { - pm4 = CALLOC_STRUCT(si_pm4_state); - uint64_t factor_va_tmz = - si_resource(sctx->tess_rings_tmz)->gpu_address + sctx->screen->hs.tess_offchip_ring_size; - si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE, - S_008988_SIZE(tf_ring_size_field)); - si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va_tmz >> 8); - si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM, - sctx->screen->hs.hs_offchip_param); - sctx->cs_preamble_tess_rings_tmz = pm4; + si_cs_preamble_add_vgt_flush(sctx, tmz); + + if (sctx->gfx_level >= GFX7) { + si_pm4_set_reg(pm4, R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(tf_ring_size_field)); + si_pm4_set_reg(pm4, R_03093C_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param); + si_pm4_set_reg(pm4, R_030940_VGT_TF_MEMORY_BASE, va >> 8); + if (sctx->gfx_level >= GFX10) + si_pm4_set_reg(pm4, R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40)); + else if (sctx->gfx_level == GFX9) + si_pm4_set_reg(pm4, R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40)); + } else { + si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size_field)); + si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8); + si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param); } }