radeonsi/gfx11: program the attribute ring right before draws

This way, we only wait for idle right before draw packets,
so that all preceding SET packets can be processed in parallel
with draws from the previous IB.

Add a new state atom that is emitted last. It only contains code for gfx11,
but some code for older chips will be added by the next commit.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27943>
This commit is contained in:
Marek Olšák 2024-02-27 16:18:13 -05:00 committed by Marge Bot
parent 9e08569d6a
commit b9b7d34d05
4 changed files with 49 additions and 40 deletions

View file

@ -411,6 +411,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->flags |= SI_CONTEXT_VGT_FLUSH; ctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush); si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_ge_ring_state);
if (ctx->screen->attribute_ring) { if (ctx->screen->attribute_ring) {
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring, radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,

View file

@ -6467,46 +6467,6 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx)
PIXEL_PIPE_STATE_CNTL_STRIDE(2) | PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask)); PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask)); si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
/* We must wait for idle using an EOP event before changing the attribute ring registers.
* Use the bottom-of-pipe EOP event, but increment the PWS counter instead of writing memory.
*/
si_pm4_cmd_add(pm4, PKT3(PKT3_RELEASE_MEM, 6, 0));
si_pm4_cmd_add(pm4, S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) |
S_490_EVENT_INDEX(5) |
S_490_PWS_ENABLE(1));
si_pm4_cmd_add(pm4, 0); /* DST_SEL, INT_SEL, DATA_SEL */
si_pm4_cmd_add(pm4, 0); /* ADDRESS_LO */
si_pm4_cmd_add(pm4, 0); /* ADDRESS_HI */
si_pm4_cmd_add(pm4, 0); /* DATA_LO */
si_pm4_cmd_add(pm4, 0); /* DATA_HI */
si_pm4_cmd_add(pm4, 0); /* INT_CTXID */
/* Wait for the PWS counter. */
si_pm4_cmd_add(pm4, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
si_pm4_cmd_add(pm4, S_580_PWS_STAGE_SEL(V_580_CP_ME) |
S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) |
S_580_PWS_ENA2(1) |
S_580_PWS_COUNT(0));
si_pm4_cmd_add(pm4, 0xffffffff); /* GCR_SIZE */
si_pm4_cmd_add(pm4, 0x01ffffff); /* GCR_SIZE_HI */
si_pm4_cmd_add(pm4, 0); /* GCR_BASE_LO */
si_pm4_cmd_add(pm4, 0); /* GCR_BASE_HI */
si_pm4_cmd_add(pm4, S_585_PWS_ENA(1));
si_pm4_cmd_add(pm4, 0); /* GCR_CNTL */
si_pm4_set_reg(pm4, R_031110_SPI_GS_THROTTLE_CNTL1, 0x12355123);
si_pm4_set_reg(pm4, R_031114_SPI_GS_THROTTLE_CNTL2, 0x1544D);
assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi);
/* The PS will read inputs from this address. */
si_pm4_set_reg(pm4, R_031118_SPI_ATTRIBUTE_RING_BASE,
sscreen->attribute_ring->gpu_address >> 16);
si_pm4_set_reg(pm4, R_03111C_SPI_ATTRIBUTE_RING_SIZE,
S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) |
S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) |
S_03111C_L1_POLICY(1));
} }
done: done:

View file

@ -237,6 +237,7 @@ union si_state_atoms {
struct si_atom cache_flush; struct si_atom cache_flush;
struct si_atom streamout_begin; /* this must be done after cache_flush */ struct si_atom streamout_begin; /* this must be done after cache_flush */
struct si_atom render_cond; /* this must be after cache_flush */ struct si_atom render_cond; /* this must be after cache_flush */
struct si_atom spi_ge_ring_state; /* this must be last because it waits for idle. */
} s; } s;
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)]; struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
}; };

View file

@ -4872,11 +4872,58 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index)
radeon_end_update_context_roll(sctx); radeon_end_update_context_roll(sctx);
} }
static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
{
struct si_screen *sscreen = sctx->screen;
if (sctx->gfx_level >= GFX11) {
radeon_begin(&sctx->gfx_cs);
/* We must wait for idle using an EOP event before changing the attribute ring registers.
* Use the bottom-of-pipe EOP event, but increment the PWS counter instead of writing memory.
*/
radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) |
S_490_EVENT_INDEX(5) |
S_490_PWS_ENABLE(1));
radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
radeon_emit(0); /* ADDRESS_LO */
radeon_emit(0); /* ADDRESS_HI */
radeon_emit(0); /* DATA_LO */
radeon_emit(0); /* DATA_HI */
radeon_emit(0); /* INT_CTXID */
/* Wait for the PWS counter. */
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(S_580_PWS_STAGE_SEL(V_580_CP_ME) |
S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) |
S_580_PWS_ENA2(1) |
S_580_PWS_COUNT(0));
radeon_emit(0xffffffff); /* GCR_SIZE */
radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
radeon_emit(0); /* GCR_BASE_LO */
radeon_emit(0); /* GCR_BASE_HI */
radeon_emit(S_585_PWS_ENA(1));
radeon_emit(0); /* GCR_CNTL */
assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi);
radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */
radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */
radeon_emit(sscreen->attribute_ring->gpu_address >> 16); /* SPI_ATTRIBUTE_RING_BASE */
radeon_emit(S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) |
S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) |
S_03111C_L1_POLICY(1)); /* SPI_ATTRIBUTE_RING_SIZE */
radeon_end();
}
}
void si_init_shader_functions(struct si_context *sctx) void si_init_shader_functions(struct si_context *sctx)
{ {
sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state; sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state;
sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state; sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state;
sctx->atoms.s.spi_ge_ring_state.emit = si_emit_spi_ge_ring_state;
sctx->b.create_vs_state = si_create_shader; sctx->b.create_vs_state = si_create_shader;
sctx->b.create_tcs_state = si_create_shader; sctx->b.create_tcs_state = si_create_shader;