From b9b7d34d05fee61b05ddac61a73ec38489d73ffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 27 Feb 2024 16:18:13 -0500 Subject: [PATCH] radeonsi/gfx11: program the attribute ring right before draws This way, we only wait for idle right before draw packets, so that all preceding SET packets can be processed in parallel with draws from the previous IB. Add a new state atom that is emitted last. It only contains code for gfx11, but some code for older chips will be added by the next commit. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_gfx_cs.c | 1 + src/gallium/drivers/radeonsi/si_state.c | 40 ---------------- src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_shaders.cpp | 47 +++++++++++++++++++ 4 files changed, 49 insertions(+), 40 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index ea493836ce8..f4e93d8b4ec 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -411,6 +411,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) ctx->flags |= SI_CONTEXT_VGT_FLUSH; si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush); + si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_ge_ring_state); if (ctx->screen->attribute_ring) { radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring, diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 464fcf6b716..7f0a4c500d7 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -6467,46 +6467,6 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx) PIXEL_PIPE_STATE_CNTL_STRIDE(2) | PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask)); si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask)); - - /* We must wait for idle using an EOP event before changing the attribute ring registers. - * Use the bottom-of-pipe EOP event, but increment the PWS counter instead of writing memory. - */ - si_pm4_cmd_add(pm4, PKT3(PKT3_RELEASE_MEM, 6, 0)); - si_pm4_cmd_add(pm4, S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | - S_490_EVENT_INDEX(5) | - S_490_PWS_ENABLE(1)); - si_pm4_cmd_add(pm4, 0); /* DST_SEL, INT_SEL, DATA_SEL */ - si_pm4_cmd_add(pm4, 0); /* ADDRESS_LO */ - si_pm4_cmd_add(pm4, 0); /* ADDRESS_HI */ - si_pm4_cmd_add(pm4, 0); /* DATA_LO */ - si_pm4_cmd_add(pm4, 0); /* DATA_HI */ - si_pm4_cmd_add(pm4, 0); /* INT_CTXID */ - - /* Wait for the PWS counter. */ - si_pm4_cmd_add(pm4, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - si_pm4_cmd_add(pm4, S_580_PWS_STAGE_SEL(V_580_CP_ME) | - S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) | - S_580_PWS_ENA2(1) | - S_580_PWS_COUNT(0)); - si_pm4_cmd_add(pm4, 0xffffffff); /* GCR_SIZE */ - si_pm4_cmd_add(pm4, 0x01ffffff); /* GCR_SIZE_HI */ - si_pm4_cmd_add(pm4, 0); /* GCR_BASE_LO */ - si_pm4_cmd_add(pm4, 0); /* GCR_BASE_HI */ - si_pm4_cmd_add(pm4, S_585_PWS_ENA(1)); - si_pm4_cmd_add(pm4, 0); /* GCR_CNTL */ - - si_pm4_set_reg(pm4, R_031110_SPI_GS_THROTTLE_CNTL1, 0x12355123); - si_pm4_set_reg(pm4, R_031114_SPI_GS_THROTTLE_CNTL2, 0x1544D); - - assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi); - - /* The PS will read inputs from this address. */ - si_pm4_set_reg(pm4, R_031118_SPI_ATTRIBUTE_RING_BASE, - sscreen->attribute_ring->gpu_address >> 16); - si_pm4_set_reg(pm4, R_03111C_SPI_ATTRIBUTE_RING_SIZE, - S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) | - S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) | - S_03111C_L1_POLICY(1)); } done: diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index ae4e9b584aa..c9e923d665d 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -237,6 +237,7 @@ union si_state_atoms { struct si_atom cache_flush; struct si_atom streamout_begin; /* this must be done after cache_flush */ struct si_atom render_cond; /* this must be after cache_flush */ + struct si_atom spi_ge_ring_state; /* this must be last because it waits for idle. */ } s; struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)]; }; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 266e8b2b105..d3a7e4c2d96 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4872,11 +4872,58 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index) radeon_end_update_context_roll(sctx); } +static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) +{ + struct si_screen *sscreen = sctx->screen; + + if (sctx->gfx_level >= GFX11) { + radeon_begin(&sctx->gfx_cs); + /* We must wait for idle using an EOP event before changing the attribute ring registers. + * Use the bottom-of-pipe EOP event, but increment the PWS counter instead of writing memory. + */ + radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); + radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | + S_490_EVENT_INDEX(5) | + S_490_PWS_ENABLE(1)); + radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */ + radeon_emit(0); /* ADDRESS_LO */ + radeon_emit(0); /* ADDRESS_HI */ + radeon_emit(0); /* DATA_LO */ + radeon_emit(0); /* DATA_HI */ + radeon_emit(0); /* INT_CTXID */ + + /* Wait for the PWS counter. */ + radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(S_580_PWS_STAGE_SEL(V_580_CP_ME) | + S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) | + S_580_PWS_ENA2(1) | + S_580_PWS_COUNT(0)); + radeon_emit(0xffffffff); /* GCR_SIZE */ + radeon_emit(0x01ffffff); /* GCR_SIZE_HI */ + radeon_emit(0); /* GCR_BASE_LO */ + radeon_emit(0); /* GCR_BASE_HI */ + radeon_emit(S_585_PWS_ENA(1)); + radeon_emit(0); /* GCR_CNTL */ + + assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi); + + radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4); + radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */ + radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */ + radeon_emit(sscreen->attribute_ring->gpu_address >> 16); /* SPI_ATTRIBUTE_RING_BASE */ + radeon_emit(S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) | + S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) | + S_03111C_L1_POLICY(1)); /* SPI_ATTRIBUTE_RING_SIZE */ + radeon_end(); + } +} + void si_init_shader_functions(struct si_context *sctx) { sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state; + sctx->atoms.s.spi_ge_ring_state.emit = si_emit_spi_ge_ring_state; sctx->b.create_vs_state = si_create_shader; sctx->b.create_tcs_state = si_create_shader;