From edb5fa4d59de8ef68deaedb27ca2798b57bc28a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 18 Aug 2021 13:05:16 -0400 Subject: [PATCH] radeonsi: eliminate redundant SPI_SHADER_PGM_RSRC3/4_GS register writes They don't change much. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_build_pm4.h | 10 ++++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 2 +- src/gallium/drivers/radeonsi/si_shader.h | 4 ++ src/gallium/drivers/radeonsi/si_state.h | 3 ++ .../drivers/radeonsi/si_state_shaders.c | 49 +++++++++++++------ 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index b96c9201fb7..40e430bd407 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -259,6 +259,16 @@ } \ } while (0) +#define radeon_opt_set_sh_reg(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_sh_reg(cs, offset, __value); \ + sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_set_privileged_config_reg(cs, reg, value) do { \ assert((reg) < CIK_UCONFIG_REG_OFFSET); \ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \ diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 1081b238bcd..80e9f760e09 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -294,7 +294,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */ /* Set all cleared context registers to saved. */ - ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */ + ctx->tracked_regs.reg_saved = BITFIELD64_MASK(SI_TRACKED_GE_PC_ALLOC); ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 0bbd20157e0..fa32c8ed705 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -839,6 +839,8 @@ struct si_shader { unsigned vgt_gs_onchip_cntl; unsigned vgt_gs_max_prims_per_subgroup; unsigned vgt_esgs_ring_itemsize; + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; } gs; struct { @@ -855,6 +857,8 @@ struct si_shader { unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ unsigned ge_pc_alloc; /* uconfig register */ + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; union si_vgt_stages_key vgt_stages; } ngg; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 34fbf43f90a..cd1bd632851 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -342,7 +342,10 @@ enum si_tracked_reg SI_TRACKED_VGT_TF_PARAM, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + /* Non-context registers: */ SI_TRACKED_GE_PC_ALLOC, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, SI_NUM_TRACKED_REGS, }; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 636c3184236..66a68711cba 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -819,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); } radeon_end_update_context_roll(sctx); + + /* These don't cause any context rolls. */ + radeon_begin_again(&sctx->gfx_cs); + if (sctx->chip_class >= GFX7) { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs); + } + if (sctx->chip_class >= GFX10) { + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs); + } + radeon_end(); } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -923,13 +937,11 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0); shader->ctx_reg.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | @@ -944,10 +956,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader); } else { - if (sscreen->info.chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -1029,6 +1040,15 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); + + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs); + radeon_end(); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) @@ -1218,12 +1238,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) | S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg( - pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); nparams = MAX2(shader->info.nr_param_exports, 1); shader->ctx_reg.ngg.spi_vs_out_config =