radeonsi: use SET_SH_REG_INDEX with index=3 for registers containing CU_EN

This matches PAL and RADV behavior. It's for preemption.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15098>
This commit is contained in:
Marek Olšák 2022-02-22 03:05:35 -05:00 committed by Marge Bot
parent 79a7ab642a
commit c8e2c6faf6
5 changed files with 99 additions and 39 deletions

View file

@ -117,11 +117,23 @@
radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \
} while (0)
#define radeon_set_sh_reg_idx3_seq(reg, num) do { \
SI_CHECK_SHADOWED_REGS(reg, num); \
assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \
radeon_emit(PKT3(PKT3_SET_SH_REG_INDEX, num, 0)); \
radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | (3 << 28)); \
} while (0)
#define radeon_set_sh_reg(reg, value) do { \
radeon_set_sh_reg_seq(reg, 1); \
radeon_emit(value); \
} while (0)
#define radeon_set_sh_reg_idx3(reg, value) do { \
radeon_set_sh_reg_idx3_seq(reg, 1); \
radeon_emit(value); \
} while (0)
#define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \
SI_CHECK_SHADOWED_REGS(reg, num); \
assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
@ -247,6 +259,19 @@
} \
} while (0)
#define radeon_opt_set_sh_reg_idx3(sctx, offset, reg, val) do { \
unsigned __value = val; \
if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
sctx->tracked_regs.reg_value[reg] != __value) { \
if (sctx->chip_class >= GFX10) \
radeon_set_sh_reg_idx3(offset, __value); \
else \
radeon_set_sh_reg(offset, __value); \
sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \
sctx->tracked_regs.reg_value[reg] = __value; \
} \
} while (0)
#define radeon_opt_set_uconfig_reg(sctx, offset, reg, val) do { \
unsigned __value = val; \
if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
@ -288,6 +313,14 @@ static inline void radeon_set_sh_reg_func(struct radeon_cmdbuf *cs, unsigned reg
radeon_end();
}
static inline void radeon_set_sh_reg_idx3_func(struct radeon_cmdbuf *cs, unsigned reg_offset,
uint32_t value)
{
radeon_begin(cs);
radeon_set_sh_reg_idx3(reg_offset, value);
radeon_end();
}
/* This should be evaluated at compile time if all parameters are constants. */
static ALWAYS_INLINE unsigned
si_get_user_data_base(enum chip_class chip_class, enum si_has_tess has_tess,

View file

@ -53,6 +53,27 @@ static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate);
}
static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
unsigned opcode, unsigned idx)
{
reg >>= 2;
if (!state->max_dw)
state->max_dw = ARRAY_SIZE(state->pm4);
assert(state->ndw + 2 <= state->max_dw);
if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
si_pm4_cmd_begin(state, opcode);
state->pm4[state->ndw++] = reg | (idx << 28);
}
assert(reg <= UINT16_MAX);
state->last_reg = reg;
state->pm4[state->ndw++] = val;
si_pm4_cmd_end(state, false);
}
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
{
unsigned opcode;
@ -80,22 +101,14 @@ void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
return;
}
reg >>= 2;
if (!state->max_dw)
state->max_dw = ARRAY_SIZE(state->pm4);
assert(state->ndw + 2 <= state->max_dw);
if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
si_pm4_cmd_begin(state, opcode);
state->pm4[state->ndw++] = reg;
si_pm4_set_reg_custom(state, reg, val, opcode, 0);
}
assert(reg <= UINT16_MAX);
state->last_reg = reg;
state->pm4[state->ndw++] = val;
si_pm4_cmd_end(state, false);
void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
{
SI_CHECK_SHADOWED_REGS(reg, 1);
si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
}
void si_pm4_clear_state(struct si_pm4_state *state)

View file

@ -61,6 +61,7 @@ struct si_pm4_state {
void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val);
void si_pm4_clear_state(struct si_pm4_state *state);
void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);

View file

@ -5476,8 +5476,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->chip_class >= GFX7) {
ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F),
C_00B01C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
S_00B01C_CU_EN(cu_mask_ps) |
S_00B01C_WAVE_LIMIT(0x3F),
C_00B01C_CU_EN, 0, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
}
if (sctx->chip_class <= GFX8) {
@ -5514,11 +5516,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
ac_set_reg_cu_en(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F),
C_00B51C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
C_00B51C_CU_EN, 0, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F),
C_00B31C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
C_00B31C_CU_EN, 0, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
/* If this is 0, Bonaire can hang even if GS isn't being used.
* Other chips are unaffected. These are suboptimal values,
@ -5560,7 +5564,8 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->chip_class >= GFX9) {
ac_set_reg_cu_en(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), C_00B41C_CU_EN,
0, &sscreen->info, (void*)si_pm4_set_reg);
0, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
S_028B50_ACCUM_ISOLINE(12) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
@ -5579,11 +5584,14 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->chip_class >= GFX10) {
/* Logical CUs 16 - 31 */
ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg);
C_00B004_CU_EN, 16, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
ac_set_reg_cu_en(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff),
C_00B104_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg);
C_00B104_CU_EN, 16, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg);
C_00B404_CU_EN, 16, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);

View file

@ -965,25 +965,27 @@ static void si_emit_shader_gs(struct si_context *sctx)
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs,
C_00B21C_CU_EN, 0, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
(void (*)(void*, unsigned, uint32_t))
(sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
}
if (sctx->chip_class >= GFX10) {
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs,
C_00B204_CU_EN, 16, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
(void (*)(void*, unsigned, uint32_t))
(sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS);
}
} else {
radeon_begin_again(&sctx->gfx_cs);
if (sctx->chip_class >= GFX7) {
radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
}
if (sctx->chip_class >= GFX10) {
radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
}
@ -1192,18 +1194,20 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs,
C_00B21C_CU_EN, 0, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
(void (*)(void*, unsigned, uint32_t))
(sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs,
C_00B204_CU_EN, 16, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
(void (*)(void*, unsigned, uint32_t))
(sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS) &
~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
} else {
radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
radeon_end();
@ -1674,7 +1678,8 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
ac_set_reg_cu_en(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &sscreen->info,
(void (*)(void*, unsigned, uint32_t))si_pm4_set_reg);
(void (*)(void*, unsigned, uint32_t))
(sscreen->info.chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
}