radeonsi: apply spi_cu_en to CU_EN

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14122>
This commit is contained in:
Marek Olšák 2021-12-08 02:15:50 -05:00
parent b06b481dfe
commit 384014bebe
6 changed files with 104 additions and 32 deletions

View file

@ -581,3 +581,21 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
return CLAMP(workgroup_size, 1, 256);
}
void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
unsigned value_shift, const struct radeon_info *info,
void set_sh_reg(void*, unsigned, uint32_t))
{
/* Register field position and mask. */
uint32_t cu_en_mask = ~clear_mask;
unsigned cu_en_shift = ffs(cu_en_mask) - 1;
/* The value being set. */
uint32_t cu_en = (value & cu_en_mask) >> cu_en_shift;
/* AND the field by spi_cu_en. */
uint32_t spi_cu_en = info->spi_cu_en >> value_shift;
uint32_t new_value = (value & ~cu_en_mask) |
(((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
set_sh_reg(cs, reg_offset, new_value);
}

View file

@ -118,6 +118,10 @@ unsigned ac_compute_esgs_workgroup_size(enum chip_class chip_class, unsigned wav
unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
unsigned max_vtx_out, unsigned prim_amp_factor);
void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
unsigned value_shift, const struct radeon_info *info,
void set_sh_reg(void*, unsigned, uint32_t));
#ifdef __cplusplus
}
#endif

View file

@ -279,6 +279,15 @@
radeon_emit_32bit_pointer(sctx->screen, (desc)->gpu_address); \
} while (0)
/* Wrappers that are only used when they are passed as function pointers. */
static inline void radeon_set_sh_reg_func(struct radeon_cmdbuf *cs, unsigned reg_offset,
uint32_t value)
{
radeon_begin(cs);
radeon_set_sh_reg(reg_offset, value);
radeon_end();
}
/* This should be evaluated at compile time if all parameters are constants. */
static ALWAYS_INLINE unsigned
si_get_user_data_base(enum chip_class chip_class, enum si_has_tess has_tess,

View file

@ -31,7 +31,8 @@
extern "C" {
#endif
#define SI_PM4_MAX_DW 176
/* TODO: This is high because of cs_preamble with ac_set_reg_cu_en. */
#define SI_PM4_MAX_DW 480
// forward defines
struct si_context;

View file

@ -5467,8 +5467,9 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa);
if (sctx->chip_class >= GFX7) {
si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F),
C_00B01C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
}
if (sctx->chip_class <= GFX8) {
@ -5503,11 +5504,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
}
if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F),
C_00B51C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F),
C_00B31C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
/* If this is 0, Bonaire can hang even if GS isn't being used.
* Other chips are unaffected. These are suboptimal values,
@ -5547,8 +5550,9 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
}
if (sctx->chip_class >= GFX9) {
si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), C_00B41C_CU_EN,
0, &sscreen->info, (void*)si_pm4_set_reg);
si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
@ -5566,9 +5570,12 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->chip_class >= GFX10) {
/* Logical CUs 16 - 31 */
si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16));
si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg);
ac_set_reg_cu_en(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff),
C_00B104_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg);
ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg);
si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);

View file

@ -919,18 +919,35 @@ static void si_emit_shader_gs(struct si_context *sctx)
radeon_end_update_context_roll(sctx);
/* These don't cause any context rolls. */
radeon_begin_again(&sctx->gfx_cs);
if (sctx->chip_class >= GFX7) {
radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
if (sctx->screen->info.spi_cu_en_has_effect) {
if (sctx->chip_class >= GFX7) {
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs,
C_00B21C_CU_EN, 0, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
}
if (sctx->chip_class >= GFX10) {
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs,
C_00B204_CU_EN, 16, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS);
}
} else {
radeon_begin_again(&sctx->gfx_cs);
if (sctx->chip_class >= GFX7) {
radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
}
if (sctx->chip_class >= GFX10) {
radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
}
radeon_end();
}
if (sctx->chip_class >= GFX10) {
radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
}
radeon_end();
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@ -1129,13 +1146,27 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
radeon_begin_again(&sctx->gfx_cs);
radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
shader->ctx_reg.ngg.ge_pc_alloc);
radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
radeon_end();
if (sctx->screen->info.spi_cu_en_has_effect) {
radeon_end();
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs,
C_00B21C_CU_EN, 0, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs,
C_00B204_CU_EN, 16, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func);
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS) &
~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
} else {
radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
radeon_end();
}
}
static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
@ -1599,8 +1630,10 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
if (sscreen->info.chip_class >= GFX7) {
si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &sscreen->info,
(void (*)(void*, unsigned, uint32_t))si_pm4_set_reg);
si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
}