amd: massively simplify how info->spi_cu_en is applied

Instead of having ac_set_reg_cu_en that sets the register, replace it with
ac_apply_cu_en that only returns the modified register value,
which allows a large simplification in both drivers because a lot of code
becomes duplicated after it's switched to ac_apply_cu_en.

RADV also didn't apply it to a few registers. Fixed.

This removes 82 lines of code in total.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21641>
This commit is contained in:
Marek Olšák 2023-02-21 12:22:38 -05:00 committed by Marge Bot
parent 2b3f551ed8
commit ccaaf8fe04
10 changed files with 145 additions and 227 deletions

View file

@ -958,9 +958,8 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
return CLAMP(workgroup_size, 1, 256);
}
void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
unsigned value_shift, const struct radeon_info *info,
void set_sh_reg(void*, unsigned, uint32_t))
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
const struct radeon_info *info)
{
/* Register field position and mask. */
uint32_t cu_en_mask = ~clear_mask;
@ -970,10 +969,8 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
/* AND the field by spi_cu_en. */
uint32_t spi_cu_en = info->spi_cu_en >> value_shift;
uint32_t new_value = (value & ~cu_en_mask) |
(((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
set_sh_reg(cs, reg_offset, new_value);
return (value & ~cu_en_mask) |
(((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
}
/* Return the register value and tune bytes_per_wave to increase scratch performance. */

View file

@ -166,9 +166,8 @@ unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned w
unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
unsigned max_vtx_out, unsigned prim_amp_factor);
void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
unsigned value_shift, const struct radeon_info *info,
void set_sh_reg(void*, unsigned, uint32_t));
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
const struct radeon_info *info);
void ac_get_scratch_tmpring_size(const struct radeon_info *info,
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,

View file

@ -117,17 +117,6 @@ radeon_set_sh_reg_idx(const struct radv_physical_device *pdevice, struct radeon_
radeon_emit(cs, value);
}
static inline void
gfx10_set_sh_reg_idx3(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
{
assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
assert(cs->cdw + 3 <= cs->max_dw);
radeon_emit(cs, PKT3(PKT3_SET_SH_REG_INDEX, 1, 0));
radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2 | (3 << 28));
radeon_emit(cs, value);
}
static inline void
radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
{

View file

@ -3853,15 +3853,10 @@ radv_pipeline_emit_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
&late_alloc_wave64, &cu_mask);
if (pdevice->rad_info.gfx_level >= GFX7) {
if (pdevice->rad_info.gfx_level >= GFX10) {
ac_set_reg_cu_en(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &pdevice->rad_info,
(void*)gfx10_set_sh_reg_idx3);
} else {
radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
}
radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &pdevice->rad_info));
radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
}
if (pdevice->rad_info.gfx_level >= GFX10) {
@ -4032,28 +4027,21 @@ radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs
ac_compute_late_alloc(&pdevice->rad_info, true, shader->info.has_ngg_culling,
shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
radeon_set_sh_reg_idx(pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) |
S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &pdevice->rad_info));
if (pdevice->rad_info.gfx_level >= GFX11) {
/* TODO: figure out how S_00B204_CU_EN_GFX11 interacts with ac_set_reg_cu_en */
gfx10_set_sh_reg_idx3(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
gfx10_set_sh_reg_idx3(
cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
} else if (pdevice->rad_info.gfx_level >= GFX10) {
ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &pdevice->rad_info, (void*)gfx10_set_sh_reg_idx3);
ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
(void*)gfx10_set_sh_reg_idx3);
radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
C_00B204_CU_EN_GFX11, 16, &pdevice->rad_info));
} else {
radeon_set_sh_reg_idx(
pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
radeon_set_sh_reg_idx(
pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info));
}
uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0;
@ -4213,25 +4201,16 @@ radv_pipeline_emit_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
radeon_emit(cs, gs->config.rsrc2);
}
if (pdevice->rad_info.gfx_level >= GFX10) {
ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &pdevice->rad_info,
(void*)gfx10_set_sh_reg_idx3);
ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
(void*)gfx10_set_sh_reg_idx3);
} else if (pdevice->rad_info.gfx_level >= GFX7) {
radeon_set_sh_reg_idx(
pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
radeon_set_sh_reg_idx(pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &pdevice->rad_info));
if (pdevice->rad_info.gfx_level >= GFX10) {
radeon_set_sh_reg_idx(
pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
}
if (pdevice->rad_info.gfx_level >= GFX10) {
radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info));
}
radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, pipeline->base.gs_copy_shader);

View file

@ -353,34 +353,35 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
if (physical_device->rad_info.gfx_level >= GFX10 &&
physical_device->rad_info.gfx_level < GFX11) {
/* Logical CUs 16 - 31 */
ac_set_reg_cu_en(cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff),
C_00B104_CU_EN, 16, &physical_device->rad_info,
(void*)gfx10_set_sh_reg_idx3);
radeon_set_sh_reg_idx(physical_device, cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, 3,
ac_apply_cu_en(S_00B104_CU_EN(0xffff),
C_00B104_CU_EN, 16, &physical_device->rad_info));
}
if (physical_device->rad_info.gfx_level >= GFX10) {
ac_set_reg_cu_en(cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &physical_device->rad_info,
(void*)gfx10_set_sh_reg_idx3);
ac_set_reg_cu_en(cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &physical_device->rad_info,
(void*)gfx10_set_sh_reg_idx3);
radeon_set_sh_reg_idx(physical_device, cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, 3,
ac_apply_cu_en(S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &physical_device->rad_info));
radeon_set_sh_reg_idx(physical_device, cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, 3,
ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &physical_device->rad_info));
}
if (physical_device->rad_info.gfx_level >= GFX10) {
ac_set_reg_cu_en(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F),
C_00B41C_CU_EN, 0, &physical_device->rad_info,
(void*)gfx10_set_sh_reg_idx3);
} else if (physical_device->rad_info.gfx_level == GFX9) {
if (physical_device->rad_info.gfx_level >= GFX9) {
radeon_set_sh_reg_idx(physical_device, cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 3,
S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
ac_apply_cu_en(S_00B41C_CU_EN(0xffff) |
S_00B41C_WAVE_LIMIT(0x3F),
C_00B41C_CU_EN, 0, &physical_device->rad_info));
} else {
radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
ac_apply_cu_en(S_00B51C_CU_EN(0xffff) |
S_00B51C_WAVE_LIMIT(0x3F),
C_00B51C_CU_EN, 0, &physical_device->rad_info));
radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
ac_apply_cu_en(S_00B31C_CU_EN(0xffff) |
S_00B31C_WAVE_LIMIT(0x3F),
C_00B31C_CU_EN, 0, &physical_device->rad_info));
/* If this is 0, Bonaire can hang even if GS isn't being used.
* Other chips are unaffected. These are suboptimal values,
* but we don't use on-chip GS.
@ -389,16 +390,11 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
}
if (physical_device->rad_info.gfx_level >= GFX10) {
ac_set_reg_cu_en(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F) |
S_00B01C_LDS_GROUP_SIZE(physical_device->rad_info.gfx_level >= GFX11),
C_00B01C_CU_EN, 0, &physical_device->rad_info,
(void*)gfx10_set_sh_reg_idx3);
} else {
radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3,
S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
}
radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3,
ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) |
S_00B01C_WAVE_LIMIT(0x3F) |
S_00B01C_LDS_GROUP_SIZE(physical_device->rad_info.gfx_level >= GFX11),
C_00B01C_CU_EN, 0, &physical_device->rad_info));
}
if (physical_device->rad_info.gfx_level >= GFX10) {

View file

@ -125,11 +125,11 @@
radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \
} while (0)
#define radeon_set_sh_reg_idx3_seq(reg, num) do { \
#define radeon_set_sh_reg_idx3_seq(sctx, reg, num) do { \
SI_CHECK_SHADOWED_REGS(reg, num); \
assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \
radeon_emit(PKT3(PKT3_SET_SH_REG_INDEX, num, 0)); \
radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | (3 << 28)); \
radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | ((sctx)->gfx_level >= GFX10 ? 3 << 28 : 0)); \
} while (0)
#define radeon_set_sh_reg(reg, value) do { \
@ -137,8 +137,8 @@
radeon_emit(value); \
} while (0)
#define radeon_set_sh_reg_idx3(reg, value) do { \
radeon_set_sh_reg_idx3_seq(reg, 1); \
#define radeon_set_sh_reg_idx3(sctx, reg, value) do { \
radeon_set_sh_reg_idx3_seq(sctx, reg, 1); \
radeon_emit(value); \
} while (0)
@ -297,10 +297,7 @@
unsigned __value = val; \
if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
sctx->tracked_regs.reg_value[reg] != __value) { \
if (sctx->gfx_level >= GFX10) \
radeon_set_sh_reg_idx3(offset, __value); \
else \
radeon_set_sh_reg(offset, __value); \
radeon_set_sh_reg_idx3(sctx, offset, __value); \
sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \
sctx->tracked_regs.reg_value[reg] = __value; \
} \
@ -338,23 +335,6 @@
radeon_emit_32bit_pointer(sctx->screen, (desc)->gpu_address); \
} while (0)
/* Wrappers that are only used when they are passed as function pointers. */
static inline void radeon_set_sh_reg_func(struct radeon_cmdbuf *cs, unsigned reg_offset,
uint32_t value)
{
radeon_begin(cs);
radeon_set_sh_reg(reg_offset, value);
radeon_end();
}
static inline void radeon_set_sh_reg_idx3_func(struct radeon_cmdbuf *cs, unsigned reg_offset,
uint32_t value)
{
radeon_begin(cs);
radeon_set_sh_reg_idx3(reg_offset, value);
radeon_end();
}
/* This should be evaluated at compile time if all parameters are constants. */
static ALWAYS_INLINE unsigned
si_get_user_data_base(enum amd_gfx_level gfx_level, enum si_has_tess has_tess,

View file

@ -104,11 +104,13 @@ void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
si_pm4_set_reg_custom(state, reg, val, opcode, 0);
}
void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
void si_pm4_set_reg_idx3(struct si_screen *sscreen, struct si_pm4_state *state,
unsigned reg, uint32_t val)
{
SI_CHECK_SHADOWED_REGS(reg, 1);
si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX,
sscreen->info.gfx_level >= GFX10 ? 3 : 0);
}
void si_pm4_clear_state(struct si_pm4_state *state)

View file

@ -31,7 +31,8 @@
extern "C" {
#endif
// forward defines
/* forward definitions */
struct si_screen;
struct si_context;
/* State atoms are callbacks which write a sequence of packets into a GPU
@ -64,7 +65,8 @@ struct si_pm4_state {
void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val);
void si_pm4_set_reg_idx3(struct si_screen *sscreen, struct si_pm4_state *state,
unsigned reg, uint32_t val);
void si_pm4_clear_state(struct si_pm4_state *state);
void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);

View file

@ -5671,12 +5671,11 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
if (sctx->gfx_level >= GFX7) {
ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) |
S_00B01C_WAVE_LIMIT(0x3F) |
S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11),
C_00B01C_CU_EN, 0, &sscreen->info,
(void*)(sctx->gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg_idx3(sscreen, pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) |
S_00B01C_WAVE_LIMIT(0x3F) |
S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11),
C_00B01C_CU_EN, 0, &sscreen->info));
}
if (sctx->gfx_level <= GFX8) {
@ -5711,13 +5710,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
}
if (sctx->gfx_level >= GFX7 && sctx->gfx_level <= GFX8) {
ac_set_reg_cu_en(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F),
C_00B51C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
ac_apply_cu_en(S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F),
C_00B51C_CU_EN, 0, &sscreen->info));
si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
ac_set_reg_cu_en(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F),
C_00B31C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
ac_apply_cu_en(S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F),
C_00B31C_CU_EN, 0, &sscreen->info));
/* If this is 0, Bonaire can hang even if GS isn't being used.
* Other chips are unaffected. These are suboptimal values,
@ -5774,10 +5773,9 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
}
if (sctx->gfx_level >= GFX9) {
ac_set_reg_cu_en(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), C_00B41C_CU_EN,
0, &sscreen->info,
(void*)(sctx->gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg_idx3(sscreen, pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F),
C_00B41C_CU_EN, 0, &sscreen->info));
si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
@ -5885,12 +5883,15 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->gfx_level >= GFX10 && sctx->gfx_level <= GFX10_3) {
/* Logical CUs 16 - 31 */
ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3);
ac_set_reg_cu_en(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff),
C_00B104_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3);
ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3);
si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &sscreen->info));
si_pm4_set_reg_idx3(sscreen, pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
ac_apply_cu_en(S_00B104_CU_EN(0xffff),
C_00B104_CU_EN, 16, &sscreen->info));
si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
ac_apply_cu_en(S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info));
si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);

View file

@ -713,11 +713,10 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
if (sscreen->info.gfx_level >= GFX9) {
if (sscreen->info.gfx_level >= GFX11) {
ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info,
(void (*)(void*, unsigned, uint32_t))si_pm4_set_reg_idx3);
si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info));
}
if (sscreen->info.gfx_level >= GFX10) {
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
@ -983,37 +982,18 @@ static void si_emit_shader_gs(struct si_context *sctx)
radeon_end_update_context_roll(sctx);
/* These don't cause any context rolls. */
if (sctx->screen->info.spi_cu_en_has_effect) {
if (sctx->gfx_level >= GFX7) {
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
shader->gs.spi_shader_pgm_rsrc3_gs,
C_00B21C_CU_EN, 0, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))
(sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
}
if (sctx->gfx_level >= GFX10) {
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
shader->gs.spi_shader_pgm_rsrc4_gs,
C_00B204_CU_EN_GFX10, 16, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))
(sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS);
}
} else {
radeon_begin_again(&sctx->gfx_cs);
if (sctx->gfx_level >= GFX7) {
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->gs.spi_shader_pgm_rsrc3_gs);
}
if (sctx->gfx_level >= GFX10) {
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->gs.spi_shader_pgm_rsrc4_gs);
}
radeon_end();
radeon_begin_again(&sctx->gfx_cs);
if (sctx->gfx_level >= GFX7) {
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->gs.spi_shader_pgm_rsrc3_gs);
}
if (sctx->gfx_level >= GFX10) {
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->gs.spi_shader_pgm_rsrc4_gs);
}
radeon_end();
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@ -1125,10 +1105,14 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
shader->gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
S_00B21C_WAVE_LIMIT(0x3F);
shader->gs.spi_shader_pgm_rsrc4_gs = S_00B204_CU_EN_GFX10(0xffff) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0);
shader->gs.spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &sscreen->info);
shader->gs.spi_shader_pgm_rsrc4_gs =
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
C_00B204_CU_EN_GFX10, 16, &sscreen->info);
shader->gs.vgt_gs_onchip_cntl =
S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
@ -1143,8 +1127,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
polaris_set_vgt_vertex_reuse(sscreen, shader->key.ge.part.gs.es, shader);
} else {
shader->gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
S_00B21C_WAVE_LIMIT(0x3F);
shader->gs.spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &sscreen->info);
si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
@ -1218,30 +1204,13 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
radeon_begin_again(&sctx->gfx_cs);
radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
shader->ngg.ge_pc_alloc);
if (sctx->screen->info.spi_cu_en_has_effect) {
radeon_end();
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
shader->ngg.spi_shader_pgm_rsrc3_gs,
C_00B21C_CU_EN, 0, &sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))
(sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
shader->ngg.spi_shader_pgm_rsrc4_gs,
sctx->gfx_level >= GFX11 ? C_00B204_CU_EN_GFX11 : C_00B204_CU_EN_GFX10, 16,
&sctx->screen->info,
(void (*)(void*, unsigned, uint32_t))
(sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS) &
~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
} else {
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ngg.spi_shader_pgm_rsrc4_gs);
radeon_end();
}
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ngg.spi_shader_pgm_rsrc4_gs);
radeon_end();
}
static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
@ -1439,15 +1408,21 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
S_00B22C_LDS_SIZE(shader->config.lds_size));
shader->ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) |
S_00B21C_WAVE_LIMIT(0x3F);
shader->ngg.spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) |
S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &sscreen->info);
if (sscreen->info.gfx_level >= GFX11) {
shader->ngg.spi_shader_pgm_rsrc4_gs =
S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64) |
S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader));
ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64) |
S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
C_00B204_CU_EN_GFX11, 16, &sscreen->info);
} else {
shader->ngg.spi_shader_pgm_rsrc4_gs =
S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
C_00B204_CU_EN_GFX10, 16, &sscreen->info);
}
nparams = MAX2(shader->info.nr_param_exports, 1);
@ -1727,11 +1702,10 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
if (sscreen->info.gfx_level >= GFX7) {
ac_set_reg_cu_en(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &sscreen->info,
(void (*)(void*, unsigned, uint32_t))
(sscreen->info.gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg_idx3(sscreen, pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &sscreen->info));
si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
}
@ -2049,11 +2023,10 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
if (sscreen->info.gfx_level >= GFX11) {
unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
S_00B004_CU_EN(cu_mask_ps >> 16),
C_00B004_CU_EN, 16, &sscreen->info,
(void (*)(void*, unsigned, uint32_t))si_pm4_set_reg_idx3);
si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16) |
S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
C_00B004_CU_EN, 16, &sscreen->info));
}
}