radeonsi: increase NGG workgroup size to 256 for VS/TES with streamout and GS

NGG streamout performance is limited by the workgroup size, so make it as
large as possible.

Since this uses si_get_max_workgroup_size() to set the NGG workgroup size,
the side effect is that all GS is also getting an increase to 256, which
is OK.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21403>
This commit is contained in:
Marek Olšák 2023-02-25 17:52:24 -05:00 committed by Marge Bot
parent 43fd552872
commit 461ccb00e1
5 changed files with 10 additions and 9 deletions

View file

@ -117,8 +117,9 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
gs_sel->screen->info.gfx_level >= GFX11 ? 3 : /* gfx11 requires at least 1 primitive per TG */
gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
bool max_vert_out_per_gs_instance = false;
unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
unsigned max_gsprims_base, max_esverts_base;
max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader);
if (gs_stage == MESA_SHADER_GEOMETRY) {
bool force_multi_cycling = false;

View file

@ -1413,8 +1413,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
}
}
sscreen->ngg_subgroup_size = 128;
if (sscreen->info.gfx_level >= GFX11) {
unsigned attr_ring_size = sscreen->info.attribute_ring_size_per_se * sscreen->info.max_se;
sscreen->attribute_ring = si_aligned_buffer_create(&sscreen->b,

View file

@ -694,8 +694,6 @@ struct si_screen {
* We want to minimize the impact on multithreaded Mesa. */
struct ac_llvm_compiler compiler_lowp[10];
unsigned ngg_subgroup_size;
struct util_idalloc_mt buffer_ids;
struct util_vertex_state_cache vertex_state_cache;

View file

@ -206,7 +206,11 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
switch (shader->selector->stage) {
case MESA_SHADER_VERTEX:
case MESA_SHADER_TESS_EVAL:
return shader->key.ge.as_ngg ? shader->selector->screen->ngg_subgroup_size : 0;
/* Use the largest workgroup size for streamout */
if (shader->key.ge.as_ngg)
return si_shader_uses_streamout(shader) ? 256 : 128;
else
return 0;
case MESA_SHADER_TESS_CTRL:
/* Return this so that LLVM doesn't remove s_barrier
@ -214,7 +218,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
case MESA_SHADER_GEOMETRY:
/* ngg_subgroup_size is only the input size. GS can always generate up to 256 vertices. */
/* GS can always generate up to 256 vertices. */
return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0;
case MESA_SHADER_COMPUTE:

View file

@ -1058,7 +1058,7 @@ static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
shader->selector->info.writes_edgeflag;
}
static inline bool si_shader_uses_streamout(struct si_shader *shader)
static inline bool si_shader_uses_streamout(const struct si_shader *shader)
{
return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
shader->selector->info.enabled_streamout_buffer_mask &&