ac/nir: allow smaller workgroups for GS
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

It's not good for performance, but it's possible to use for debugging.
Running single-wave GS workgroups could work around any LDS race conditions.

Setting the workgroup size to 64 reliably works around
GLCTS *primitive_counter*line failures, indicating streamout data
corruption with multi-wave GS workgroups.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38328>
This commit is contained in:
Marek Olšák 2025-11-08 12:25:50 -05:00 committed by Marge Bot
parent ff8df8712e
commit 9bd2c6dcb2
4 changed files with 22 additions and 14 deletions

View file

@ -1349,9 +1349,11 @@ ac_legacy_gs_compute_subgroup_info(enum mesa_prim input_prim, unsigned gs_vertic
*/
bool
ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, mesa_shader_stage es_stage, bool is_gs,
enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
enum mesa_prim input_prim, unsigned gs_vertices_out,
unsigned gs_invocations, unsigned target_workgroup_size,
unsigned max_workgroup_size, unsigned wave_size,
unsigned esgs_vertex_stride, unsigned ngg_lds_vertex_size,
unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
unsigned max_esgs_lds_padding, ac_ngg_subgroup_info *out)
{
const unsigned gs_num_invocations = MAX2(gs_invocations, 1);
@ -1373,16 +1375,19 @@ ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, mesa_shader_stage es_
bool max_vert_out_per_gs_instance = false;
unsigned max_gsprims_base, max_esverts_base;
max_gsprims_base = max_esverts_base = max_workgroup_size;
/* In the worst case, we can run 1 GS invocation per workgroup. */
assert(!is_gs || gs_vertices_out <= max_workgroup_size);
max_gsprims_base = max_esverts_base = target_workgroup_size;
if (is_gs) {
bool force_multi_cycling = false;
unsigned max_out_verts_per_gsprim = gs_vertices_out * gs_num_invocations;
retry_select_mode:
if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
if (max_out_verts_per_gsprim <= max_workgroup_size && !force_multi_cycling) {
if (max_out_verts_per_gsprim) {
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
max_gsprims_base = MIN2(max_gsprims_base, max_workgroup_size / max_out_verts_per_gsprim);
}
} else {
/* Use special multi-cycling mode in which each GS
@ -1483,7 +1488,7 @@ retry_select_mode:
: is_gs
? max_gsprims * gs_num_invocations * gs_vertices_out
: max_esverts;
assert(max_out_vertices <= 256);
assert(max_out_vertices <= max_workgroup_size);
out->hw_max_esverts = max_esverts;
out->max_gsprims = max_gsprims;
@ -1504,6 +1509,6 @@ retry_select_mode:
/* If asserts are disabled, we use the same conditions to return false */
return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
max_out_vertices <= 256 &&
max_out_vertices <= max_workgroup_size &&
out->hw_max_esverts >= min_esverts;
}

View file

@ -336,9 +336,11 @@ typedef struct {
bool
ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, mesa_shader_stage es_stage, bool is_gs,
enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
enum mesa_prim input_prim, unsigned gs_vertices_out,
unsigned gs_invocations, unsigned target_workgroup_size,
unsigned max_workgroup_size, unsigned wave_size,
unsigned esgs_vertex_stride, unsigned ngg_lds_vertex_size,
unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
unsigned max_esgs_lds_padding, ac_ngg_subgroup_info *out);
static unsigned inline

View file

@ -1277,8 +1277,8 @@ gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es
ac_ngg_subgroup_info info;
ac_ngg_compute_subgroup_info(gfx_level, es_info->stage, !!gs_info, input_prim, gs_vertices_out, gs_num_invocations,
128, stage_info->wave_size, es_info->esgs_itemsize, stage_info->ngg_lds_vertex_size,
stage_info->ngg_lds_scratch_size, false, 0, &info);
128, 256, stage_info->wave_size, es_info->esgs_itemsize,
stage_info->ngg_lds_vertex_size, stage_info->ngg_lds_scratch_size, false, 0, &info);
out->hw_max_esverts = info.hw_max_esverts;
out->max_gsprims = info.max_gsprims;

View file

@ -1887,11 +1887,12 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false);
unsigned gs_vertices_out = gs_sel->stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 0;
unsigned gs_invocations = gs_sel->stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.invocations : 0;
unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
if (!ac_ngg_compute_subgroup_info(gs_sel->screen->info.gfx_level, es_sel->stage,
gs_sel->stage == MESA_SHADER_GEOMETRY,
input_prim, gs_vertices_out, gs_invocations,
si_get_max_workgroup_size(shader), shader->wave_size,
max_workgroup_size, max_workgroup_size, shader->wave_size,
es_sel->info.esgs_vertex_stride, shader->info.ngg_lds_vertex_size,
shader->info.ngg_lds_scratch_size, gs_sel->tess_turns_off_ngg,
gs_sel->stage == MESA_SHADER_GEOMETRY ? 255 : 0, &shader->ngg.info)) {