radeonsi/gfx12: fix VS output corruption with streamout

We increased VS_EXPORT_COUNT to 8 for streamout in gfx10_shader_ngg,
but we forgot to increase the attribute ring stride, causing all waves
except the first one to get corrupted VS outputs.

Fixes: f703dfd1bb - radeonsi: add gfx12

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30503>
This commit is contained in:
Marek Olšák 2024-08-05 00:14:20 -04:00 committed by Marge Bot
parent 8b3e02587e
commit 0e27df4521
3 changed files with 23 additions and 11 deletions

View file

@ -43,7 +43,7 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
sel->info.base.vs.blit_sgprs_amd - 1) :
ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
unsigned stride = 16 * shader->info.nr_param_exports;
unsigned stride = 16 * si_shader_num_alloc_param_exports(shader);
uint32_t desc[4];
ac_build_attr_ring_descriptor(sel->screen->info.gfx_level,

View file

@ -1028,6 +1028,7 @@ void si_nir_late_opts(struct nir_shader *nir);
char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
/* si_state_shaders.cpp */
unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
struct gfx9_gs_info *out);

View file

@ -1391,6 +1391,26 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
shader->info.nr_pos_exports > 1));
}
/* Return the number of allocated param exports. This can be more than the number of param
* exports in the shader.
*/
unsigned si_shader_num_alloc_param_exports(struct si_shader *shader)
{
unsigned num_params = shader->info.nr_param_exports;
/* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs on GFX12, they can wrap
* around if there are more than 2^12 workgroups, causing 2 workgroups to get the same
* ordered ID, which can deadlock the "ordered add" loop.
*
* The recommended solution is to use the alloc/dealloc mechanism of the attribute ring to limit
* the number of workgroups in flight and thus the number of ordered IDs in flight.
*/
if (shader->selector->screen->info.gfx_level >= GFX12 && si_shader_uses_streamout(shader))
num_params = MAX2(num_params, 8);
return num_params;
}
/**
* Prepare the PM4 image for \p shader, which will run as a merged ESGS shader
* in NGG mode.
@ -1541,16 +1561,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
gs_sel->info.writes_primid);
if (sscreen->info.gfx_level >= GFX12) {
unsigned num_params = shader->info.nr_param_exports;
/* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs, they can wrap
* around if there are more than 2^12 workgroups, causing 2 workgroups to get the same
* ordered ID, which would break the streamout algorithm.
* The recommended solution is to use the alloc/dealloc mechanism of the attribute ring,
* which is enough to limit the range of ordered IDs that can be in flight.
*/
if (si_shader_uses_streamout(shader))
num_params = MAX2(num_params, 8);
unsigned num_params = si_shader_num_alloc_param_exports(shader);
shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B220_SPI_SHADER_LATE_ALLOC_GS(127) |
S_00B220_GLG_FORCE_DISABLE(1) |