ir3: Implement round-robin workaround
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

On later a6xx and a7xx, round-robin does not work properly when there
are more than 8 active waves from the same dispatch in the same uSP. We
have to clamp the register usage to a minimum to guarantee there aren't
more waves. There is a problem for very large workgroups, which will
have to be solved the same way as the problem with deep control flow,
through implementing ReuseGPRMode.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41562>
This commit is contained in:
Connor Abbott 2026-05-13 12:32:04 -04:00 committed by Marge Bot
parent 25f7765d21
commit 31db17f653
2 changed files with 45 additions and 0 deletions

View file

@ -298,6 +298,20 @@ ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
v->name);
exit(1);
}
/* Due to round_robin_errata we may be unable to support forward progress
* guarantees between waves if there are more than 8 waves active.
*/
if (v->cs.round_robin_mode && compiler->info->props.round_robin_errata) {
if (waves_per_wg > 8 && v->has_barrier) {
mesa_loge(
"Compute shader (%s) requires forward progress but uses more "
"than 8 waves.",
v->name);
exit(1);
}
max_waves = MIN2(max_waves, 8);
}
}
return max_waves;
@ -315,6 +329,29 @@ ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
: compiler->info->max_waves;
}
/* Get the minimum number of registers a shader must declare, even if it doesn't
* actually use as many.
*/
unsigned
ir3_get_min_reg_count(const struct ir3_shader_variant *v, bool double_threadsize)
{
if (!ir3_shader_compute(v) || !v->cs.round_robin_mode ||
!v->compiler->info->props.round_robin_errata)
return 0;
/* Limit occupancy to work around the round-robin errata. */
unsigned max_waves = 8;
/* We want to find the smallest register size where no more than
* (max_waves / wave_granularity) waves fit in reg_size_vec4. Calculate the
* maximum register size where (max_waves / wave_granularity + 1) waves fit,
* then add 1.
*/
return (v->compiler->info->props.reg_size_vec4 /
((max_waves / v->compiler->info->wave_granularity) *
(double_threadsize ? 2 : 1) + 1)) + 1;
}
void
ir3_collect_info(struct ir3_shader_variant *v)
{
@ -555,6 +592,11 @@ ir3_collect_info(struct ir3_shader_variant *v)
info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
/* Limit occupancy if necessary by increasing max_reg. */
unsigned min_reg_count = ir3_get_min_reg_count(v, info->double_threadsize);
if (min_reg_count > 0)
info->max_reg = MAX2(info->max_reg, min_reg_count - 1);
/* TODO this is different for earlier gens, but earlier gens don't use this */
info->subgroup_size = v->info.double_threadsize ? 128 : 64;

View file

@ -885,6 +885,9 @@ unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
bool double_threadsize);
unsigned ir3_get_min_reg_count(const struct ir3_shader_variant *v,
bool double_threadsize);
bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
unsigned regs_count);