From 31db17f653cbba464e432b3c4e2a144e2785425d Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 13 May 2026 12:32:04 -0400 Subject: [PATCH] ir3: Implement round-robin workaround On later a6xx and a7xx, round-robin does not work properly when there are more than 8 active waves from the same dispatch in the same uSP. We have to clamp the register usage to a minimum to guarantee there aren't more waves. There is a problem for very large workgroups, which will have to be solved the same way as the problem with deep control flow, through implementing ReuseGPRMode. Part-of: --- src/freedreno/ir3/ir3.c | 42 +++++++++++++++++++++++++++++++++++++++++ src/freedreno/ir3/ir3.h | 3 +++ 2 files changed, 45 insertions(+) diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 521ce04f09f..f61c203637b 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -298,6 +298,20 @@ ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, v->name); exit(1); } + + /* Due to round_robin_errata we may be unable to support forward progress + * guarantees between waves if there are more than 8 waves active. + */ + if (v->cs.round_robin_mode && compiler->info->props.round_robin_errata) { + if (waves_per_wg > 8 && v->has_barrier) { + mesa_loge( + "Compute shader (%s) requires forward progress but uses more " + "than 8 waves.", + v->name); + exit(1); + } + max_waves = MIN2(max_waves, 8); + } } return max_waves; @@ -315,6 +329,29 @@ ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, : compiler->info->max_waves; } +/* Get the minimum number of registers a shader must declare, even if it doesn't + * actually use as many. + */ +unsigned +ir3_get_min_reg_count(const struct ir3_shader_variant *v, bool double_threadsize) +{ + if (!ir3_shader_compute(v) || !v->cs.round_robin_mode || + !v->compiler->info->props.round_robin_errata) + return 0; + + /* Limit occupancy to work around the round-robin errata. */ + unsigned max_waves = 8; + + /* We want to find the smallest register size where no more than + * (max_waves / wave_granularity) waves fit in reg_size_vec4. Calculate the + * maximum register size where (max_waves / wave_granularity + 1) waves fit, + * then add 1. + */ + return (v->compiler->info->props.reg_size_vec4 / + ((max_waves / v->compiler->info->wave_granularity) * + (double_threadsize ? 2 : 1) + 1)) + 1; +} + void ir3_collect_info(struct ir3_shader_variant *v) { @@ -555,6 +592,11 @@ ir3_collect_info(struct ir3_shader_variant *v) info->double_threadsize = ir3_should_double_threadsize(v, regs_count); + /* Limit occupancy if necessary by increasing max_reg. */ + unsigned min_reg_count = ir3_get_min_reg_count(v, info->double_threadsize); + if (min_reg_count > 0) + info->max_reg = MAX2(info->max_reg, min_reg_count - 1); + /* TODO this is different for earlier gens, but earlier gens don't use this */ info->subgroup_size = v->info.double_threadsize ? 128 : 64; diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 08424fe5bda..7d2cdb8e5fb 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -885,6 +885,9 @@ unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize); +unsigned ir3_get_min_reg_count(const struct ir3_shader_variant *v, + bool double_threadsize); + bool ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count);