tu: Disable 128-wide subgroups on No Man's Sky.
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

It has a lighting compute shader that assumes either 32 or 64, and that
produces vertical banding if the compiler opts for 128.

Cc: mesa-stable
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/work_items/15423
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41665>
This commit is contained in:
Emma Anholt 2026-05-18 16:18:59 -07:00 committed by Marge Bot
parent 5acc764c82
commit 4c03db9ec8
5 changed files with 31 additions and 8 deletions

View file

@ -885,8 +885,8 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
p->deviceNodeMask = 0;
p->deviceLUIDValid = false;
p->subgroupSize = pdevice->info->props.supports_double_threadsize ?
pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
p->subgroupSize =
pdevice->expose_double_threadsize ? pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
VK_SUBGROUP_FEATURE_VOTE_BIT |
@ -1034,8 +1034,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
struct vk_properties *p)
{
p->minSubgroupSize = pdevice->info->threadsize_base;
p->maxSubgroupSize = pdevice->info->props.supports_double_threadsize ?
pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
p->maxSubgroupSize =
pdevice->expose_double_threadsize ? pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
p->maxComputeWorkgroupSubgroups = pdevice->info->max_waves;
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
@ -1164,9 +1164,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
props->maxComputeWorkGroupCount[0] =
props->maxComputeWorkGroupCount[1] =
props->maxComputeWorkGroupCount[2] = 65535;
props->maxComputeWorkGroupInvocations = pdevice->info->props.supports_double_threadsize ?
pdevice->info->threadsize_base * 2 * pdevice->info->max_waves :
pdevice->info->threadsize_base * pdevice->info->max_waves;
props->maxComputeWorkGroupInvocations = pdevice->expose_double_threadsize
? pdevice->info->threadsize_base * 2 * pdevice->info->max_waves
: pdevice->info->threadsize_base * pdevice->info->max_waves;
if (pdevice->info->props.is_a702) {
props->maxComputeWorkGroupSize[0] =
props->maxComputeWorkGroupSize[1] = 512;
@ -1687,6 +1687,8 @@ tu_physical_device_init(struct tu_physical_device *device,
goto fail_free_name;
}
device->expose_double_threadsize = info.props.supports_double_threadsize && !instance->restrict_subgroup_size_64;
device->level1_dcache_size = util_cache_granularity();
device->has_cached_non_coherent_memory =
device->level1_dcache_size > 0 && !DETECT_ARCH_ARM;
@ -1863,6 +1865,7 @@ static const driOptionDescription tu_dri_options[] = {
DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false)
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
DRI_CONF_TU_OVERRIDE_UNCACHED_AS_CACHE_COHERENT(false)
DRI_CONF_TU_RESTRICT_SUBGROUP_SIZE_64(false)
DRI_CONF_SECTION_END
};
@ -1911,6 +1914,7 @@ tu_init_dri_options(struct tu_instance *instance)
instance->allow_concurrent_binning =
(driQueryOptionb(&instance->dri_options, "tu_allow_concurrent_binning") && !TU_DEBUG(NO_CONCURRENT_BINNING)) ||
TU_DEBUG(FORCE_CONCURRENT_BINNING);
instance->restrict_subgroup_size_64 = driQueryOptionb(&instance->dri_options, "tu_restrict_subgroup_size_64");
}
static uint32_t instance_count = 0;

View file

@ -145,6 +145,8 @@ struct tu_physical_device
bool has_preemption;
bool expose_double_threadsize;
/* Whether performance counter selector registers can be written by userspace CSes. */
bool is_perf_cntr_selectable;
@ -231,6 +233,11 @@ struct tu_instance
*/
bool enable_d24s8_border_color_workaround;
/* Various games assume that gl_SubgroupSize is either 32 or 64, and we hide
* our 128-invocation subgroup support for them.
*/
bool restrict_subgroup_size_64;
/* When D24S8 is used without enable_d24s8_border_color_workaround, the
* fast border color HW feature results in an incorrect color being used.
* However, we want to enable fast border colors for apps that are known

View file

@ -3471,7 +3471,7 @@ tu_shader_key_subgroup_size(struct tu_shader_key *key,
struct tu_device *dev)
{
enum ir3_wavesize_option api_wavesize, real_wavesize;
if (!dev->physical_device->info->props.supports_double_threadsize) {
if (!dev->physical_device->expose_double_threadsize) {
api_wavesize = IR3_SINGLE_ONLY;
real_wavesize = IR3_SINGLE_ONLY;
} else {

View file

@ -1596,6 +1596,14 @@ TODO: document the other workarounds.
<application name="Half-Life: Alyx" application_name_match="hlvr">
<option name="tu_emulate_alpha_to_coverage" value="true" />
</application>
<application name='No Man&apos;s Sky' application_name_match='No Man&apos;s Sky'>
<!-- A lighting CS reducing 8x8 regions with fmin/fmax does a write once per
subgroup (assuming 64), or collects the results from two subgroups in the
32 subgroup size case. Thus, our 128 subgroup size results in vertical banding
when half the 8x8 regions don't get written.
-->
<option name="tu_restrict_subgroup_size_64" value="true" />
</application>
</device>
<device driver="asahi">

View file

@ -668,6 +668,10 @@
DRI_CONF_OPT_B(tu_allow_concurrent_binning, def, \
"Allow concurrent binning on A7XX+, the CB is disabled by default because it regresses performance on desktop games")
#define DRI_CONF_TU_RESTRICT_SUBGROUP_SIZE_64(def) \
DRI_CONF_OPT_B(tu_restrict_subgroup_size_64, def, \
"Restrict subgroup size to 64 (instead of a max of 128) to work around games assuming desktop GPU 32/64 sizes")
/**
* \brief Honeykrisp specific configuration options
*/