tu: Disable 128-wide subgroups on No Man's Sky.

It has a lighting compute shader that assumes either 32 or 64, and that produces vertical banding if the compiler opts for 128. Cc: mesa-stable Closes: https://gitlab.freedesktop.org/mesa/mesa/-/work_items/15423 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41665>
2026-06-03 19:48:17 +02:00 · 2026-05-18 16:18:59 -07:00 · 2026-05-18 16:18:59 -07:00 · 4c03db9ec8
commit 4c03db9ec8
parent 5acc764c82
5 changed files with 31 additions and 8 deletions
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -885,8 +885,8 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
   p->deviceNodeMask = 0;
   p->deviceLUIDValid = false;

-   p->subgroupSize = pdevice->info->props.supports_double_threadsize ?
-      pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
+   p->subgroupSize =
+      pdevice->expose_double_threadsize ? pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
   p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
@ -1034,8 +1034,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
                                      struct vk_properties *p)
 {
   p->minSubgroupSize = pdevice->info->threadsize_base;
-   p->maxSubgroupSize = pdevice->info->props.supports_double_threadsize ?
-      pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
+   p->maxSubgroupSize =
+      pdevice->expose_double_threadsize ? pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
   p->maxComputeWorkgroupSubgroups = pdevice->info->max_waves;
   p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;

@ -1164,9 +1164,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
   props->maxComputeWorkGroupCount[0] =
      props->maxComputeWorkGroupCount[1] =
      props->maxComputeWorkGroupCount[2] = 65535;
-   props->maxComputeWorkGroupInvocations = pdevice->info->props.supports_double_threadsize ?
-      pdevice->info->threadsize_base * 2 * pdevice->info->max_waves :
-      pdevice->info->threadsize_base * pdevice->info->max_waves;
+   props->maxComputeWorkGroupInvocations = pdevice->expose_double_threadsize
+                                              ? pdevice->info->threadsize_base * 2 * pdevice->info->max_waves
+                                              : pdevice->info->threadsize_base * pdevice->info->max_waves;
   if (pdevice->info->props.is_a702) {
      props->maxComputeWorkGroupSize[0] =
         props->maxComputeWorkGroupSize[1] = 512;
@ -1687,6 +1687,8 @@ tu_physical_device_init(struct tu_physical_device *device,
      goto fail_free_name;
   }

+   device->expose_double_threadsize = info.props.supports_double_threadsize && !instance->restrict_subgroup_size_64;
+
   device->level1_dcache_size = util_cache_granularity();
   device->has_cached_non_coherent_memory =
      device->level1_dcache_size > 0 && !DETECT_ARCH_ARM;
@ -1863,6 +1865,7 @@ static const driOptionDescription tu_dri_options[] = {
      DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false)
      DRI_CONF_TU_AUTOTUNE_ALGORITHM()
      DRI_CONF_TU_OVERRIDE_UNCACHED_AS_CACHE_COHERENT(false)
+      DRI_CONF_TU_RESTRICT_SUBGROUP_SIZE_64(false)
   DRI_CONF_SECTION_END
 };

@ -1911,6 +1914,7 @@ tu_init_dri_options(struct tu_instance *instance)
   instance->allow_concurrent_binning =
      (driQueryOptionb(&instance->dri_options, "tu_allow_concurrent_binning") && !TU_DEBUG(NO_CONCURRENT_BINNING)) ||
      TU_DEBUG(FORCE_CONCURRENT_BINNING);
+   instance->restrict_subgroup_size_64 = driQueryOptionb(&instance->dri_options, "tu_restrict_subgroup_size_64");
 }

 static uint32_t instance_count = 0;
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -145,6 +145,8 @@ struct tu_physical_device

   bool has_preemption;

+   bool expose_double_threadsize;
+
   /* Whether performance counter selector registers can be written by userspace CSes. */
   bool is_perf_cntr_selectable;

@ -231,6 +233,11 @@ struct tu_instance
    */
   bool enable_d24s8_border_color_workaround;

+   /* Various games assume that gl_SubgroupSize is either 32 or 64, and we hide
+    * our 128-invocation subgroup support for them.
+    */
+   bool restrict_subgroup_size_64;
+
   /* When D24S8 is used without enable_d24s8_border_color_workaround, the
    * fast border color HW feature results in an incorrect color being used.
    * However, we want to enable fast border colors for apps that are known
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -3471,7 +3471,7 @@ tu_shader_key_subgroup_size(struct tu_shader_key *key,
                            struct tu_device *dev)
 {
   enum ir3_wavesize_option api_wavesize, real_wavesize;
-   if (!dev->physical_device->info->props.supports_double_threadsize) {
+   if (!dev->physical_device->expose_double_threadsize) {
      api_wavesize = IR3_SINGLE_ONLY;
      real_wavesize = IR3_SINGLE_ONLY;
   } else {
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@ -1596,6 +1596,14 @@ TODO: document the other workarounds.
        <application name="Half-Life: Alyx" application_name_match="hlvr">
            <option name="tu_emulate_alpha_to_coverage" value="true" />
        </application>
+        <application name='No Man&apos;s Sky' application_name_match='No Man&apos;s Sky'>
+            <!-- A lighting CS reducing 8x8 regions with fmin/fmax does a write once per
+            subgroup (assuming 64), or collects the results from two subgroups in the
+            32 subgroup size case.  Thus, our 128 subgroup size results in vertical banding
+            when half the 8x8 regions don't get written.
+            -->
+            <option name="tu_restrict_subgroup_size_64" value="true" />
+        </application>
    </device>

    <device driver="asahi">
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@ -668,6 +668,10 @@
   DRI_CONF_OPT_B(tu_allow_concurrent_binning, def, \
                  "Allow concurrent binning on A7XX+, the CB is disabled by default because it regresses performance on desktop games")

+#define DRI_CONF_TU_RESTRICT_SUBGROUP_SIZE_64(def) \
+   DRI_CONF_OPT_B(tu_restrict_subgroup_size_64, def, \
+                  "Restrict subgroup size to 64 (instead of a max of 128) to work around games assuming desktop GPU 32/64 sizes")
+
 /**
 * \brief Honeykrisp specific configuration options
 */