anv: Add full subgroups workaround for the shaders that use shared memory

This workaround is similar to anv_assume_full_subgroups, but it applies to the shaders that use shared memory. If they rely on the implicit synchronization, and we choose a smaller group size than the (broken) shader expects, it will produce incorrect results. Cc: mesa-stable Signed-off-by: Sviatoslav Peleshko <sviatoslav.peleshko@globallogic.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23408> (cherry picked from commit 369aec5704)
2026-01-04 20:00:11 +01:00 · 2025-03-11 05:14:57 +02:00 · 2025-03-11 05:14:57 +02:00 · 090dbbc995
commit 090dbbc995
parent 3be28b42e2
5 changed files with 25 additions and 1 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -2014,7 +2014,7 @@
        "description": "anv: Add full subgroups workaround for the shaders that use shared memory",
        "nominated": true,
        "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/src/intel/vulkan/anv_instance.c
+++ b/src/intel/vulkan/anv_instance.c
@ -16,6 +16,7 @@ static const driOptionDescription anv_dri_options[] = {
      DRI_CONF_VK_XWAYLAND_WAIT_READY(false)
      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_BARRIER(false)
+      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(false)
      DRI_CONF_ANV_DISABLE_FCV(false)
      DRI_CONF_ANV_ENABLE_BUFFER_COMP(false)
      DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true)
@ -141,6 +142,8 @@ anv_init_dri_options(struct anv_instance *instance)
       driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups");
    instance->assume_full_subgroups_with_barrier =
       driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_barrier");
+    instance->assume_full_subgroups_with_shared_memory =
+       driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_shared_memory");
    instance->limit_trig_input_range =
       driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
    instance->sample_mask_out_opengl_behaviour =
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -648,6 +648,9 @@ anv_pipeline_hash_graphics(struct anv_graphics_base_pipeline *pipeline,
   if (stages[MESA_SHADER_MESH].info || stages[MESA_SHADER_TASK].info) {
      const uint8_t afs = device->physical->instance->assume_full_subgroups;
      _mesa_sha1_update(&ctx, &afs, sizeof(afs));
+
+      const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory;
+      _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm));
   }

   _mesa_sha1_final(&ctx, sha1_out);
@ -670,6 +673,9 @@ anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
   const bool afswb = device->physical->instance->assume_full_subgroups_with_barrier;
   _mesa_sha1_update(&ctx, &afswb, sizeof(afswb));

+   const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory;
+   _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm));
+
   _mesa_sha1_update(&ctx, stage->shader_sha1,
                     sizeof(stage->shader_sha1));
   _mesa_sha1_update(&ctx, &stage->key.cs, sizeof(stage->key.cs));
@ -859,6 +865,16 @@ anv_fixup_subgroup_size(struct anv_device *device, struct shader_info *info)
       local_size % BRW_SUBGROUP_SIZE == 0)
      info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;

+   /* Similarly, sometimes games rely on the implicit synchronization of
+    * the shared memory accesses, and choosing smaller subgroups than the game
+    * expects will cause bugs. */
+   if (device->physical->instance->assume_full_subgroups_with_shared_memory &&
+       info->shared_size > 0 &&
+       info->subgroup_size == SUBGROUP_SIZE_VARYING &&
+       local_size &&
+       local_size % BRW_SUBGROUP_SIZE == 0)
+      info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
+
   /* If the client requests that we dispatch full subgroups but doesn't
    * allow us to pick a subgroup size, we have to smash it to the API
    * value of 32.  Performance will likely be terrible in this case but
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1295,6 +1295,7 @@ struct anv_instance {
     */
    uint8_t                                     assume_full_subgroups;
    bool                                        assume_full_subgroups_with_barrier;
+    bool                                        assume_full_subgroups_with_shared_memory;
    bool                                        limit_trig_input_range;
    bool                                        sample_mask_out_opengl_behaviour;
    bool                                        force_filter_addr_rounding;
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@ -794,6 +794,10 @@
   DRI_CONF_OPT_B(anv_assume_full_subgroups_with_barrier, def, \
                  "Assume full subgroups requirement for compute shaders that use control barriers")

+#define DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(def) \
+   DRI_CONF_OPT_B(anv_assume_full_subgroups_with_shared_memory, def, \
+                  "Allow assuming full subgroups requirement for shaders using shared memory even when it's not specified explicitly")
+
 #define DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(def) \
   DRI_CONF_OPT_B(anv_sample_mask_out_opengl_behaviour, def, \
                  "Ignore sample mask out when having single sampled target")