From 090dbbc9959fc7daf2a98d2be368afc40edaf4d3 Mon Sep 17 00:00:00 2001
From: Sviatoslav Peleshko <sviatoslav.peleshko@globallogic.com>
Date: Tue, 11 Mar 2025 05:14:57 +0200
Subject: [PATCH] anv: Add full subgroups workaround for the shaders that use
 shared memory

This workaround is similar to anv_assume_full_subgroups, but it applies
to the shaders that use shared memory. If they rely on the implicit
synchronization, and we choose a smaller group size than the
(broken) shader expects, it will produce incorrect results.

Cc: mesa-stable
Signed-off-by: Sviatoslav Peleshko <sviatoslav.peleshko@globallogic.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23408>
(cherry picked from commit 369aec57046143d78295338e4fe96878de1a8668)
---
 .pick_status.json               |  2 +-
 src/intel/vulkan/anv_instance.c |  3 +++
 src/intel/vulkan/anv_pipeline.c | 16 ++++++++++++++++
 src/intel/vulkan/anv_private.h  |  1 +
 src/util/driconf.h              |  4 ++++
 5 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/.pick_status.json b/.pick_status.json
index a04829e23c5..321e667d3ff 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -2014,7 +2014,7 @@
         "description": "anv: Add full subgroups workaround for the shaders that use shared memory",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c
index 527a82a35f0..0bf5ed9972c 100644
--- a/src/intel/vulkan/anv_instance.c
+++ b/src/intel/vulkan/anv_instance.c
@@ -16,6 +16,7 @@ static const driOptionDescription anv_dri_options[] = {
       DRI_CONF_VK_XWAYLAND_WAIT_READY(false)
       DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
       DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_BARRIER(false)
+      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(false)
       DRI_CONF_ANV_DISABLE_FCV(false)
       DRI_CONF_ANV_ENABLE_BUFFER_COMP(false)
       DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true)
@@ -141,6 +142,8 @@ anv_init_dri_options(struct anv_instance *instance)
        driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups");
     instance->assume_full_subgroups_with_barrier =
        driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_barrier");
+    instance->assume_full_subgroups_with_shared_memory =
+       driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_shared_memory");
     instance->limit_trig_input_range =
        driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
     instance->sample_mask_out_opengl_behaviour =
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index ce35d55edf1..e6d77b32806 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -648,6 +648,9 @@ anv_pipeline_hash_graphics(struct anv_graphics_base_pipeline *pipeline,
    if (stages[MESA_SHADER_MESH].info || stages[MESA_SHADER_TASK].info) {
       const uint8_t afs = device->physical->instance->assume_full_subgroups;
       _mesa_sha1_update(&ctx, &afs, sizeof(afs));
+
+      const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory;
+      _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm));
    }
 
    _mesa_sha1_final(&ctx, sha1_out);
@@ -670,6 +673,9 @@ anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
    const bool afswb = device->physical->instance->assume_full_subgroups_with_barrier;
    _mesa_sha1_update(&ctx, &afswb, sizeof(afswb));
 
+   const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory;
+   _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm));
+
    _mesa_sha1_update(&ctx, stage->shader_sha1,
                      sizeof(stage->shader_sha1));
    _mesa_sha1_update(&ctx, &stage->key.cs, sizeof(stage->key.cs));
@@ -859,6 +865,16 @@ anv_fixup_subgroup_size(struct anv_device *device, struct shader_info *info)
        local_size % BRW_SUBGROUP_SIZE == 0)
       info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
 
+   /* Similarly, sometimes games rely on the implicit synchronization of
+    * the shared memory accesses, and choosing smaller subgroups than the game
+    * expects will cause bugs. */
+   if (device->physical->instance->assume_full_subgroups_with_shared_memory &&
+       info->shared_size > 0 &&
+       info->subgroup_size == SUBGROUP_SIZE_VARYING &&
+       local_size &&
+       local_size % BRW_SUBGROUP_SIZE == 0)
+      info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
+
    /* If the client requests that we dispatch full subgroups but doesn't
     * allow us to pick a subgroup size, we have to smash it to the API
     * value of 32.  Performance will likely be terrible in this case but
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 73505b719eb..561dd12e8e1 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1295,6 +1295,7 @@ struct anv_instance {
      */
     uint8_t                                     assume_full_subgroups;
     bool                                        assume_full_subgroups_with_barrier;
+    bool                                        assume_full_subgroups_with_shared_memory;
     bool                                        limit_trig_input_range;
     bool                                        sample_mask_out_opengl_behaviour;
     bool                                        force_filter_addr_rounding;
diff --git a/src/util/driconf.h b/src/util/driconf.h
index d186b090f82..0f75aa36a4e 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -794,6 +794,10 @@
    DRI_CONF_OPT_B(anv_assume_full_subgroups_with_barrier, def, \
                   "Assume full subgroups requirement for compute shaders that use control barriers")
 
+#define DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(def) \
+   DRI_CONF_OPT_B(anv_assume_full_subgroups_with_shared_memory, def, \
+                  "Allow assuming full subgroups requirement for shaders using shared memory even when it's not specified explicitly")
+
 #define DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(def) \
    DRI_CONF_OPT_B(anv_sample_mask_out_opengl_behaviour, def, \
                   "Ignore sample mask out when having single sampled target")