From 090dbbc9959fc7daf2a98d2be368afc40edaf4d3 Mon Sep 17 00:00:00 2001 From: Sviatoslav Peleshko Date: Tue, 11 Mar 2025 05:14:57 +0200 Subject: [PATCH] anv: Add full subgroups workaround for the shaders that use shared memory This workaround is similar to anv_assume_full_subgroups, but it applies to the shaders that use shared memory. If they rely on the implicit synchronization, and we choose a smaller group size than the (broken) shader expects, it will produce incorrect results. Cc: mesa-stable Signed-off-by: Sviatoslav Peleshko Reviewed-by: Lionel Landwerlin Part-of: (cherry picked from commit 369aec57046143d78295338e4fe96878de1a8668) --- .pick_status.json | 2 +- src/intel/vulkan/anv_instance.c | 3 +++ src/intel/vulkan/anv_pipeline.c | 16 ++++++++++++++++ src/intel/vulkan/anv_private.h | 1 + src/util/driconf.h | 4 ++++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.pick_status.json b/.pick_status.json index a04829e23c5..321e667d3ff 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2014,7 +2014,7 @@ "description": "anv: Add full subgroups workaround for the shaders that use shared memory", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c index 527a82a35f0..0bf5ed9972c 100644 --- a/src/intel/vulkan/anv_instance.c +++ b/src/intel/vulkan/anv_instance.c @@ -16,6 +16,7 @@ static const driOptionDescription anv_dri_options[] = { DRI_CONF_VK_XWAYLAND_WAIT_READY(false) DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0) DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_BARRIER(false) + DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(false) DRI_CONF_ANV_DISABLE_FCV(false) DRI_CONF_ANV_ENABLE_BUFFER_COMP(false) DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true) @@ -141,6 +142,8 @@ anv_init_dri_options(struct anv_instance *instance) driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups"); instance->assume_full_subgroups_with_barrier = driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_barrier"); + instance->assume_full_subgroups_with_shared_memory = + driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_shared_memory"); instance->limit_trig_input_range = driQueryOptionb(&instance->dri_options, "limit_trig_input_range"); instance->sample_mask_out_opengl_behaviour = diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index ce35d55edf1..e6d77b32806 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -648,6 +648,9 @@ anv_pipeline_hash_graphics(struct anv_graphics_base_pipeline *pipeline, if (stages[MESA_SHADER_MESH].info || stages[MESA_SHADER_TASK].info) { const uint8_t afs = device->physical->instance->assume_full_subgroups; _mesa_sha1_update(&ctx, &afs, sizeof(afs)); + + const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory; + _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm)); } _mesa_sha1_final(&ctx, sha1_out); @@ -670,6 +673,9 @@ anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline, const bool afswb = device->physical->instance->assume_full_subgroups_with_barrier; _mesa_sha1_update(&ctx, &afswb, sizeof(afswb)); + const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory; + _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm)); + _mesa_sha1_update(&ctx, stage->shader_sha1, sizeof(stage->shader_sha1)); _mesa_sha1_update(&ctx, &stage->key.cs, sizeof(stage->key.cs)); @@ -859,6 +865,16 @@ anv_fixup_subgroup_size(struct anv_device *device, struct shader_info *info) local_size % BRW_SUBGROUP_SIZE == 0) info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS; + /* Similarly, sometimes games rely on the implicit synchronization of + * the shared memory accesses, and choosing smaller subgroups than the game + * expects will cause bugs. */ + if (device->physical->instance->assume_full_subgroups_with_shared_memory && + info->shared_size > 0 && + info->subgroup_size == SUBGROUP_SIZE_VARYING && + local_size && + local_size % BRW_SUBGROUP_SIZE == 0) + info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS; + /* If the client requests that we dispatch full subgroups but doesn't * allow us to pick a subgroup size, we have to smash it to the API * value of 32. Performance will likely be terrible in this case but diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 73505b719eb..561dd12e8e1 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1295,6 +1295,7 @@ struct anv_instance { */ uint8_t assume_full_subgroups; bool assume_full_subgroups_with_barrier; + bool assume_full_subgroups_with_shared_memory; bool limit_trig_input_range; bool sample_mask_out_opengl_behaviour; bool force_filter_addr_rounding; diff --git a/src/util/driconf.h b/src/util/driconf.h index d186b090f82..0f75aa36a4e 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -794,6 +794,10 @@ DRI_CONF_OPT_B(anv_assume_full_subgroups_with_barrier, def, \ "Assume full subgroups requirement for compute shaders that use control barriers") +#define DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(def) \ + DRI_CONF_OPT_B(anv_assume_full_subgroups_with_shared_memory, def, \ + "Allow assuming full subgroups requirement for shaders using shared memory even when it's not specified explicitly") + #define DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(def) \ DRI_CONF_OPT_B(anv_sample_mask_out_opengl_behaviour, def, \ "Ignore sample mask out when having single sampled target")