anv: Add full subgroups workaround for the shaders that use shared memory

This workaround is similar to anv_assume_full_subgroups, but it applies
to the shaders that use shared memory. If they rely on the implicit
synchronization, and we choose a smaller group size than the
(broken) shader expects, it will produce incorrect results.

Cc: mesa-stable
Signed-off-by: Sviatoslav Peleshko <sviatoslav.peleshko@globallogic.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23408>
(cherry picked from commit 369aec5704)
This commit is contained in:
Sviatoslav Peleshko 2025-03-11 05:14:57 +02:00 committed by Eric Engestrom
parent 3be28b42e2
commit 090dbbc995
5 changed files with 25 additions and 1 deletions

View file

@ -2014,7 +2014,7 @@
"description": "anv: Add full subgroups workaround for the shaders that use shared memory",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -16,6 +16,7 @@ static const driOptionDescription anv_dri_options[] = {
DRI_CONF_VK_XWAYLAND_WAIT_READY(false)
DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_BARRIER(false)
DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(false)
DRI_CONF_ANV_DISABLE_FCV(false)
DRI_CONF_ANV_ENABLE_BUFFER_COMP(false)
DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true)
@ -141,6 +142,8 @@ anv_init_dri_options(struct anv_instance *instance)
driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups");
instance->assume_full_subgroups_with_barrier =
driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_barrier");
instance->assume_full_subgroups_with_shared_memory =
driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups_with_shared_memory");
instance->limit_trig_input_range =
driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
instance->sample_mask_out_opengl_behaviour =

View file

@ -648,6 +648,9 @@ anv_pipeline_hash_graphics(struct anv_graphics_base_pipeline *pipeline,
if (stages[MESA_SHADER_MESH].info || stages[MESA_SHADER_TASK].info) {
const uint8_t afs = device->physical->instance->assume_full_subgroups;
_mesa_sha1_update(&ctx, &afs, sizeof(afs));
const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory;
_mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm));
}
_mesa_sha1_final(&ctx, sha1_out);
@ -670,6 +673,9 @@ anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
const bool afswb = device->physical->instance->assume_full_subgroups_with_barrier;
_mesa_sha1_update(&ctx, &afswb, sizeof(afswb));
const bool afs_shm = device->physical->instance->assume_full_subgroups_with_shared_memory;
_mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm));
_mesa_sha1_update(&ctx, stage->shader_sha1,
sizeof(stage->shader_sha1));
_mesa_sha1_update(&ctx, &stage->key.cs, sizeof(stage->key.cs));
@ -859,6 +865,16 @@ anv_fixup_subgroup_size(struct anv_device *device, struct shader_info *info)
local_size % BRW_SUBGROUP_SIZE == 0)
info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
/* Similarly, sometimes games rely on the implicit synchronization of
* the shared memory accesses, and choosing smaller subgroups than the game
* expects will cause bugs. */
if (device->physical->instance->assume_full_subgroups_with_shared_memory &&
info->shared_size > 0 &&
info->subgroup_size == SUBGROUP_SIZE_VARYING &&
local_size &&
local_size % BRW_SUBGROUP_SIZE == 0)
info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
/* If the client requests that we dispatch full subgroups but doesn't
* allow us to pick a subgroup size, we have to smash it to the API
* value of 32. Performance will likely be terrible in this case but

View file

@ -1295,6 +1295,7 @@ struct anv_instance {
*/
uint8_t assume_full_subgroups;
bool assume_full_subgroups_with_barrier;
bool assume_full_subgroups_with_shared_memory;
bool limit_trig_input_range;
bool sample_mask_out_opengl_behaviour;
bool force_filter_addr_rounding;

View file

@ -794,6 +794,10 @@
DRI_CONF_OPT_B(anv_assume_full_subgroups_with_barrier, def, \
"Assume full subgroups requirement for compute shaders that use control barriers")
#define DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS_WITH_SHARED_MEMORY(def) \
DRI_CONF_OPT_B(anv_assume_full_subgroups_with_shared_memory, def, \
"Allow assuming full subgroups requirement for shaders using shared memory even when it's not specified explicitly")
#define DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(def) \
DRI_CONF_OPT_B(anv_sample_mask_out_opengl_behaviour, def, \
"Ignore sample mask out when having single sampled target")