From fa523aedd0ab96cb4262f3d63e2d8fd75a112efb Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Sun, 15 Mar 2026 21:26:33 +0200
Subject: [PATCH] brw: fence SLM writes between workgroups

On LSC platforms the SLM writes are unfenced between workgroups. This
means a workgroup W1 finishing might have uncompleted SLM writes.
Another workgroup W2 dispatched after W1 which gets allocated an
overlapping SLM location might have writes that race with the previous
W1 operations.

The solution to this is fence all write operations (store & atomics)
of a workgroup before ending the threads. We do this by emitting a
single SLM fence either at the end of the shader or if there is only a
single unfenced right, at the end of that block.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: mesa-stable
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13924
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40430>
---
 src/intel/compiler/brw/brw_nir.c              |  7 ++
 src/intel/compiler/brw/brw_nir.h              |  3 +
 .../brw/brw_nir_fence_shared_stores.c         | 84 +++++++++++++++++++
 src/intel/compiler/brw/meson.build            |  1 +
 4 files changed, 95 insertions(+)
 create mode 100644 src/intel/compiler/brw/brw_nir_fence_shared_stores.c

diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index bcb1a00281f..b6b52e3ffd2 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -2764,6 +2764,13 @@ brw_postprocess_nir_opts(brw_pass_tracker *pt,
 
    brw_vectorize_lower_mem_access(pt, robust_flags);
 
+   /* Fence LSC SLM writes to avoid workgroups WaW hazards to the same SLM
+    * location.
+    */
+   if (devinfo->has_lsc &&
+       mesa_shader_stage_uses_workgroup(nir->info.stage))
+      OPT(brw_nir_fence_shared_stores);
+
    /* Do this after lowering memory access bit-sizes */
    if (nir->info.stage == MESA_SHADER_MESH ||
        nir->info.stage == MESA_SHADER_TASK) {
diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h
index 9e468414932..588a88a2cf8 100644
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@@ -138,6 +138,9 @@ void brw_preprocess_nir(const struct brw_compiler *compiler,
                         nir_shader *nir,
                         const struct brw_nir_compiler_opts *opts);
 
+bool
+brw_nir_fence_shared_stores(nir_shader *shader);
+
 void
 brw_nir_link_shaders(const struct brw_compiler *compiler,
                      nir_shader *producer, nir_shader *consumer);
diff --git a/src/intel/compiler/brw/brw_nir_fence_shared_stores.c b/src/intel/compiler/brw/brw_nir_fence_shared_stores.c
new file mode 100644
index 00000000000..8749579ab8d
--- /dev/null
+++ b/src/intel/compiler/brw/brw_nir_fence_shared_stores.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+block_is_in_loop(nir_block *block)
+{
+   nir_cf_node *cf_node = block->cf_node.parent;
+
+   while (cf_node != NULL) {
+      if (cf_node->type == nir_cf_node_loop)
+         return true;
+
+      cf_node = cf_node->parent;
+   }
+
+   return false;
+}
+
+bool
+brw_nir_fence_shared_stores(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function_with_impl(function, impl, shader) {
+      bool multiple_unfenced_write_blocks = false;
+      nir_block *unfenced_write_block = NULL;
+      nir_foreach_block(block, impl) {
+         bool unfenced_writes = false;
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_store_shared:
+            case nir_intrinsic_shared_atomic:
+            case nir_intrinsic_shared_atomic_swap:
+            case nir_intrinsic_store_shared_block_intel:
+               unfenced_writes = true;
+               break;
+
+            case nir_intrinsic_barrier:
+               if (nir_intrinsic_memory_modes(intrin) & nir_var_mem_shared)
+                  unfenced_writes = false;
+               break;
+
+            default:
+               break;
+            }
+         }
+
+         if (unfenced_writes) {
+            /* Consider we have multiple blocks if the unfenced write is
+             * within a loop.
+             */
+            multiple_unfenced_write_blocks =
+               unfenced_write_block != NULL ||
+               block_is_in_loop(block);
+            unfenced_write_block = block;
+         }
+      }
+
+      if (multiple_unfenced_write_blocks || unfenced_write_block) {
+         nir_builder b = nir_builder_at(
+            nir_after_block_before_jump(
+               multiple_unfenced_write_blocks ?
+               nir_impl_last_block(impl) :
+               unfenced_write_block));
+         nir_barrier(&b,
+                     .execution_scope=SCOPE_NONE,
+                     .memory_scope=SCOPE_WORKGROUP,
+                     .memory_semantics = NIR_MEMORY_RELEASE,
+                     .memory_modes = nir_var_mem_shared);
+         progress |= nir_progress(true, impl, nir_metadata_control_flow);
+      }
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw/meson.build b/src/intel/compiler/brw/meson.build
index cfdbe0281da..c58ff39ad3d 100644
--- a/src/intel/compiler/brw/meson.build
+++ b/src/intel/compiler/brw/meson.build
@@ -52,6 +52,7 @@ libintel_compiler_brw_files = files(
   'brw_lower_subgroup_ops.cpp',
   'brw_nir.h',
   'brw_nir.c',
+  'brw_nir_fence_shared_stores.c',
   'brw_nir_lower_cooperative_matrix.c',
   'brw_nir_lower_cs_intrinsics.c',
   'brw_nir_lower_alpha_to_coverage.c',