brw: fence SLM writes between workgroups

On LSC platforms the SLM writes are unfenced between workgroups. This means a workgroup W1 finishing might have uncompleted SLM writes. Another workgroup W2 dispatched after W1 which gets allocated an overlapping SLM location might have writes that race with the previous W1 operations. The solution to this is fence all write operations (store & atomics) of a workgroup before ending the threads. We do this by emitting a single SLM fence either at the end of the shader or if there is only a single unfenced right, at the end of that block. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Cc: mesa-stable Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13924 Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40430>
2026-05-22 19:38:10 +02:00 · 2026-03-15 21:26:33 +02:00 · 2026-03-15 21:26:33 +02:00 · fa523aedd0
commit fa523aedd0
parent 32ca98a26e
4 changed files with 95 additions and 0 deletions
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@ -2764,6 +2764,13 @@ brw_postprocess_nir_opts(brw_pass_tracker *pt,

   brw_vectorize_lower_mem_access(pt, robust_flags);

+   /* Fence LSC SLM writes to avoid workgroups WaW hazards to the same SLM
+    * location.
+    */
+   if (devinfo->has_lsc &&
+       mesa_shader_stage_uses_workgroup(nir->info.stage))
+      OPT(brw_nir_fence_shared_stores);
+
   /* Do this after lowering memory access bit-sizes */
   if (nir->info.stage == MESA_SHADER_MESH ||
       nir->info.stage == MESA_SHADER_TASK) {
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@ -138,6 +138,9 @@ void brw_preprocess_nir(const struct brw_compiler *compiler,
                        nir_shader *nir,
                        const struct brw_nir_compiler_opts *opts);

+bool
+brw_nir_fence_shared_stores(nir_shader *shader);
+
 void
 brw_nir_link_shaders(const struct brw_compiler *compiler,
                     nir_shader *producer, nir_shader *consumer);
--- a/src/intel/compiler/brw/brw_nir_fence_shared_stores.c
+++ b/src/intel/compiler/brw/brw_nir_fence_shared_stores.c
@ -0,0 +1,84 @@
+/*
+ * Copyright © 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+block_is_in_loop(nir_block *block)
+{
+   nir_cf_node *cf_node = block->cf_node.parent;
+
+   while (cf_node != NULL) {
+      if (cf_node->type == nir_cf_node_loop)
+         return true;
+
+      cf_node = cf_node->parent;
+   }
+
+   return false;
+}
+
+bool
+brw_nir_fence_shared_stores(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function_with_impl(function, impl, shader) {
+      bool multiple_unfenced_write_blocks = false;
+      nir_block *unfenced_write_block = NULL;
+      nir_foreach_block(block, impl) {
+         bool unfenced_writes = false;
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_store_shared:
+            case nir_intrinsic_shared_atomic:
+            case nir_intrinsic_shared_atomic_swap:
+            case nir_intrinsic_store_shared_block_intel:
+               unfenced_writes = true;
+               break;
+
+            case nir_intrinsic_barrier:
+               if (nir_intrinsic_memory_modes(intrin) & nir_var_mem_shared)
+                  unfenced_writes = false;
+               break;
+
+            default:
+               break;
+            }
+         }
+
+         if (unfenced_writes) {
+            /* Consider we have multiple blocks if the unfenced write is
+             * within a loop.
+             */
+            multiple_unfenced_write_blocks =
+               unfenced_write_block != NULL ||
+               block_is_in_loop(block);
+            unfenced_write_block = block;
+         }
+      }
+
+      if (multiple_unfenced_write_blocks || unfenced_write_block) {
+         nir_builder b = nir_builder_at(
+            nir_after_block_before_jump(
+               multiple_unfenced_write_blocks ?
+               nir_impl_last_block(impl) :
+               unfenced_write_block));
+         nir_barrier(&b,
+                     .execution_scope=SCOPE_NONE,
+                     .memory_scope=SCOPE_WORKGROUP,
+                     .memory_semantics = NIR_MEMORY_RELEASE,
+                     .memory_modes = nir_var_mem_shared);
+         progress |= nir_progress(true, impl, nir_metadata_control_flow);
+      }
+   }
+
+   return progress;
+}
--- a/src/intel/compiler/brw/meson.build
+++ b/src/intel/compiler/brw/meson.build
@ -52,6 +52,7 @@ libintel_compiler_brw_files = files(
  'brw_lower_subgroup_ops.cpp',
  'brw_nir.h',
  'brw_nir.c',
+  'brw_nir_fence_shared_stores.c',
  'brw_nir_lower_cooperative_matrix.c',
  'brw_nir_lower_cs_intrinsics.c',
  'brw_nir_lower_alpha_to_coverage.c',