intel/brw: Move workarounds to a separate file

All the workarounds are relatively small, so keep them in a single file. Promote (or add) them to a separate file if they get large -- like it is done for opt and lower. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26887>
2026-05-04 22:49:13 +02:00 · 2024-01-04 22:39:57 -08:00 · 2024-01-04 22:39:57 -08:00 · e3dc608db9
commit e3dc608db9
parent c25803880e
3 changed files with 274 additions and 263 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -2576,269 +2576,6 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
   free(filename);
 }

-static bool
-needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
-{
-   /* This workaround is about making sure that any instruction writing
-    * through UGM has completed before we hit EOT.
-    */
-   if (inst->sfid != GFX12_SFID_UGM)
-      return false;
-
-   /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
-    * where the L1-cache override is NOT among {WB, WS, WT}
-    */
-   enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
-   if (lsc_opcode_is_store(opcode)) {
-      switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
-      case LSC_CACHE_STORE_L1STATE_L3MOCS:
-      case LSC_CACHE_STORE_L1WB_L3WB:
-      case LSC_CACHE_STORE_L1S_L3UC:
-      case LSC_CACHE_STORE_L1S_L3WB:
-      case LSC_CACHE_STORE_L1WT_L3UC:
-      case LSC_CACHE_STORE_L1WT_L3WB:
-         return false;
-
-      default:
-         return true;
-      }
-   }
-
-   /* Any UGM Atomic message WITHOUT return value */
-   if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
-      return true;
-
-   return false;
-}
-
-/* Wa_14015360517
- *
- * The first instruction of any kernel should have non-zero emask.
- * Make sure this happens by introducing a dummy mov instruction.
- */
-bool
-brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s)
-{
-   if (!intel_needs_workaround(s.devinfo, 14015360517))
-      return false;
-
-   struct backend_instruction *first_inst =
-      s.cfg->first_block()->start();
-
-   /* We can skip the WA if first instruction is marked with
-    * force_writemask_all or exec_size equals dispatch_width.
-    */
-   if (first_inst->force_writemask_all ||
-       first_inst->exec_size == s.dispatch_width)
-      return false;
-
-   /* Insert dummy mov as first instruction. */
-   const fs_builder ubld =
-      fs_builder(&s, s.cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
-   ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
-
-   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-   return true;
-}
-
-/* Wa_22013689345
- *
- * We need to emit UGM fence message before EOT, if shader has any UGM write
- * or atomic message.
- *
- * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
- *                We probably need a better criteria in needs_dummy_fence().
- */
-bool
-brw_fs_workaround_memory_fence_before_eot(fs_visitor &s)
-{
-   bool progress = false;
-   bool has_ugm_write_or_atomic = false;
-
-   if (!intel_needs_workaround(s.devinfo, 22013689345))
-      return false;
-
-   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
-      if (!inst->eot) {
-         if (needs_dummy_fence(s.devinfo, inst))
-            has_ugm_write_or_atomic = true;
-         continue;
-      }
-
-      if (!has_ugm_write_or_atomic)
-         break;
-
-      const fs_builder ibld(&s, block, inst);
-      const fs_builder ubld = ibld.exec_all().group(1, 0);
-
-      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-      fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
-                                       dst, brw_vec8_grf(0, 0),
-                                       /* commit enable */ brw_imm_ud(1),
-                                       /* bti */ brw_imm_ud(0));
-      dummy_fence->sfid = GFX12_SFID_UGM;
-      dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
-                                             LSC_FLUSH_TYPE_NONE_6, false);
-      ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
-      progress = true;
-      /* TODO: remove this break if we ever have shader with multiple EOT. */
-      break;
-   }
-
-   if (progress) {
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
-                            DEPENDENCY_VARIABLES);
-   }
-
-   return progress;
-}
-
-/**
- * Find the first instruction in the program that might start a region of
- * divergent control flow due to a HALT jump.  There is no
- * find_halt_control_flow_region_end(), the region of divergence extends until
- * the only SHADER_OPCODE_HALT_TARGET in the program.
- */
-static const fs_inst *
-find_halt_control_flow_region_start(const fs_visitor *v)
-{
-   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
-      if (inst->opcode == BRW_OPCODE_HALT ||
-          inst->opcode == SHADER_OPCODE_HALT_TARGET)
-         return inst;
-   }
-
-   return NULL;
-}
-
-/**
- * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
- * can cause a BB to be executed with all channels disabled, which will lead
- * to the execution of any NoMask instructions in it, even though any
- * execution-masked instructions will be correctly shot down.  This may break
- * assumptions of some NoMask SEND messages whose descriptor depends on data
- * generated by live invocations of the shader.
- *
- * This avoids the problem by predicating certain instructions on an ANY
- * horizontal predicate that makes sure that their execution is omitted when
- * all channels of the program are disabled.
- */
-bool
-brw_fs_workaround_nomask_control_flow(fs_visitor &s)
-{
-   if (s.devinfo->ver != 12)
-      return false;
-
-   const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
-                              s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
-                              BRW_PREDICATE_ALIGN1_ANY8H;
-   const fs_inst *halt_start = find_halt_control_flow_region_start(&s);
-   unsigned depth = 0;
-   bool progress = false;
-
-   const fs_live_variables &live_vars = s.live_analysis.require();
-
-   /* Scan the program backwards in order to be able to easily determine
-    * whether the flag register is live at any point.
-    */
-   foreach_block_reverse_safe(block, s.cfg) {
-      BITSET_WORD flag_liveout = live_vars.block_data[block->num]
-                                               .flag_liveout[0];
-      STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
-
-      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
-         if (!inst->predicate && inst->exec_size >= 8)
-            flag_liveout &= ~inst->flags_written(s.devinfo);
-
-         switch (inst->opcode) {
-         case BRW_OPCODE_DO:
-         case BRW_OPCODE_IF:
-            /* Note that this doesn't handle BRW_OPCODE_HALT since only
-             * the first one in the program closes the region of divergent
-             * control flow due to any HALT instructions -- Instead this is
-             * handled with the halt_start check below.
-             */
-            depth--;
-            break;
-
-         case BRW_OPCODE_WHILE:
-         case BRW_OPCODE_ENDIF:
-         case SHADER_OPCODE_HALT_TARGET:
-            depth++;
-            break;
-
-         default:
-            /* Note that the vast majority of NoMask SEND instructions in the
-             * program are harmless while executed in a block with all
-             * channels disabled, since any instructions with side effects we
-             * could hit here should be execution-masked.
-             *
-             * The main concern is NoMask SEND instructions where the message
-             * descriptor or header depends on data generated by live
-             * invocations of the shader (RESINFO and
-             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
-             * computed surface index seem to be the only examples right now
-             * where this could easily lead to GPU hangs).  Unfortunately we
-             * have no straightforward way to detect that currently, so just
-             * predicate any NoMask SEND instructions we find under control
-             * flow.
-             *
-             * If this proves to have a measurable performance impact it can
-             * be easily extended with a whitelist of messages we know we can
-             * safely omit the predication for.
-             */
-            if (depth && inst->force_writemask_all &&
-                is_send(inst) && !inst->predicate) {
-               /* We need to load the execution mask into the flag register by
-                * using a builder with channel group matching the whole shader
-                * (rather than the default which is derived from the original
-                * instruction), in order to avoid getting a right-shifted
-                * value.
-                */
-               const fs_builder ubld = fs_builder(&s, block, inst)
-                                       .exec_all().group(s.dispatch_width, 0);
-               const fs_reg flag = retype(brw_flag_reg(0, 0),
-                                          BRW_REGISTER_TYPE_UD);
-
-               /* Due to the lack of flag register allocation we need to save
-                * and restore the flag register if it's live.
-                */
-               const bool save_flag = flag_liveout &
-                                      brw_fs_flag_mask(flag, s.dispatch_width / 8);
-               const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
-
-               if (save_flag) {
-                  ubld.group(8, 0).UNDEF(tmp);
-                  ubld.group(1, 0).MOV(tmp, flag);
-               }
-
-               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
-
-               set_predicate(pred, inst);
-               inst->flag_subreg = 0;
-               inst->predicate_trivial = true;
-
-               if (save_flag)
-                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
-
-               progress = true;
-            }
-            break;
-         }
-
-         if (inst == halt_start)
-            depth--;
-
-         flag_liveout |= inst->flags_read(s.devinfo);
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
 uint32_t
 fs_visitor::compute_max_register_pressure()
 {
--- a/src/intel/compiler/brw_fs_workaround.cpp
+++ b/src/intel/compiler/brw_fs_workaround.cpp
@ -0,0 +1,273 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+/* Wa_14015360517
+ *
+ * The first instruction of any kernel should have non-zero emask.
+ * Make sure this happens by introducing a dummy mov instruction.
+ */
+bool
+brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s)
+{
+   if (!intel_needs_workaround(s.devinfo, 14015360517))
+      return false;
+
+   struct backend_instruction *first_inst =
+      s.cfg->first_block()->start();
+
+   /* We can skip the WA if first instruction is marked with
+    * force_writemask_all or exec_size equals dispatch_width.
+    */
+   if (first_inst->force_writemask_all ||
+       first_inst->exec_size == s.dispatch_width)
+      return false;
+
+   /* Insert dummy mov as first instruction. */
+   const fs_builder ubld =
+      fs_builder(&s, s.cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
+   ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
+
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+   return true;
+}
+
+static bool
+needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
+{
+   /* This workaround is about making sure that any instruction writing
+    * through UGM has completed before we hit EOT.
+    */
+   if (inst->sfid != GFX12_SFID_UGM)
+      return false;
+
+   /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
+    * where the L1-cache override is NOT among {WB, WS, WT}
+    */
+   enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
+   if (lsc_opcode_is_store(opcode)) {
+      switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
+      case LSC_CACHE_STORE_L1STATE_L3MOCS:
+      case LSC_CACHE_STORE_L1WB_L3WB:
+      case LSC_CACHE_STORE_L1S_L3UC:
+      case LSC_CACHE_STORE_L1S_L3WB:
+      case LSC_CACHE_STORE_L1WT_L3UC:
+      case LSC_CACHE_STORE_L1WT_L3WB:
+         return false;
+
+      default:
+         return true;
+      }
+   }
+
+   /* Any UGM Atomic message WITHOUT return value */
+   if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
+      return true;
+
+   return false;
+}
+
+/* Wa_22013689345
+ *
+ * We need to emit UGM fence message before EOT, if shader has any UGM write
+ * or atomic message.
+ *
+ * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
+ *                We probably need a better criteria in needs_dummy_fence().
+ */
+bool
+brw_fs_workaround_memory_fence_before_eot(fs_visitor &s)
+{
+   bool progress = false;
+   bool has_ugm_write_or_atomic = false;
+
+   if (!intel_needs_workaround(s.devinfo, 22013689345))
+      return false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
+      if (!inst->eot) {
+         if (needs_dummy_fence(s.devinfo, inst))
+            has_ugm_write_or_atomic = true;
+         continue;
+      }
+
+      if (!has_ugm_write_or_atomic)
+         break;
+
+      const fs_builder ibld(&s, block, inst);
+      const fs_builder ubld = ibld.exec_all().group(1, 0);
+
+      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
+                                       dst, brw_vec8_grf(0, 0),
+                                       /* commit enable */ brw_imm_ud(1),
+                                       /* bti */ brw_imm_ud(0));
+      dummy_fence->sfid = GFX12_SFID_UGM;
+      dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
+                                             LSC_FLUSH_TYPE_NONE_6, false);
+      ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
+      progress = true;
+      /* TODO: remove this break if we ever have shader with multiple EOT. */
+      break;
+   }
+
+   if (progress) {
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
+                            DEPENDENCY_VARIABLES);
+   }
+
+   return progress;
+}
+
+/**
+ * Find the first instruction in the program that might start a region of
+ * divergent control flow due to a HALT jump.  There is no
+ * find_halt_control_flow_region_end(), the region of divergence extends until
+ * the only SHADER_OPCODE_HALT_TARGET in the program.
+ */
+static const fs_inst *
+find_halt_control_flow_region_start(const fs_visitor *v)
+{
+   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+      if (inst->opcode == BRW_OPCODE_HALT ||
+          inst->opcode == SHADER_OPCODE_HALT_TARGET)
+         return inst;
+   }
+
+   return NULL;
+}
+
+/**
+ * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
+ * can cause a BB to be executed with all channels disabled, which will lead
+ * to the execution of any NoMask instructions in it, even though any
+ * execution-masked instructions will be correctly shot down.  This may break
+ * assumptions of some NoMask SEND messages whose descriptor depends on data
+ * generated by live invocations of the shader.
+ *
+ * This avoids the problem by predicating certain instructions on an ANY
+ * horizontal predicate that makes sure that their execution is omitted when
+ * all channels of the program are disabled.
+ */
+bool
+brw_fs_workaround_nomask_control_flow(fs_visitor &s)
+{
+   if (s.devinfo->ver != 12)
+      return false;
+
+   const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
+                              s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                              BRW_PREDICATE_ALIGN1_ANY8H;
+   const fs_inst *halt_start = find_halt_control_flow_region_start(&s);
+   unsigned depth = 0;
+   bool progress = false;
+
+   const fs_live_variables &live_vars = s.live_analysis.require();
+
+   /* Scan the program backwards in order to be able to easily determine
+    * whether the flag register is live at any point.
+    */
+   foreach_block_reverse_safe(block, s.cfg) {
+      BITSET_WORD flag_liveout = live_vars.block_data[block->num]
+                                               .flag_liveout[0];
+      STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_liveout &= ~inst->flags_written(s.devinfo);
+
+         switch (inst->opcode) {
+         case BRW_OPCODE_DO:
+         case BRW_OPCODE_IF:
+            /* Note that this doesn't handle BRW_OPCODE_HALT since only
+             * the first one in the program closes the region of divergent
+             * control flow due to any HALT instructions -- Instead this is
+             * handled with the halt_start check below.
+             */
+            depth--;
+            break;
+
+         case BRW_OPCODE_WHILE:
+         case BRW_OPCODE_ENDIF:
+         case SHADER_OPCODE_HALT_TARGET:
+            depth++;
+            break;
+
+         default:
+            /* Note that the vast majority of NoMask SEND instructions in the
+             * program are harmless while executed in a block with all
+             * channels disabled, since any instructions with side effects we
+             * could hit here should be execution-masked.
+             *
+             * The main concern is NoMask SEND instructions where the message
+             * descriptor or header depends on data generated by live
+             * invocations of the shader (RESINFO and
+             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
+             * computed surface index seem to be the only examples right now
+             * where this could easily lead to GPU hangs).  Unfortunately we
+             * have no straightforward way to detect that currently, so just
+             * predicate any NoMask SEND instructions we find under control
+             * flow.
+             *
+             * If this proves to have a measurable performance impact it can
+             * be easily extended with a whitelist of messages we know we can
+             * safely omit the predication for.
+             */
+            if (depth && inst->force_writemask_all &&
+                is_send(inst) && !inst->predicate) {
+               /* We need to load the execution mask into the flag register by
+                * using a builder with channel group matching the whole shader
+                * (rather than the default which is derived from the original
+                * instruction), in order to avoid getting a right-shifted
+                * value.
+                */
+               const fs_builder ubld = fs_builder(&s, block, inst)
+                                       .exec_all().group(s.dispatch_width, 0);
+               const fs_reg flag = retype(brw_flag_reg(0, 0),
+                                          BRW_REGISTER_TYPE_UD);
+
+               /* Due to the lack of flag register allocation we need to save
+                * and restore the flag register if it's live.
+                */
+               const bool save_flag = flag_liveout &
+                                      brw_fs_flag_mask(flag, s.dispatch_width / 8);
+               const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
+
+               if (save_flag) {
+                  ubld.group(8, 0).UNDEF(tmp);
+                  ubld.group(1, 0).MOV(tmp, flag);
+               }
+
+               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
+
+               set_predicate(pred, inst);
+               inst->flag_subreg = 0;
+               inst->predicate_trivial = true;
+
+               if (save_flag)
+                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
+
+               progress = true;
+            }
+            break;
+         }
+
+         if (inst == halt_start)
+            depth--;
+
+         flag_liveout |= inst->flags_read(s.devinfo);
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@ -93,6 +93,7 @@ libintel_compiler_brw_files = files(
  'brw_fs_thread_payload.cpp',
  'brw_fs_validate.cpp',
  'brw_fs_visitor.cpp',
+  'brw_fs_workaround.cpp',
  'brw_inst.h',
  'brw_interpolation_map.c',
  'brw_ir.h',