intel/brw: Reduce scope of has_source_and_destination_hazard

This predicate at the moment is only relevant during register allocation, so move it there and the code can ignore virtual instructions that were already lowered previously. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30496>
2025-12-24 17:30:12 +01:00 · 2024-08-27 13:40:05 -07:00 · 2024-08-27 13:40:05 -07:00 · 4361a08254
commit 4361a08254
parent bf9456753d
2 changed files with 115 additions and 115 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -274,120 +274,6 @@ fs_inst::is_payload(unsigned arg) const
   }
 }

-/**
- * Returns true if this instruction's sources and destinations cannot
- * safely be the same register.
- *
- * In most cases, a register can be written over safely by the same
- * instruction that is its last use.  For a single instruction, the
- * sources are dereferenced before writing of the destination starts
- * (naturally).
- *
- * However, there are a few cases where this can be problematic:
- *
- * - Virtual opcodes that translate to multiple instructions in the
- *   code generator: if src == dst and one instruction writes the
- *   destination before a later instruction reads the source, then
- *   src will have been clobbered.
- *
- * - SIMD16 compressed instructions with certain regioning (see below).
- *
- * The register allocator uses this information to set up conflicts between
- * GRF sources and the destination.
- */
-bool
-fs_inst::has_source_and_destination_hazard() const
-{
-   switch (opcode) {
-   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
-      /* Multiple partial writes to the destination */
-      return true;
-   case SHADER_OPCODE_SHUFFLE:
-      /* This instruction returns an arbitrary channel from the source and
-       * gets split into smaller instructions in the generator.  It's possible
-       * that one of the instructions will read from a channel corresponding
-       * to an earlier instruction.
-       */
-   case SHADER_OPCODE_SEL_EXEC:
-      /* This is implemented as
-       *
-       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
-       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
-       *
-       * Because the source is only read in the second instruction, the first
-       * may stomp all over it.
-       */
-      return true;
-   case SHADER_OPCODE_QUAD_SWIZZLE:
-      switch (src[1].ud) {
-      case BRW_SWIZZLE_XXXX:
-      case BRW_SWIZZLE_YYYY:
-      case BRW_SWIZZLE_ZZZZ:
-      case BRW_SWIZZLE_WWWW:
-      case BRW_SWIZZLE_XXZZ:
-      case BRW_SWIZZLE_YYWW:
-      case BRW_SWIZZLE_XYXY:
-      case BRW_SWIZZLE_ZWZW:
-         /* These can be implemented as a single Align1 region on all
-          * platforms, so there's never a hazard between source and
-          * destination.  C.f. fs_generator::generate_quad_swizzle().
-          */
-         return false;
-      default:
-         return !is_uniform(src[0]);
-      }
-   case BRW_OPCODE_DPAS:
-      /* This is overly conservative. The actual hazard is more complicated to
-       * describe. When the repeat count is N, the single instruction behaves
-       * like N instructions with a repeat count of one, but the destination
-       * and source registers are incremented (in somewhat complex ways) for
-       * each instruction.
-       *
-       * This means the source and destination register is actually a range of
-       * registers. The hazard exists of an earlier iteration would write a
-       * register that should be read by a later iteration.
-       *
-       * There may be some advantage to properly modeling this, but for now,
-       * be overly conservative.
-       */
-      return rcount > 1;
-   default:
-      /* The SIMD16 compressed instruction
-       *
-       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
-       *
-       * is actually decoded in hardware as:
-       *
-       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
-       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
-       *
-       * Which is safe.  However, if we have uniform accesses
-       * happening, we get into trouble:
-       *
-       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
-       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
-       *
-       * Now our destination for the first instruction overwrote the
-       * second instruction's src0, and we get garbage for those 8
-       * pixels.  There's a similar issue for the pre-gfx6
-       * pixel_x/pixel_y, which are registers of 16-bit values and thus
-       * would get stomped by the first decode as well.
-       */
-      if (exec_size == 16) {
-         for (int i = 0; i < sources; i++) {
-            if (src[i].file == VGRF && (src[i].stride == 0 ||
-                                        src[i].type == BRW_TYPE_UW ||
-                                        src[i].type == BRW_TYPE_W ||
-                                        src[i].type == BRW_TYPE_UB ||
-                                        src[i].type == BRW_TYPE_B)) {
-               return true;
-            }
-         }
-      }
-      return false;
-   }
-}
-
 bool
 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
 {
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@ -405,13 +405,127 @@ fs_reg_alloc::setup_live_interference(unsigned node,
   }
 }

+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use.  For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ *   code generator: if src == dst and one instruction writes the
+ *   destination before a later instruction reads the source, then
+ *   src will have been clobbered.
+ *
+ * - SIMD16 compressed instructions with certain regioning (see below).
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+static bool
+brw_inst_has_source_and_destination_hazard(const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+      /* Multiple partial writes to the destination */
+      return true;
+   case SHADER_OPCODE_SHUFFLE:
+      /* This instruction returns an arbitrary channel from the source and
+       * gets split into smaller instructions in the generator.  It's possible
+       * that one of the instructions will read from a channel corresponding
+       * to an earlier instruction.
+       */
+   case SHADER_OPCODE_SEL_EXEC:
+      /* This is implemented as
+       *
+       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
+       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
+       *
+       * Because the source is only read in the second instruction, the first
+       * may stomp all over it.
+       */
+      return true;
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      switch (inst->src[1].ud) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         /* These can be implemented as a single Align1 region on all
+          * platforms, so there's never a hazard between source and
+          * destination.  C.f. fs_generator::generate_quad_swizzle().
+          */
+         return false;
+      default:
+         return !is_uniform(inst->src[0]);
+      }
+   case BRW_OPCODE_DPAS:
+      /* This is overly conservative. The actual hazard is more complicated to
+       * describe. When the repeat count is N, the single instruction behaves
+       * like N instructions with a repeat count of one, but the destination
+       * and source registers are incremented (in somewhat complex ways) for
+       * each instruction.
+       *
+       * This means the source and destination register is actually a range of
+       * registers. The hazard exists of an earlier iteration would write a
+       * register that should be read by a later iteration.
+       *
+       * There may be some advantage to properly modeling this, but for now,
+       * be overly conservative.
+       */
+      return inst->rcount > 1;
+   default:
+      /* The SIMD16 compressed instruction
+       *
+       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       *
+       * is actually decoded in hardware as:
+       *
+       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
+       *
+       * Which is safe.  However, if we have uniform accesses
+       * happening, we get into trouble:
+       *
+       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
+       *
+       * Now our destination for the first instruction overwrote the
+       * second instruction's src0, and we get garbage for those 8
+       * pixels.  There's a similar issue for the pre-gfx6
+       * pixel_x/pixel_y, which are registers of 16-bit values and thus
+       * would get stomped by the first decode as well.
+       */
+      if (inst->exec_size == 16) {
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == VGRF && (inst->src[i].stride == 0 ||
+                                              inst->src[i].type == BRW_TYPE_UW ||
+                                              inst->src[i].type == BRW_TYPE_W ||
+                                              inst->src[i].type == BRW_TYPE_UB ||
+                                              inst->src[i].type == BRW_TYPE_B)) {
+               return true;
+            }
+         }
+      }
+      return false;
+   }
+}
+
 void
 fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
 {
   /* Certain instructions can't safely use the same register for their
    * sources and destination.  Add interference.
    */
-   if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+   if (inst->dst.file == VGRF && brw_inst_has_source_and_destination_hazard(inst)) {
      for (unsigned i = 0; i < inst->sources; i++) {
         if (inst->src[i].file == VGRF) {
            ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,