intel/brw: Lower VGRFs to FIXED_GRFs earlier

Moves the lowering of VGRFs into FIXED_GRFs from the code generation to (almost) right after the register allocation. This will allow: (1) later passes not worry about VGRFs (and what they mean in a post reg alloc phase) and (2) make easier to add certain types of validation post reg alloc phase using the backend IR. Note that a couple of passes still take advantage of seeing "allocated VGRFs", so perform lowering after they run. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28604>
2025-12-27 06:10:13 +01:00 · 2024-04-04 16:03:34 -07:00 · 2024-04-04 16:03:34 -07:00 · ff89e83178
commit ff89e83178
parent 5b3d4c757d
4 changed files with 109 additions and 88 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -2940,6 +2940,18 @@ fs_visitor::allocate_registers(bool allow_spilling)

   debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);

+   /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
+    * of part of assign_regs since both bank conflicts optimization and post
+    * RA scheduling take advantage of distinguishing references to registers
+    * that were allocated from references that were already fixed.
+    *
+    * TODO: Change the passes above, then move this lowering to be part of
+    * assign_regs.
+    */
+   brw_fs_lower_vgrfs_to_fixed_grfs(*this);
+
+   debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
+
   if (last_scratch > 0) {
      ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;

--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@ -608,6 +608,7 @@ bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s);
 bool brw_fs_lower_simd_width(fs_visitor &s);
 bool brw_fs_lower_sub_sat(fs_visitor &s);
 bool brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s);
+void brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s);

 bool brw_fs_opt_algebraic(fs_visitor &s);
 bool brw_fs_opt_bank_conflicts(fs_visitor &s);
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@ -64,80 +64,13 @@ brw_math_function(enum opcode op)
   }
 }

-
-static enum brw_reg_file
-brw_file_from_reg(fs_reg *reg)
-{
-   switch (reg->file) {
-   case ARF:
-      return BRW_ARCHITECTURE_REGISTER_FILE;
-   case FIXED_GRF:
-   case VGRF:
-      return BRW_GENERAL_REGISTER_FILE;
-   case IMM:
-      return BRW_IMMEDIATE_VALUE;
-   case BAD_FILE:
-   case ATTR:
-   case UNIFORM:
-      unreachable("not reached");
-   }
-   return BRW_ARCHITECTURE_REGISTER_FILE;
-}
-
 static struct brw_reg
 brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
-                    fs_reg *reg, bool compressed)
+                    fs_reg *reg)
 {
   struct brw_reg brw_reg;

   switch (reg->file) {
-   case VGRF:
-      if (reg->stride == 0) {
-         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
-      } else {
-         /* From the Haswell PRM:
-          *
-          *  "VertStride must be used to cross GRF register boundaries. This
-          *   rule implies that elements within a 'Width' cannot cross GRF
-          *   boundaries."
-          *
-          * The maximum width value that could satisfy this restriction is:
-          */
-         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
-
-         /* Because the hardware can only split source regions at a whole
-          * multiple of width during decompression (i.e. vertically), clamp
-          * the value obtained above to the physical execution size of a
-          * single decompressed chunk of the instruction:
-          */
-         const unsigned phys_width = compressed ? inst->exec_size / 2 :
-                                     inst->exec_size;
-
-         const unsigned max_hw_width = 16;
-
-         /* XXX - The equation above is strictly speaking not correct on
-          *       hardware that supports unbalanced GRF writes -- On Gfx9+
-          *       each decompressed chunk of the instruction may have a
-          *       different execution size when the number of components
-          *       written to each destination GRF is not the same.
-          */
-         if (reg->stride > 4) {
-            assert(reg != &inst->dst);
-            assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
-            brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
-            brw_reg = stride(brw_reg, reg->stride, 1, 0);
-         } else {
-            const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
-            brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
-            brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
-         }
-      }
-
-      brw_reg = retype(brw_reg, reg->type);
-      brw_reg = byte_offset(brw_reg, reg->offset);
-      brw_reg.abs = reg->abs;
-      brw_reg.negate = reg->negate;
-      break;
   case ARF:
   case FIXED_GRF:
   case IMM:
@ -148,6 +81,7 @@ brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
      /* Probably unused. */
      brw_reg = brw_null_reg();
      break;
+   case VGRF:
   case ATTR:
   case UNIFORM:
      unreachable("not reached");
@ -913,22 +847,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
      if (unlikely(debug_flag))
         disasm_annotate(disasm_info, inst, p->next_insn_offset);

-      /* If the instruction writes to more than one register, it needs to be
-       * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
-       * hardware figures out by itself what the right compression mode is,
-       * but we still need to know whether the instruction is compressed to
-       * set up the source register regions appropriately.
-       *
-       * XXX - This is wrong for instructions that write a single register but
-       *       read more than one which should strictly speaking be treated as
-       *       compressed.  For instructions that don't write any registers it
-       *       relies on the destination being a null register of the correct
-       *       type and regioning so the instruction is considered compressed
-       *       or not accordingly.
-       */
-      const bool compressed =
-           inst->dst.component_size(inst->exec_size) > REG_SIZE;
-
      if (devinfo->ver >= 20 && inst->group % 8 != 0) {
         assert(inst->force_writemask_all);
         assert(!inst->predicate && !inst->conditional_mod);
@ -941,8 +859,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
      }

      for (unsigned int i = 0; i < inst->sources; i++) {
-         src[i] = brw_reg_from_fs_reg(devinfo, inst,
-                                      &inst->src[i], compressed);
+         src[i] = brw_reg_from_fs_reg(devinfo, inst, &inst->src[i]);
 	 /* The accumulator result appears to get used for the
 	  * conditional modifier generation.  When negating a UD
 	  * value, there is a 33rd bit generated for the sign in the
@ -953,8 +870,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
 		!inst->src[i].negate);
      }
-      dst = brw_reg_from_fs_reg(devinfo, inst,
-                                &inst->dst, compressed);
+      dst = brw_reg_from_fs_reg(devinfo, inst, &inst->dst);

      brw_set_default_access_mode(p, BRW_ALIGN_1);
      brw_set_default_predicate_control(p, inst->predicate);
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@ -612,3 +612,95 @@ brw_fs_lower_alu_restrictions(fs_visitor &s)

   return progress;
 }
+
+static void
+brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
+                               fs_reg *reg, bool compressed)
+{
+   if (reg->file != VGRF)
+      return;
+
+   struct brw_reg new_reg;
+
+   if (reg->stride == 0) {
+      new_reg = brw_vec1_grf(reg->nr, 0);
+   } else if (reg->stride > 4) {
+      assert(reg != &inst->dst);
+      assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
+      new_reg = brw_vecn_grf(1, reg->nr, 0);
+      new_reg = stride(new_reg, reg->stride, 1, 0);
+   } else {
+      /* From the Haswell PRM:
+       *
+       *  "VertStride must be used to cross GRF register boundaries. This
+       *   rule implies that elements within a 'Width' cannot cross GRF
+       *   boundaries."
+       *
+       * The maximum width value that could satisfy this restriction is:
+       */
+      const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
+
+      /* Because the hardware can only split source regions at a whole
+       * multiple of width during decompression (i.e. vertically), clamp
+       * the value obtained above to the physical execution size of a
+       * single decompressed chunk of the instruction:
+       */
+      const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
+      const unsigned phys_width = compressed ? inst->exec_size / 2 :
+                                  inst->exec_size;
+
+      /* XXX - The equation above is strictly speaking not correct on
+       *       hardware that supports unbalanced GRF writes -- On Gfx9+
+       *       each decompressed chunk of the instruction may have a
+       *       different execution size when the number of components
+       *       written to each destination GRF is not the same.
+       */
+
+      const unsigned max_hw_width = 16;
+
+      const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
+      new_reg = brw_vecn_grf(width, reg->nr, 0);
+      new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
+   }
+
+   new_reg = retype(new_reg, reg->type);
+   new_reg = byte_offset(new_reg, reg->offset);
+   new_reg.abs = reg->abs;
+   new_reg.negate = reg->negate;
+
+   *reg = new_reg;
+}
+
+void
+brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
+{
+   assert(s.grf_used || !"Must be called after register allocation");
+
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
+      /* If the instruction writes to more than one register, it needs to be
+       * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
+       * hardware figures out by itself what the right compression mode is,
+       * but we still need to know whether the instruction is compressed to
+       * set up the source register regions appropriately.
+       *
+       * XXX - This is wrong for instructions that write a single register but
+       *       read more than one which should strictly speaking be treated as
+       *       compressed.  For instructions that don't write any registers it
+       *       relies on the destination being a null register of the correct
+       *       type and regioning so the instruction is considered compressed
+       *       or not accordingly.
+       */
+
+      const bool compressed =
+           inst->dst.component_size(inst->exec_size) > REG_SIZE;
+
+      brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
+      for (int i = 0; i < inst->sources; i++) {
+         brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
+      }
+   }
+
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                         DEPENDENCY_VARIABLES);
+}
+