brw: Basic infrastructure to store convergent values as scalars

In SIMD16 and SIMD32, storing convergent values in full 16- or 32-channel registers is wasteful. It wastes register space, and in most cases on SIMD32, it wastes instructions. Our register allocator is not clever enough to handle scalar allocations. It's fundamental unit of allocation is SIMD8. Start treating convergent values as SIMD8. Add a tracking bit in brw_reg to specify that a register represents a convergent, scalar value. This has two implications: 1. All channels of the SIMD8 register must contain the same value. In general, this means that writes to the register must be force_writemask_all and exec_size = 8; 2. Reads of this register can (and should) use <0,1,0> stride. SIMD8 instructions that have restrictions on source stride can us <8,8,1>. Values that are vectors (e.g., results of load_uniform or texture operations) will be stored as multiple SIMD8 hardware registers. v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2. v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in brw_reg.h. Both suggested by Caio. The offset_to_scalar() change necessitates some trickery in the fs_builder offset() function, but I think this is an improvement overall. There is also some rework in find_value_for_offset to account for the possibility that is_scalar sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2025-12-20 13:50:11 +01:00 · 2024-02-09 17:12:11 -08:00 · 2024-02-09 17:12:11 -08:00 · 1bff4f93ca
commit 1bff4f93ca
parent ef3dc401da
7 changed files with 72 additions and 12 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -729,7 +729,9 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
   case FIXED_GRF:
   case VGRF:
   case ATTR:
-      return components_read(arg) * src[arg].component_size(exec_size);
+      /* Regardless of exec_size, values marked as scalar are SIMD8. */
      return components_read(arg) *
             src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
   }
   return 0;
 }
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@ -28,6 +28,9 @@
 #include "brw_eu.h"
 #include "brw_fs.h"
 static inline brw_reg offset(const brw_reg &, const brw::fs_builder &,
                             unsigned);
 namespace brw {
   /**
    * Toolbox to assemble an FS IR program out of individual instructions.
@ -402,8 +405,9 @@ namespace brw {
      move_to_vgrf(const brw_reg &src, unsigned num_components) const
      {
         brw_reg *const src_comps = new brw_reg[num_components];
         for (unsigned i = 0; i < num_components; i++)
-            src_comps[i] = offset(src, dispatch_width(), i);
+            src_comps[i] = offset(src, *this, i);
         const brw_reg dst = vgrf(src.type, num_components);
         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
@ -891,8 +895,36 @@ namespace brw {
   };
 }
 /**
 * Offset by a number of components into a VGRF
 *
 * It is assumed that the VGRF represents a vector (e.g., returned by
 * load_uniform or a texture operation). Convergent and divergent values are
 * stored differently, so care must be taken to offset properly.
 */
 static inline brw_reg
 offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta)
 {
   /* If the value is convergent (stored as one or more SIMD8), offset using
    * SIMD8 and select component 0.
    */
   if (reg.is_scalar) {
      const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
      brw_reg offset_reg = offset(reg, allocation_width, delta);
      /* If the dispatch width is larger than the allocation width, that
       * implies that the register can only be used as a source. Otherwise the
       * instruction would write past the allocation size of the register.
       */
      if (bld.dispatch_width() > allocation_width)
         return component(offset_reg, 0);
      else
         return offset_reg;
   }
   /* Offset to the component assuming the value was allocated in
    * dispatch_width units.
    */
   return offset(reg, bld.dispatch_width(), delta);
 }
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@ -1778,11 +1778,19 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size)
   case SHADER_OPCODE_LOAD_PAYLOAD: {
      unsigned offset = 0;
      for (int i = def->header_size; i < def->sources; i++) {
-         const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1;
+         /* Ignore the source splat if the source is a scalar. In that case
          * always use just the first component.
          */
         const unsigned splat =
            (def->src[i].stride == 0 && !src.is_scalar) || def->src[i].file == IMM ? def->exec_size : 1;
         const unsigned component_size =
            def->src[i].component_size(def->exec_size);
         if (offset == src.offset) {
            if (def->dst.type == def->src[i].type &&
                def->src[i].stride <= 1 &&
-                def->src[i].component_size(def->exec_size) * splat == src_size)
+                (component_size * splat == src_size ||
                 (def->src[i].file == IMM && component_size == src_size)))
               val = def->src[i];
            break;
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@ -749,6 +749,7 @@ brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst
   new_reg = byte_offset(new_reg, reg->offset);
   new_reg.abs = reg->abs;
   new_reg.negate = reg->negate;
   new_reg.is_scalar = reg->is_scalar;
   *reg = new_reg;
 }
--- a/src/intel/compiler/brw_fs_lower_simd_width.cpp
+++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp
@ -478,11 +478,12 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
   if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
      return false;
-   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
+   return !inst->src[i].is_scalar &&
-            (inst->components_read(i) == 1 &&
+          (!(is_periodic(inst->src[i], lbld.dispatch_width()) ||
-             lbld.dispatch_width() <= inst->exec_size)) ||
+             (inst->components_read(i) == 1 &&
-          (inst->flags_written(lbld.shader->devinfo) &
+              lbld.dispatch_width() <= inst->exec_size)) ||
-           brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type)));
+           (inst->flags_written(lbld.shader->devinfo) &
            brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))));
 }
 /**
@ -509,7 +510,8 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
      return tmp;
   } else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
-              (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) {
+              (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) ||
              inst->src[i].is_scalar) {
      /* The source is invariant for all dispatch_width-wide groups of the
       * original region.
       *
--- a/src/intel/compiler/brw_fs_validate.cpp
+++ b/src/intel/compiler/brw_fs_validate.cpp
@ -377,6 +377,9 @@ brw_fs_validate(const fs_visitor &s)
      if (inst->dst.file == VGRF) {
         fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
                        s.alloc.sizes[inst->dst.nr]);
         if (inst->exec_size > 1)
            fsv_assert_ne(inst->dst.stride, 0);
      }
      for (unsigned i = 0; i < inst->sources; i++) {
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@ -182,7 +182,19 @@ typedef struct brw_reg {
         unsigned vstride:4;      /* source only */
         unsigned width:3;        /* src only, align1 only */
         unsigned hstride:2;      /* align1 only */
-         unsigned pad1:1;
+
         /**
          * Does this register represent a scalar value?
          *
          * Registers are allocated in SIMD8 parcels, but may be used to
          * represent convergent (i.e., scalar) values. As a destination, it
          * is written as SIMD8. As a source, it may be read as <8,8,1> in
          * SIMD8 instructions or <0,1,0> on other execution sizes.
          *
          * If the value represents a vector (e.g., a convergent load_uniform
          * of a vec4), it will be stored as multiple SIMD8 registers.
          */
         unsigned is_scalar:1;
      };
      double df;
@ -405,7 +417,7 @@ brw_make_reg(enum brw_reg_file file,
   reg.vstride = vstride;
   reg.width = width;
   reg.hstride = hstride;
-   reg.pad1 = 0;
+   reg.is_scalar = 0;
   reg.offset = 0;
   reg.stride = 1;