From 1bff4f93ca98847a637bbf836e290b70fe4db590 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Fri, 9 Feb 2024 17:12:11 -0800
Subject: [PATCH] brw: Basic infrastructure to store convergent values as
 scalars

In SIMD16 and SIMD32, storing convergent values in full 16- or
32-channel registers is wasteful. It wastes register space, and in most
cases on SIMD32, it wastes instructions. Our register allocator is not
clever enough to handle scalar allocations. It's fundamental unit of
allocation is SIMD8. Start treating convergent values as SIMD8.

Add a tracking bit in brw_reg to specify that a register represents a
convergent, scalar value. This has two implications:

1. All channels of the SIMD8 register must contain the same value. In
   general, this means that writes to the register must be
   force_writemask_all and exec_size = 8;

2. Reads of this register can (and should) use <0,1,0> stride. SIMD8
   instructions that have restrictions on source stride can us <8,8,1>.

Values that are vectors (e.g., results of load_uniform or texture
operations) will be stored as multiple SIMD8 hardware registers.

v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2.

v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in
brw_reg.h. Both suggested by Caio. The offset_to_scalar() change
necessitates some trickery in the fs_builder offset() function, but I
think this is an improvement overall. There is also some rework in
find_value_for_offset to account for the possibility that is_scalar
sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
---
 src/intel/compiler/brw_fs.cpp                 |  4 ++-
 src/intel/compiler/brw_fs_builder.h           | 34 ++++++++++++++++++-
 .../compiler/brw_fs_copy_propagation.cpp      | 12 +++++--
 src/intel/compiler/brw_fs_lower.cpp           |  1 +
 .../compiler/brw_fs_lower_simd_width.cpp      | 14 ++++----
 src/intel/compiler/brw_fs_validate.cpp        |  3 ++
 src/intel/compiler/brw_reg.h                  | 16 +++++++--
 7 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0991b733921..62a6921b9cc 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -729,7 +729,9 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
    case FIXED_GRF:
    case VGRF:
    case ATTR:
-      return components_read(arg) * src[arg].component_size(exec_size);
+      /* Regardless of exec_size, values marked as scalar are SIMD8. */
+      return components_read(arg) *
+             src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
    }
    return 0;
 }
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index e6b4c735071..9d5f6acd4ef 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -28,6 +28,9 @@
 #include "brw_eu.h"
 #include "brw_fs.h"
 
+static inline brw_reg offset(const brw_reg &, const brw::fs_builder &,
+                             unsigned);
+
 namespace brw {
    /**
     * Toolbox to assemble an FS IR program out of individual instructions.
@@ -402,8 +405,9 @@ namespace brw {
       move_to_vgrf(const brw_reg &src, unsigned num_components) const
       {
          brw_reg *const src_comps = new brw_reg[num_components];
+
          for (unsigned i = 0; i < num_components; i++)
-            src_comps[i] = offset(src, dispatch_width(), i);
+            src_comps[i] = offset(src, *this, i);
 
          const brw_reg dst = vgrf(src.type, num_components);
          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
@@ -891,8 +895,36 @@ namespace brw {
    };
 }
 
+/**
+ * Offset by a number of components into a VGRF
+ *
+ * It is assumed that the VGRF represents a vector (e.g., returned by
+ * load_uniform or a texture operation). Convergent and divergent values are
+ * stored differently, so care must be taken to offset properly.
+ */
 static inline brw_reg
 offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta)
 {
+   /* If the value is convergent (stored as one or more SIMD8), offset using
+    * SIMD8 and select component 0.
+    */
+   if (reg.is_scalar) {
+      const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
+
+      brw_reg offset_reg = offset(reg, allocation_width, delta);
+
+      /* If the dispatch width is larger than the allocation width, that
+       * implies that the register can only be used as a source. Otherwise the
+       * instruction would write past the allocation size of the register.
+       */
+      if (bld.dispatch_width() > allocation_width)
+         return component(offset_reg, 0);
+      else
+         return offset_reg;
+   }
+
+   /* Offset to the component assuming the value was allocated in
+    * dispatch_width units.
+    */
    return offset(reg, bld.dispatch_width(), delta);
 }
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index d23cffcd6bd..ed29dba41ff 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -1778,11 +1778,19 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size)
    case SHADER_OPCODE_LOAD_PAYLOAD: {
       unsigned offset = 0;
       for (int i = def->header_size; i < def->sources; i++) {
-         const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1;
+         /* Ignore the source splat if the source is a scalar. In that case
+          * always use just the first component.
+          */
+         const unsigned splat =
+            (def->src[i].stride == 0 && !src.is_scalar) || def->src[i].file == IMM ? def->exec_size : 1;
+         const unsigned component_size =
+            def->src[i].component_size(def->exec_size);
+
          if (offset == src.offset) {
             if (def->dst.type == def->src[i].type &&
                 def->src[i].stride <= 1 &&
-                def->src[i].component_size(def->exec_size) * splat == src_size)
+                (component_size * splat == src_size ||
+                 (def->src[i].file == IMM && component_size == src_size)))
                val = def->src[i];
 
             break;
diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp
index 9bc8adef646..11648ce1a63 100644
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@@ -749,6 +749,7 @@ brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst
    new_reg = byte_offset(new_reg, reg->offset);
    new_reg.abs = reg->abs;
    new_reg.negate = reg->negate;
+   new_reg.is_scalar = reg->is_scalar;
 
    *reg = new_reg;
 }
diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp
index ac7ad3ed797..c385438b9bb 100644
--- a/src/intel/compiler/brw_fs_lower_simd_width.cpp
+++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp
@@ -478,11 +478,12 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
    if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
       return false;
 
-   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
-            (inst->components_read(i) == 1 &&
-             lbld.dispatch_width() <= inst->exec_size)) ||
-          (inst->flags_written(lbld.shader->devinfo) &
-           brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type)));
+   return !inst->src[i].is_scalar &&
+          (!(is_periodic(inst->src[i], lbld.dispatch_width()) ||
+             (inst->components_read(i) == 1 &&
+              lbld.dispatch_width() <= inst->exec_size)) ||
+           (inst->flags_written(lbld.shader->devinfo) &
+            brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))));
 }
 
 /**
@@ -509,7 +510,8 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
 
       return tmp;
    } else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
-              (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) {
+              (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) ||
+              inst->src[i].is_scalar) {
       /* The source is invariant for all dispatch_width-wide groups of the
        * original region.
        *
diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp
index 113484580b7..698c9330c4f 100644
--- a/src/intel/compiler/brw_fs_validate.cpp
+++ b/src/intel/compiler/brw_fs_validate.cpp
@@ -377,6 +377,9 @@ brw_fs_validate(const fs_visitor &s)
       if (inst->dst.file == VGRF) {
          fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
                         s.alloc.sizes[inst->dst.nr]);
+
+         if (inst->exec_size > 1)
+            fsv_assert_ne(inst->dst.stride, 0);
       }
 
       for (unsigned i = 0; i < inst->sources; i++) {
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index 448ad2556d2..9b14067a87d 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -182,7 +182,19 @@ typedef struct brw_reg {
          unsigned vstride:4;      /* source only */
          unsigned width:3;        /* src only, align1 only */
          unsigned hstride:2;      /* align1 only */
-         unsigned pad1:1;
+
+         /**
+          * Does this register represent a scalar value?
+          *
+          * Registers are allocated in SIMD8 parcels, but may be used to
+          * represent convergent (i.e., scalar) values. As a destination, it
+          * is written as SIMD8. As a source, it may be read as <8,8,1> in
+          * SIMD8 instructions or <0,1,0> on other execution sizes.
+          *
+          * If the value represents a vector (e.g., a convergent load_uniform
+          * of a vec4), it will be stored as multiple SIMD8 registers.
+          */
+         unsigned is_scalar:1;
       };
 
       double df;
@@ -405,7 +417,7 @@ brw_make_reg(enum brw_reg_file file,
    reg.vstride = vstride;
    reg.width = width;
    reg.hstride = hstride;
-   reg.pad1 = 0;
+   reg.is_scalar = 0;
 
    reg.offset = 0;
    reg.stride = 1;