From 4c106136259e3a6bb7d59a8cde2c1c601a21e1b7 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 1 Mar 2024 15:45:06 -0800
Subject: [PATCH] intel/brw: Remove SIMD lowering to a larger SIMD size

On Gfx4, we had to emulate SIMD8 texturing with SIMD16 for some message
types.  This ceased to be a thing with Gfx5 and hasn't come up again.

So, we can simply assert that we are truly "SIMD splitting", and assume
that the lowered size is smaller than the original instruction size.
This avoids some mental complexity as we can always think of the split
instructions as taking apart, operating on, and recombining subsets of
the original values.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27959>
---
 .../compiler/brw_fs_lower_simd_width.cpp      | 57 ++++++-------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp
index 97b26dea55f..36b439e6611 100644
--- a/src/intel/compiler/brw_fs_lower_simd_width.cpp
+++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp
@@ -461,16 +461,10 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
    const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
 
    if (needs_src_copy(lbld, inst, i)) {
-      /* Builder of the right width to perform the copy avoiding uninitialized
-       * data if the lowered execution size is greater than the original
-       * execution size of the instruction.
-       */
-      const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
-                                              inst->exec_size), 0);
       const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
 
       for (unsigned k = 0; k < inst->components_read(i); ++k)
-         cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
+         lbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
 
       return tmp;
 
@@ -507,13 +501,6 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
    if (inst->size_written > inst->dst.component_size(inst->exec_size))
       return true;
 
-   /* If the lowered execution size is larger than the original the result of
-    * the instruction won't fit in the original destination, so we'll have to
-    * allocate a temporary in any case.
-    */
-   if (lbld.dispatch_width() > inst->exec_size)
-      return true;
-
    for (unsigned i = 0; i < inst->sources; i++) {
       /* If we already made a copy of the source for other reasons there won't
        * be any overlap with the destination.
@@ -578,24 +565,15 @@ emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
        * destination into the temporary before emitting the lowered
        * instruction.
        */
-      const fs_builder gbld_before =
-         lbld_before.group(MIN2(lbld_before.dispatch_width(),
-                                inst->exec_size), 0);
       for (unsigned k = 0; k < dst_size; ++k) {
-         gbld_before.MOV(offset(tmp, lbld_before, k),
+         lbld_before.MOV(offset(tmp, lbld_before, k),
                          offset(dst, inst->exec_size, k));
       }
    }
 
-   const fs_builder gbld_after =
-      lbld_after.group(MIN2(lbld_after.dispatch_width(),
-                            inst->exec_size), 0);
    for (unsigned k = 0; k < dst_size; ++k) {
-      /* Use a builder of the right width to perform the copy avoiding
-       * uninitialized data if the lowered execution size is greater than the
-       * original execution size of the instruction.
-       */
-      gbld_after.MOV(offset(dst, inst->exec_size, k),
+      /* Copy the (split) temp into the original (larger) destination */
+      lbld_after.MOV(offset(dst, inst->exec_size, k),
                      offset(tmp, lbld_after, k));
    }
 
@@ -606,14 +584,14 @@ emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
        * have to build a single 32bit value for the SIMD32 message out of 2
        * SIMD16 16 bit values.
        */
-      const fs_builder rbld = gbld_after.exec_all().group(1, 0);
+      const fs_builder rbld = lbld_after.exec_all().group(1, 0);
       fs_reg local_res_reg = component(
          retype(offset(tmp, lbld_before, dst_size),
                 BRW_REGISTER_TYPE_UW), 0);
       fs_reg final_res_reg =
          retype(byte_offset(inst->dst,
                             inst->size_written - residency_size +
-                            gbld_after.group() / 8),
+                            lbld_after.group() / 8),
                 BRW_REGISTER_TYPE_UW);
       rbld.MOV(final_res_reg, local_res_reg);
    }
@@ -629,20 +607,19 @@ brw_fs_lower_simd_width(fs_visitor &s)
    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
       const unsigned lower_width = brw_fs_get_lowered_simd_width(&s, inst);
 
-      if (lower_width != inst->exec_size) {
-         /* Builder matching the original instruction.  We may also need to
-          * emit an instruction of width larger than the original, set the
-          * execution size of the builder to the highest of both for now so
-          * we're sure that both cases can be handled.
-          */
-         const unsigned max_width = MAX2(inst->exec_size, lower_width);
+      /* No splitting required */
+      if (lower_width == inst->exec_size)
+         continue;
 
-         const fs_builder bld =
-            fs_builder(&s, MAX2(max_width, s.dispatch_width)).at_end();
-         const fs_builder ibld = bld.at(block, inst)
-                                    .exec_all(inst->force_writemask_all)
-                                    .group(max_width, inst->group / max_width);
+      assert(lower_width < inst->exec_size);
 
+      /* Builder matching the original instruction. */
+      const fs_builder bld = fs_builder(&s).at_end();
+      const fs_builder ibld =
+         bld.at(block, inst).exec_all(inst->force_writemask_all)
+            .group(inst->exec_size, inst->group / inst->exec_size);
+
+      {
          /* Split the copies in chunks of the execution width of either the
           * original or the lowered instruction, whichever is lower.
           */