From 662339a2ff4db1b8180778a1449f317da3898f8c Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 15 Oct 2024 15:51:22 -0700
Subject: [PATCH] brw/build: Use SIMD8 temporaries in emit_uniformize

The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.

v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).

v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.

v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.

v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.

shader-db:

Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18

total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108

LOST:   0
GAINED: 1

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20

total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158

total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0

total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0

fossil-db:

Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)

Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%

Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%

Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%

Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%

Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%

Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
---
 src/intel/ci/traces-iris.yml                |  4 ++--
 src/intel/compiler/brw_fs_builder.h         | 26 ++++++++++++++++-----
 src/intel/compiler/brw_fs_opt.cpp           | 11 +++++++++
 src/intel/compiler/brw_fs_opt_algebraic.cpp |  4 ++++
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/intel/ci/traces-iris.yml b/src/intel/ci/traces-iris.yml
index 43556e2fe8d..a0f814bdb7b 100644
--- a/src/intel/ci/traces-iris.yml
+++ b/src/intel/ci/traces-iris.yml
@@ -88,7 +88,7 @@ traces:
     gl-intel-glk:
       checksum: 06f4222f7f5737f93ed1d191cbdc0798
     gl-intel-amly:
-      checksum: 06d587a2b934295da6ad874b750b9c9d
+      checksum: 06f4222f7f5737f93ed1d191cbdc0798
     gl-intel-kbl:
       checksum: 06f4222f7f5737f93ed1d191cbdc0798
     gl-intel-whl:
@@ -191,7 +191,7 @@ traces:
     gl-intel-glk:
       checksum: f9309a25e696938c0372c1afc781d01b
     gl-intel-amly:
-      checksum: 10e49cd5a5e12d4a01f504c14b4335cc
+      checksum: f9309a25e696938c0372c1afc781d01b
     gl-intel-kbl:
       checksum: f9309a25e696938c0372c1afc781d01b
     gl-intel-whl:
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index 807b566531b..c4896f427af 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -383,13 +383,19 @@ namespace brw {
          /* FIXME: We use a vector chan_index and dst to allow constant and
           * copy propagration to move result all the way into the consuming
           * instruction (typically a surface index or sampler index for a
-          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
-          * dispatch. Once we teach const/copy propagation about scalars we
+          * send). Once we teach const/copy propagation about scalars we
           * should go back to scalar destinations here.
           */
-         const brw_reg chan_index = vgrf(BRW_TYPE_UD);
+         const fs_builder xbld = scalar_group();
+         const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
+
+         /* FIND_LIVE_CHANNEL will only write a single component after
+          * lowering. Munge size_written here to match the allocated size of
+          * chan_index.
+          */
+         exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
+            ->size_written = chan_index.component_size(xbld.dispatch_width());
 
-         exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
          return BROADCAST(src, component(chan_index, 0));
       }
 
@@ -792,7 +798,10 @@ namespace brw {
       brw_reg
       BROADCAST(brw_reg value, brw_reg index) const
       {
-         const brw_reg dst = vgrf(value.type);
+         const fs_builder xbld = scalar_group();
+         const brw_reg dst = xbld.vgrf(value.type);
+
+         assert(is_uniform(index));
 
          /* Ensure that the source of a broadcast is always register aligned.
           * See brw_broadcast() non-scalar case for more details.
@@ -800,7 +809,12 @@ namespace brw {
          if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
             value = MOV(value);
 
-         exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index);
+         /* BROADCAST will only write a single component after lowering. Munge
+          * size_written here to match the allocated size of dst.
+          */
+         exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
+            ->size_written = dst.component_size(xbld.dispatch_width());
+
          return component(dst, 0);
       }
 
diff --git a/src/intel/compiler/brw_fs_opt.cpp b/src/intel/compiler/brw_fs_opt.cpp
index 3ec97916e26..3d04f018d89 100644
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@@ -456,6 +456,14 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0] = brw_imm_ud(0u);
             inst->force_writemask_all = true;
+
+            /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
+             * size_written set by hand to a smaller value. In this case,
+             * munge the exec_size to match.
+             */
+            if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
+               inst->exec_size = 8 * reg_unit(s.devinfo);
+
             inst->resize_sources(1);
             progress = true;
 
@@ -475,7 +483,10 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
                bcast->opcode = BRW_OPCODE_MOV;
                if (!is_uniform(bcast->src[0]))
                   bcast->src[0] = component(bcast->src[0], 0);
+
                bcast->force_writemask_all = true;
+               bcast->exec_size = 8 * reg_unit(s.devinfo);
+               assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
                bcast->resize_sources(1);
             }
          }
diff --git a/src/intel/compiler/brw_fs_opt_algebraic.cpp b/src/intel/compiler/brw_fs_opt_algebraic.cpp
index 94f52d574f4..b34ea251261 100644
--- a/src/intel/compiler/brw_fs_opt_algebraic.cpp
+++ b/src/intel/compiler/brw_fs_opt_algebraic.cpp
@@ -464,6 +464,8 @@ brw_fs_opt_algebraic(fs_visitor &s)
          if (is_uniform(inst->src[0])) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->force_writemask_all = true;
+            inst->exec_size = 8 * reg_unit(devinfo);
+            assert(inst->size_written == inst->dst.component_size(inst->exec_size));
             inst->resize_sources(1);
             progress = true;
          } else if (inst->src[1].file == IMM) {
@@ -480,6 +482,8 @@ brw_fs_opt_algebraic(fs_visitor &s)
             const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
             inst->src[0] = component(inst->src[0], comp);
             inst->force_writemask_all = true;
+            inst->exec_size = 8 * reg_unit(devinfo);
+            assert(inst->size_written == inst->dst.component_size(inst->exec_size));
             inst->resize_sources(1);
             progress = true;
          }