From 2e2b83f72d508dd633021bd06e8b3f5a784dc5d3 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Wed, 31 Jul 2024 22:46:20 -0700
Subject: [PATCH] intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION

Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.

Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).

Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed.  The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.

The motivation for this change is to allow passes other than the NIR
conversion to use this value.  The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.

Fossil-db numbers, TGL

```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider

*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider

Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%

Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```

and DG2

```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider

*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider

Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%

Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
---
 src/intel/compiler/brw_fs_builder.h |  8 +++++++
 src/intel/compiler/brw_fs_cse.cpp   |  1 +
 src/intel/compiler/brw_fs_lower.cpp |  7 +-----
 src/intel/compiler/brw_fs_nir.cpp   | 37 ++++++++---------------------
 4 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index da71b352e26..f736fc73028 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -892,6 +892,14 @@ namespace brw {
          shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
       }
 
+      brw_reg
+      LOAD_SUBGROUP_INVOCATION() const
+      {
+         brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
+         exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
+         return reg;
+      }
+
       fs_visitor *shader;
 
       fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp
index 45d8e3b8a6b..c5838e38919 100644
--- a/src/intel/compiler/brw_fs_cse.cpp
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -119,6 +119,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case SHADER_OPCODE_INT_REMAINDER:
    case SHADER_OPCODE_SIN:
    case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
       return true;
    case SHADER_OPCODE_LOAD_PAYLOAD:
       return !is_coalescing_payload(v->alloc, inst);
diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp
index d19938cc0d8..46236a32562 100644
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@@ -798,6 +798,7 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
       const fs_builder abld =
          fs_builder(&s, block, inst).annotate("SubgroupInvocation", NULL);
       const fs_builder ubld8 = abld.group(8, 0).exec_all();
+      ubld8.UNDEF(inst->dst);
 
       if (inst->exec_size == 8) {
          assert(inst->dst.type == BRW_TYPE_UD);
@@ -806,7 +807,6 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
          ubld8.MOV(inst->dst, uw);
       } else {
          assert(inst->dst.type == BRW_TYPE_UW);
-         abld.UNDEF(inst->dst);
          ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
          ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
          if (inst->exec_size > 16) {
@@ -817,11 +817,6 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
 
       inst->remove(block);
       progress = true;
-
-      /* Currently this is only ever emitted once, so there's no point in
-       * continuing to look for more cases.  Drop if we ever re-emit it.
-       */
-      break;
    }
 
    if (progress)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 74c062fbc98..d27d7b25f6a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -367,7 +367,6 @@ emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
 static void
 fs_nir_emit_system_values(nir_to_brw_state &ntb)
 {
-   const fs_builder &bld = ntb.bld;
    fs_visitor &s = ntb.s;
 
    ntb.system_values = ralloc_array(ntb.mem_ctx, brw_reg, SYSTEM_VALUE_MAX);
@@ -375,15 +374,6 @@ fs_nir_emit_system_values(nir_to_brw_state &ntb)
       ntb.system_values[i] = brw_reg();
    }
 
-   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
-    * never end up using it.
-    */
-   {
-      brw_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
-      reg = bld.vgrf(s.dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
-      bld.emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
-   }
-
    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
    nir_foreach_block(block, impl)
       emit_system_values_block(ntb, block);
@@ -2650,8 +2640,7 @@ emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
           * by 32 (shifting by 5), and add the two together.  This is
           * the final indirect byte offset.
           */
-         brw_reg sequence =
-            ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+         brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
 
          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
          brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
@@ -2899,7 +2888,7 @@ get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
     * by the GRF size (by shifting), and add the two together.  This is
     * the final indirect byte offset.
     */
-   brw_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+   brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
 
    /* Offsets will be 0, 4, 8, ... */
    brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
@@ -5254,8 +5243,7 @@ swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
 {
    fs_visitor &s = ntb.s;
 
-   const brw_reg &chan_index =
-      ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+   const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
    const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
 
    if (nir_src_is_const(nir_addr_src)) {
@@ -7357,8 +7345,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
       break;
 
    case nir_intrinsic_load_subgroup_invocation:
-      bld.MOV(retype(dest, BRW_TYPE_UD),
-              ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      bld.MOV(retype(dest, BRW_TYPE_UD), bld.LOAD_SUBGROUP_INVOCATION());
       break;
 
    case nir_intrinsic_load_subgroup_eq_mask:
@@ -7415,7 +7402,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
        * 0b...1111, invocations 4-7 will have 0b...11110000 and so on.
        */
       brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD);
-      bld.MOV(invoc_ud, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION());
       brw_reg quad_mask =
          bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC)));
 
@@ -7679,8 +7666,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
           * MOVs or else fall back to doing indirects.
           */
          brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
-                      brw_imm_w(0x2));
+         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x2));
          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
       }
       break;
@@ -7700,8 +7686,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
           * MOVs or else fall back to doing indirects.
           */
          brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
-                      brw_imm_w(0x3));
+         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x3));
          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
       }
       break;
@@ -7783,8 +7768,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
           */
          brw_reg shifted = bld.vgrf(src.type);
          brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
-                         brw_imm_w(-1));
+         allbld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(-1));
          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
          allbld.group(1, 0).MOV(horiz_offset(shifted, 0), identity);
          scan = shifted;
@@ -8079,10 +8063,9 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
             bld.SHL(bld.AND(raw_id, brw_imm_ud(INTEL_MASK(2, 0))),
                     brw_imm_ud(4));
 
-         /* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
+         /* LaneID[0:3] << 0 (Use subgroup invocation) */
          assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
-         bld.ADD(dst, bld.OR(eu, tid),
-                 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+         bld.ADD(dst, bld.OR(eu, tid), bld.LOAD_SUBGROUP_INVOCATION());
          break;
       }
       default: