intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION

Instead of emitting a single one at the top, and making reference to it, emit the virtual instruction as needed and let CSE do its job. Since load_subgroup_invocation now can appear not at the start of the shader, use UNDEF in all cases to ensure that the liveness of the destination doesn't extend to the first partial write done here (it was being used only for SIMD > 8 before). Note this option was considered in the past 6132992cdb but at the time dismissed. The difference now is that the lowering of the virtual instruction happens earlier than the scheduling. The motivation for this change is to allow passes other than the NIR conversion to use this value. The alternative of storing a `brw_reg` in the shader (instead of NIR state) gets complicated by passes like compact_vgrfs, that move VGRFs around (and update the instructions). This and maybe other passes would have to care about the brw_reg. Fossil-db numbers, TGL ``` *** Shaders only in 'after' results are ignored: steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider *** Shaders only in 'before' results are ignored: steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider Totals: Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00% Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03% Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01% Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01% Max live registers: 31725096 -> 31671792 (-0.17%) Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00% Totals from 52574 (8.34% of 630441) affected shaders: Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02% Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29% Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03% Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02% Max live registers: 1992493 -> 1939189 (-2.68%) Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05% ``` and DG2 ``` *** Shaders only in 'after' results are ignored: steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider *** Shaders only in 'before' results are ignored: steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider Totals: Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00% Subgroup size: 7699776 -> 7699784 (+0.00%) Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02% Spill count: 61283 -> 61274 (-0.01%) Fill count: 119886 -> 119849 (-0.03%) Max live registers: 31810432 -> 31758920 (-0.16%) Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06% Totals from 49286 (7.81% of 631231) affected shaders: Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04% Subgroup size: 857752 -> 857760 (+0.00%) Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72% Spill count: 6339 -> 6330 (-0.14%) Fill count: 12571 -> 12534 (-0.29%) Max live registers: 1788346 -> 1736834 (-2.88%) Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66% ``` Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2026-01-02 07:20:16 +01:00 · 2024-07-31 22:46:20 -07:00 · 2024-07-31 22:46:20 -07:00 · 2e2b83f72d
commit 2e2b83f72d
parent df16f47036
4 changed files with 20 additions and 33 deletions
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@ -892,6 +892,14 @@ namespace brw {
         shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
      }

+      brw_reg
+      LOAD_SUBGROUP_INVOCATION() const
+      {
+         brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
+         exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
+         return reg;
+      }
+
      fs_visitor *shader;

      fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
--- a/src/intel/compiler/brw_fs_cse.cpp
+++ b/src/intel/compiler/brw_fs_cse.cpp
@ -119,6 +119,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
   case SHADER_OPCODE_INT_REMAINDER:
   case SHADER_OPCODE_SIN:
   case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
      return true;
   case SHADER_OPCODE_LOAD_PAYLOAD:
      return !is_coalescing_payload(v->alloc, inst);
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@ -798,6 +798,7 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
      const fs_builder abld =
         fs_builder(&s, block, inst).annotate("SubgroupInvocation", NULL);
      const fs_builder ubld8 = abld.group(8, 0).exec_all();
+      ubld8.UNDEF(inst->dst);

      if (inst->exec_size == 8) {
         assert(inst->dst.type == BRW_TYPE_UD);
@ -806,7 +807,6 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
         ubld8.MOV(inst->dst, uw);
      } else {
         assert(inst->dst.type == BRW_TYPE_UW);
-         abld.UNDEF(inst->dst);
         ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
         ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
         if (inst->exec_size > 16) {
@ -817,11 +817,6 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)

      inst->remove(block);
      progress = true;
-
-      /* Currently this is only ever emitted once, so there's no point in
-       * continuing to look for more cases.  Drop if we ever re-emit it.
-       */
-      break;
   }

   if (progress)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -367,7 +367,6 @@ emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
 static void
 fs_nir_emit_system_values(nir_to_brw_state &ntb)
 {
-   const fs_builder &bld = ntb.bld;
   fs_visitor &s = ntb.s;

   ntb.system_values = ralloc_array(ntb.mem_ctx, brw_reg, SYSTEM_VALUE_MAX);
@ -375,15 +374,6 @@ fs_nir_emit_system_values(nir_to_brw_state &ntb)
      ntb.system_values[i] = brw_reg();
   }

-   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
-    * never end up using it.
-    */
-   {
-      brw_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
-      reg = bld.vgrf(s.dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
-      bld.emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
-   }
-
   nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
   nir_foreach_block(block, impl)
      emit_system_values_block(ntb, block);
@ -2650,8 +2640,7 @@ emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
          * by 32 (shifting by 5), and add the two together.  This is
          * the final indirect byte offset.
          */
-         brw_reg sequence =
-            ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+         brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();

         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
         brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
@ -2899,7 +2888,7 @@ get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
    * by the GRF size (by shifting), and add the two together.  This is
    * the final indirect byte offset.
    */
-   brw_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+   brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();

   /* Offsets will be 0, 4, 8, ... */
   brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
@ -5254,8 +5243,7 @@ swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
 {
   fs_visitor &s = ntb.s;

-   const brw_reg &chan_index =
-      ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+   const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
   const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;

   if (nir_src_is_const(nir_addr_src)) {
@ -7357,8 +7345,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
      break;

   case nir_intrinsic_load_subgroup_invocation:
-      bld.MOV(retype(dest, BRW_TYPE_UD),
-              ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      bld.MOV(retype(dest, BRW_TYPE_UD), bld.LOAD_SUBGROUP_INVOCATION());
      break;

   case nir_intrinsic_load_subgroup_eq_mask:
@ -7415,7 +7402,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
       * 0b...1111, invocations 4-7 will have 0b...11110000 and so on.
       */
      brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD);
-      bld.MOV(invoc_ud, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION());
      brw_reg quad_mask =
         bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC)));

@ -7679,8 +7666,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
          * MOVs or else fall back to doing indirects.
          */
         brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
-                      brw_imm_w(0x2));
+         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x2));
         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
      }
      break;
@ -7700,8 +7686,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
          * MOVs or else fall back to doing indirects.
          */
         brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
-                      brw_imm_w(0x3));
+         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x3));
         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
      }
      break;
@ -7783,8 +7768,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
          */
         brw_reg shifted = bld.vgrf(src.type);
         brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
-                         brw_imm_w(-1));
+         allbld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(-1));
         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
         allbld.group(1, 0).MOV(horiz_offset(shifted, 0), identity);
         scan = shifted;
@ -8079,10 +8063,9 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
            bld.SHL(bld.AND(raw_id, brw_imm_ud(INTEL_MASK(2, 0))),
                    brw_imm_ud(4));

-         /* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
+         /* LaneID[0:3] << 0 (Use subgroup invocation) */
         assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
-         bld.ADD(dst, bld.OR(eu, tid),
-                 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+         bld.ADD(dst, bld.OR(eu, tid), bld.LOAD_SUBGROUP_INVOCATION());
         break;
      }
      default: