intel/brw: Make gl_SubgroupInvocation lane index loading SSA

Our code to initialize gl_SubgroupInvocation uses multiple instructions some of which are partial writes. This makes it difficult to analyze expressions involving gl_SubgroupInvocation, which appear very frequently in compute shaders. To make this easier, we add a new virtual opcode which initializes a full VGRF to the value of gl_SubgroupInvocation. (We also expand it to UD for SIMD8 so there are not partial write issues.) We then lower it to the original code later on in compilation, after we've done the bulk of our optimizations. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2026-01-04 13:30:11 +01:00 · 2024-02-24 01:24:03 -08:00 · 2024-02-24 01:24:03 -08:00 · 84219892ad
commit 84219892ad
parent 344d4ee9f0
6 changed files with 53 additions and 12 deletions
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@ -536,6 +536,8 @@ enum opcode {

   SHADER_OPCODE_READ_ARCH_REG,

+   SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION,
+
   RT_OPCODE_TRACE_RAY_LOGICAL,
 };

--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -2464,6 +2464,8 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
      return "btd_retire_logical";
   case SHADER_OPCODE_READ_ARCH_REG:
      return "read_arch_reg";
+   case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
+      return "load_subgroup_invocation";
   }

   unreachable("not reached");
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@ -599,6 +599,7 @@ bool brw_fs_lower_derivatives(fs_visitor &s);
 bool brw_fs_lower_dpas(fs_visitor &s);
 bool brw_fs_lower_find_live_channel(fs_visitor &s);
 bool brw_fs_lower_integer_multiplication(fs_visitor &s);
+bool brw_fs_lower_load_subgroup_invocation(fs_visitor &s);
 bool brw_fs_lower_logical_sends(fs_visitor &s);
 bool brw_fs_lower_pack(fs_visitor &s);
 bool brw_fs_lower_load_payload(fs_visitor &s);
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@ -703,3 +703,47 @@ brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
   s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
                         DEPENDENCY_VARIABLES);
 }
+
+bool
+brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
+         continue;
+
+      const fs_builder abld =
+         fs_builder(&s, block, inst).annotate("SubgroupInvocation", NULL);
+      const fs_builder ubld8 = abld.group(8, 0).exec_all();
+
+      if (inst->exec_size == 8) {
+         assert(inst->dst.type == BRW_TYPE_UD);
+         fs_reg uw = retype(inst->dst, BRW_TYPE_UW);
+         ubld8.MOV(uw, brw_imm_v(0x76543210));
+         ubld8.MOV(inst->dst, uw);
+      } else {
+         assert(inst->dst.type == BRW_TYPE_UW);
+         abld.UNDEF(inst->dst);
+         ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
+         ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
+         if (inst->exec_size > 16) {
+            const fs_builder ubld16 = abld.group(16, 0).exec_all();
+            ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+
+      /* Currently this is only ever emitted once, so there's no point in
+       * continuing to look for more cases.  Drop if we ever re-emit it.
+       */
+      break;
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -379,19 +379,9 @@ fs_nir_emit_system_values(nir_to_brw_state &ntb)
    * never end up using it.
    */
   {
-      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
      fs_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
-      reg = abld.vgrf(BRW_TYPE_UW);
-      abld.UNDEF(reg);
-
-      const fs_builder allbld8 = abld.group(8, 0).exec_all();
-      allbld8.MOV(reg, brw_imm_v(0x76543210));
-      if (s.dispatch_width > 8)
-         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
-      if (s.dispatch_width > 16) {
-         const fs_builder allbld16 = abld.group(16, 0).exec_all();
-         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
-      }
+      reg = bld.vgrf(s.dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
+      bld.emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
   }

   nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@ -155,6 +155,8 @@ brw_fs_optimize(fs_visitor &s)
   OPT(brw_fs_lower_uniform_pull_constant_loads);

   OPT(brw_fs_lower_find_live_channel);
+
+   OPT(brw_fs_lower_load_subgroup_invocation);
 }

 static unsigned