From 84219892adb423c248e7770925e8095ca84ff9f1 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sat, 24 Feb 2024 01:24:03 -0800
Subject: [PATCH] intel/brw: Make gl_SubgroupInvocation lane index loading SSA

Our code to initialize gl_SubgroupInvocation uses multiple instructions
some of which are partial writes.  This makes it difficult to analyze
expressions involving gl_SubgroupInvocation, which appear very
frequently in compute shaders.

To make this easier, we add a new virtual opcode which initializes
a full VGRF to the value of gl_SubgroupInvocation.  (We also expand
it to UD for SIMD8 so there are not partial write issues.)  We then
lower it to the original code later on in compilation, after we've
done the bulk of our optimizations.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
---
 src/intel/compiler/brw_eu_defines.h |  2 ++
 src/intel/compiler/brw_fs.cpp       |  2 ++
 src/intel/compiler/brw_fs.h         |  1 +
 src/intel/compiler/brw_fs_lower.cpp | 44 +++++++++++++++++++++++++++++
 src/intel/compiler/brw_fs_nir.cpp   | 14 ++-------
 src/intel/compiler/brw_fs_opt.cpp   |  2 ++
 6 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 05e9fd46c7f..d00b4990755 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -536,6 +536,8 @@ enum opcode {
 
    SHADER_OPCODE_READ_ARCH_REG,
 
+   SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION,
+
    RT_OPCODE_TRACE_RAY_LOGICAL,
 };
 
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c4126d694cf..6ca33a4e313 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2464,6 +2464,8 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
       return "btd_retire_logical";
    case SHADER_OPCODE_READ_ARCH_REG:
       return "read_arch_reg";
+   case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
+      return "load_subgroup_invocation";
    }
 
    unreachable("not reached");
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index eb92c84da70..dde29f8a284 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -599,6 +599,7 @@ bool brw_fs_lower_derivatives(fs_visitor &s);
 bool brw_fs_lower_dpas(fs_visitor &s);
 bool brw_fs_lower_find_live_channel(fs_visitor &s);
 bool brw_fs_lower_integer_multiplication(fs_visitor &s);
+bool brw_fs_lower_load_subgroup_invocation(fs_visitor &s);
 bool brw_fs_lower_logical_sends(fs_visitor &s);
 bool brw_fs_lower_pack(fs_visitor &s);
 bool brw_fs_lower_load_payload(fs_visitor &s);
diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp
index cdab24904ea..d6d4da8e3fc 100644
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@@ -703,3 +703,47 @@ brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
    s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
                          DEPENDENCY_VARIABLES);
 }
+
+bool
+brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
+         continue;
+
+      const fs_builder abld =
+         fs_builder(&s, block, inst).annotate("SubgroupInvocation", NULL);
+      const fs_builder ubld8 = abld.group(8, 0).exec_all();
+
+      if (inst->exec_size == 8) {
+         assert(inst->dst.type == BRW_TYPE_UD);
+         fs_reg uw = retype(inst->dst, BRW_TYPE_UW);
+         ubld8.MOV(uw, brw_imm_v(0x76543210));
+         ubld8.MOV(inst->dst, uw);
+      } else {
+         assert(inst->dst.type == BRW_TYPE_UW);
+         abld.UNDEF(inst->dst);
+         ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
+         ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
+         if (inst->exec_size > 16) {
+            const fs_builder ubld16 = abld.group(16, 0).exec_all();
+            ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+
+      /* Currently this is only ever emitted once, so there's no point in
+       * continuing to look for more cases.  Drop if we ever re-emit it.
+       */
+      break;
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 07db93600a0..371ae4d2d8b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -379,19 +379,9 @@ fs_nir_emit_system_values(nir_to_brw_state &ntb)
     * never end up using it.
     */
    {
-      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
       fs_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
-      reg = abld.vgrf(BRW_TYPE_UW);
-      abld.UNDEF(reg);
-
-      const fs_builder allbld8 = abld.group(8, 0).exec_all();
-      allbld8.MOV(reg, brw_imm_v(0x76543210));
-      if (s.dispatch_width > 8)
-         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
-      if (s.dispatch_width > 16) {
-         const fs_builder allbld16 = abld.group(16, 0).exec_all();
-         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
-      }
+      reg = bld.vgrf(s.dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
+      bld.emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
    }
 
    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
diff --git a/src/intel/compiler/brw_fs_opt.cpp b/src/intel/compiler/brw_fs_opt.cpp
index 66e6a3b7057..fb0e0b99c8d 100644
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@@ -155,6 +155,8 @@ brw_fs_optimize(fs_visitor &s)
    OPT(brw_fs_lower_uniform_pull_constant_loads);
 
    OPT(brw_fs_lower_find_live_channel);
+
+   OPT(brw_fs_lower_load_subgroup_invocation);
 }
 
 static unsigned