From ed934ae17b28a353aaba8a826f777acd8eac0e51 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:57:28 -0400
Subject: [PATCH] i915/corm: add vec construction optimizations

Optimize vec2/3/4 construction with several strategies:

- same_reg: when all components come from the same register, collapse
  to a single swizzle+negate alias (zero instructions)
- const-swizzle piggybacking: ZERO/ONE sources share a MOV with
  real-register sources from the same register
- per-channel negate: preserve per-channel negate bits through the
  swizzle path instead of emitting separate negation

shader-db (I915_FS=nir): 130/403 compiled, 1614 alu
shader-db (I915_FS=both): nir won 130 (26 identical, 16 tied, 86 better, 2 only),
  156 TGSI, 117 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 88 ++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 4df750d4734..e4834c94c4b 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -361,14 +361,96 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
    case nir_op_vec3:
    case nir_op_vec4: {
       unsigned n = nir_op_infos[alu->op].num_inputs;
+      uint32_t srcs[4] = { 0 };
+      for (unsigned i = 0; i < n; i++)
+         srcs[i] = alu_src_ureg(c, &alu->src[i]);
+
+      bool same_reg = true;
+      for (unsigned i = 1; i < n; i++) {
+         if ((srcs[i] & UREG_TYPE_NR_MASK) != (srcs[0] & UREG_TYPE_NR_MASK)) {
+            same_reg = false;
+            break;
+         }
+      }
+
+      if (same_reg) {
+         uint32_t base = UREG(GET_UREG_TYPE(srcs[0]), GET_UREG_NR(srcs[0]));
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         for (unsigned i = 0; i < n; i++) {
+            ch[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7;
+            ng[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
+         }
+         i915_release_temp(p, GET_UREG_NR(dest));
+         set_ureg(c, def, negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                                 ng[0], ng[1], ng[2], ng[3]));
+         return;
+      }
+
       static const uint32_t chan_mask[] = {
          A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y,
          A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W,
       };
+      bool emitted[4] = { false };
+      uint32_t ch_sel[4];
+      int neg_sel[4] = { 0, 0, 0, 0 };
       for (unsigned i = 0; i < n; i++) {
-         uint32_t s = alu_src_ureg(c, &alu->src[i]);
-         i915_emit_arith(p, A0_MOV, dest, chan_mask[i] & mask, 0,
-                         swizzle(s, X, X, X, X), 0, 0);
+         ch_sel[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7;
+         neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
+      }
+
+      /* Process real-register sources first, folding in any ZERO/ONE
+       * const-swizzle sources that can piggyback on the same MOV.
+       * Use the unswizzled base register since swizzle() composes.
+       */
+      for (unsigned i = 0; i < n; i++) {
+         if (emitted[i] || ch_sel[i] >= SRC_ZERO)
+            continue;
+         uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), GET_UREG_NR(srcs[i]));
+         uint32_t group_mask = chan_mask[i];
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         ch[i] = ch_sel[i];
+         ng[i] = neg_sel[i];
+         for (unsigned j = i + 1; j < n; j++) {
+            if (!emitted[j] &&
+                (ch_sel[j] >= SRC_ZERO ||
+                 (srcs[j] & UREG_TYPE_NR_MASK) ==
+                 (srcs[i] & UREG_TYPE_NR_MASK))) {
+               group_mask |= chan_mask[j];
+               ch[j] = ch_sel[j];
+               ng[j] = neg_sel[j];
+               emitted[j] = true;
+            }
+         }
+         i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0,
+                         negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                                ng[0], ng[1], ng[2], ng[3]),
+                         0, 0);
+         emitted[i] = true;
+      }
+      /* Any remaining const-swizzle-only sources */
+      for (unsigned i = 0; i < n; i++) {
+         if (emitted[i])
+            continue;
+         uint32_t group_mask = chan_mask[i];
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         ch[i] = ch_sel[i];
+         ng[i] = neg_sel[i];
+         for (unsigned j = i + 1; j < n; j++) {
+            if (!emitted[j]) {
+               group_mask |= chan_mask[j];
+               ch[j] = ch_sel[j];
+               ng[j] = neg_sel[j];
+               emitted[j] = true;
+            }
+         }
+         i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0,
+                         negate(swizzle(srcs[i], ch[0], ch[1], ch[2], ch[3]),
+                                ng[0], ng[1], ng[2], ng[3]),
+                         0, 0);
+         emitted[i] = true;
       }
       break;
    }