From 2f5686e212442f5612353031eb87dd2d2a89de14 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Mon, 4 May 2026 15:08:38 -0400
Subject: [PATCH 01/20] i915: pass NIR to draw instead of pre-converted TGSI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

draw_vs.c already handles the non-native-integer NIR→TGSI conversion
internally, so i915 doesn't need to do it. keep nir_lower_point_size
(i915-specific lowering) and pass the result to draw as NIR.

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_state.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 8d786c02e41..af45247355d 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -667,28 +667,11 @@ i915_create_vs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
    struct i915_context *i915 = i915_context(pipe);
-   void *vertex_shader;
 
-   struct pipe_shader_state from_nir = {PIPE_SHADER_IR_TGSI};
-   if (templ->type == PIPE_SHADER_IR_NIR) {
-      nir_shader *s = templ->ir.nir;
+   if (templ->type == PIPE_SHADER_IR_NIR)
+      NIR_PASS(_, templ->ir.nir, nir_lower_point_size, 1.0, 255.0);
 
-      NIR_PASS(_, s, nir_lower_point_size, 1.0, 255.0);
-
-      /* The gallivm draw path doesn't support non-native-integers NIR shaders,
-       * st/mesa does native-integers for the screen as a whole rather than
-       * per-stage, and i915 FS can't do native integers.  So, convert to TGSI,
-       * where the draw path *does* support non-native-integers.
-       */
-      from_nir.tokens = nir_to_tgsi(s, pipe->screen);
-      templ = &from_nir;
-   }
-
-   vertex_shader = draw_create_vertex_shader(i915->draw, templ);
-
-   FREE((void *)from_nir.tokens);
-
-   return vertex_shader;
+   return draw_create_vertex_shader(i915->draw, templ);
 }
 
 static void

From 5e1ada315c30838b57a78cb7ea4e238a51e7d653 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Tue, 5 May 2026 16:15:29 -0400
Subject: [PATCH 02/20] i915: improve shader-db stats reporting

Report actual instruction counts (alu+tex) instead of program dwords/3,
add a separate "alu" field for the 64-instruction bottleneck metric, and
fix "temps" to use the actual temp register count instead of
util_last_bit (highest register number).

Before: "69 inst, 2 tex, 3 tex_indirect, 4 temps, 5 const"
After:  "21 instructions, 19 alu, 2 tex, 2 tex_indirect, 16 temps, 3 const"

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_translate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index b6cfb2a3dfb..f07095ee110 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -1032,9 +1032,11 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
       if (i915) {
          util_debug_message(
             &i915->debug, SHADER_INFO,
-            "%s shader: %d inst, %d tex, %d tex_indirect, %d temps, %d const",
+            "%s shader: %d instructions, %d alu, %d tex, %d tex_indirect, "
+            "%d temps, %d const",
             _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT),
-            (int)program_size, p->nr_tex_insn, p->nr_tex_indirect,
+            p->nr_alu_insn + p->nr_tex_insn,
+            p->nr_alu_insn, p->nr_tex_insn, p->nr_tex_indirect,
             p->shader->info.file_max[TGSI_FILE_TEMPORARY] + 1,
             ifs->num_constants);
       }

From b1e709384bf43f48a704841ce3f7f783c6f9339e Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 15:45:38 -0400
Subject: [PATCH 03/20] i915: emit passthrough for empty TGSI fragment shaders

The TGSI compiler rejected empty fragment shaders (num_instructions
== 1, just TGSI_END) as errors. Instead, emit a passthrough program.

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_translate.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index f07095ee110..ba6f6172e85 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -1006,12 +1006,11 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
                          p->nr_decl_insn, I915_MAX_DECL_INSN);
    }
 
-   /* hw doesn't seem to like empty frag programs (num_instructions == 1 is just
-    * TGSI_END), even when the depth write fixup gets emitted below - maybe that
-    * one is fishy, too?
-    */
-   if (ifs->info.num_instructions == 1)
-      i915_program_error(p, "Empty fragment shader");
+   if (ifs->info.num_instructions == 1) {
+      i915_use_passthrough_shader(ifs);
+      ifs->nr_alu_insn = 1;
+      goto done;
+   }
 
    if (strlen(p->error) != 0) {
       i915_use_passthrough_shader(ifs);
@@ -1042,6 +1041,7 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
       }
    }
 
+done:
    if (strlen(p->error) != 0)
       ifs->error = p->error;
    else

From badd52c7d5afbc191f4f51e15f56ae590c630451 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 14:46:42 -0400
Subject: [PATCH 04/20] i915: fix incorrect texcoord optimization in TGSI
 compiler

i915_fpc_optimize_mov_before_tex replaces a MOV+TEX pair with a
direct TEX from the input register when the MOV copies from the
input with identity swizzle. But it only checked the source swizzle,
not the MOV's writemask. When the MOV wrote a subset of the channels
the TEX reads (e.g., MOV TEMP.y, IN.y before a 2D TEX that reads
XY), the optimization replaced the TEX source with IN, losing the X
channel that was set by a different MOV.

This caused incorrect texture sampling coordinates in shaders with
multi-MOV texcoord construction (blur filters, shadow maps, etc.).

Fix: verify the MOV's dest writemask covers all channels the TEX
instruction reads before applying the optimization.

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_optimize.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c
index b4ae362dfef..731f2444fec 100644
--- a/src/gallium/drivers/i915/i915_fpc_optimize.c
+++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
@@ -405,6 +405,8 @@ i915_fpc_optimize_mov_before_tex(struct i915_optimize_context *ctx,
        target_is_texture2d(next->FullInstruction.Texture.Texture) &&
        same_src_dst_reg(&next->FullInstruction.Src[0],
                         &current->FullInstruction.Dst[0]) &&
+       (current->FullInstruction.Dst[0].Register.WriteMask &
+        i915_tex_mask(next)) == i915_tex_mask(next) &&
        is_unswizzled(&current->FullInstruction.Src[0], i915_tex_mask(next)) &&
        unused_from(ctx, &current->FullInstruction.Dst[0], index)) {
       memcpy(&next->FullInstruction.Src[0], &current->FullInstruction.Src[0],

From 4087e3b7ef1e0b9307b84e8a7b94ba6f0bf7aae9 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:43:25 -0400
Subject: [PATCH 05/20] i915: refactor constant and compiler infrastructure for
 NIR backend

Rework the constant register encoding to track per-channel ownership
(I915_CONSTFLAG_IMM / I915_CONSTFLAG_USER_CH) instead of whole-register
flags, allowing compiler immediates and user UBO values to share a
constant register on different channels.  Update emit_constants() to
handle per-channel source selection at upload time.

Add i915_emit_const1f_prefer() for packing scalar constants into a
preferred register, reducing dual-constant conflicts.

Move i915_program_error(), i915_use_passthrough_shader(), and negate()
from i915_fpc_translate.c to shared locations (i915_fpc_emit.c /
i915_fpc.h) so the NIR backend can use them.

Fix i915_emit_texld() to use a utemp instead of a temp register for
texcoord swizzle copies, avoiding unnecessary tex indirect phase
boundaries.  Add a fallback path that copies to a utemp when bumping
the phase count would exceed the hardware limit.

Add nr_alu_insn, nr_tex_insn, nr_tex_indirect, nr_temps, writes_z,
and input semantic tracking to i915_fragment_shader for use by the
NIR backend's multi-variant comparison framework.

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_context.h       |  24 ++-
 src/gallium/drivers/i915/i915_fpc.h           |  23 +++
 src/gallium/drivers/i915/i915_fpc_emit.c      | 159 +++++++++++++-----
 src/gallium/drivers/i915/i915_fpc_translate.c |  57 +------
 src/gallium/drivers/i915/i915_state_emit.c    |  43 ++---
 5 files changed, 190 insertions(+), 116 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 0bbbd66662b..ef81f69740c 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -88,8 +88,15 @@ struct i915_winsys_batchbuffer;
 
 #define I915_MAX_CONSTANT 32
 
-/** See constant_flags[] below */
-#define I915_CONSTFLAG_USER 0x1f
+/**
+ * Per-channel flags for constant_flags[].
+ * Bits 0-3: channel has a compiler immediate.
+ * Bits 4-7: channel has a user (UBO) value uploaded at draw time.
+ * A channel is available when neither bit is set.
+ */
+#define I915_CONSTFLAG_IMM(ch)     (1 << (ch))
+#define I915_CONSTFLAG_USER_CH(ch) (1 << ((ch) + 4))
+#define I915_CONSTFLAG_USER        0xf0
 
 /**
  * Subclass of pipe_shader_state
@@ -103,6 +110,10 @@ struct i915_fragment_shader {
 
    uint32_t *program;
    uint32_t program_len;
+   uint32_t nr_alu_insn;
+   uint32_t nr_tex_insn;
+   uint32_t nr_tex_indirect;
+   uint32_t nr_temps;
 
    /**
     * constants introduced during translation.
@@ -134,12 +145,15 @@ struct i915_fragment_shader {
    } texcoords[I915_TEX_UNITS];
 
    bool reads_pntc;
+   bool writes_z;
+
+   unsigned num_inputs;
+   uint8_t input_semantic_name[PIPE_MAX_SHADER_INPUTS];
+   uint8_t input_semantic_index[PIPE_MAX_SHADER_INPUTS];
 
-   /* Set if the shader is an internal (blit, etc.) shader that shouldn't debug
-    * log by default. */
    bool internal;
 
-   char *error; /* Any error message from compiling this shader (or NULL) */
+   char *error;
 };
 
 struct i915_cache_context;
diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index d234042dea2..fe0d0f1e544 100644
--- a/src/gallium/drivers/i915/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -136,6 +136,15 @@ swizzle(int reg, uint32_t x, uint32_t y, uint32_t z, uint32_t w)
            CHANNEL_SRC(GET_CHANNEL_SRC(reg, w), 3));
 }
 
+static inline int
+negate(int reg, int x, int y, int z, int w)
+{
+   return reg ^ (x << UREG_CHANNEL_X_NEGATE_SHIFT |
+                 y << UREG_CHANNEL_Y_NEGATE_SHIFT |
+                 z << UREG_CHANNEL_Z_NEGATE_SHIFT |
+                 w << UREG_CHANNEL_W_NEGATE_SHIFT);
+}
+
 #define A0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT)
 #define D0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT)
 #define T0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT)
@@ -173,8 +182,20 @@ swizzle(int reg, uint32_t x, uint32_t y, uint32_t z, uint32_t w)
  */
 extern void i915_translate_fragment_program(struct i915_context *i915,
                                             struct i915_fragment_shader *fs);
+struct corm_compile_opts {
+   bool deferred_const;
+   bool seq_sne_opt;
+};
+
+extern void i915_translate_fragment_program_nir(struct i915_context *i915,
+                                                struct i915_fragment_shader *ifs,
+                                                struct nir_shader *s,
+                                                const struct corm_compile_opts *opts);
+extern void i915_use_passthrough_shader(struct i915_fragment_shader *fs);
+extern void i915_program_error(struct i915_fp_compile *p, const char *msg, ...);
 
 extern uint32_t i915_get_temp(struct i915_fp_compile *p);
+extern void i915_release_temp(struct i915_fp_compile *p, int reg);
 extern uint32_t i915_get_utemp(struct i915_fp_compile *p);
 extern void i915_release_utemps(struct i915_fp_compile *p);
 
@@ -191,6 +212,8 @@ extern uint32_t i915_emit_decl(struct i915_fp_compile *p, uint32_t type,
                                uint32_t nr, uint32_t d0_flags);
 
 extern uint32_t i915_emit_const1f(struct i915_fp_compile *p, float c0);
+extern uint32_t i915_emit_const1f_prefer(struct i915_fp_compile *p, float c0,
+                                         int preferred_reg);
 
 extern uint32_t i915_emit_const2f(struct i915_fp_compile *p, float c0,
                                   float c1);
diff --git a/src/gallium/drivers/i915/i915_fpc_emit.c b/src/gallium/drivers/i915/i915_fpc_emit.c
index 603c79e089f..aeace4396ca 100644
--- a/src/gallium/drivers/i915/i915_fpc_emit.c
+++ b/src/gallium/drivers/i915/i915_fpc_emit.c
@@ -25,11 +25,45 @@
  *
  **************************************************************************/
 
+#include <stdarg.h>
+
+#include "util/ralloc.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "i915_context.h"
 #include "i915_fpc.h"
 #include "i915_reg.h"
 
+void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
+{
+   va_list args;
+   va_start(args, msg);
+   ralloc_vasprintf_append(&p->error, msg, args);
+   va_end(args);
+}
+
+static const unsigned passthrough_program[] = {
+   _3DSTATE_PIXEL_SHADER_PROGRAM | ((1 * 3) - 1),
+   (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL |
+    (REG_TYPE_R << A0_SRC0_TYPE_SHIFT) | (0 << A0_SRC0_NR_SHIFT)),
+   ((SRC_ONE << A1_SRC0_CHANNEL_X_SHIFT) |
+    (SRC_ZERO << A1_SRC0_CHANNEL_Y_SHIFT) |
+    (SRC_ZERO << A1_SRC0_CHANNEL_Z_SHIFT) |
+    (SRC_ONE << A1_SRC0_CHANNEL_W_SHIFT)),
+   0};
+
+void
+i915_use_passthrough_shader(struct i915_fragment_shader *fs)
+{
+   fs->program = (uint32_t *)MALLOC(sizeof(passthrough_program));
+   if (fs->program) {
+      memcpy(fs->program, passthrough_program, sizeof(passthrough_program));
+      fs->program_len = ARRAY_SIZE(passthrough_program);
+   }
+   fs->num_constants = 0;
+}
+
 uint32_t
 i915_get_temp(struct i915_fp_compile *p)
 {
@@ -43,7 +77,7 @@ i915_get_temp(struct i915_fp_compile *p)
    return bit - 1;
 }
 
-static void
+void
 i915_release_temp(struct i915_fp_compile *p, int reg)
 {
    p->temp_flag &= ~(1 << reg);
@@ -179,8 +213,6 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
 {
    const uint32_t k = UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord));
 
-   int temp = -1;
-
    uint32_t coord_used = 0xf << UREG_CHANNEL_X_SHIFT;
    if (coord_mask & TGSI_WRITEMASK_Y)
       coord_used |= 0xf << UREG_CHANNEL_Y_SHIFT;
@@ -191,13 +223,10 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
 
    if ((coord & coord_used) != (k & coord_used) ||
        GET_UREG_TYPE(coord) == REG_TYPE_CONST) {
-      /* texcoord is swizzled or negated.  Need to allocate a new temporary
-       * register (a utemp / unpreserved temp) won't do.
+      /* texcoord is swizzled or negated.  Need a temporary to hold it.
+       * Use a utemp so it doesn't create a tex indirect phase boundary.
        */
-      uint32_t tempReg;
-
-      temp = i915_get_temp(p);          /* get temp reg index */
-      tempReg = UREG(REG_TYPE_R, temp); /* make i915 register */
+      uint32_t tempReg = i915_get_utemp(p);
 
       i915_emit_arith(p, A0_MOV, tempReg,
                       A0_DEST_CHANNEL_ALL, /* dest reg, writemask */
@@ -227,11 +256,21 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
          p->nr_tex_indirect++;
 
       /* Reading from an r# register whose contents depend on output of the
-       * current phase defines a phase boundary.
+       * current phase defines a phase boundary.  Prefer just bumping the
+       * phase count (free), but if we'd exceed the HW limit, copy to a
+       * utemp instead (costs 1 ALU instruction).
        */
       if (GET_UREG_TYPE(coord) == REG_TYPE_R &&
-          p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect)
-         p->nr_tex_indirect++;
+          p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect) {
+         if (p->nr_tex_indirect + 1 < I915_MAX_TEX_INDIRECT) {
+            p->nr_tex_indirect++;
+         } else {
+            uint32_t tmp = i915_get_utemp(p);
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            coord, 0, 0);
+            coord = tmp;
+         }
+      }
 
       if (p->csr < p->program + I915_PROGRAM_SIZE) {
          *(p->csr++) = (opcode | T0_DEST(dest) | T0_SAMPLER(sampler));
@@ -246,40 +285,75 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
       p->nr_tex_insn++;
    }
 
-   if (temp >= 0)
-      i915_release_temp(p, temp);
-
    return dest;
 }
 
+static uint32_t
+i915_try_const1f_in_reg(struct i915_fp_compile *p, float c0, unsigned reg)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+
+   for (unsigned idx = 0; idx < 4; idx++) {
+      if (ifs->constant_flags[reg] & I915_CONSTFLAG_USER_CH(idx))
+         continue;
+      if (!(ifs->constant_flags[reg] & I915_CONSTFLAG_IMM(idx)) ||
+          ifs->constants[reg][idx] == c0) {
+         ifs->constants[reg][idx] = c0;
+         ifs->constant_flags[reg] |= I915_CONSTFLAG_IMM(idx);
+         if (reg + 1 > ifs->num_constants)
+            ifs->num_constants = reg + 1;
+         return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
+      }
+   }
+   return UREG_BAD;
+}
+
+static uint32_t
+i915_try_emit_const1f(struct i915_fp_compile *p, float c0, int preferred_reg)
+{
+   if (preferred_reg >= 0) {
+      uint32_t r = i915_try_const1f_in_reg(p, c0, preferred_reg);
+      if (r != UREG_BAD)
+         return r;
+   }
+
+   for (unsigned reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      uint32_t r = i915_try_const1f_in_reg(p, c0, reg);
+      if (r != UREG_BAD)
+         return r;
+   }
+
+   i915_program_error(p, "i915_emit_const1f: out of constants");
+   return 0;
+}
+
 uint32_t
 i915_emit_const1f(struct i915_fp_compile *p, float c0)
 {
-   struct i915_fragment_shader *ifs = p->shader;
-   unsigned reg, idx;
-
    if (c0 == 0.0)
       return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
    if (c0 == 1.0)
       return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+   if (c0 == -1.0)
+      return negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE),
+                    1, 1, 1, 1);
 
-   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
-      if (ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
-         continue;
-      for (idx = 0; idx < 4; idx++) {
-         if (!(ifs->constant_flags[reg] & (1 << idx)) ||
-             ifs->constants[reg][idx] == c0) {
-            ifs->constants[reg][idx] = c0;
-            ifs->constant_flags[reg] |= 1 << idx;
-            if (reg + 1 > ifs->num_constants)
-               ifs->num_constants = reg + 1;
-            return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
-         }
-      }
-   }
+   return i915_try_emit_const1f(p, c0, -1);
+}
 
-   i915_program_error(p, "i915_emit_const1f: out of constants");
-   return 0;
+uint32_t
+i915_emit_const1f_prefer(struct i915_fp_compile *p, float c0,
+                         int preferred_reg)
+{
+   if (c0 == 0.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+   if (c0 == 1.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+   if (c0 == -1.0)
+      return negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE),
+                    1, 1, 1, 1);
+
+   return i915_try_emit_const1f(p, c0, preferred_reg);
 }
 
 uint32_t
@@ -301,14 +375,15 @@ i915_emit_const2f(struct i915_fp_compile *p, float c0, float c1)
    // XXX emit swizzle here for 0, 1, -1 and any combination thereof
    // we can use swizzle + neg for that
    for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
-      if (ifs->constant_flags[reg] == 0xf ||
-          ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+      uint8_t occupied = (ifs->constant_flags[reg] & 0xf) |
+                         (ifs->constant_flags[reg] >> 4);
+      if (occupied == 0xf)
          continue;
       for (idx = 0; idx < 3; idx++) {
-         if (!(ifs->constant_flags[reg] & (3 << idx))) {
+         if (!(occupied & (3 << idx))) {
             ifs->constants[reg][idx + 0] = c0;
             ifs->constants[reg][idx + 1] = c1;
-            ifs->constant_flags[reg] |= 3 << idx;
+            ifs->constant_flags[reg] |= (3 << idx); /* immediate bits */
             if (reg + 1 > ifs->num_constants)
                ifs->num_constants = reg + 1;
             return swizzle(UREG(REG_TYPE_CONST, reg), idx, idx + 1, ZERO, ONE);
@@ -330,9 +405,9 @@ i915_emit_const4f(struct i915_fp_compile *p, float c0, float c1, float c2,
    // XXX emit swizzle here for 0, 1, -1 and any combination thereof
    // we can use swizzle + neg for that
    for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
-      if (ifs->constant_flags[reg] == 0xf && ifs->constants[reg][0] == c0 &&
-          ifs->constants[reg][1] == c1 && ifs->constants[reg][2] == c2 &&
-          ifs->constants[reg][3] == c3) {
+      if ((ifs->constant_flags[reg] & 0x0f) == 0x0f &&
+          ifs->constants[reg][0] == c0 && ifs->constants[reg][1] == c1 &&
+          ifs->constants[reg][2] == c2 && ifs->constants[reg][3] == c3) {
          return UREG(REG_TYPE_CONST, reg);
       } else if (ifs->constant_flags[reg] == 0) {
 
@@ -340,7 +415,7 @@ i915_emit_const4f(struct i915_fp_compile *p, float c0, float c1, float c2,
          ifs->constants[reg][1] = c1;
          ifs->constants[reg][2] = c2;
          ifs->constants[reg][3] = c3;
-         ifs->constant_flags[reg] = 0xf;
+         ifs->constant_flags[reg] = 0x0f;
          if (reg + 1 > ifs->num_constants)
             ifs->num_constants = reg + 1;
          return UREG(REG_TYPE_CONST, reg);
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index ba6f6172e85..9277e55e9e3 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -54,55 +54,9 @@
  * Simple pass-through fragment shader to use when we don't have
  * a real shader (or it fails to compile for some reason).
  */
-static unsigned passthrough_program[] = {
-   _3DSTATE_PIXEL_SHADER_PROGRAM | ((1 * 3) - 1),
-   /* move to output color:
-    */
-   (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL |
-    (REG_TYPE_R << A0_SRC0_TYPE_SHIFT) | (0 << A0_SRC0_NR_SHIFT)),
-   ((SRC_ONE << A1_SRC0_CHANNEL_X_SHIFT) |
-    (SRC_ZERO << A1_SRC0_CHANNEL_Y_SHIFT) |
-    (SRC_ZERO << A1_SRC0_CHANNEL_Z_SHIFT) |
-    (SRC_ONE << A1_SRC0_CHANNEL_W_SHIFT)),
-   0};
-
 /**
  * component-wise negation of ureg
  */
-static inline int
-negate(int reg, int x, int y, int z, int w)
-{
-   /* Another neat thing about the UREG representation */
-   return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) |
-                 ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) |
-                 ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) |
-                 ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT));
-}
-
-/**
- * In the event of a translation failure, we'll generate a simple color
- * pass-through program.
- */
-static void
-i915_use_passthrough_shader(struct i915_fragment_shader *fs)
-{
-   fs->program = (uint32_t *)MALLOC(sizeof(passthrough_program));
-   if (fs->program) {
-      memcpy(fs->program, passthrough_program, sizeof(passthrough_program));
-      fs->program_len = ARRAY_SIZE(passthrough_program);
-   }
-   fs->num_constants = 0;
-}
-
-void
-i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
-{
-   va_list args;
-   va_start(args, msg);
-   ralloc_vasprintf_append(&p->error, msg, args);
-   va_end(args);
-}
-
 static uint32_t
 get_mapping(struct i915_fragment_shader *fs, enum tgsi_semantic semantic,
             int index)
@@ -1023,6 +977,10 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
       assert(!ifs->program);
 
       ifs->program_len = decl_size + program_size;
+      ifs->nr_alu_insn = p->nr_alu_insn;
+      ifs->nr_tex_insn = p->nr_tex_insn;
+      ifs->nr_tex_indirect = p->nr_tex_indirect;
+      ifs->nr_temps = util_bitcount(p->temp_flag);
       ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t));
       memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t));
       memcpy(&ifs->program[decl_size], p->program,
@@ -1034,10 +992,9 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
             "%s shader: %d instructions, %d alu, %d tex, %d tex_indirect, "
             "%d temps, %d const",
             _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT),
-            p->nr_alu_insn + p->nr_tex_insn,
-            p->nr_alu_insn, p->nr_tex_insn, p->nr_tex_indirect,
-            p->shader->info.file_max[TGSI_FILE_TEMPORARY] + 1,
-            ifs->num_constants);
+            ifs->nr_alu_insn + ifs->nr_tex_insn,
+            ifs->nr_alu_insn, ifs->nr_tex_insn, ifs->nr_tex_indirect,
+            ifs->nr_temps, ifs->num_constants);
       }
    }
 
diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
index f3561b143e8..8a92d6d0a7b 100644
--- a/src/gallium/drivers/i915/i915_state_emit.c
+++ b/src/gallium/drivers/i915/i915_state_emit.c
@@ -332,28 +332,33 @@ emit_constants(struct i915_context *i915)
       OUT_BATCH((1 << nr) - 1);
 
       for (i = 0; i < nr; i++) {
-         const uint32_t *c;
-         if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
-            /* grab user-defined constant */
-            c = (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT])
-                   ->data;
+         uint8_t flags = i915->fs->constant_flags[i];
+         uint8_t user_mask = flags >> 4;
+
+         if (!user_mask) {
+            const uint32_t *c = (uint32_t *)i915->fs->constants[i];
+            OUT_BATCH(c[0]);
+            OUT_BATCH(c[1]);
+            OUT_BATCH(c[2]);
+            OUT_BATCH(c[3]);
+         } else if (user_mask == 0xf) {
+            const uint32_t *c =
+               (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT])
+                  ->data;
             c += 4 * i;
+            OUT_BATCH(c[0]);
+            OUT_BATCH(c[1]);
+            OUT_BATCH(c[2]);
+            OUT_BATCH(c[3]);
          } else {
-            /* emit program constant */
-            c = (uint32_t *)i915->fs->constants[i];
+            const uint32_t *user =
+               (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT])
+                  ->data;
+            user += 4 * i;
+            const uint32_t *imm = (uint32_t *)i915->fs->constants[i];
+            for (unsigned ch = 0; ch < 4; ch++)
+               OUT_BATCH((user_mask & (1 << ch)) ? user[ch] : imm[ch]);
          }
-#if 0 /* debug */
-         {
-            float *f = (float *) c;
-            printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
-                   (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
-                    ? "user" : "immediate"));
-         }
-#endif
-         OUT_BATCH(*c++);
-         OUT_BATCH(*c++);
-         OUT_BATCH(*c++);
-         OUT_BATCH(*c++);
       }
    }
 }

From 3d3b5577804919ed1cb4db2a7bba89edf4486a1b Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:45:51 -0400
Subject: [PATCH 06/20] i915/corm: add NIR fragment shader backend

Bare-minimum NIR-to-i915 fragment shader compiler with multi-variant
framework, lexicographic cost metric (ALU > tex_indirect > temps > consts),
and winner-tagged stats output.

Stats are emitted once per shader with [NIR] or [TGSI] tag indicating
which backend won.  The corm_compile_opts struct is available for
multi-variant compilation (currently empty).

Assisted-by: Claude

shader-db (I915_FS=nir): 48/403 compiled, 65 alu
shader-db (I915_FS=both): nir won 48 (26 identical, 16 tied, 6 better),
  236 TGSI, 119 neither
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 821 ++++++++++++++++++++++++
 src/gallium/drivers/i915/i915_screen.c  |   7 +
 src/gallium/drivers/i915/i915_state.c   | 328 +++++++++-
 src/gallium/drivers/i915/meson.build    |   1 +
 4 files changed, 1130 insertions(+), 27 deletions(-)
 create mode 100644 src/gallium/drivers/i915/i915_fpc_nir.c

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
new file mode 100644
index 00000000000..346e06d0a34
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -0,0 +1,821 @@
+/*
+ * Copyright 2025 Red Hat, Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "util/log.h"
+#include "util/ralloc.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_debug.h"
+#include "i915_debug_private.h"
+#include "i915_fpc.h"
+#include "i915_reg.h"
+
+struct nir_to_i915 {
+   struct corm_compile_opts opts;
+   struct i915_fp_compile *p;
+   struct i915_fragment_shader *ifs;
+
+   uint32_t *ureg_map;
+   unsigned ureg_map_size;
+};
+
+static void
+set_ureg(struct nir_to_i915 *c, nir_def *def, uint32_t ureg)
+{
+   assert(def->index < c->ureg_map_size);
+   c->ureg_map[def->index] = ureg;
+}
+
+static uint32_t
+src_ureg(struct nir_to_i915 *c, nir_src *src)
+{
+   assert(src->ssa->index < c->ureg_map_size);
+   return c->ureg_map[src->ssa->index];
+}
+
+static uint32_t
+alu_src_ureg(struct nir_to_i915 *c, nir_alu_src *src)
+{
+   uint32_t ureg = src_ureg(c, &src->src);
+   return swizzle(ureg,
+                  src->swizzle[0], src->swizzle[1],
+                  src->swizzle[2], src->swizzle[3]);
+}
+
+static uint32_t
+def_mask(nir_def *def)
+{
+   uint32_t mask = 0;
+   if (def->num_components >= 1) mask |= A0_DEST_CHANNEL_X;
+   if (def->num_components >= 2) mask |= A0_DEST_CHANNEL_Y;
+   if (def->num_components >= 3) mask |= A0_DEST_CHANNEL_Z;
+   if (def->num_components >= 4) mask |= A0_DEST_CHANNEL_W;
+   return mask;
+}
+
+static uint32_t
+writemask_to_mask(unsigned wm)
+{
+   uint32_t mask = 0;
+   if (wm & 1) mask |= A0_DEST_CHANNEL_X;
+   if (wm & 2) mask |= A0_DEST_CHANNEL_Y;
+   if (wm & 4) mask |= A0_DEST_CHANNEL_Z;
+   if (wm & 8) mask |= A0_DEST_CHANNEL_W;
+   return mask;
+}
+
+static uint32_t
+get_texcoord_mapping(struct i915_fragment_shader *fs,
+                     unsigned semantic, int index)
+{
+   for (int i = 0; i < I915_TEX_UNITS; i++) {
+      if (fs->texcoords[i].semantic == -1) {
+         fs->texcoords[i].semantic = semantic;
+         fs->texcoords[i].index = index;
+         return i;
+      }
+      if (fs->texcoords[i].semantic == (int)semantic &&
+          fs->texcoords[i].index == index)
+         return i;
+   }
+   return 0;
+}
+
+static uint32_t
+emit_input(struct nir_to_i915 *c, unsigned location)
+{
+   struct i915_fp_compile *p = c->p;
+   struct i915_fragment_shader *ifs = c->ifs;
+   unsigned sem_name, sem_index;
+
+   tgsi_get_gl_varying_semantic((gl_varying_slot)location, true,
+                                &sem_name, &sem_index);
+
+   switch (sem_name) {
+   case TGSI_SEMANTIC_GENERIC:
+   case TGSI_SEMANTIC_TEXCOORD:
+   case TGSI_SEMANTIC_PCOORD:
+   case TGSI_SEMANTIC_POSITION: {
+      if (sem_name == TGSI_SEMANTIC_PCOORD)
+         ifs->reads_pntc = true;
+      int tc = get_texcoord_mapping(ifs, sem_name, sem_index);
+      return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_ALL);
+   }
+   case TGSI_SEMANTIC_COLOR:
+      if (sem_index == 0) {
+         return i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL);
+      } else {
+         return swizzle(
+            i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ),
+            X, Y, Z, ONE);
+      }
+   case TGSI_SEMANTIC_FOG:
+      return swizzle(
+         i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W),
+         W, W, W, W);
+   case TGSI_SEMANTIC_FACE: {
+      int tc = get_texcoord_mapping(ifs, sem_name, sem_index);
+      return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_X);
+   }
+   default:
+      i915_program_error(p, "Bad input location %d (semantic %d)",
+                         location, sem_name);
+      return 0;
+   }
+}
+
+static void
+emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load)
+{
+   struct i915_fp_compile *p = c->p;
+
+   switch (load->def.num_components) {
+   case 1:
+      set_ureg(c, &load->def, i915_emit_const1f(p, load->value[0].f32));
+      break;
+   case 2:
+      set_ureg(c, &load->def,
+               i915_emit_const2f(p, load->value[0].f32,
+                                 load->value[1].f32));
+      break;
+   case 3:
+   case 4: {
+      float v[4] = {
+         load->value[0].f32,
+         load->def.num_components > 1 ? load->value[1].f32 : 0.0f,
+         load->def.num_components > 2 ? load->value[2].f32 : 0.0f,
+         load->def.num_components > 3 ? load->value[3].f32 : 0.0f,
+      };
+      set_ureg(c, &load->def, i915_emit_const4fv(p, v));
+      break;
+   }
+   default:
+      i915_program_error(p, "load_const with %d components",
+                         load->def.num_components);
+      break;
+   }
+}
+
+static void
+emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
+{
+   struct i915_fp_compile *p = c->p;
+   nir_def *def = &alu->def;
+   uint32_t mask = def_mask(def);
+   uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
+   set_ureg(c, def, dest);
+
+   uint32_t src0 = 0, src1 = 0, src2 = 0;
+   if (nir_op_infos[alu->op].num_inputs >= 1)
+      src0 = alu_src_ureg(c, &alu->src[0]);
+   if (nir_op_infos[alu->op].num_inputs >= 2)
+      src1 = alu_src_ureg(c, &alu->src[1]);
+   if (nir_op_infos[alu->op].num_inputs >= 3)
+      src2 = alu_src_ureg(c, &alu->src[2]);
+
+   switch (alu->op) {
+   case nir_op_mov:
+   case nir_op_fcanonicalize:
+   case nir_op_fneg: {
+      i915_release_temp(p, GET_UREG_NR(dest));
+      set_ureg(c, def, alu->op == nir_op_fneg ? negate(src0, 1, 1, 1, 1)
+                                         : src0);
+      return;
+   }
+   case nir_op_fabs:
+      i915_emit_arith(p, A0_MAX, dest, mask, 0,
+                      src0, negate(src0, 1, 1, 1, 1), 0);
+      break;
+   case nir_op_fsat:
+      i915_emit_arith(p, A0_MOV, dest, mask, A0_DEST_SATURATE, src0, 0, 0);
+      break;
+   case nir_op_fadd:
+      i915_emit_arith(p, A0_ADD, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_fmul:
+      i915_emit_arith(p, A0_MUL, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_ffma:
+      i915_emit_arith(p, A0_MAD, dest, mask, 0, src0, src1, src2);
+      break;
+   case nir_op_fmin:
+   case nir_op_imin:
+   case nir_op_umin:
+      i915_emit_arith(p, A0_MIN, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_fmax:
+   case nir_op_imax:
+   case nir_op_umax:
+      i915_emit_arith(p, A0_MAX, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_ffloor:
+      i915_emit_arith(p, A0_FLR, dest, mask, 0, src0, 0, 0);
+      break;
+   case nir_op_ffract:
+      i915_emit_arith(p, A0_FRC, dest, mask, 0, src0, 0, 0);
+      break;
+   case nir_op_ftrunc:
+      i915_emit_arith(p, A0_TRC, dest, mask, 0, src0, 0, 0);
+      break;
+   case nir_op_fceil: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_FLR, tmp, mask, 0,
+                      negate(src0, 1, 1, 1, 1), 0, 0);
+      i915_emit_arith(p, A0_MOV, dest, mask, 0,
+                      negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+   }
+   case nir_op_frcp:
+      i915_emit_arith(p, A0_RCP, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_frsq:
+      i915_emit_arith(p, A0_RSQ, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_fsqrt: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_RSQ, tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      i915_emit_arith(p, A0_MUL, dest, mask, 0,
+                      src0, swizzle(tmp, X, X, X, X), 0);
+      break;
+   }
+   case nir_op_fexp2:
+      i915_emit_arith(p, A0_EXP, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_flog2:
+      i915_emit_arith(p, A0_LOG, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_fdot2:
+   case nir_op_fdot2_replicated:
+      i915_emit_arith(p, A0_DP3, dest, mask, 0,
+                      swizzle(src0, X, Y, ZERO, ZERO), src1, 0);
+      break;
+   case nir_op_fdot3:
+   case nir_op_fdot3_replicated:
+      i915_emit_arith(p, A0_DP3, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_fdot4:
+   case nir_op_fdot4_replicated:
+      i915_emit_arith(p, A0_DP4, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_slt:
+      i915_emit_arith(p, A0_SLT, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_sge:
+      i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_seq: {
+      /* seq(a,b) = sge(a,b) * sge(b,a) */
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, src1, 0);
+      i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0);
+      i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0);
+      break;
+   }
+   case nir_op_sne: {
+      /* sne(a,b) = slt(a,b) + slt(b,a) */
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, src1, 0);
+      i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0);
+      i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0);
+      break;
+   }
+   case nir_op_fpow: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0);
+      i915_emit_arith(p, A0_EXP, dest, mask, 0,
+                      swizzle(tmp, X, X, X, X), 0, 0);
+      break;
+   }
+   case nir_op_bcsel:
+      i915_emit_arith(p, A0_CMP, dest, mask, 0,
+                      negate(src0, 1, 1, 1, 1), src2, src1);
+      break;
+   case nir_op_fcsel_ge:
+      i915_emit_arith(p, A0_CMP, dest, mask, 0, src0, src1, src2);
+      break;
+   case nir_op_fcsel_gt:
+      i915_emit_arith(p, A0_CMP, dest, mask, 0,
+                      negate(src0, 1, 1, 1, 1), src2, src1);
+      break;
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4: {
+      unsigned n = nir_op_infos[alu->op].num_inputs;
+      static const uint32_t chan_mask[] = {
+         A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y,
+         A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W,
+      };
+      for (unsigned i = 0; i < n; i++) {
+         uint32_t s = alu_src_ureg(c, &alu->src[i]);
+         i915_emit_arith(p, A0_MOV, dest, chan_mask[i] & mask, 0,
+                         swizzle(s, X, X, X, X), 0, 0);
+      }
+      break;
+   }
+   case nir_op_fsign: {
+      uint32_t tmp = i915_get_utemp(p);
+      const uint32_t zero = swizzle(UREG(REG_TYPE_R, 0),
+                                    ZERO, ZERO, ZERO, ZERO);
+      i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, zero, 0);
+      i915_emit_arith(p, A0_SLT, dest, mask, 0, zero, src0, 0);
+      i915_emit_arith(p, A0_ADD, dest, mask, 0,
+                      dest, negate(tmp, 1, 1, 1, 1), 0);
+      break;
+   }
+   default:
+      i915_program_error(p, "unsupported NIR ALU op: %s",
+                         nir_op_infos[alu->op].name);
+      break;
+   }
+
+   i915_release_utemps(p);
+}
+
+static uint32_t
+translate_tex_type(struct i915_fp_compile *p, enum glsl_sampler_dim dim)
+{
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      return D0_SAMPLE_TYPE_2D;
+   case GLSL_SAMPLER_DIM_3D:
+      return D0_SAMPLE_TYPE_VOLUME;
+   case GLSL_SAMPLER_DIM_CUBE:
+      return D0_SAMPLE_TYPE_CUBE;
+   default:
+      i915_program_error(p, "unsupported sampler dim %d", dim);
+      return D0_SAMPLE_TYPE_2D;
+   }
+}
+
+static uint32_t
+tex_coord_mask(nir_tex_instr *tex)
+{
+   uint32_t mask = TGSI_WRITEMASK_X;
+
+   switch (tex->sampler_dim) {
+   case GLSL_SAMPLER_DIM_1D:
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      mask = TGSI_WRITEMASK_XY;
+      break;
+   case GLSL_SAMPLER_DIM_3D:
+   case GLSL_SAMPLER_DIM_CUBE:
+      mask = TGSI_WRITEMASK_XYZ;
+      break;
+   default:
+      break;
+   }
+
+   if (tex->is_shadow)
+      mask |= TGSI_WRITEMASK_Z;
+
+   if (tex->op == nir_texop_txb)
+      mask |= TGSI_WRITEMASK_W;
+
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      if (tex->src[i].src_type == nir_tex_src_projector) {
+         mask |= TGSI_WRITEMASK_W;
+         break;
+      }
+   }
+
+   return mask;
+}
+
+static void
+emit_tex(struct nir_to_i915 *c, nir_tex_instr *tex)
+{
+   struct i915_fp_compile *p = c->p;
+   nir_def *def = &tex->def;
+   uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
+   set_ureg(c, def, dest);
+
+   uint32_t hw_tex = translate_tex_type(p, tex->sampler_dim);
+   uint32_t sampler = i915_emit_decl(p, REG_TYPE_S, tex->sampler_index, hw_tex);
+
+   uint32_t coord = 0;
+   uint32_t bias_or_proj = 0;
+   uint32_t shadow = 0;
+   bool has_bias = false, has_proj = false, has_shadow = false;
+
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      switch (tex->src[i].src_type) {
+      case nir_tex_src_coord:
+         coord = src_ureg(c, &tex->src[i].src);
+         break;
+      case nir_tex_src_bias:
+         bias_or_proj = src_ureg(c, &tex->src[i].src);
+         has_bias = true;
+         break;
+      case nir_tex_src_projector:
+         bias_or_proj = src_ureg(c, &tex->src[i].src);
+         has_proj = true;
+         break;
+      case nir_tex_src_comparator:
+         shadow = src_ureg(c, &tex->src[i].src);
+         has_shadow = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   /* 1D textures: set Y = X so LOD works correctly when sampled as 2D */
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_1D)
+      coord = swizzle(coord, X, X, Z, W);
+
+   /* pack bias/projector/shadow into a single coord register if needed */
+   if (has_bias || has_proj || has_shadow) {
+      uint32_t tmp = UREG(REG_TYPE_R, i915_get_temp(p));
+
+      i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0, coord, 0, 0);
+
+      if (has_shadow)
+         i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_Z, 0,
+                         swizzle(shadow, X, X, X, X), 0, 0);
+
+      if (has_bias || has_proj)
+         i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_W, 0,
+                         swizzle(bias_or_proj, X, X, X, X), 0, 0);
+
+      coord = tmp;
+   }
+
+   uint32_t opcode;
+   if (tex->op == nir_texop_txb) {
+      opcode = T0_TEXLDB;
+   } else if (has_proj) {
+      opcode = T0_TEXLDP;
+   } else if (tex->op == nir_texop_tex) {
+      opcode = T0_TEXLD;
+   } else {
+      i915_program_error(p, "unsupported tex op %d", tex->op);
+      return;
+   }
+
+   i915_emit_texld(p, dest, A0_DEST_CHANNEL_ALL, sampler, coord, opcode,
+                   tex_coord_mask(tex));
+
+   i915_release_utemps(p);
+}
+
+static void
+emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr)
+{
+   struct i915_fp_compile *p = c->p;
+   struct i915_fragment_shader *ifs = c->ifs;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_input: {
+      nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+      unsigned comp = nir_intrinsic_component(intr);
+      uint32_t reg = emit_input(c, sem.location);
+
+      if (comp > 0) {
+         reg = swizzle(reg, comp, MIN2(comp + 1, 3),
+                       MIN2(comp + 2, 3), MIN2(comp + 3, 3));
+      }
+
+      set_ureg(c, &intr->def, reg);
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+      unsigned comp = nir_intrinsic_component(intr);
+      uint32_t val = src_ureg(c, &intr->src[0]);
+      uint32_t wm = nir_intrinsic_write_mask(intr);
+      uint32_t dest;
+
+      if (sem.location == FRAG_RESULT_DEPTH) {
+         dest = UREG(REG_TYPE_OD, 0);
+      } else {
+         dest = UREG(REG_TYPE_OC, 0);
+      }
+
+      if (comp > 0) {
+         uint32_t s[4] = { X, Y, Z, W };
+         for (int i = 3; i >= (int)comp; i--)
+            s[i] = s[i - comp];
+         for (unsigned i = 0; i < comp; i++)
+            s[i] = ZERO;
+         val = swizzle(val, s[0], s[1], s[2], s[3]);
+         wm <<= comp;
+      }
+
+      i915_emit_arith(p, A0_MOV, dest, writemask_to_mask(wm), 0,
+                      val, 0, 0);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo: {
+      nir_src *offset_src = &intr->src[1];
+      if (!nir_src_is_const(*offset_src)) {
+         i915_program_error(p, "non-constant UBO offset");
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+      unsigned byte_offset = (unsigned)nir_src_as_float(*offset_src);
+      unsigned slot = byte_offset / 16;
+      unsigned comp = (byte_offset % 16) / 4;
+
+      if (slot >= I915_MAX_CONSTANT) {
+         i915_program_error(p, "UBO offset %d exceeds max constants", slot);
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+
+      for (unsigned i = 0; i < intr->def.num_components; i++)
+         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER;
+      ifs->num_constants = MAX2(ifs->num_constants, slot + 1);
+
+      uint32_t reg = UREG(REG_TYPE_CONST, slot);
+      if (comp > 0) {
+         uint32_t s[4];
+         for (unsigned i = 0; i < 4; i++)
+            s[i] = MIN2(comp + i, 3);
+         reg = swizzle(reg, s[0], s[1], s[2], s[3]);
+      }
+
+      set_ureg(c, &intr->def, reg);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo_vec4: {
+      nir_src *offset_src = &intr->src[1];
+      if (!nir_src_is_const(*offset_src)) {
+         i915_program_error(p, "non-constant UBO offset");
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+      unsigned slot = nir_intrinsic_base(intr) +
+                      (unsigned)nir_src_as_float(*offset_src);
+      unsigned comp = nir_intrinsic_component(intr);
+
+      if (slot >= I915_MAX_CONSTANT) {
+         i915_program_error(p, "UBO slot %d exceeds max constants", slot);
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+
+      for (unsigned i = 0; i < intr->def.num_components; i++)
+         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER;
+      ifs->num_constants = MAX2(ifs->num_constants, slot + 1);
+
+      uint32_t reg = UREG(REG_TYPE_CONST, slot);
+      if (comp > 0) {
+         uint32_t s[4];
+         for (unsigned i = 0; i < 4; i++)
+            s[i] = MIN2(comp + i, 3);
+         reg = swizzle(reg, s[0], s[1], s[2], s[3]);
+      }
+
+      set_ureg(c, &intr->def, reg);
+      break;
+   }
+
+   case nir_intrinsic_terminate:
+   case nir_intrinsic_demote: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE),
+                             1, 1, 1, 1),
+                      T0_TEXKILL, TGSI_WRITEMASK_X);
+      i915_release_utemps(p);
+      break;
+   }
+
+   case nir_intrinsic_terminate_if:
+   case nir_intrinsic_demote_if: {
+      uint32_t cond = src_ureg(c, &intr->src[0]);
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      negate(swizzle(cond, X, X, X, X), 1, 1, 1, 1),
+                      T0_TEXKILL, TGSI_WRITEMASK_XYZW);
+      i915_release_utemps(p);
+      break;
+   }
+
+   case nir_intrinsic_ddx:
+   case nir_intrinsic_ddy:
+   case nir_intrinsic_ddx_coarse:
+   case nir_intrinsic_ddy_coarse:
+   case nir_intrinsic_ddx_fine:
+   case nir_intrinsic_ddy_fine:
+      set_ureg(c, &intr->def,
+               swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+      break;
+
+   default:
+      i915_program_error(p, "unsupported intrinsic: %s",
+                         nir_intrinsic_infos[intr->intrinsic].name);
+      break;
+   }
+}
+
+static void
+emit_instr(struct nir_to_i915 *c, nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      emit_load_const(c, nir_instr_as_load_const(instr));
+      break;
+   case nir_instr_type_alu:
+      emit_alu(c, nir_instr_as_alu(instr));
+      break;
+   case nir_instr_type_tex:
+      emit_tex(c, nir_instr_as_tex(instr));
+      break;
+   case nir_instr_type_intrinsic:
+      emit_intrinsic(c, nir_instr_as_intrinsic(instr));
+      break;
+   case nir_instr_type_undef: {
+      nir_undef_instr *undef = nir_instr_as_undef(instr);
+      set_ureg(c, &undef->def,
+               swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+      break;
+   }
+   case nir_instr_type_jump:
+   case nir_instr_type_deref:
+      break;
+   default:
+      i915_program_error(c->p, "unsupported NIR instruction type %d",
+                         instr->type);
+      break;
+   }
+}
+
+static void
+fixup_depth_write(struct nir_to_i915 *c, nir_shader *s)
+{
+   if (!(s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
+      return;
+
+   /* NIR writes depth to OD.X (component 0); hardware reads from OD.W */
+   i915_emit_arith(c->p, A0_MOV,
+                   UREG(REG_TYPE_OD, 0), A0_DEST_CHANNEL_W, 0,
+                   swizzle(UREG(REG_TYPE_OD, 0), X, Y, Z, X),
+                   0, 0);
+}
+
+void
+i915_translate_fragment_program_nir(struct i915_context *i915,
+                                    struct i915_fragment_shader *ifs,
+                                    nir_shader *s,
+                                    const struct corm_compile_opts *opts)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(s);
+   bool debug = I915_DBG_ON(DBG_FS) &&
+                (!ifs->internal || NIR_DEBUG(PRINT_INTERNAL));
+
+   if (debug) {
+      mesa_logi("NIR fragment shader:");
+      nir_log_shaderi(s);
+   }
+
+   struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile);
+   p->shader = ifs;
+   p->error = ralloc_strdup(NULL, "");
+   p->log_program_errors = !ifs->internal;
+
+   ifs->num_constants = 0;
+   memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags));
+   memset(p->register_phases, 0, sizeof(p->register_phases));
+
+   for (int i = 0; i < I915_TEX_UNITS; i++)
+      ifs->texcoords[i].semantic = -1;
+
+   p->nr_tex_indirect = 1;
+   p->nr_tex_insn = 0;
+   p->nr_alu_insn = 0;
+   p->nr_decl_insn = 0;
+   p->csr = p->program;
+   p->decl = p->declarations;
+   p->decl_s = 0;
+   p->decl_t = 0;
+   p->temp_flag = ~0x0U << I915_MAX_TEMPORARY;
+   p->utemp_flag = ~0x7;
+
+   *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM;
+
+   struct nir_to_i915 c = {
+      .p = p,
+      .ifs = ifs,
+      .opts = *opts,
+      .ureg_map_size = impl->ssa_alloc,
+      .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)),
+   };
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         emit_instr(&c, instr);
+         if (p->error[0])
+            break;
+      }
+      if (p->error[0])
+         break;
+   }
+
+   if (!p->error[0])
+      fixup_depth_write(&c, s);
+
+   /* finalize */
+   if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT)
+      i915_program_error(p, "exceeded max tex indirect (%d/%d)",
+                         p->nr_tex_indirect, I915_MAX_TEX_INDIRECT);
+   if (p->nr_tex_insn > I915_MAX_TEX_INSN)
+      i915_program_error(p, "exceeded max tex insn (%d/%d)",
+                         p->nr_tex_insn, I915_MAX_TEX_INSN);
+   if (p->nr_alu_insn > I915_MAX_ALU_INSN)
+      i915_program_error(p, "exceeded max ALU insn (%d/%d)",
+                         p->nr_alu_insn, I915_MAX_ALU_INSN);
+   if (p->nr_decl_insn > I915_MAX_DECL_INSN)
+      i915_program_error(p, "exceeded max decl insn (%d/%d)",
+                         p->nr_decl_insn, I915_MAX_DECL_INSN);
+
+   if (p->nr_alu_insn == 0 && p->nr_tex_insn == 0) {
+      i915_use_passthrough_shader(ifs);
+      ifs->nr_alu_insn = 1;
+      goto cleanup;
+   }
+
+   ifs->nr_alu_insn = p->nr_alu_insn;
+   ifs->nr_tex_insn = p->nr_tex_insn;
+   ifs->nr_tex_indirect = p->nr_tex_indirect;
+   ifs->nr_temps = util_bitcount(p->temp_flag);
+
+   {
+      unsigned long program_size = (unsigned long)(p->csr - p->program);
+      unsigned long decl_size = (unsigned long)(p->decl - p->declarations);
+
+      p->declarations[0] |= program_size + decl_size - 2;
+
+      assert(!ifs->program);
+      ifs->program_len = decl_size + program_size;
+      ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t));
+      memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t));
+      memcpy(&ifs->program[decl_size], p->program,
+             program_size * sizeof(uint32_t));
+
+      if (p->error[0]) {
+         /* dump the program for debugging, then replace with passthrough */
+         if (debug && ifs->program_len > 2) {
+            mesa_logi("FAILED program (%d ALU):", p->nr_alu_insn);
+            i915_disassemble_program(ifs->program, ifs->program_len);
+         }
+         FREE(ifs->program);
+         ifs->program = NULL;
+         ifs->program_len = 0;
+         i915_use_passthrough_shader(ifs);
+      }
+   }
+
+cleanup:
+   if (p->error[0])
+      ifs->error = p->error;
+   else
+      ralloc_free(p->error);
+
+   FREE(c.ureg_map);
+   FREE(p);
+
+   if (debug) {
+      if (ifs->error)
+         mesa_loge("%s", ifs->error);
+
+      mesa_logi("i915 fragment shader with %d constants%s",
+                ifs->num_constants, ifs->num_constants ? ":" : "");
+
+      for (int i = 0; i < I915_MAX_CONSTANT; i++) {
+         if (ifs->constant_flags[i] & 0x0f) {
+            mesa_logi("\t\tC[%d] = { %f, %f, %f, %f }", i,
+                      ifs->constants[i][0], ifs->constants[i][1],
+                      ifs->constants[i][2], ifs->constants[i][3]);
+         }
+      }
+      i915_disassemble_program(ifs->program, ifs->program_len);
+   }
+}
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 17db0d34034..df43fb05149 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -176,6 +176,8 @@ i915_optimize_nir(struct nir_shader *s)
 {
    bool progress;
 
+   NIR_PASS(_, s, nir_lower_int_to_float);
+
    do {
       progress = false;
 
@@ -212,6 +214,11 @@ i915_optimize_nir(struct nir_shader *s)
 
    } while (progress);
 
+   NIR_PASS(_, s, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS(_, s, nir_lower_bool_to_float, false);
+   NIR_PASS(_, s, nir_opt_algebraic);
+   NIR_PASS(_, s, nir_opt_dce);
+
    NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp,
             NULL);
 
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index af45247355d..12da6b72266 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -31,7 +31,9 @@
 #include "compiler/nir/nir_builder.h"
 #include "draw/draw_context.h"
 #include "nir/nir_to_tgsi.h"
+#include "tgsi/tgsi_from_mesa.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
@@ -542,6 +544,23 @@ static const struct nir_to_tgsi_options ntt_options = {
    .lower_fabs = true,
 };
 
+static int
+type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+static bool
+scalarize_vector_bools(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   return alu->op == nir_op_bcsel ||
+          alu->op == nir_op_fcsel_ge ||
+          alu->op == nir_op_fcsel_gt;
+}
+
 static char *
 i915_check_control_flow(nir_shader *s)
 {
@@ -565,6 +584,94 @@ i915_check_control_flow(nir_shader *s)
    return NULL;
 }
 
+enum i915_fs_mode {
+   I915_FS_TGSI,
+   I915_FS_NIR,
+   I915_FS_BOTH,
+};
+
+static enum i915_fs_mode
+i915_get_fs_mode(void)
+{
+   const char *env = debug_get_option("I915_FS", "both");
+   if (!strcmp(env, "tgsi"))
+      return I915_FS_TGSI;
+   if (!strcmp(env, "nir"))
+      return I915_FS_NIR;
+   return I915_FS_BOTH;
+}
+
+static void
+i915_populate_fs_metadata(struct i915_fragment_shader *ifs, nir_shader *s)
+{
+   ifs->num_inputs = 0;
+   ifs->writes_z = s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH);
+
+   nir_foreach_shader_in_variable(var, s) {
+      unsigned sem_name, sem_index;
+      tgsi_get_gl_varying_semantic((gl_varying_slot)var->data.location, true,
+                                   &sem_name, &sem_index);
+      unsigned idx = ifs->num_inputs++;
+      ifs->input_semantic_name[idx] = sem_name;
+      ifs->input_semantic_index[idx] = sem_index;
+   }
+}
+
+static void
+i915_compile_tgsi(struct i915_context *i915,
+                  struct i915_fragment_shader *ifs,
+                  struct pipe_screen *screen,
+                  nir_shader *nir_clone)
+{
+   ifs->state.tokens = nir_to_tgsi_options(nir_clone, screen, &ntt_options);
+   ifs->state.type = PIPE_SHADER_IR_TGSI;
+   tgsi_scan_shader(ifs->state.tokens, &ifs->info);
+   i915_translate_fragment_program(i915, ifs);
+}
+
+static bool
+corm_fs_better(const struct i915_fragment_shader *a,
+               const struct i915_fragment_shader *b)
+{
+   if (a->nr_tex_indirect != b->nr_tex_indirect)
+      return a->nr_tex_indirect < b->nr_tex_indirect;
+   if (a->nr_alu_insn != b->nr_alu_insn)
+      return a->nr_alu_insn < b->nr_alu_insn;
+   if (a->nr_temps != b->nr_temps)
+      return a->nr_temps < b->nr_temps;
+   return a->num_constants < b->num_constants;
+}
+
+static const char *
+corm_win_reason(const struct i915_fragment_shader *winner,
+                const struct i915_fragment_shader *loser,
+                char *buf, size_t len)
+{
+   if (!loser) {
+      snprintf(buf, len, "only");
+      return buf;
+   }
+   int da = (int)winner->nr_alu_insn - (int)loser->nr_alu_insn;
+   int dp = (int)winner->nr_tex_indirect - (int)loser->nr_tex_indirect;
+   int dt = (int)winner->nr_temps - (int)loser->nr_temps;
+   if (dp != 0)
+      snprintf(buf, len, "%+d phase", dp);
+   else if (da != 0)
+      snprintf(buf, len, "%+d alu", da);
+   else if (dt != 0)
+      snprintf(buf, len, "%+d temps", dt);
+   else if ((int)winner->num_constants != (int)loser->num_constants)
+      snprintf(buf, len, "%+d const",
+               (int)winner->num_constants - (int)loser->num_constants);
+   else if (winner->program_len == loser->program_len &&
+            !memcmp(winner->program, loser->program,
+                    winner->program_len * sizeof(uint32_t)))
+      snprintf(buf, len, "identical");
+   else
+      snprintf(buf, len, "tied");
+   return buf;
+}
+
 static void *
 i915_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
@@ -576,39 +683,206 @@ i915_create_fs_state(struct pipe_context *pipe,
 
    ifs->draw_data = draw_create_fragment_shader(i915->draw, templ);
 
-   if (templ->type == PIPE_SHADER_IR_NIR) {
-      nir_shader *s = templ->ir.nir;
-      ifs->internal = s->info.internal;
-
-      char *msg = i915_check_control_flow(s);
-      if (msg) {
-         if (I915_DBG_ON(DBG_FS) &&
-             (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL))) {
-            mesa_logi("failing shader:");
-            nir_log_shaderi(s);
-         }
-         if (templ->report_compile_error) {
-            ((struct pipe_shader_state *)templ)->error_message = strdup(msg);
-            ralloc_free(s);
-            i915_delete_fs_state(NULL, ifs);
-            return NULL;
-         }
-      }
-
-      ifs->state.tokens = nir_to_tgsi_options(s, pipe->screen, &ntt_options);
-   } else {
-      assert(templ->type == PIPE_SHADER_IR_TGSI);
-      /* we need to keep a local copy of the tokens */
+   if (templ->type == PIPE_SHADER_IR_TGSI) {
       ifs->state.tokens = tgsi_dup_tokens(templ->tokens);
+      ifs->state.type = PIPE_SHADER_IR_TGSI;
       ifs->internal = i915->no_log_program_errors;
+      tgsi_scan_shader(ifs->state.tokens, &ifs->info);
+      i915_translate_fragment_program(i915, ifs);
+      return ifs;
    }
 
-   ifs->state.type = PIPE_SHADER_IR_TGSI;
+   assert(templ->type == PIPE_SHADER_IR_NIR);
+   nir_shader *s = templ->ir.nir;
+   ifs->internal = s->info.internal;
 
-   tgsi_scan_shader(ifs->state.tokens, &ifs->info);
+   bool debug = I915_DBG_ON(DBG_FS) &&
+                (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL));
+
+   char *msg = i915_check_control_flow(s);
+   if (msg) {
+      if (debug) {
+         mesa_logi("failing shader:");
+         nir_log_shaderi(s);
+      }
+      if (templ->report_compile_error) {
+         ((struct pipe_shader_state *)templ)->error_message = strdup(msg);
+         ralloc_free(s);
+         i915_delete_fs_state(NULL, ifs);
+         return NULL;
+      }
+   }
+
+   static enum i915_fs_mode fs_mode = -1;
+   if (fs_mode == (enum i915_fs_mode)-1)
+      fs_mode = i915_get_fs_mode();
+
+   bool try_nir = (fs_mode == I915_FS_NIR || fs_mode == I915_FS_BOTH);
+   bool try_tgsi = (fs_mode == I915_FS_TGSI || fs_mode == I915_FS_BOTH);
+
+   struct i915_fragment_shader tgsi_fs = {0};
+
+   static const struct corm_compile_opts corm_variants[] = {
+      { .deferred_const = false, .seq_sne_opt = false },
+      { .deferred_const = false, .seq_sne_opt = true },
+      { .deferred_const = true,  .seq_sne_opt = false },
+      { .deferred_const = true,  .seq_sne_opt = true },
+   };
+
+   struct i915_fragment_shader nir_results[ARRAY_SIZE(corm_variants)];
+   int best_nir = -1;
+
+   if (try_nir) {
+      nir_shader *nir_s = try_tgsi ? nir_shader_clone(NULL, s) : s;
+      NIR_PASS(_, nir_s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+               type_size, (nir_lower_io_options)0);
+      NIR_PASS(_, nir_s, nir_lower_alu_to_scalar, scalarize_vector_bools, NULL);
+      NIR_PASS(_, nir_s, nir_opt_vectorize, NULL, NULL);
+      NIR_PASS(_, nir_s, nir_lower_bool_to_float, false);
+      NIR_PASS(_, nir_s, nir_opt_algebraic);
+      NIR_PASS(_, nir_s, nir_opt_algebraic_late);
+      NIR_PASS(_, nir_s, nir_opt_dce);
+      nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s));
+
+      for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
+         nir_shader *variant_nir = (v == ARRAY_SIZE(corm_variants) - 1)
+            ? nir_s : nir_shader_clone(NULL, nir_s);
+         memset(&nir_results[v], 0, sizeof(nir_results[v]));
+         i915_populate_fs_metadata(&nir_results[v], variant_nir);
+         i915_translate_fragment_program_nir(i915, &nir_results[v],
+                                            variant_nir, &corm_variants[v]);
+         if (v < ARRAY_SIZE(corm_variants) - 1)
+            ralloc_free(variant_nir);
+
+         bool ok = !nir_results[v].error || !nir_results[v].error[0];
+         if (ok && (best_nir < 0 ||
+                    corm_fs_better(&nir_results[v], &nir_results[best_nir])))
+            best_nir = v;
+      }
+
+      if (try_tgsi)
+         ralloc_free(nir_s);
+   }
+
+   if (try_tgsi) {
+      i915_compile_tgsi(i915, &tgsi_fs, pipe->screen, s);
+   } else {
+      ralloc_free(s);
+   }
+
+   bool nir_ok = best_nir >= 0;
+   bool tgsi_ok = try_tgsi && (!tgsi_fs.error || !tgsi_fs.error[0]);
+   struct i915_fragment_shader *best_nir_fs = nir_ok ? &nir_results[best_nir] : NULL;
+
+   bool use_nir;
+   if (nir_ok && tgsi_ok)
+      use_nir = !corm_fs_better(&tgsi_fs, best_nir_fs);
+   else
+      use_nir = nir_ok;
+
+   if (debug && try_nir && try_tgsi) {
+      for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
+         bool ok = !nir_results[v].error || !nir_results[v].error[0];
+         mesa_logi("  NIR[dc=%d,ss=%d]: %s (%d ALU, %d phase, %d temps)%s",
+                   corm_variants[v].deferred_const,
+                   corm_variants[v].seq_sne_opt,
+                   ok ? "ok" : "FAIL",
+                   ok ? nir_results[v].nr_alu_insn : 0,
+                   ok ? nir_results[v].nr_tex_indirect : 0,
+                   ok ? nir_results[v].nr_temps : 0,
+                   (int)v == best_nir ? " *" : "");
+      }
+      mesa_logi("  TGSI: %s (%d ALU, %d phase, %d temps)",
+                tgsi_ok ? "ok" : "FAIL",
+                tgsi_ok ? tgsi_fs.nr_alu_insn : 0,
+                tgsi_ok ? tgsi_fs.nr_tex_indirect : 0,
+                tgsi_ok ? tgsi_fs.nr_temps : 0);
+      mesa_logi("  -> %s%s", use_nir ? "NIR" : "TGSI",
+                use_nir ? (corm_fs_better(best_nir_fs, &tgsi_fs)
+                           ? " (better)" : " (tied)") : "");
+   }
+
+   /* Free non-winning NIR variants */
+   if (try_nir) {
+      for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
+         if ((int)v != best_nir) {
+            FREE(nir_results[v].program);
+            ralloc_free(nir_results[v].error);
+         }
+      }
+   }
+
+   struct i915_fragment_shader *winner, *loser = NULL;
+   struct i915_fragment_shader nir_loser_copy = {0};
+   if (use_nir) {
+      winner = best_nir_fs;
+      loser = tgsi_ok ? &tgsi_fs : NULL;
+   } else {
+      winner = &tgsi_fs;
+      if (best_nir_fs) {
+         nir_loser_copy = *best_nir_fs;
+         nir_loser_copy.program = NULL;
+         loser = &nir_loser_copy;
+         FREE(best_nir_fs->program);
+         ralloc_free(best_nir_fs->error);
+      }
+   }
+
+   if (i915 && !ifs->internal) {
+      bool neither = (winner->nr_alu_insn + winner->nr_tex_insn) == 0;
+      char reason[32];
+      if (neither)
+         snprintf(reason, sizeof(reason), "neither");
+      else
+         corm_win_reason(winner, loser, reason, sizeof(reason));
+      util_debug_message(
+         &i915->debug, SHADER_INFO,
+         "%s shader [%s, %s]: %d instructions, %d alu, %d tex, "
+         "%d tex_indirect, %d temps, %d const",
+         _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT),
+         neither ? "FAIL" : use_nir ? "NIR" : "TGSI", reason,
+         winner->nr_alu_insn + winner->nr_tex_insn,
+         winner->nr_alu_insn, winner->nr_tex_insn, winner->nr_tex_indirect,
+         winner->nr_temps, winner->num_constants);
+   }
+
+   ifs->program = winner->program;
+   ifs->program_len = winner->program_len;
+   ifs->nr_alu_insn = winner->nr_alu_insn;
+   ifs->nr_tex_insn = winner->nr_tex_insn;
+   ifs->nr_tex_indirect = winner->nr_tex_indirect;
+   ifs->nr_temps = winner->nr_temps;
+   ifs->num_constants = winner->num_constants;
+   memcpy(ifs->constants, winner->constants, sizeof(ifs->constants));
+   memcpy(ifs->constant_flags, winner->constant_flags,
+          sizeof(ifs->constant_flags));
+   memcpy(ifs->texcoords, winner->texcoords, sizeof(ifs->texcoords));
+   ifs->reads_pntc = winner->reads_pntc;
+   ifs->writes_z = winner->writes_z;
+   ifs->num_inputs = winner->num_inputs;
+   memcpy(ifs->input_semantic_name, winner->input_semantic_name,
+          sizeof(ifs->input_semantic_name));
+   memcpy(ifs->input_semantic_index, winner->input_semantic_index,
+          sizeof(ifs->input_semantic_index));
+   if (winner->error)
+      ifs->error = winner->error;
+
+   /* The loser's info may be in use (TGSI path populates ifs->info) */
+   if (try_tgsi)
+      ifs->info = tgsi_fs.info;
+
+   if (loser) {
+      FREE(loser->program);
+      ralloc_free(loser->error);
+   }
+   if (!use_nir && try_tgsi) {
+      /* TGSI won — tokens are in tgsi_fs via i915_compile_tgsi.
+       * We need them for ifs->state for draw's FS pipeline. */
+      ifs->state = tgsi_fs.state;
+   } else if (try_tgsi) {
+      FREE((void *)tgsi_fs.state.tokens);
+   }
 
-   /* The shader's compiled to i915 instructions here */
-   i915_translate_fragment_program(i915, ifs);
    if (ifs->error && templ->report_compile_error) {
       ((struct pipe_shader_state *)templ)->error_message = strdup(ifs->error);
       i915_delete_fs_state(NULL, ifs);
diff --git a/src/gallium/drivers/i915/meson.build b/src/gallium/drivers/i915/meson.build
index 80dc825fbc5..ef1d5f7ad34 100644
--- a/src/gallium/drivers/i915/meson.build
+++ b/src/gallium/drivers/i915/meson.build
@@ -16,6 +16,7 @@ files_i915 = files(
   'i915_flush.c',
   'i915_fpc_emit.c',
   'i915_fpc.h',
+  'i915_fpc_nir.c',
   'i915_fpc_optimize.c',
   'i915_fpc_translate.c',
   'i915_prim_emit.c',

From c6be264c2da75c655fb2536245e2ec5075868172 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 22:59:20 -0400
Subject: [PATCH 07/20] i915/corm: add copy propagation before algebraic late
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nir_lower_io introduces identity vec instructions that block
nir_opt_algebraic_late's fadd+fmul→ffma fusion pattern. Adding
nir_opt_copy_prop + nir_opt_dce before algebraic cleans these up,
enabling ffma fusion and eliminating redundant vec construction.

shader-db (I915_FS=nir): 48/403 compiled, 62 alu
shader-db (I915_FS=both): nir won 48 (26 identical, 16 tied, 6 better),
  236 TGSI, 119 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_state.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 12da6b72266..10a185db957 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -561,6 +561,20 @@ scalarize_vector_bools(const nir_instr *instr, const void *data)
           alu->op == nir_op_fcsel_gt;
 }
 
+static bool
+lower_fsqrt_filter(const nir_instr *instr, UNUSED const void *data)
+{
+   return instr->type == nir_instr_type_alu &&
+          nir_instr_as_alu(instr)->op == nir_op_fsqrt;
+}
+
+static nir_def *
+lower_fsqrt_impl(nir_builder *b, nir_instr *instr, UNUSED void *data)
+{
+   nir_def *src = nir_instr_as_alu(instr)->src[0].src.ssa;
+   return nir_fmul(b, src, nir_frsq(b, src));
+}
+
 static char *
 i915_check_control_flow(nir_shader *s)
 {
@@ -739,6 +753,11 @@ i915_create_fs_state(struct pipe_context *pipe,
       NIR_PASS(_, nir_s, nir_lower_alu_to_scalar, scalarize_vector_bools, NULL);
       NIR_PASS(_, nir_s, nir_opt_vectorize, NULL, NULL);
       NIR_PASS(_, nir_s, nir_lower_bool_to_float, false);
+      NIR_PASS(_, nir_s, nir_shader_lower_instructions, lower_fsqrt_filter,
+               lower_fsqrt_impl, NULL);
+      NIR_PASS(_, nir_s, nir_opt_copy_prop);
+      NIR_PASS(_, nir_s, nir_opt_cse);
+      NIR_PASS(_, nir_s, nir_opt_dce);
       NIR_PASS(_, nir_s, nir_opt_algebraic);
       NIR_PASS(_, nir_s, nir_opt_algebraic_late);
       NIR_PASS(_, nir_s, nir_opt_dce);

From 9a88dff9f4db038645e0200f9d5c1859835d971d Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:55:06 -0400
Subject: [PATCH 08/20] i915/corm: add temporary register tracking and dead
 temp release

Track the last use of each SSA def and release temporary registers
as soon as they're dead, allowing more aggressive temp reuse.

Includes the register aliasing fix for mov/fneg: these ops alias
the def to the source register, so the source's lifetime must be
extended to match the def's to prevent premature release.

shader-db (I915_FS=nir): 52/403 compiled, 231 alu
shader-db (I915_FS=both): nir won 52 (26 identical, 16 tied, 9 better, 1 only),
  233 TGSI, 118 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 54 +++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 346e06d0a34..4df750d4734 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -22,8 +22,51 @@ struct nir_to_i915 {
 
    uint32_t *ureg_map;
    unsigned ureg_map_size;
+
+   int *last_use;
+   int ip;
 };
 
+static bool
+mark_last_use_cb(nir_src *src, void *state)
+{
+   struct nir_to_i915 *c = state;
+   if (src->ssa->index < c->ureg_map_size)
+      c->last_use[src->ssa->index] = c->ip;
+   return true;
+}
+
+static void
+compute_last_use(struct nir_to_i915 *c, nir_function_impl *impl)
+{
+   c->ip = 0;
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         nir_foreach_src(instr, mark_last_use_cb, c);
+         c->ip++;
+      }
+   }
+}
+
+static bool
+release_if_last_use_cb(nir_src *src, void *state)
+{
+   struct nir_to_i915 *c = state;
+   unsigned idx = src->ssa->index;
+   if (idx < c->ureg_map_size && c->last_use[idx] == c->ip) {
+      uint32_t ureg = c->ureg_map[idx];
+      if (GET_UREG_TYPE(ureg) == REG_TYPE_R)
+         i915_release_temp(c->p, GET_UREG_NR(ureg));
+   }
+   return true;
+}
+
+static void
+release_dead_temps(struct nir_to_i915 *c, nir_instr *instr)
+{
+   nir_foreach_src(instr, release_if_last_use_cb, c);
+}
+
 static void
 set_ureg(struct nir_to_i915 *c, nir_def *def, uint32_t ureg)
 {
@@ -185,6 +228,9 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
       i915_release_temp(p, GET_UREG_NR(dest));
       set_ureg(c, def, alu->op == nir_op_fneg ? negate(src0, 1, 1, 1, 1)
                                          : src0);
+      unsigned src_idx = alu->src[0].src.ssa->index;
+      if (c->last_use[src_idx] == c->ip)
+         c->last_use[src_idx] = c->last_use[def->index];
       return;
    }
    case nir_op_fabs:
@@ -727,13 +773,20 @@ i915_translate_fragment_program_nir(struct i915_context *i915,
       .opts = *opts,
       .ureg_map_size = impl->ssa_alloc,
       .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)),
+      .last_use = CALLOC(impl->ssa_alloc, sizeof(int)),
    };
 
+   memset(c.last_use, -1, impl->ssa_alloc * sizeof(int));
+   compute_last_use(&c, impl);
+
+   c.ip = 0;
    nir_foreach_block(block, impl) {
       nir_foreach_instr(instr, block) {
          emit_instr(&c, instr);
          if (p->error[0])
             break;
+         release_dead_temps(&c, instr);
+         c.ip++;
       }
       if (p->error[0])
          break;
@@ -799,6 +852,7 @@ cleanup:
    else
       ralloc_free(p->error);
 
+   FREE(c.last_use);
    FREE(c.ureg_map);
    FREE(p);
 

From ed934ae17b28a353aaba8a826f777acd8eac0e51 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:57:28 -0400
Subject: [PATCH 09/20] i915/corm: add vec construction optimizations

Optimize vec2/3/4 construction with several strategies:

- same_reg: when all components come from the same register, collapse
  to a single swizzle+negate alias (zero instructions)
- const-swizzle piggybacking: ZERO/ONE sources share a MOV with
  real-register sources from the same register
- per-channel negate: preserve per-channel negate bits through the
  swizzle path instead of emitting separate negation

shader-db (I915_FS=nir): 130/403 compiled, 1614 alu
shader-db (I915_FS=both): nir won 130 (26 identical, 16 tied, 86 better, 2 only),
  156 TGSI, 117 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 88 ++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 4df750d4734..e4834c94c4b 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -361,14 +361,96 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
    case nir_op_vec3:
    case nir_op_vec4: {
       unsigned n = nir_op_infos[alu->op].num_inputs;
+      uint32_t srcs[4] = { 0 };
+      for (unsigned i = 0; i < n; i++)
+         srcs[i] = alu_src_ureg(c, &alu->src[i]);
+
+      bool same_reg = true;
+      for (unsigned i = 1; i < n; i++) {
+         if ((srcs[i] & UREG_TYPE_NR_MASK) != (srcs[0] & UREG_TYPE_NR_MASK)) {
+            same_reg = false;
+            break;
+         }
+      }
+
+      if (same_reg) {
+         uint32_t base = UREG(GET_UREG_TYPE(srcs[0]), GET_UREG_NR(srcs[0]));
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         for (unsigned i = 0; i < n; i++) {
+            ch[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7;
+            ng[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
+         }
+         i915_release_temp(p, GET_UREG_NR(dest));
+         set_ureg(c, def, negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                                 ng[0], ng[1], ng[2], ng[3]));
+         return;
+      }
+
       static const uint32_t chan_mask[] = {
          A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y,
          A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W,
       };
+      bool emitted[4] = { false };
+      uint32_t ch_sel[4];
+      int neg_sel[4] = { 0, 0, 0, 0 };
       for (unsigned i = 0; i < n; i++) {
-         uint32_t s = alu_src_ureg(c, &alu->src[i]);
-         i915_emit_arith(p, A0_MOV, dest, chan_mask[i] & mask, 0,
-                         swizzle(s, X, X, X, X), 0, 0);
+         ch_sel[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7;
+         neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
+      }
+
+      /* Process real-register sources first, folding in any ZERO/ONE
+       * const-swizzle sources that can piggyback on the same MOV.
+       * Use the unswizzled base register since swizzle() composes.
+       */
+      for (unsigned i = 0; i < n; i++) {
+         if (emitted[i] || ch_sel[i] >= SRC_ZERO)
+            continue;
+         uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), GET_UREG_NR(srcs[i]));
+         uint32_t group_mask = chan_mask[i];
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         ch[i] = ch_sel[i];
+         ng[i] = neg_sel[i];
+         for (unsigned j = i + 1; j < n; j++) {
+            if (!emitted[j] &&
+                (ch_sel[j] >= SRC_ZERO ||
+                 (srcs[j] & UREG_TYPE_NR_MASK) ==
+                 (srcs[i] & UREG_TYPE_NR_MASK))) {
+               group_mask |= chan_mask[j];
+               ch[j] = ch_sel[j];
+               ng[j] = neg_sel[j];
+               emitted[j] = true;
+            }
+         }
+         i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0,
+                         negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                                ng[0], ng[1], ng[2], ng[3]),
+                         0, 0);
+         emitted[i] = true;
+      }
+      /* Any remaining const-swizzle-only sources */
+      for (unsigned i = 0; i < n; i++) {
+         if (emitted[i])
+            continue;
+         uint32_t group_mask = chan_mask[i];
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         ch[i] = ch_sel[i];
+         ng[i] = neg_sel[i];
+         for (unsigned j = i + 1; j < n; j++) {
+            if (!emitted[j]) {
+               group_mask |= chan_mask[j];
+               ch[j] = ch_sel[j];
+               ng[j] = neg_sel[j];
+               emitted[j] = true;
+            }
+         }
+         i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0,
+                         negate(swizzle(srcs[i], ch[0], ch[1], ch[2], ch[3]),
+                                ng[0], ng[1], ng[2], ng[3]),
+                         0, 0);
+         emitted[i] = true;
       }
       break;
    }

From 28400d7c6c8da10192ae422cfd418ab759af23d8 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:56:40 -0400
Subject: [PATCH 10/20] i915/corm: add fsat folding, output dest folding, and
 vec dest folding

Add the def_csr mechanism: track the instruction cursor position for
each single-instruction SSA def so we can retroactively patch it.

fsat folding: when a single-use SSA def feeds into fsat, fold
A0_DEST_SATURATE into the previous instruction instead of emitting
a separate MOV.

Output dest folding: when store_output consumes a single-use temp,
patch the previous instruction to write directly to the output
register (OC/OD). Includes vec look-through for the identity-swizzle
case where a vec was collapsed to a register alias.

Vec dest folding: single-use scalar ALU results feeding a vec
component get patched to write directly into the vec dest register.

shader-db (I915_FS=nir): 209/403 compiled, 3157 alu
shader-db (I915_FS=both): nir won 209 (26 identical, 16 tied, 164 better, 3 only),
  78 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 85 ++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index e4834c94c4b..41482cbd9d1 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -21,6 +21,7 @@ struct nir_to_i915 {
    struct i915_fragment_shader *ifs;
 
    uint32_t *ureg_map;
+   uint32_t **def_csr;
    unsigned ureg_map_size;
 
    int *last_use;
@@ -221,6 +222,8 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
    if (nir_op_infos[alu->op].num_inputs >= 3)
       src2 = alu_src_ureg(c, &alu->src[2]);
 
+   uint32_t *pre_csr = p->csr;
+
    switch (alu->op) {
    case nir_op_mov:
    case nir_op_fcanonicalize:
@@ -237,9 +240,22 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
       i915_emit_arith(p, A0_MAX, dest, mask, 0,
                       src0, negate(src0, 1, 1, 1, 1), 0);
       break;
-   case nir_op_fsat:
+   case nir_op_fsat: {
+      nir_def *src_def = alu->src[0].src.ssa;
+      uint32_t *prev = c->def_csr[src_def->index];
+      if (prev && list_is_singular(&src_def->uses)) {
+         prev[0] |= A0_DEST_SATURATE;
+         i915_release_temp(p, GET_UREG_NR(dest));
+         set_ureg(c, def, src_ureg(c, &alu->src[0].src));
+         c->def_csr[def->index] = prev;
+         unsigned src_idx = alu->src[0].src.ssa->index;
+         if (c->last_use[src_idx] == c->ip)
+            c->last_use[src_idx] = c->last_use[def->index];
+         return;
+      }
       i915_emit_arith(p, A0_MOV, dest, mask, A0_DEST_SATURATE, src0, 0, 0);
       break;
+   }
    case nir_op_fadd:
       i915_emit_arith(p, A0_ADD, dest, mask, 0, src0, src1, 0);
       break;
@@ -399,6 +415,29 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
          neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
       }
 
+      /* Single-component ALU dest folding: if a vec source is a single-use
+       * scalar ALU result in a temp, patch that instruction to write directly
+       * into our dest with the right channel mask.
+       */
+      for (unsigned i = 0; i < n; i++) {
+         nir_def *src_def = alu->src[i].src.ssa;
+         uint32_t *prev_csr = c->def_csr[src_def->index];
+         if (!prev_csr || !list_is_singular(&src_def->uses))
+            continue;
+         if (GET_UREG_TYPE(srcs[i]) != REG_TYPE_R)
+            continue;
+         if (src_def->num_components != 1)
+            continue;
+
+         prev_csr[0] = (prev_csr[0] & ~(A0_DEST_CHANNEL_ALL |
+                               (0x1ff << A0_DEST_NR_SHIFT))) |
+                   A0_DEST(dest) | chan_mask[i];
+
+         i915_release_temp(p, GET_UREG_NR(srcs[i]));
+         c->ureg_map[src_def->index] = dest;
+         emitted[i] = true;
+      }
+
       /* Process real-register sources first, folding in any ZERO/ONE
        * const-swizzle sources that can piggyback on the same MOV.
        * Use the unswizzled base register since swizzle() composes.
@@ -471,6 +510,9 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
       break;
    }
 
+   if (p->csr == pre_csr + 3)
+      c->def_csr[def->index] = pre_csr;
+
    i915_release_utemps(p);
 }
 
@@ -640,6 +682,45 @@ emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr)
          dest = UREG(REG_TYPE_OC, 0);
       }
 
+      nir_def *src_def = intr->src[0].ssa;
+      uint32_t *prev = c->def_csr[src_def->index];
+
+      /* Look through identity vec (same_reg case emits no instructions).
+       * Check that all uses of the underlying def come from this vec.
+       */
+      bool looked_through_vec = false;
+      if (!prev) {
+         nir_instr *def_instr = nir_def_instr_nonconst(src_def);
+         if (def_instr->type == nir_instr_type_alu) {
+            nir_alu_instr *vec = nir_instr_as_alu(def_instr);
+            if ((vec->op == nir_op_vec4 || vec->op == nir_op_vec3 ||
+                 vec->op == nir_op_vec2) &&
+                list_is_singular(&src_def->uses)) {
+               nir_def *inner = vec->src[0].src.ssa;
+               bool all_from_vec = true;
+               nir_foreach_use(use, inner) {
+                  if (nir_src_use_instr(use) != def_instr) {
+                     all_from_vec = false;
+                     break;
+                  }
+               }
+               if (all_from_vec) {
+                  src_def = inner;
+                  prev = c->def_csr[src_def->index];
+                  looked_through_vec = true;
+               }
+            }
+         }
+      }
+
+      if (prev && comp == 0 &&
+          (looked_through_vec || list_is_singular(&src_def->uses))) {
+         prev[0] = (prev[0] & ~(A0_DEST_CHANNEL_ALL |
+                                (0x1ff << A0_DEST_NR_SHIFT))) |
+                   A0_DEST(dest) | writemask_to_mask(wm);
+         break;
+      }
+
       if (comp > 0) {
          uint32_t s[4] = { X, Y, Z, W };
          for (int i = 3; i >= (int)comp; i--)
@@ -855,6 +936,7 @@ i915_translate_fragment_program_nir(struct i915_context *i915,
       .opts = *opts,
       .ureg_map_size = impl->ssa_alloc,
       .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)),
+      .def_csr = CALLOC(impl->ssa_alloc, sizeof(uint32_t *)),
       .last_use = CALLOC(impl->ssa_alloc, sizeof(int)),
    };
 
@@ -935,6 +1017,7 @@ cleanup:
       ralloc_free(p->error);
 
    FREE(c.last_use);
+   FREE(c.def_csr);
    FREE(c.ureg_map);
    FREE(p);
 

From 75ef9f6d658c9019bf9af869f95a4456e3296dc3 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:59:01 -0400
Subject: [PATCH 11/20] i915/corm: deferred constant allocation with
 per-channel UBO mixing

When opts.deferred_const is set, defer scalar load_const allocation
until the consuming ALU instruction. coalesce_constants resolves
deferred constants with a preferred register hint so co-occurring
constants pack into the same CONST register, avoiding dual-constant
MOV penalties.

Also fix per-channel UBO constant flags: mark only the actually
loaded channels with I915_CONSTFLAG_USER_CH(comp+i) instead of
setting all user bits, leaving free channels for immediates.

shader-db (I915_FS=nir): 210/403 compiled, 3202 alu
shader-db (I915_FS=both): nir won 210 (26 identical, 16 tied, 165 better, 3 only),
  77 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 68 +++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 41482cbd9d1..96e28a23fe4 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -22,6 +22,7 @@ struct nir_to_i915 {
 
    uint32_t *ureg_map;
    uint32_t **def_csr;
+   float *deferred_const;
    unsigned ureg_map_size;
 
    int *last_use;
@@ -75,10 +76,28 @@ set_ureg(struct nir_to_i915 *c, nir_def *def, uint32_t ureg)
    c->ureg_map[def->index] = ureg;
 }
 
+static bool
+is_deferred(struct nir_to_i915 *c, unsigned ssa_index)
+{
+   return c->ureg_map[ssa_index] == UREG_BAD;
+}
+
+static uint32_t
+resolve_const(struct nir_to_i915 *c, unsigned ssa_index, int preferred_reg)
+{
+   uint32_t ureg = i915_emit_const1f_prefer(c->p,
+                                            c->deferred_const[ssa_index],
+                                            preferred_reg);
+   c->ureg_map[ssa_index] = ureg;
+   return ureg;
+}
+
 static uint32_t
 src_ureg(struct nir_to_i915 *c, nir_src *src)
 {
    assert(src->ssa->index < c->ureg_map_size);
+   if (c->ureg_map[src->ssa->index] == UREG_BAD)
+      resolve_const(c, src->ssa->index, -1);
    return c->ureg_map[src->ssa->index];
 }
 
@@ -179,9 +198,17 @@ emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load)
    struct i915_fp_compile *p = c->p;
 
    switch (load->def.num_components) {
-   case 1:
-      set_ureg(c, &load->def, i915_emit_const1f(p, load->value[0].f32));
+   case 1: {
+      float val = load->value[0].f32;
+      if (c->opts.deferred_const &&
+          val != 0.0f && val != 1.0f && val != -1.0f) {
+         c->deferred_const[load->def.index] = val;
+         set_ureg(c, &load->def, UREG_BAD);
+      } else {
+         set_ureg(c, &load->def, i915_emit_const1f(p, val));
+      }
       break;
+   }
    case 2:
       set_ureg(c, &load->def,
                i915_emit_const2f(p, load->value[0].f32,
@@ -205,6 +232,35 @@ emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load)
    }
 }
 
+static void
+coalesce_constants(struct nir_to_i915 *c, nir_alu_instr *alu)
+{
+   unsigned n = nir_op_infos[alu->op].num_inputs;
+   unsigned deferred[3];
+   unsigned nr_deferred = 0;
+   int preferred = -1;
+
+   for (unsigned i = 0; i < n; i++) {
+      unsigned idx = alu->src[i].src.ssa->index;
+      if (is_deferred(c, idx)) {
+         deferred[nr_deferred++] = idx;
+      } else {
+         uint32_t ureg = c->ureg_map[idx];
+         if (GET_UREG_TYPE(ureg) == REG_TYPE_CONST && preferred < 0)
+            preferred = GET_UREG_NR(ureg);
+      }
+   }
+
+   if (nr_deferred == 0)
+      return;
+
+   for (unsigned i = 0; i < nr_deferred; i++) {
+      uint32_t ureg = resolve_const(c, deferred[i], preferred);
+      if (preferred < 0 && GET_UREG_TYPE(ureg) == REG_TYPE_CONST)
+         preferred = GET_UREG_NR(ureg);
+   }
+}
+
 static void
 emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
 {
@@ -214,6 +270,8 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
    uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
    set_ureg(c, def, dest);
 
+   coalesce_constants(c, alu);
+
    uint32_t src0 = 0, src1 = 0, src2 = 0;
    if (nir_op_infos[alu->op].num_inputs >= 1)
       src0 = alu_src_ureg(c, &alu->src[0]);
@@ -756,7 +814,7 @@ emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr)
       }
 
       for (unsigned i = 0; i < intr->def.num_components; i++)
-         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER;
+         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER_CH(comp + i);
       ifs->num_constants = MAX2(ifs->num_constants, slot + 1);
 
       uint32_t reg = UREG(REG_TYPE_CONST, slot);
@@ -791,7 +849,7 @@ emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr)
       }
 
       for (unsigned i = 0; i < intr->def.num_components; i++)
-         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER;
+         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER_CH(comp + i);
       ifs->num_constants = MAX2(ifs->num_constants, slot + 1);
 
       uint32_t reg = UREG(REG_TYPE_CONST, slot);
@@ -937,6 +995,7 @@ i915_translate_fragment_program_nir(struct i915_context *i915,
       .ureg_map_size = impl->ssa_alloc,
       .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)),
       .def_csr = CALLOC(impl->ssa_alloc, sizeof(uint32_t *)),
+      .deferred_const = CALLOC(impl->ssa_alloc, sizeof(float)),
       .last_use = CALLOC(impl->ssa_alloc, sizeof(int)),
    };
 
@@ -1017,6 +1076,7 @@ cleanup:
       ralloc_free(p->error);
 
    FREE(c.last_use);
+   FREE(c.deferred_const);
    FREE(c.def_csr);
    FREE(c.ureg_map);
    FREE(p);

From 4885eb02ab9e581b144942fd4a03b5ae24cac9d7 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 12:59:42 -0400
Subject: [PATCH 12/20] i915/corm: optimize seq/sne against zero to 2
 instructions

When opts.seq_sne_opt is set and one operand is zero, use the
abs+compare pattern: x == 0 becomes -abs(x) >= 0, and x != 0
becomes -abs(x) < 0. This reduces from 3 ALU instructions to 2.

This is a variant dimension because it can increase register
pressure in some shaders; the multi-variant framework picks the
winner per-shader.

shader-db (I915_FS=nir): 212/403 compiled, 3228 alu
shader-db (I915_FS=both): nir won 212 (26 identical, 16 tied, 167 better, 3 only),
  75 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 54 +++++++++++++++++++------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 96e28a23fe4..99a61d385c3 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -394,21 +394,51 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
       i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0);
       break;
    case nir_op_seq: {
-      /* seq(a,b) = sge(a,b) * sge(b,a) */
-      uint32_t tmp = i915_get_utemp(p);
-      i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0,
-                      src0, src1, 0);
-      i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0);
-      i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0);
+      const uint32_t zero =
+         swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+      if (c->opts.seq_sne_opt &&
+          ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) ||
+           (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) {
+         if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))
+            src0 = src1;
+         /* x == 0  <->  -abs(x) >= 0: 2 insns instead of 3 */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, negate(src0, 1, 1, 1, 1), 0);
+         i915_emit_arith(p, A0_SGE, dest, mask, 0,
+                         negate(tmp, 1, 1, 1, 1), zero, 0);
+      } else {
+         /* seq(a,b) = sge(a,b) * sge(b,a) */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, src1, 0);
+         i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0);
+         i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0);
+      }
       break;
    }
    case nir_op_sne: {
-      /* sne(a,b) = slt(a,b) + slt(b,a) */
-      uint32_t tmp = i915_get_utemp(p);
-      i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
-                      src0, src1, 0);
-      i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0);
-      i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0);
+      const uint32_t zero =
+         swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+      if (c->opts.seq_sne_opt &&
+          ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) ||
+           (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) {
+         if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))
+            src0 = src1;
+         /* x != 0  <->  -abs(x) < 0: 2 insns instead of 3 */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, negate(src0, 1, 1, 1, 1), 0);
+         i915_emit_arith(p, A0_SLT, dest, mask, 0,
+                         negate(tmp, 1, 1, 1, 1), zero, 0);
+      } else {
+         /* sne(a,b) = slt(a,b) + slt(b,a) */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, src1, 0);
+         i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0);
+         i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0);
+      }
       break;
    }
    case nir_op_fpow: {

From 17b699ae24f63436f0a16456bb798dfcb8eba708 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Wed, 6 May 2026 20:07:44 -0400
Subject: [PATCH 13/20] i915/corm: use hardware swizzle constants for vec3/4
 load_const

When a vec3 or vec4 load_const has all components from {0, 1, -1},
emit a swizzle+negate alias using the hardware ZERO/ONE source
constants instead of allocating a constant register via
i915_emit_const4fv. This matches what the TGSI path does through
its immediate recognition.

Saves a constant register slot per qualifying load_const and
converts 32 of 33 previous ties to identical output.

shader-db (I915_FS=nir): 212/403 compiled, 3227 alu
shader-db (I915_FS=both): nir won 212 (26 identical, 1 tied, 182 better, 3 only),
  75 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 34 ++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 99a61d385c3..27f57ece08b 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -216,13 +216,39 @@ emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load)
       break;
    case 3:
    case 4: {
+      unsigned n = load->def.num_components;
       float v[4] = {
          load->value[0].f32,
-         load->def.num_components > 1 ? load->value[1].f32 : 0.0f,
-         load->def.num_components > 2 ? load->value[2].f32 : 0.0f,
-         load->def.num_components > 3 ? load->value[3].f32 : 0.0f,
+         n > 1 ? load->value[1].f32 : 0.0f,
+         n > 2 ? load->value[2].f32 : 0.0f,
+         n > 3 ? load->value[3].f32 : 0.0f,
       };
-      set_ureg(c, &load->def, i915_emit_const4fv(p, v));
+
+      uint32_t ch[4] = { X, Y, Z, W };
+      int ng[4] = { 0, 0, 0, 0 };
+      bool all_swizzle = true;
+      for (unsigned i = 0; i < n; i++) {
+         if (v[i] == 0.0f)
+            ch[i] = ZERO;
+         else if (v[i] == 1.0f)
+            ch[i] = ONE;
+         else if (v[i] == -1.0f) {
+            ch[i] = ONE;
+            ng[i] = 1;
+         } else {
+            all_swizzle = false;
+            break;
+         }
+      }
+
+      if (all_swizzle) {
+         set_ureg(c, &load->def,
+                  negate(swizzle(UREG(REG_TYPE_R, 0),
+                                 ch[0], ch[1], ch[2], ch[3]),
+                         ng[0], ng[1], ng[2], ng[3]));
+      } else {
+         set_ureg(c, &load->def, i915_emit_const4fv(p, v));
+      }
       break;
    }
    default:

From 879cf1bd746a417d9d150067ff920d64083de98f Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 09:39:31 -0400
Subject: [PATCH 14/20] i915/corm: direct output writes for vec construction

When a vec2/3/4 construction has a single consumer that is a
store_output, emit the partial-writemask MOVs directly to the
output register (oC/oD) instead of building in a temp and copying.
Skip this for the same_reg case which already collapses to a
zero-instruction swizzle alias.

Also fix TGSI-win reporting: preserve loser stats before freeing
so corm_win_reason shows the actual delta instead of "only".

shader-db (I915_FS=nir): 214/403 compiled, 3231 alu
shader-db (I915_FS=both): nir won 214 (26 identical, 1 tied, 184 better, 3 only),
  73 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 26 +++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 27f57ece08b..8daa84e4a9e 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -517,6 +517,27 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
          return;
       }
 
+      /* If this vec's only consumer is a store_output, write directly
+       * to the output register instead of going through a temp.
+       */
+      if (list_is_singular(&def->uses)) {
+         nir_src *use = list_first_entry(&def->uses, nir_src, use_link);
+         nir_instr *use_instr = nir_src_use_instr(use);
+         if (use_instr->type == nir_instr_type_intrinsic) {
+            nir_intrinsic_instr *store =
+               nir_instr_as_intrinsic(use_instr);
+            if (store->intrinsic == nir_intrinsic_store_output &&
+                nir_intrinsic_component(store) == 0) {
+               nir_io_semantics sem = nir_intrinsic_io_semantics(store);
+               uint32_t out = sem.location == FRAG_RESULT_DEPTH
+                  ? UREG(REG_TYPE_OD, 0) : UREG(REG_TYPE_OC, 0);
+               i915_release_temp(p, GET_UREG_NR(dest));
+               dest = out;
+               set_ureg(c, def, dest);
+            }
+         }
+      }
+
       static const uint32_t chan_mask[] = {
          A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y,
          A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W,
@@ -796,6 +817,11 @@ emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr)
          dest = UREG(REG_TYPE_OC, 0);
       }
 
+      /* Vec direct-output already wrote to oC/oD */
+      uint32_t val_type = GET_UREG_TYPE(val);
+      if (val_type == REG_TYPE_OC || val_type == REG_TYPE_OD)
+         break;
+
       nir_def *src_def = intr->src[0].ssa;
       uint32_t *prev = c->def_csr[src_def->index];
 

From 595b9850e012ef121343d37c9078626327ae93d2 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 09:46:52 -0400
Subject: [PATCH 15/20] i915/corm: multi-component ALU dest folding in vec
 construction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generalize the scalar ALU dest fold to handle multi-component results.
When a vec source covers contiguous channels with identity swizzle and
all uses of the source come from this vec, patch the ALU instruction
to write directly into the vec's dest register with the appropriate
channel mask.

This eliminates redundant MOVs for patterns like
  vec4(%result.x, %result.y, %result.z, %other)
where %result is a vec3 ALU output — the ALU instruction now writes
directly to the output register's .xyz channels.

shader-db (I915_FS=nir): 233/403 compiled, 3328 alu
shader-db (I915_FS=both): nir won 233 (26 identical, 1 tied, 203 better, 3 only),
  54 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 35 ++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 8daa84e4a9e..2e395c16777 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -550,27 +550,48 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
          neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
       }
 
-      /* Single-component ALU dest folding: if a vec source is a single-use
-       * scalar ALU result in a temp, patch that instruction to write directly
-       * into our dest with the right channel mask.
+      /* ALU dest folding: if a vec source is a single-use ALU result in a
+       * temp with identity swizzle, patch that instruction to write
+       * directly into our dest with the right channel mask.
        */
       for (unsigned i = 0; i < n; i++) {
          nir_def *src_def = alu->src[i].src.ssa;
          uint32_t *prev_csr = c->def_csr[src_def->index];
-         if (!prev_csr || !list_is_singular(&src_def->uses))
+         if (!prev_csr)
             continue;
          if (GET_UREG_TYPE(srcs[i]) != REG_TYPE_R)
             continue;
-         if (src_def->num_components != 1)
+         unsigned nc = src_def->num_components;
+         if (i + nc > n)
             continue;
+         bool identity = true;
+         for (unsigned j = 0; j < nc && identity; j++)
+            identity = (j == 0 || alu->src[i + j].src.ssa == src_def) &&
+                       (alu->src[i + j].swizzle[0] == j);
+         if (!identity)
+            continue;
+         bool all_from_this_vec = true;
+         nir_foreach_use(use, src_def) {
+            if (nir_src_use_instr(use) != &alu->instr) {
+               all_from_this_vec = false;
+               break;
+            }
+         }
+         if (!all_from_this_vec)
+            continue;
+
+         uint32_t fold_mask = 0;
+         for (unsigned j = 0; j < nc; j++)
+            fold_mask |= chan_mask[i + j];
 
          prev_csr[0] = (prev_csr[0] & ~(A0_DEST_CHANNEL_ALL |
                                (0x1ff << A0_DEST_NR_SHIFT))) |
-                   A0_DEST(dest) | chan_mask[i];
+                   A0_DEST(dest) | fold_mask;
 
          i915_release_temp(p, GET_UREG_NR(srcs[i]));
          c->ureg_map[src_def->index] = dest;
-         emitted[i] = true;
+         for (unsigned j = 0; j < nc; j++)
+            emitted[i + j] = true;
       }
 
       /* Process real-register sources first, folding in any ZERO/ONE

From 800375c3c4bc09bf0719f5b39927d4f34eb8f01a Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 09:51:19 -0400
Subject: [PATCH 16/20] i915/corm: fuse binary ALU ops through vec construction

When a vec's only consumer is a binary ALU op (MUL, ADD, MIN, MAX)
and the other source is a single register, emit the ALU op directly
per register group with partial writemasks instead of building the
vec with MOVs and then applying the ALU op.

For example, fmul(vec4(a.zw, b.xy), tex) becomes:
  MUL oC.xy, a.zw, tex
  MUL oC.zw, b.xy, tex
instead of:
  MOV R.xy, a.zw
  MOV R.zw, b.xy
  MUL oC, R, tex

shader-db (I915_FS=nir): 248/403 compiled, 3544 alu
shader-db (I915_FS=both): nir won 248 (26 identical, 1 tied, 218 better, 3 only),
  39 TGSI, 116 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 88 +++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 2e395c16777..15e935cd67b 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -292,6 +292,10 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
 {
    struct i915_fp_compile *p = c->p;
    nir_def *def = &alu->def;
+
+   if (def->index < c->ureg_map_size && c->ureg_map[def->index] != 0)
+      return;
+
    uint32_t mask = def_mask(def);
    uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
    set_ureg(c, def, dest);
@@ -594,6 +598,90 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
             emitted[i + j] = true;
       }
 
+      /* ALU consumer fusion: if this vec feeds a single binary ALU op
+       * and the other ALU source is a single register, emit the ALU op
+       * per-group with partial writemasks instead of MOV+ALU.
+       */
+      if (list_is_singular(&def->uses)) {
+         nir_src *use = list_first_entry(&def->uses, nir_src, use_link);
+         nir_instr *use_instr = nir_src_use_instr(use);
+         if (use_instr->type == nir_instr_type_alu) {
+            nir_alu_instr *consumer = nir_instr_as_alu(use_instr);
+            unsigned nargs = nir_op_infos[consumer->op].num_inputs;
+            int vec_arg = -1;
+            for (unsigned a = 0; a < nargs; a++) {
+               if (consumer->src[a].src.ssa == def) {
+                  vec_arg = a;
+                  break;
+               }
+            }
+            uint32_t hw_op = 0;
+            bool can_fuse = (vec_arg >= 0 && nargs == 2);
+            if (can_fuse) {
+               switch (consumer->op) {
+               case nir_op_fmul: hw_op = A0_MUL; break;
+               case nir_op_fadd: hw_op = A0_ADD; break;
+               case nir_op_fmin: case nir_op_imin: case nir_op_umin:
+                  hw_op = A0_MIN; break;
+               case nir_op_fmax: case nir_op_imax: case nir_op_umax:
+                  hw_op = A0_MAX; break;
+               default: can_fuse = false; break;
+               }
+            }
+            /* check the non-vec source is a single register */
+            if (can_fuse) {
+               int other_arg = 1 - vec_arg;
+               nir_def *other_def = consumer->src[other_arg].src.ssa;
+               if (other_def->index < c->ureg_map_size &&
+                   c->ureg_map[other_def->index] != UREG_BAD) {
+                  uint32_t other = alu_src_ureg(c, &consumer->src[other_arg]);
+                  nir_def *cdef = &consumer->def;
+                  uint32_t cdest = dest;
+                  uint32_t cmask = def_mask(cdef);
+
+                  for (unsigned i = 0; i < n; i++) {
+                     if (emitted[i])
+                        continue;
+                     uint32_t base = UREG(GET_UREG_TYPE(srcs[i]),
+                                          GET_UREG_NR(srcs[i]));
+                     uint32_t group_mask = chan_mask[i];
+                     uint32_t ch[4] = { X, Y, Z, W };
+                     int ng[4] = { 0, 0, 0, 0 };
+                     ch[i] = ch_sel[i];
+                     ng[i] = neg_sel[i];
+                     for (unsigned j = i + 1; j < n; j++) {
+                        if (!emitted[j] &&
+                            (ch_sel[j] >= SRC_ZERO ||
+                             (srcs[j] & UREG_TYPE_NR_MASK) ==
+                             (srcs[i] & UREG_TYPE_NR_MASK))) {
+                           group_mask |= chan_mask[j];
+                           ch[j] = ch_sel[j];
+                           ng[j] = neg_sel[j];
+                           emitted[j] = true;
+                        }
+                     }
+                     uint32_t fused_src = negate(
+                        swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                        ng[0], ng[1], ng[2], ng[3]);
+                     if (vec_arg == 0)
+                        i915_emit_arith(p, hw_op, cdest,
+                                        group_mask & cmask, 0,
+                                        fused_src, other, 0);
+                     else
+                        i915_emit_arith(p, hw_op, cdest,
+                                        group_mask & cmask, 0,
+                                        other, fused_src, 0);
+                     emitted[i] = true;
+                  }
+
+                  set_ureg(c, cdef, cdest);
+                  c->def_csr[cdef->index] = p->csr - 3;
+                  break;
+               }
+            }
+         }
+      }
+
       /* Process real-register sources first, folding in any ZERO/ONE
        * const-swizzle sources that can piggyback on the same MOV.
        * Use the unswizzled base register since swizzle() composes.

From 2a2ef36852c6f8b68cfaac31c3ffd2cb48b3387f Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 11:25:26 -0400
Subject: [PATCH 17/20] i915/corm: use utemp for vec texcoord to avoid phase
 boundaries

When a vec construction feeds a single-use tex instruction, use a
utemp (unpreserved temp) instead of an R-file temp for the vec dest.
R-file temps written by ALU trigger tex indirect phase boundaries
when read by subsequent texld instructions; utemps do not.

Preserve the utemp allocation across i915_release_utemps so the
value survives until the texld consumer reads it.

shader-db (I915_FS=nir): 249/403 compiled, 3495 alu
shader-db (I915_FS=both): nir won 249 (26 identical, 1 tied, 217 better, 5 only),
  40 TGSI, 114 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 15e935cd67b..99f1b664142 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -523,6 +523,7 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
 
       /* If this vec's only consumer is a store_output, write directly
        * to the output register instead of going through a temp.
+       * If it's a tex instruction, use a utemp to avoid phase boundaries.
        */
       if (list_is_singular(&def->uses)) {
          nir_src *use = list_first_entry(&def->uses, nir_src, use_link);
@@ -539,6 +540,11 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
                dest = out;
                set_ureg(c, def, dest);
             }
+         } else if (use_instr->type == nir_instr_type_tex) {
+            i915_release_temp(p, GET_UREG_NR(dest));
+            uint32_t utemp = i915_get_utemp(p);
+            dest = utemp;
+            set_ureg(c, def, dest);
          }
       }
 
@@ -757,7 +763,11 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
    if (p->csr == pre_csr + 3)
       c->def_csr[def->index] = pre_csr;
 
+   uint32_t save = 0;
+   if (GET_UREG_TYPE(dest) == REG_TYPE_U)
+      save = p->utemp_flag & (1 << GET_UREG_NR(dest));
    i915_release_utemps(p);
+   p->utemp_flag |= save;
 }
 
 static uint32_t

From bfbba3f3b4e049af5d3fbbccf8fda9342b028279 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 12:24:44 -0400
Subject: [PATCH 18/20] i915/corm: extend ALU consumer fusion to ffma and
 3-input ops

Generalize the binary ALU consumer fusion to handle ffma (MAD) and
any number of inputs. When a vec's only consumer is an ALU op where
the vec occupies one source slot and all other sources are single
registers, emit the ALU op per register group with partial
writemasks.

shader-db (I915_FS=nir): 252/403 compiled, 3618 alu
shader-db (I915_FS=both): nir won 252 (26 identical, 1 tied, 219 better, 6 only),
  38 TGSI, 113 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc_nir.c | 41 +++++++++++++++----------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
index 99f1b664142..d1835800413 100644
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -622,11 +622,12 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
                }
             }
             uint32_t hw_op = 0;
-            bool can_fuse = (vec_arg >= 0 && nargs == 2);
+            bool can_fuse = (vec_arg >= 0);
             if (can_fuse) {
                switch (consumer->op) {
                case nir_op_fmul: hw_op = A0_MUL; break;
                case nir_op_fadd: hw_op = A0_ADD; break;
+               case nir_op_ffma: hw_op = A0_MAD; break;
                case nir_op_fmin: case nir_op_imin: case nir_op_umin:
                   hw_op = A0_MIN; break;
                case nir_op_fmax: case nir_op_imax: case nir_op_umax:
@@ -634,13 +635,22 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
                default: can_fuse = false; break;
                }
             }
-            /* check the non-vec source is a single register */
+            /* check the non-vec sources are single registers */
+            uint32_t other_srcs[3] = { 0, 0, 0 };
+            if (can_fuse) {
+               for (unsigned a = 0; a < nargs; a++) {
+                  if ((int)a == vec_arg)
+                     continue;
+                  nir_def *od = consumer->src[a].src.ssa;
+                  if (od->index >= c->ureg_map_size ||
+                      c->ureg_map[od->index] == UREG_BAD) {
+                     can_fuse = false;
+                     break;
+                  }
+                  other_srcs[a] = alu_src_ureg(c, &consumer->src[a]);
+               }
+            }
             if (can_fuse) {
-               int other_arg = 1 - vec_arg;
-               nir_def *other_def = consumer->src[other_arg].src.ssa;
-               if (other_def->index < c->ureg_map_size &&
-                   c->ureg_map[other_def->index] != UREG_BAD) {
-                  uint32_t other = alu_src_ureg(c, &consumer->src[other_arg]);
                   nir_def *cdef = &consumer->def;
                   uint32_t cdest = dest;
                   uint32_t cmask = def_mask(cdef);
@@ -669,21 +679,20 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
                      uint32_t fused_src = negate(
                         swizzle(base, ch[0], ch[1], ch[2], ch[3]),
                         ng[0], ng[1], ng[2], ng[3]);
-                     if (vec_arg == 0)
-                        i915_emit_arith(p, hw_op, cdest,
-                                        group_mask & cmask, 0,
-                                        fused_src, other, 0);
-                     else
-                        i915_emit_arith(p, hw_op, cdest,
-                                        group_mask & cmask, 0,
-                                        other, fused_src, 0);
+                     uint32_t s[3];
+                     for (unsigned a = 0; a < nargs; a++)
+                        s[a] = ((int)a == vec_arg) ? fused_src
+                                                   : other_srcs[a];
+                     i915_emit_arith(p, hw_op, cdest,
+                                     group_mask & cmask, 0,
+                                     s[0], nargs > 1 ? s[1] : 0,
+                                     nargs > 2 ? s[2] : 0);
                      emitted[i] = true;
                   }
 
                   set_ureg(c, cdef, cdest);
                   c->def_csr[cdef->index] = p->csr - 3;
                   break;
-               }
             }
          }
       }

From 6e38f519e0a68ad60e55235f314dab0f84c305f5 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 12:40:35 -0400
Subject: [PATCH 19/20] i915/corm: add late scalarization as variant dimension

Some shaders produce better code when fully scalarized after
optimization: vec3(a, b, 1.0) feeding a dot product creates a
cross-register vec construction, but scalarizing the fmul exposes
1.0*1.0 to constant folding, eliminating the vec entirely.

Other shaders are worse fully scalar because corm's vec construction
handles same_reg vecs at zero cost. Add late_scalar as a variant
dimension so the multi-variant framework picks whichever is better
per shader.

shader-db (I915_FS=nir): 254/403 compiled, 4063 alu
shader-db (I915_FS=both): nir won 254 (26 identical, 1 tied, 221 better, 6 only),
  36 TGSI, 113 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_fpc.h   |  1 +
 src/gallium/drivers/i915/i915_state.c | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index fe0d0f1e544..9e3e4b8ee63 100644
--- a/src/gallium/drivers/i915/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -185,6 +185,7 @@ extern void i915_translate_fragment_program(struct i915_context *i915,
 struct corm_compile_opts {
    bool deferred_const;
    bool seq_sne_opt;
+   bool late_scalar;
 };
 
 extern void i915_translate_fragment_program_nir(struct i915_context *i915,
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 10a185db957..88c65eee1c3 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -741,6 +741,10 @@ i915_create_fs_state(struct pipe_context *pipe,
       { .deferred_const = false, .seq_sne_opt = true },
       { .deferred_const = true,  .seq_sne_opt = false },
       { .deferred_const = true,  .seq_sne_opt = true },
+      { .deferred_const = false, .seq_sne_opt = false, .late_scalar = true },
+      { .deferred_const = false, .seq_sne_opt = true,  .late_scalar = true },
+      { .deferred_const = true,  .seq_sne_opt = false, .late_scalar = true },
+      { .deferred_const = true,  .seq_sne_opt = true,  .late_scalar = true },
    };
 
    struct i915_fragment_shader nir_results[ARRAY_SIZE(corm_variants)];
@@ -764,14 +768,19 @@ i915_create_fs_state(struct pipe_context *pipe,
       nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s));
 
       for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
-         nir_shader *variant_nir = (v == ARRAY_SIZE(corm_variants) - 1)
-            ? nir_s : nir_shader_clone(NULL, nir_s);
+         nir_shader *variant_nir = nir_shader_clone(NULL, nir_s);
+         if (corm_variants[v].late_scalar) {
+            NIR_PASS(_, variant_nir, nir_lower_alu_to_scalar, NULL, NULL);
+            NIR_PASS(_, variant_nir, nir_opt_copy_prop);
+            NIR_PASS(_, variant_nir, nir_opt_algebraic);
+            NIR_PASS(_, variant_nir, nir_opt_dce);
+            nir_index_ssa_defs(nir_shader_get_entrypoint(variant_nir));
+         }
          memset(&nir_results[v], 0, sizeof(nir_results[v]));
          i915_populate_fs_metadata(&nir_results[v], variant_nir);
          i915_translate_fragment_program_nir(i915, &nir_results[v],
                                             variant_nir, &corm_variants[v]);
-         if (v < ARRAY_SIZE(corm_variants) - 1)
-            ralloc_free(variant_nir);
+         ralloc_free(variant_nir);
 
          bool ok = !nir_results[v].error || !nir_results[v].error[0];
          if (ok && (best_nir < 0 ||
@@ -779,8 +788,7 @@ i915_create_fs_state(struct pipe_context *pipe,
             best_nir = v;
       }
 
-      if (try_tgsi)
-         ralloc_free(nir_s);
+      ralloc_free(nir_s);
    }
 
    if (try_tgsi) {

From ca9f95f78330f6e56c9b167ec4bd39daaafc0bdd Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 7 May 2026 14:51:48 -0400
Subject: [PATCH 20/20] i915/corm: add shrink_vectors to reduce cross-register
 vec construction

nir_opt_shrink_vectors narrows vector widths when only a subset of
components are consumed, which eliminates unnecessary cross-register
vec constructions. Follow with copy_prop + dce to clean up.

shader-db (I915_FS=nir): 272/403 compiled, 4388 alu
shader-db (I915_FS=both): nir won 272 (26 identical, 1 tied, 239 better, 6 only),
  18 TGSI, 113 neither

Assisted-by: Claude
---
 src/gallium/drivers/i915/i915_state.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 88c65eee1c3..24adc396241 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -765,6 +765,9 @@ i915_create_fs_state(struct pipe_context *pipe,
       NIR_PASS(_, nir_s, nir_opt_algebraic);
       NIR_PASS(_, nir_s, nir_opt_algebraic_late);
       NIR_PASS(_, nir_s, nir_opt_dce);
+      NIR_PASS(_, nir_s, nir_opt_shrink_vectors, false);
+      NIR_PASS(_, nir_s, nir_opt_copy_prop);
+      NIR_PASS(_, nir_s, nir_opt_dce);
       nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s));
 
       for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {