diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 0bbbd66662b..ef81f69740c 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -88,8 +88,15 @@ struct i915_winsys_batchbuffer;
 
 #define I915_MAX_CONSTANT 32
 
-/** See constant_flags[] below */
-#define I915_CONSTFLAG_USER 0x1f
+/**
+ * Per-channel flags for constant_flags[].
+ * Bits 0-3: channel has a compiler immediate.
+ * Bits 4-7: channel has a user (UBO) value uploaded at draw time.
+ * A channel is available when neither bit is set.
+ */
+#define I915_CONSTFLAG_IMM(ch)     (1 << (ch))
+#define I915_CONSTFLAG_USER_CH(ch) (1 << ((ch) + 4))
+#define I915_CONSTFLAG_USER        0xf0
 
 /**
  * Subclass of pipe_shader_state
@@ -103,6 +110,10 @@ struct i915_fragment_shader {
 
    uint32_t *program;
    uint32_t program_len;
+   uint32_t nr_alu_insn;
+   uint32_t nr_tex_insn;
+   uint32_t nr_tex_indirect;
+   uint32_t nr_temps;
 
    /**
     * constants introduced during translation.
@@ -134,12 +145,15 @@ struct i915_fragment_shader {
    } texcoords[I915_TEX_UNITS];
 
    bool reads_pntc;
+   bool writes_z;
+
+   unsigned num_inputs;
+   uint8_t input_semantic_name[PIPE_MAX_SHADER_INPUTS];
+   uint8_t input_semantic_index[PIPE_MAX_SHADER_INPUTS];
 
-   /* Set if the shader is an internal (blit, etc.) shader that shouldn't debug
-    * log by default. */
    bool internal;
 
-   char *error; /* Any error message from compiling this shader (or NULL) */
+   char *error;
 };
 
 struct i915_cache_context;
diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index d234042dea2..9e3e4b8ee63 100644
--- a/src/gallium/drivers/i915/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -136,6 +136,15 @@ swizzle(int reg, uint32_t x, uint32_t y, uint32_t z, uint32_t w)
            CHANNEL_SRC(GET_CHANNEL_SRC(reg, w), 3));
 }
 
+static inline int
+negate(int reg, int x, int y, int z, int w)
+{
+   return reg ^ (x << UREG_CHANNEL_X_NEGATE_SHIFT |
+                 y << UREG_CHANNEL_Y_NEGATE_SHIFT |
+                 z << UREG_CHANNEL_Z_NEGATE_SHIFT |
+                 w << UREG_CHANNEL_W_NEGATE_SHIFT);
+}
+
 #define A0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT)
 #define D0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT)
 #define T0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT)
@@ -173,8 +182,21 @@ swizzle(int reg, uint32_t x, uint32_t y, uint32_t z, uint32_t w)
  */
 extern void i915_translate_fragment_program(struct i915_context *i915,
                                             struct i915_fragment_shader *fs);
+struct corm_compile_opts {
+   bool deferred_const;
+   bool seq_sne_opt;
+   bool late_scalar;
+};
+
+extern void i915_translate_fragment_program_nir(struct i915_context *i915,
+                                                struct i915_fragment_shader *ifs,
+                                                struct nir_shader *s,
+                                                const struct corm_compile_opts *opts);
+extern void i915_use_passthrough_shader(struct i915_fragment_shader *fs);
+extern void i915_program_error(struct i915_fp_compile *p, const char *msg, ...);
 
 extern uint32_t i915_get_temp(struct i915_fp_compile *p);
+extern void i915_release_temp(struct i915_fp_compile *p, int reg);
 extern uint32_t i915_get_utemp(struct i915_fp_compile *p);
 extern void i915_release_utemps(struct i915_fp_compile *p);
 
@@ -191,6 +213,8 @@ extern uint32_t i915_emit_decl(struct i915_fp_compile *p, uint32_t type,
                                uint32_t nr, uint32_t d0_flags);
 
 extern uint32_t i915_emit_const1f(struct i915_fp_compile *p, float c0);
+extern uint32_t i915_emit_const1f_prefer(struct i915_fp_compile *p, float c0,
+                                         int preferred_reg);
 
 extern uint32_t i915_emit_const2f(struct i915_fp_compile *p, float c0,
                                   float c1);
diff --git a/src/gallium/drivers/i915/i915_fpc_emit.c b/src/gallium/drivers/i915/i915_fpc_emit.c
index 603c79e089f..aeace4396ca 100644
--- a/src/gallium/drivers/i915/i915_fpc_emit.c
+++ b/src/gallium/drivers/i915/i915_fpc_emit.c
@@ -25,11 +25,45 @@
  *
  **************************************************************************/
 
+#include <stdarg.h>
+
+#include "util/ralloc.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "i915_context.h"
 #include "i915_fpc.h"
 #include "i915_reg.h"
 
+void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
+{
+   va_list args;
+   va_start(args, msg);
+   ralloc_vasprintf_append(&p->error, msg, args);
+   va_end(args);
+}
+
+static const unsigned passthrough_program[] = {
+   _3DSTATE_PIXEL_SHADER_PROGRAM | ((1 * 3) - 1),
+   (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL |
+    (REG_TYPE_R << A0_SRC0_TYPE_SHIFT) | (0 << A0_SRC0_NR_SHIFT)),
+   ((SRC_ONE << A1_SRC0_CHANNEL_X_SHIFT) |
+    (SRC_ZERO << A1_SRC0_CHANNEL_Y_SHIFT) |
+    (SRC_ZERO << A1_SRC0_CHANNEL_Z_SHIFT) |
+    (SRC_ONE << A1_SRC0_CHANNEL_W_SHIFT)),
+   0};
+
+void
+i915_use_passthrough_shader(struct i915_fragment_shader *fs)
+{
+   fs->program = (uint32_t *)MALLOC(sizeof(passthrough_program));
+   if (fs->program) {
+      memcpy(fs->program, passthrough_program, sizeof(passthrough_program));
+      fs->program_len = ARRAY_SIZE(passthrough_program);
+   }
+   fs->num_constants = 0;
+}
+
 uint32_t
 i915_get_temp(struct i915_fp_compile *p)
 {
@@ -43,7 +77,7 @@ i915_get_temp(struct i915_fp_compile *p)
    return bit - 1;
 }
 
-static void
+void
 i915_release_temp(struct i915_fp_compile *p, int reg)
 {
    p->temp_flag &= ~(1 << reg);
@@ -179,8 +213,6 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
 {
    const uint32_t k = UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord));
 
-   int temp = -1;
-
    uint32_t coord_used = 0xf << UREG_CHANNEL_X_SHIFT;
    if (coord_mask & TGSI_WRITEMASK_Y)
       coord_used |= 0xf << UREG_CHANNEL_Y_SHIFT;
@@ -191,13 +223,10 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
 
    if ((coord & coord_used) != (k & coord_used) ||
        GET_UREG_TYPE(coord) == REG_TYPE_CONST) {
-      /* texcoord is swizzled or negated.  Need to allocate a new temporary
-       * register (a utemp / unpreserved temp) won't do.
+      /* texcoord is swizzled or negated.  Need a temporary to hold it.
+       * Use a utemp so it doesn't create a tex indirect phase boundary.
        */
-      uint32_t tempReg;
-
-      temp = i915_get_temp(p);          /* get temp reg index */
-      tempReg = UREG(REG_TYPE_R, temp); /* make i915 register */
+      uint32_t tempReg = i915_get_utemp(p);
 
       i915_emit_arith(p, A0_MOV, tempReg,
                       A0_DEST_CHANNEL_ALL, /* dest reg, writemask */
@@ -227,11 +256,21 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
          p->nr_tex_indirect++;
 
       /* Reading from an r# register whose contents depend on output of the
-       * current phase defines a phase boundary.
+       * current phase defines a phase boundary.  Prefer just bumping the
+       * phase count (free), but if we'd exceed the HW limit, copy to a
+       * utemp instead (costs 1 ALU instruction).
        */
       if (GET_UREG_TYPE(coord) == REG_TYPE_R &&
-          p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect)
-         p->nr_tex_indirect++;
+          p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect) {
+         if (p->nr_tex_indirect + 1 < I915_MAX_TEX_INDIRECT) {
+            p->nr_tex_indirect++;
+         } else {
+            uint32_t tmp = i915_get_utemp(p);
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            coord, 0, 0);
+            coord = tmp;
+         }
+      }
 
       if (p->csr < p->program + I915_PROGRAM_SIZE) {
          *(p->csr++) = (opcode | T0_DEST(dest) | T0_SAMPLER(sampler));
@@ -246,40 +285,75 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask,
       p->nr_tex_insn++;
    }
 
-   if (temp >= 0)
-      i915_release_temp(p, temp);
-
    return dest;
 }
 
+static uint32_t
+i915_try_const1f_in_reg(struct i915_fp_compile *p, float c0, unsigned reg)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+
+   for (unsigned idx = 0; idx < 4; idx++) {
+      if (ifs->constant_flags[reg] & I915_CONSTFLAG_USER_CH(idx))
+         continue;
+      if (!(ifs->constant_flags[reg] & I915_CONSTFLAG_IMM(idx)) ||
+          ifs->constants[reg][idx] == c0) {
+         ifs->constants[reg][idx] = c0;
+         ifs->constant_flags[reg] |= I915_CONSTFLAG_IMM(idx);
+         if (reg + 1 > ifs->num_constants)
+            ifs->num_constants = reg + 1;
+         return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
+      }
+   }
+   return UREG_BAD;
+}
+
+static uint32_t
+i915_try_emit_const1f(struct i915_fp_compile *p, float c0, int preferred_reg)
+{
+   if (preferred_reg >= 0) {
+      uint32_t r = i915_try_const1f_in_reg(p, c0, preferred_reg);
+      if (r != UREG_BAD)
+         return r;
+   }
+
+   for (unsigned reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      uint32_t r = i915_try_const1f_in_reg(p, c0, reg);
+      if (r != UREG_BAD)
+         return r;
+   }
+
+   i915_program_error(p, "i915_emit_const1f: out of constants");
+   return 0;
+}
+
 uint32_t
 i915_emit_const1f(struct i915_fp_compile *p, float c0)
 {
-   struct i915_fragment_shader *ifs = p->shader;
-   unsigned reg, idx;
-
    if (c0 == 0.0)
       return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
    if (c0 == 1.0)
       return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+   if (c0 == -1.0)
+      return negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE),
+                    1, 1, 1, 1);
 
-   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
-      if (ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
-         continue;
-      for (idx = 0; idx < 4; idx++) {
-         if (!(ifs->constant_flags[reg] & (1 << idx)) ||
-             ifs->constants[reg][idx] == c0) {
-            ifs->constants[reg][idx] = c0;
-            ifs->constant_flags[reg] |= 1 << idx;
-            if (reg + 1 > ifs->num_constants)
-               ifs->num_constants = reg + 1;
-            return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
-         }
-      }
-   }
+   return i915_try_emit_const1f(p, c0, -1);
+}
 
-   i915_program_error(p, "i915_emit_const1f: out of constants");
-   return 0;
+uint32_t
+i915_emit_const1f_prefer(struct i915_fp_compile *p, float c0,
+                         int preferred_reg)
+{
+   if (c0 == 0.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+   if (c0 == 1.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+   if (c0 == -1.0)
+      return negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE),
+                    1, 1, 1, 1);
+
+   return i915_try_emit_const1f(p, c0, preferred_reg);
 }
 
 uint32_t
@@ -301,14 +375,15 @@ i915_emit_const2f(struct i915_fp_compile *p, float c0, float c1)
    // XXX emit swizzle here for 0, 1, -1 and any combination thereof
    // we can use swizzle + neg for that
    for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
-      if (ifs->constant_flags[reg] == 0xf ||
-          ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+      uint8_t occupied = (ifs->constant_flags[reg] & 0xf) |
+                         (ifs->constant_flags[reg] >> 4);
+      if (occupied == 0xf)
          continue;
       for (idx = 0; idx < 3; idx++) {
-         if (!(ifs->constant_flags[reg] & (3 << idx))) {
+         if (!(occupied & (3 << idx))) {
             ifs->constants[reg][idx + 0] = c0;
             ifs->constants[reg][idx + 1] = c1;
-            ifs->constant_flags[reg] |= 3 << idx;
+            ifs->constant_flags[reg] |= (3 << idx); /* immediate bits */
             if (reg + 1 > ifs->num_constants)
                ifs->num_constants = reg + 1;
             return swizzle(UREG(REG_TYPE_CONST, reg), idx, idx + 1, ZERO, ONE);
@@ -330,9 +405,9 @@ i915_emit_const4f(struct i915_fp_compile *p, float c0, float c1, float c2,
    // XXX emit swizzle here for 0, 1, -1 and any combination thereof
    // we can use swizzle + neg for that
    for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
-      if (ifs->constant_flags[reg] == 0xf && ifs->constants[reg][0] == c0 &&
-          ifs->constants[reg][1] == c1 && ifs->constants[reg][2] == c2 &&
-          ifs->constants[reg][3] == c3) {
+      if ((ifs->constant_flags[reg] & 0x0f) == 0x0f &&
+          ifs->constants[reg][0] == c0 && ifs->constants[reg][1] == c1 &&
+          ifs->constants[reg][2] == c2 && ifs->constants[reg][3] == c3) {
          return UREG(REG_TYPE_CONST, reg);
       } else if (ifs->constant_flags[reg] == 0) {
 
@@ -340,7 +415,7 @@ i915_emit_const4f(struct i915_fp_compile *p, float c0, float c1, float c2,
          ifs->constants[reg][1] = c1;
          ifs->constants[reg][2] = c2;
          ifs->constants[reg][3] = c3;
-         ifs->constant_flags[reg] = 0xf;
+         ifs->constant_flags[reg] = 0x0f;
          if (reg + 1 > ifs->num_constants)
             ifs->num_constants = reg + 1;
          return UREG(REG_TYPE_CONST, reg);
diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c
new file mode 100644
index 00000000000..d1835800413
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@@ -0,0 +1,1310 @@
+/*
+ * Copyright 2025 Red Hat, Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "util/log.h"
+#include "util/ralloc.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_debug.h"
+#include "i915_debug_private.h"
+#include "i915_fpc.h"
+#include "i915_reg.h"
+
+struct nir_to_i915 {
+   struct corm_compile_opts opts;
+   struct i915_fp_compile *p;
+   struct i915_fragment_shader *ifs;
+
+   uint32_t *ureg_map;
+   uint32_t **def_csr;
+   float *deferred_const;
+   unsigned ureg_map_size;
+
+   int *last_use;
+   int ip;
+};
+
+static bool
+mark_last_use_cb(nir_src *src, void *state)
+{
+   struct nir_to_i915 *c = state;
+   if (src->ssa->index < c->ureg_map_size)
+      c->last_use[src->ssa->index] = c->ip;
+   return true;
+}
+
+static void
+compute_last_use(struct nir_to_i915 *c, nir_function_impl *impl)
+{
+   c->ip = 0;
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         nir_foreach_src(instr, mark_last_use_cb, c);
+         c->ip++;
+      }
+   }
+}
+
+static bool
+release_if_last_use_cb(nir_src *src, void *state)
+{
+   struct nir_to_i915 *c = state;
+   unsigned idx = src->ssa->index;
+   if (idx < c->ureg_map_size && c->last_use[idx] == c->ip) {
+      uint32_t ureg = c->ureg_map[idx];
+      if (GET_UREG_TYPE(ureg) == REG_TYPE_R)
+         i915_release_temp(c->p, GET_UREG_NR(ureg));
+   }
+   return true;
+}
+
+static void
+release_dead_temps(struct nir_to_i915 *c, nir_instr *instr)
+{
+   nir_foreach_src(instr, release_if_last_use_cb, c);
+}
+
+static void
+set_ureg(struct nir_to_i915 *c, nir_def *def, uint32_t ureg)
+{
+   assert(def->index < c->ureg_map_size);
+   c->ureg_map[def->index] = ureg;
+}
+
+static bool
+is_deferred(struct nir_to_i915 *c, unsigned ssa_index)
+{
+   return c->ureg_map[ssa_index] == UREG_BAD;
+}
+
+static uint32_t
+resolve_const(struct nir_to_i915 *c, unsigned ssa_index, int preferred_reg)
+{
+   uint32_t ureg = i915_emit_const1f_prefer(c->p,
+                                            c->deferred_const[ssa_index],
+                                            preferred_reg);
+   c->ureg_map[ssa_index] = ureg;
+   return ureg;
+}
+
+static uint32_t
+src_ureg(struct nir_to_i915 *c, nir_src *src)
+{
+   assert(src->ssa->index < c->ureg_map_size);
+   if (c->ureg_map[src->ssa->index] == UREG_BAD)
+      resolve_const(c, src->ssa->index, -1);
+   return c->ureg_map[src->ssa->index];
+}
+
+static uint32_t
+alu_src_ureg(struct nir_to_i915 *c, nir_alu_src *src)
+{
+   uint32_t ureg = src_ureg(c, &src->src);
+   return swizzle(ureg,
+                  src->swizzle[0], src->swizzle[1],
+                  src->swizzle[2], src->swizzle[3]);
+}
+
+static uint32_t
+def_mask(nir_def *def)
+{
+   uint32_t mask = 0;
+   if (def->num_components >= 1) mask |= A0_DEST_CHANNEL_X;
+   if (def->num_components >= 2) mask |= A0_DEST_CHANNEL_Y;
+   if (def->num_components >= 3) mask |= A0_DEST_CHANNEL_Z;
+   if (def->num_components >= 4) mask |= A0_DEST_CHANNEL_W;
+   return mask;
+}
+
+static uint32_t
+writemask_to_mask(unsigned wm)
+{
+   uint32_t mask = 0;
+   if (wm & 1) mask |= A0_DEST_CHANNEL_X;
+   if (wm & 2) mask |= A0_DEST_CHANNEL_Y;
+   if (wm & 4) mask |= A0_DEST_CHANNEL_Z;
+   if (wm & 8) mask |= A0_DEST_CHANNEL_W;
+   return mask;
+}
+
+static uint32_t
+get_texcoord_mapping(struct i915_fragment_shader *fs,
+                     unsigned semantic, int index)
+{
+   for (int i = 0; i < I915_TEX_UNITS; i++) {
+      if (fs->texcoords[i].semantic == -1) {
+         fs->texcoords[i].semantic = semantic;
+         fs->texcoords[i].index = index;
+         return i;
+      }
+      if (fs->texcoords[i].semantic == (int)semantic &&
+          fs->texcoords[i].index == index)
+         return i;
+   }
+   return 0;
+}
+
+static uint32_t
+emit_input(struct nir_to_i915 *c, unsigned location)
+{
+   struct i915_fp_compile *p = c->p;
+   struct i915_fragment_shader *ifs = c->ifs;
+   unsigned sem_name, sem_index;
+
+   tgsi_get_gl_varying_semantic((gl_varying_slot)location, true,
+                                &sem_name, &sem_index);
+
+   switch (sem_name) {
+   case TGSI_SEMANTIC_GENERIC:
+   case TGSI_SEMANTIC_TEXCOORD:
+   case TGSI_SEMANTIC_PCOORD:
+   case TGSI_SEMANTIC_POSITION: {
+      if (sem_name == TGSI_SEMANTIC_PCOORD)
+         ifs->reads_pntc = true;
+      int tc = get_texcoord_mapping(ifs, sem_name, sem_index);
+      return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_ALL);
+   }
+   case TGSI_SEMANTIC_COLOR:
+      if (sem_index == 0) {
+         return i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL);
+      } else {
+         return swizzle(
+            i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ),
+            X, Y, Z, ONE);
+      }
+   case TGSI_SEMANTIC_FOG:
+      return swizzle(
+         i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W),
+         W, W, W, W);
+   case TGSI_SEMANTIC_FACE: {
+      int tc = get_texcoord_mapping(ifs, sem_name, sem_index);
+      return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_X);
+   }
+   default:
+      i915_program_error(p, "Bad input location %d (semantic %d)",
+                         location, sem_name);
+      return 0;
+   }
+}
+
+static void
+emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load)
+{
+   struct i915_fp_compile *p = c->p;
+
+   switch (load->def.num_components) {
+   case 1: {
+      float val = load->value[0].f32;
+      if (c->opts.deferred_const &&
+          val != 0.0f && val != 1.0f && val != -1.0f) {
+         c->deferred_const[load->def.index] = val;
+         set_ureg(c, &load->def, UREG_BAD);
+      } else {
+         set_ureg(c, &load->def, i915_emit_const1f(p, val));
+      }
+      break;
+   }
+   case 2:
+      set_ureg(c, &load->def,
+               i915_emit_const2f(p, load->value[0].f32,
+                                 load->value[1].f32));
+      break;
+   case 3:
+   case 4: {
+      unsigned n = load->def.num_components;
+      float v[4] = {
+         load->value[0].f32,
+         n > 1 ? load->value[1].f32 : 0.0f,
+         n > 2 ? load->value[2].f32 : 0.0f,
+         n > 3 ? load->value[3].f32 : 0.0f,
+      };
+
+      uint32_t ch[4] = { X, Y, Z, W };
+      int ng[4] = { 0, 0, 0, 0 };
+      bool all_swizzle = true;
+      for (unsigned i = 0; i < n; i++) {
+         if (v[i] == 0.0f)
+            ch[i] = ZERO;
+         else if (v[i] == 1.0f)
+            ch[i] = ONE;
+         else if (v[i] == -1.0f) {
+            ch[i] = ONE;
+            ng[i] = 1;
+         } else {
+            all_swizzle = false;
+            break;
+         }
+      }
+
+      if (all_swizzle) {
+         set_ureg(c, &load->def,
+                  negate(swizzle(UREG(REG_TYPE_R, 0),
+                                 ch[0], ch[1], ch[2], ch[3]),
+                         ng[0], ng[1], ng[2], ng[3]));
+      } else {
+         set_ureg(c, &load->def, i915_emit_const4fv(p, v));
+      }
+      break;
+   }
+   default:
+      i915_program_error(p, "load_const with %d components",
+                         load->def.num_components);
+      break;
+   }
+}
+
+static void
+coalesce_constants(struct nir_to_i915 *c, nir_alu_instr *alu)
+{
+   unsigned n = nir_op_infos[alu->op].num_inputs;
+   unsigned deferred[3];
+   unsigned nr_deferred = 0;
+   int preferred = -1;
+
+   for (unsigned i = 0; i < n; i++) {
+      unsigned idx = alu->src[i].src.ssa->index;
+      if (is_deferred(c, idx)) {
+         deferred[nr_deferred++] = idx;
+      } else {
+         uint32_t ureg = c->ureg_map[idx];
+         if (GET_UREG_TYPE(ureg) == REG_TYPE_CONST && preferred < 0)
+            preferred = GET_UREG_NR(ureg);
+      }
+   }
+
+   if (nr_deferred == 0)
+      return;
+
+   for (unsigned i = 0; i < nr_deferred; i++) {
+      uint32_t ureg = resolve_const(c, deferred[i], preferred);
+      if (preferred < 0 && GET_UREG_TYPE(ureg) == REG_TYPE_CONST)
+         preferred = GET_UREG_NR(ureg);
+   }
+}
+
+static void
+emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
+{
+   struct i915_fp_compile *p = c->p;
+   nir_def *def = &alu->def;
+
+   if (def->index < c->ureg_map_size && c->ureg_map[def->index] != 0)
+      return;
+
+   uint32_t mask = def_mask(def);
+   uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
+   set_ureg(c, def, dest);
+
+   coalesce_constants(c, alu);
+
+   uint32_t src0 = 0, src1 = 0, src2 = 0;
+   if (nir_op_infos[alu->op].num_inputs >= 1)
+      src0 = alu_src_ureg(c, &alu->src[0]);
+   if (nir_op_infos[alu->op].num_inputs >= 2)
+      src1 = alu_src_ureg(c, &alu->src[1]);
+   if (nir_op_infos[alu->op].num_inputs >= 3)
+      src2 = alu_src_ureg(c, &alu->src[2]);
+
+   uint32_t *pre_csr = p->csr;
+
+   switch (alu->op) {
+   case nir_op_mov:
+   case nir_op_fcanonicalize:
+   case nir_op_fneg: {
+      i915_release_temp(p, GET_UREG_NR(dest));
+      set_ureg(c, def, alu->op == nir_op_fneg ? negate(src0, 1, 1, 1, 1)
+                                         : src0);
+      unsigned src_idx = alu->src[0].src.ssa->index;
+      if (c->last_use[src_idx] == c->ip)
+         c->last_use[src_idx] = c->last_use[def->index];
+      return;
+   }
+   case nir_op_fabs:
+      i915_emit_arith(p, A0_MAX, dest, mask, 0,
+                      src0, negate(src0, 1, 1, 1, 1), 0);
+      break;
+   case nir_op_fsat: {
+      nir_def *src_def = alu->src[0].src.ssa;
+      uint32_t *prev = c->def_csr[src_def->index];
+      if (prev && list_is_singular(&src_def->uses)) {
+         prev[0] |= A0_DEST_SATURATE;
+         i915_release_temp(p, GET_UREG_NR(dest));
+         set_ureg(c, def, src_ureg(c, &alu->src[0].src));
+         c->def_csr[def->index] = prev;
+         unsigned src_idx = alu->src[0].src.ssa->index;
+         if (c->last_use[src_idx] == c->ip)
+            c->last_use[src_idx] = c->last_use[def->index];
+         return;
+      }
+      i915_emit_arith(p, A0_MOV, dest, mask, A0_DEST_SATURATE, src0, 0, 0);
+      break;
+   }
+   case nir_op_fadd:
+      i915_emit_arith(p, A0_ADD, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_fmul:
+      i915_emit_arith(p, A0_MUL, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_ffma:
+      i915_emit_arith(p, A0_MAD, dest, mask, 0, src0, src1, src2);
+      break;
+   case nir_op_fmin:
+   case nir_op_imin:
+   case nir_op_umin:
+      i915_emit_arith(p, A0_MIN, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_fmax:
+   case nir_op_imax:
+   case nir_op_umax:
+      i915_emit_arith(p, A0_MAX, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_ffloor:
+      i915_emit_arith(p, A0_FLR, dest, mask, 0, src0, 0, 0);
+      break;
+   case nir_op_ffract:
+      i915_emit_arith(p, A0_FRC, dest, mask, 0, src0, 0, 0);
+      break;
+   case nir_op_ftrunc:
+      i915_emit_arith(p, A0_TRC, dest, mask, 0, src0, 0, 0);
+      break;
+   case nir_op_fceil: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_FLR, tmp, mask, 0,
+                      negate(src0, 1, 1, 1, 1), 0, 0);
+      i915_emit_arith(p, A0_MOV, dest, mask, 0,
+                      negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+   }
+   case nir_op_frcp:
+      i915_emit_arith(p, A0_RCP, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_frsq:
+      i915_emit_arith(p, A0_RSQ, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_fsqrt: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_RSQ, tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      i915_emit_arith(p, A0_MUL, dest, mask, 0,
+                      src0, swizzle(tmp, X, X, X, X), 0);
+      break;
+   }
+   case nir_op_fexp2:
+      i915_emit_arith(p, A0_EXP, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_flog2:
+      i915_emit_arith(p, A0_LOG, dest, mask, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+   case nir_op_fdot2:
+   case nir_op_fdot2_replicated:
+      i915_emit_arith(p, A0_DP3, dest, mask, 0,
+                      swizzle(src0, X, Y, ZERO, ZERO), src1, 0);
+      break;
+   case nir_op_fdot3:
+   case nir_op_fdot3_replicated:
+      i915_emit_arith(p, A0_DP3, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_fdot4:
+   case nir_op_fdot4_replicated:
+      i915_emit_arith(p, A0_DP4, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_slt:
+      i915_emit_arith(p, A0_SLT, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_sge:
+      i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0);
+      break;
+   case nir_op_seq: {
+      const uint32_t zero =
+         swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+      if (c->opts.seq_sne_opt &&
+          ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) ||
+           (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) {
+         if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))
+            src0 = src1;
+         /* x == 0  <->  -abs(x) >= 0: 2 insns instead of 3 */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, negate(src0, 1, 1, 1, 1), 0);
+         i915_emit_arith(p, A0_SGE, dest, mask, 0,
+                         negate(tmp, 1, 1, 1, 1), zero, 0);
+      } else {
+         /* seq(a,b) = sge(a,b) * sge(b,a) */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, src1, 0);
+         i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0);
+         i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0);
+      }
+      break;
+   }
+   case nir_op_sne: {
+      const uint32_t zero =
+         swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+      if (c->opts.seq_sne_opt &&
+          ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) ||
+           (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) {
+         if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))
+            src0 = src1;
+         /* x != 0  <->  -abs(x) < 0: 2 insns instead of 3 */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, negate(src0, 1, 1, 1, 1), 0);
+         i915_emit_arith(p, A0_SLT, dest, mask, 0,
+                         negate(tmp, 1, 1, 1, 1), zero, 0);
+      } else {
+         /* sne(a,b) = slt(a,b) + slt(b,a) */
+         uint32_t tmp = i915_get_utemp(p);
+         i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
+                         src0, src1, 0);
+         i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0);
+         i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0);
+      }
+      break;
+   }
+   case nir_op_fpow: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0);
+      i915_emit_arith(p, A0_EXP, dest, mask, 0,
+                      swizzle(tmp, X, X, X, X), 0, 0);
+      break;
+   }
+   case nir_op_bcsel:
+      i915_emit_arith(p, A0_CMP, dest, mask, 0,
+                      negate(src0, 1, 1, 1, 1), src2, src1);
+      break;
+   case nir_op_fcsel_ge:
+      i915_emit_arith(p, A0_CMP, dest, mask, 0, src0, src1, src2);
+      break;
+   case nir_op_fcsel_gt:
+      i915_emit_arith(p, A0_CMP, dest, mask, 0,
+                      negate(src0, 1, 1, 1, 1), src2, src1);
+      break;
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4: {
+      unsigned n = nir_op_infos[alu->op].num_inputs;
+      uint32_t srcs[4] = { 0 };
+      for (unsigned i = 0; i < n; i++)
+         srcs[i] = alu_src_ureg(c, &alu->src[i]);
+
+      bool same_reg = true;
+      for (unsigned i = 1; i < n; i++) {
+         if ((srcs[i] & UREG_TYPE_NR_MASK) != (srcs[0] & UREG_TYPE_NR_MASK)) {
+            same_reg = false;
+            break;
+         }
+      }
+
+      if (same_reg) {
+         uint32_t base = UREG(GET_UREG_TYPE(srcs[0]), GET_UREG_NR(srcs[0]));
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         for (unsigned i = 0; i < n; i++) {
+            ch[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7;
+            ng[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
+         }
+         i915_release_temp(p, GET_UREG_NR(dest));
+         set_ureg(c, def, negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                                 ng[0], ng[1], ng[2], ng[3]));
+         return;
+      }
+
+      /* If this vec's only consumer is a store_output, write directly
+       * to the output register instead of going through a temp.
+       * If it's a tex instruction, use a utemp to avoid phase boundaries.
+       */
+      if (list_is_singular(&def->uses)) {
+         nir_src *use = list_first_entry(&def->uses, nir_src, use_link);
+         nir_instr *use_instr = nir_src_use_instr(use);
+         if (use_instr->type == nir_instr_type_intrinsic) {
+            nir_intrinsic_instr *store =
+               nir_instr_as_intrinsic(use_instr);
+            if (store->intrinsic == nir_intrinsic_store_output &&
+                nir_intrinsic_component(store) == 0) {
+               nir_io_semantics sem = nir_intrinsic_io_semantics(store);
+               uint32_t out = sem.location == FRAG_RESULT_DEPTH
+                  ? UREG(REG_TYPE_OD, 0) : UREG(REG_TYPE_OC, 0);
+               i915_release_temp(p, GET_UREG_NR(dest));
+               dest = out;
+               set_ureg(c, def, dest);
+            }
+         } else if (use_instr->type == nir_instr_type_tex) {
+            i915_release_temp(p, GET_UREG_NR(dest));
+            uint32_t utemp = i915_get_utemp(p);
+            dest = utemp;
+            set_ureg(c, def, dest);
+         }
+      }
+
+      static const uint32_t chan_mask[] = {
+         A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y,
+         A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W,
+      };
+      bool emitted[4] = { false };
+      uint32_t ch_sel[4];
+      int neg_sel[4] = { 0, 0, 0, 0 };
+      for (unsigned i = 0; i < n; i++) {
+         ch_sel[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7;
+         neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1;
+      }
+
+      /* ALU dest folding: if a vec source is a single-use ALU result in a
+       * temp with identity swizzle, patch that instruction to write
+       * directly into our dest with the right channel mask.
+       */
+      for (unsigned i = 0; i < n; i++) {
+         nir_def *src_def = alu->src[i].src.ssa;
+         uint32_t *prev_csr = c->def_csr[src_def->index];
+         if (!prev_csr)
+            continue;
+         if (GET_UREG_TYPE(srcs[i]) != REG_TYPE_R)
+            continue;
+         unsigned nc = src_def->num_components;
+         if (i + nc > n)
+            continue;
+         bool identity = true;
+         for (unsigned j = 0; j < nc && identity; j++)
+            identity = (j == 0 || alu->src[i + j].src.ssa == src_def) &&
+                       (alu->src[i + j].swizzle[0] == j);
+         if (!identity)
+            continue;
+         bool all_from_this_vec = true;
+         nir_foreach_use(use, src_def) {
+            if (nir_src_use_instr(use) != &alu->instr) {
+               all_from_this_vec = false;
+               break;
+            }
+         }
+         if (!all_from_this_vec)
+            continue;
+
+         uint32_t fold_mask = 0;
+         for (unsigned j = 0; j < nc; j++)
+            fold_mask |= chan_mask[i + j];
+
+         prev_csr[0] = (prev_csr[0] & ~(A0_DEST_CHANNEL_ALL |
+                               (0x1ff << A0_DEST_NR_SHIFT))) |
+                   A0_DEST(dest) | fold_mask;
+
+         i915_release_temp(p, GET_UREG_NR(srcs[i]));
+         c->ureg_map[src_def->index] = dest;
+         for (unsigned j = 0; j < nc; j++)
+            emitted[i + j] = true;
+      }
+
+      /* ALU consumer fusion: if this vec feeds a single binary ALU op
+       * and the other ALU source is a single register, emit the ALU op
+       * per-group with partial writemasks instead of MOV+ALU.
+       */
+      if (list_is_singular(&def->uses)) {
+         nir_src *use = list_first_entry(&def->uses, nir_src, use_link);
+         nir_instr *use_instr = nir_src_use_instr(use);
+         if (use_instr->type == nir_instr_type_alu) {
+            nir_alu_instr *consumer = nir_instr_as_alu(use_instr);
+            unsigned nargs = nir_op_infos[consumer->op].num_inputs;
+            int vec_arg = -1;
+            for (unsigned a = 0; a < nargs; a++) {
+               if (consumer->src[a].src.ssa == def) {
+                  vec_arg = a;
+                  break;
+               }
+            }
+            uint32_t hw_op = 0;
+            bool can_fuse = (vec_arg >= 0);
+            if (can_fuse) {
+               switch (consumer->op) {
+               case nir_op_fmul: hw_op = A0_MUL; break;
+               case nir_op_fadd: hw_op = A0_ADD; break;
+               case nir_op_ffma: hw_op = A0_MAD; break;
+               case nir_op_fmin: case nir_op_imin: case nir_op_umin:
+                  hw_op = A0_MIN; break;
+               case nir_op_fmax: case nir_op_imax: case nir_op_umax:
+                  hw_op = A0_MAX; break;
+               default: can_fuse = false; break;
+               }
+            }
+            /* check the non-vec sources are single registers */
+            uint32_t other_srcs[3] = { 0, 0, 0 };
+            if (can_fuse) {
+               for (unsigned a = 0; a < nargs; a++) {
+                  if ((int)a == vec_arg)
+                     continue;
+                  nir_def *od = consumer->src[a].src.ssa;
+                  if (od->index >= c->ureg_map_size ||
+                      c->ureg_map[od->index] == UREG_BAD) {
+                     can_fuse = false;
+                     break;
+                  }
+                  other_srcs[a] = alu_src_ureg(c, &consumer->src[a]);
+               }
+            }
+            if (can_fuse) {
+                  nir_def *cdef = &consumer->def;
+                  uint32_t cdest = dest;
+                  uint32_t cmask = def_mask(cdef);
+
+                  for (unsigned i = 0; i < n; i++) {
+                     if (emitted[i])
+                        continue;
+                     uint32_t base = UREG(GET_UREG_TYPE(srcs[i]),
+                                          GET_UREG_NR(srcs[i]));
+                     uint32_t group_mask = chan_mask[i];
+                     uint32_t ch[4] = { X, Y, Z, W };
+                     int ng[4] = { 0, 0, 0, 0 };
+                     ch[i] = ch_sel[i];
+                     ng[i] = neg_sel[i];
+                     for (unsigned j = i + 1; j < n; j++) {
+                        if (!emitted[j] &&
+                            (ch_sel[j] >= SRC_ZERO ||
+                             (srcs[j] & UREG_TYPE_NR_MASK) ==
+                             (srcs[i] & UREG_TYPE_NR_MASK))) {
+                           group_mask |= chan_mask[j];
+                           ch[j] = ch_sel[j];
+                           ng[j] = neg_sel[j];
+                           emitted[j] = true;
+                        }
+                     }
+                     uint32_t fused_src = negate(
+                        swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                        ng[0], ng[1], ng[2], ng[3]);
+                     uint32_t s[3];
+                     for (unsigned a = 0; a < nargs; a++)
+                        s[a] = ((int)a == vec_arg) ? fused_src
+                                                   : other_srcs[a];
+                     i915_emit_arith(p, hw_op, cdest,
+                                     group_mask & cmask, 0,
+                                     s[0], nargs > 1 ? s[1] : 0,
+                                     nargs > 2 ? s[2] : 0);
+                     emitted[i] = true;
+                  }
+
+                  set_ureg(c, cdef, cdest);
+                  c->def_csr[cdef->index] = p->csr - 3;
+                  break;
+            }
+         }
+      }
+
+      /* Process real-register sources first, folding in any ZERO/ONE
+       * const-swizzle sources that can piggyback on the same MOV.
+       * Use the unswizzled base register since swizzle() composes.
+       */
+      for (unsigned i = 0; i < n; i++) {
+         if (emitted[i] || ch_sel[i] >= SRC_ZERO)
+            continue;
+         uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), GET_UREG_NR(srcs[i]));
+         uint32_t group_mask = chan_mask[i];
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         ch[i] = ch_sel[i];
+         ng[i] = neg_sel[i];
+         for (unsigned j = i + 1; j < n; j++) {
+            if (!emitted[j] &&
+                (ch_sel[j] >= SRC_ZERO ||
+                 (srcs[j] & UREG_TYPE_NR_MASK) ==
+                 (srcs[i] & UREG_TYPE_NR_MASK))) {
+               group_mask |= chan_mask[j];
+               ch[j] = ch_sel[j];
+               ng[j] = neg_sel[j];
+               emitted[j] = true;
+            }
+         }
+         i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0,
+                         negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                                ng[0], ng[1], ng[2], ng[3]),
+                         0, 0);
+         emitted[i] = true;
+      }
+      /* Any remaining const-swizzle-only sources */
+      for (unsigned i = 0; i < n; i++) {
+         if (emitted[i])
+            continue;
+         uint32_t group_mask = chan_mask[i];
+         uint32_t ch[4] = { X, Y, Z, W };
+         int ng[4] = { 0, 0, 0, 0 };
+         ch[i] = ch_sel[i];
+         ng[i] = neg_sel[i];
+         for (unsigned j = i + 1; j < n; j++) {
+            if (!emitted[j]) {
+               group_mask |= chan_mask[j];
+               ch[j] = ch_sel[j];
+               ng[j] = neg_sel[j];
+               emitted[j] = true;
+            }
+         }
+         i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0,
+                         negate(swizzle(srcs[i], ch[0], ch[1], ch[2], ch[3]),
+                                ng[0], ng[1], ng[2], ng[3]),
+                         0, 0);
+         emitted[i] = true;
+      }
+      break;
+   }
+   case nir_op_fsign: {
+      uint32_t tmp = i915_get_utemp(p);
+      const uint32_t zero = swizzle(UREG(REG_TYPE_R, 0),
+                                    ZERO, ZERO, ZERO, ZERO);
+      i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, zero, 0);
+      i915_emit_arith(p, A0_SLT, dest, mask, 0, zero, src0, 0);
+      i915_emit_arith(p, A0_ADD, dest, mask, 0,
+                      dest, negate(tmp, 1, 1, 1, 1), 0);
+      break;
+   }
+   default:
+      i915_program_error(p, "unsupported NIR ALU op: %s",
+                         nir_op_infos[alu->op].name);
+      break;
+   }
+
+   if (p->csr == pre_csr + 3)
+      c->def_csr[def->index] = pre_csr;
+
+   uint32_t save = 0;
+   if (GET_UREG_TYPE(dest) == REG_TYPE_U)
+      save = p->utemp_flag & (1 << GET_UREG_NR(dest));
+   i915_release_utemps(p);
+   p->utemp_flag |= save;
+}
+
+static uint32_t
+translate_tex_type(struct i915_fp_compile *p, enum glsl_sampler_dim dim)
+{
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      return D0_SAMPLE_TYPE_2D;
+   case GLSL_SAMPLER_DIM_3D:
+      return D0_SAMPLE_TYPE_VOLUME;
+   case GLSL_SAMPLER_DIM_CUBE:
+      return D0_SAMPLE_TYPE_CUBE;
+   default:
+      i915_program_error(p, "unsupported sampler dim %d", dim);
+      return D0_SAMPLE_TYPE_2D;
+   }
+}
+
+static uint32_t
+tex_coord_mask(nir_tex_instr *tex)
+{
+   uint32_t mask = TGSI_WRITEMASK_X;
+
+   switch (tex->sampler_dim) {
+   case GLSL_SAMPLER_DIM_1D:
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      mask = TGSI_WRITEMASK_XY;
+      break;
+   case GLSL_SAMPLER_DIM_3D:
+   case GLSL_SAMPLER_DIM_CUBE:
+      mask = TGSI_WRITEMASK_XYZ;
+      break;
+   default:
+      break;
+   }
+
+   if (tex->is_shadow)
+      mask |= TGSI_WRITEMASK_Z;
+
+   if (tex->op == nir_texop_txb)
+      mask |= TGSI_WRITEMASK_W;
+
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      if (tex->src[i].src_type == nir_tex_src_projector) {
+         mask |= TGSI_WRITEMASK_W;
+         break;
+      }
+   }
+
+   return mask;
+}
+
+static void
+emit_tex(struct nir_to_i915 *c, nir_tex_instr *tex)
+{
+   struct i915_fp_compile *p = c->p;
+   nir_def *def = &tex->def;
+   uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
+   set_ureg(c, def, dest);
+
+   uint32_t hw_tex = translate_tex_type(p, tex->sampler_dim);
+   uint32_t sampler = i915_emit_decl(p, REG_TYPE_S, tex->sampler_index, hw_tex);
+
+   uint32_t coord = 0;
+   uint32_t bias_or_proj = 0;
+   uint32_t shadow = 0;
+   bool has_bias = false, has_proj = false, has_shadow = false;
+
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      switch (tex->src[i].src_type) {
+      case nir_tex_src_coord:
+         coord = src_ureg(c, &tex->src[i].src);
+         break;
+      case nir_tex_src_bias:
+         bias_or_proj = src_ureg(c, &tex->src[i].src);
+         has_bias = true;
+         break;
+      case nir_tex_src_projector:
+         bias_or_proj = src_ureg(c, &tex->src[i].src);
+         has_proj = true;
+         break;
+      case nir_tex_src_comparator:
+         shadow = src_ureg(c, &tex->src[i].src);
+         has_shadow = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   /* 1D textures: set Y = X so LOD works correctly when sampled as 2D */
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_1D)
+      coord = swizzle(coord, X, X, Z, W);
+
+   /* pack bias/projector/shadow into a single coord register if needed */
+   if (has_bias || has_proj || has_shadow) {
+      uint32_t tmp = UREG(REG_TYPE_R, i915_get_temp(p));
+
+      i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0, coord, 0, 0);
+
+      if (has_shadow)
+         i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_Z, 0,
+                         swizzle(shadow, X, X, X, X), 0, 0);
+
+      if (has_bias || has_proj)
+         i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_W, 0,
+                         swizzle(bias_or_proj, X, X, X, X), 0, 0);
+
+      coord = tmp;
+   }
+
+   uint32_t opcode;
+   if (tex->op == nir_texop_txb) {
+      opcode = T0_TEXLDB;
+   } else if (has_proj) {
+      opcode = T0_TEXLDP;
+   } else if (tex->op == nir_texop_tex) {
+      opcode = T0_TEXLD;
+   } else {
+      i915_program_error(p, "unsupported tex op %d", tex->op);
+      return;
+   }
+
+   i915_emit_texld(p, dest, A0_DEST_CHANNEL_ALL, sampler, coord, opcode,
+                   tex_coord_mask(tex));
+
+   i915_release_utemps(p);
+}
+
+static void
+emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr)
+{
+   struct i915_fp_compile *p = c->p;
+   struct i915_fragment_shader *ifs = c->ifs;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_input: {
+      nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+      unsigned comp = nir_intrinsic_component(intr);
+      uint32_t reg = emit_input(c, sem.location);
+
+      if (comp > 0) {
+         reg = swizzle(reg, comp, MIN2(comp + 1, 3),
+                       MIN2(comp + 2, 3), MIN2(comp + 3, 3));
+      }
+
+      set_ureg(c, &intr->def, reg);
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+      unsigned comp = nir_intrinsic_component(intr);
+      uint32_t val = src_ureg(c, &intr->src[0]);
+      uint32_t wm = nir_intrinsic_write_mask(intr);
+      uint32_t dest;
+
+      if (sem.location == FRAG_RESULT_DEPTH) {
+         dest = UREG(REG_TYPE_OD, 0);
+      } else {
+         dest = UREG(REG_TYPE_OC, 0);
+      }
+
+      /* Vec direct-output already wrote to oC/oD */
+      uint32_t val_type = GET_UREG_TYPE(val);
+      if (val_type == REG_TYPE_OC || val_type == REG_TYPE_OD)
+         break;
+
+      nir_def *src_def = intr->src[0].ssa;
+      uint32_t *prev = c->def_csr[src_def->index];
+
+      /* Look through identity vec (same_reg case emits no instructions).
+       * Check that all uses of the underlying def come from this vec.
+       */
+      bool looked_through_vec = false;
+      if (!prev) {
+         nir_instr *def_instr = nir_def_instr_nonconst(src_def);
+         if (def_instr->type == nir_instr_type_alu) {
+            nir_alu_instr *vec = nir_instr_as_alu(def_instr);
+            if ((vec->op == nir_op_vec4 || vec->op == nir_op_vec3 ||
+                 vec->op == nir_op_vec2) &&
+                list_is_singular(&src_def->uses)) {
+               nir_def *inner = vec->src[0].src.ssa;
+               bool all_from_vec = true;
+               nir_foreach_use(use, inner) {
+                  if (nir_src_use_instr(use) != def_instr) {
+                     all_from_vec = false;
+                     break;
+                  }
+               }
+               if (all_from_vec) {
+                  src_def = inner;
+                  prev = c->def_csr[src_def->index];
+                  looked_through_vec = true;
+               }
+            }
+         }
+      }
+
+      if (prev && comp == 0 &&
+          (looked_through_vec || list_is_singular(&src_def->uses))) {
+         prev[0] = (prev[0] & ~(A0_DEST_CHANNEL_ALL |
+                                (0x1ff << A0_DEST_NR_SHIFT))) |
+                   A0_DEST(dest) | writemask_to_mask(wm);
+         break;
+      }
+
+      if (comp > 0) {
+         uint32_t s[4] = { X, Y, Z, W };
+         for (int i = 3; i >= (int)comp; i--)
+            s[i] = s[i - comp];
+         for (unsigned i = 0; i < comp; i++)
+            s[i] = ZERO;
+         val = swizzle(val, s[0], s[1], s[2], s[3]);
+         wm <<= comp;
+      }
+
+      i915_emit_arith(p, A0_MOV, dest, writemask_to_mask(wm), 0,
+                      val, 0, 0);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo: {
+      nir_src *offset_src = &intr->src[1];
+      if (!nir_src_is_const(*offset_src)) {
+         i915_program_error(p, "non-constant UBO offset");
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+      unsigned byte_offset = (unsigned)nir_src_as_float(*offset_src);
+      unsigned slot = byte_offset / 16;
+      unsigned comp = (byte_offset % 16) / 4;
+
+      if (slot >= I915_MAX_CONSTANT) {
+         i915_program_error(p, "UBO offset %d exceeds max constants", slot);
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+
+      for (unsigned i = 0; i < intr->def.num_components; i++)
+         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER_CH(comp + i);
+      ifs->num_constants = MAX2(ifs->num_constants, slot + 1);
+
+      uint32_t reg = UREG(REG_TYPE_CONST, slot);
+      if (comp > 0) {
+         uint32_t s[4];
+         for (unsigned i = 0; i < 4; i++)
+            s[i] = MIN2(comp + i, 3);
+         reg = swizzle(reg, s[0], s[1], s[2], s[3]);
+      }
+
+      set_ureg(c, &intr->def, reg);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo_vec4: {
+      nir_src *offset_src = &intr->src[1];
+      if (!nir_src_is_const(*offset_src)) {
+         i915_program_error(p, "non-constant UBO offset");
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+      unsigned slot = nir_intrinsic_base(intr) +
+                      (unsigned)nir_src_as_float(*offset_src);
+      unsigned comp = nir_intrinsic_component(intr);
+
+      if (slot >= I915_MAX_CONSTANT) {
+         i915_program_error(p, "UBO slot %d exceeds max constants", slot);
+         set_ureg(c, &intr->def,
+                  swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+         break;
+      }
+
+      for (unsigned i = 0; i < intr->def.num_components; i++)
+         ifs->constant_flags[slot] |= I915_CONSTFLAG_USER_CH(comp + i);
+      ifs->num_constants = MAX2(ifs->num_constants, slot + 1);
+
+      uint32_t reg = UREG(REG_TYPE_CONST, slot);
+      if (comp > 0) {
+         uint32_t s[4];
+         for (unsigned i = 0; i < 4; i++)
+            s[i] = MIN2(comp + i, 3);
+         reg = swizzle(reg, s[0], s[1], s[2], s[3]);
+      }
+
+      set_ureg(c, &intr->def, reg);
+      break;
+   }
+
+   case nir_intrinsic_terminate:
+   case nir_intrinsic_demote: {
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE),
+                             1, 1, 1, 1),
+                      T0_TEXKILL, TGSI_WRITEMASK_X);
+      i915_release_utemps(p);
+      break;
+   }
+
+   case nir_intrinsic_terminate_if:
+   case nir_intrinsic_demote_if: {
+      uint32_t cond = src_ureg(c, &intr->src[0]);
+      uint32_t tmp = i915_get_utemp(p);
+      i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      negate(swizzle(cond, X, X, X, X), 1, 1, 1, 1),
+                      T0_TEXKILL, TGSI_WRITEMASK_XYZW);
+      i915_release_utemps(p);
+      break;
+   }
+
+   case nir_intrinsic_ddx:
+   case nir_intrinsic_ddy:
+   case nir_intrinsic_ddx_coarse:
+   case nir_intrinsic_ddy_coarse:
+   case nir_intrinsic_ddx_fine:
+   case nir_intrinsic_ddy_fine:
+      set_ureg(c, &intr->def,
+               swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+      break;
+
+   default:
+      i915_program_error(p, "unsupported intrinsic: %s",
+                         nir_intrinsic_infos[intr->intrinsic].name);
+      break;
+   }
+}
+
+static void
+emit_instr(struct nir_to_i915 *c, nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      emit_load_const(c, nir_instr_as_load_const(instr));
+      break;
+   case nir_instr_type_alu:
+      emit_alu(c, nir_instr_as_alu(instr));
+      break;
+   case nir_instr_type_tex:
+      emit_tex(c, nir_instr_as_tex(instr));
+      break;
+   case nir_instr_type_intrinsic:
+      emit_intrinsic(c, nir_instr_as_intrinsic(instr));
+      break;
+   case nir_instr_type_undef: {
+      nir_undef_instr *undef = nir_instr_as_undef(instr);
+      set_ureg(c, &undef->def,
+               swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO));
+      break;
+   }
+   case nir_instr_type_jump:
+   case nir_instr_type_deref:
+      break;
+   default:
+      i915_program_error(c->p, "unsupported NIR instruction type %d",
+                         instr->type);
+      break;
+   }
+}
+
+static void
+fixup_depth_write(struct nir_to_i915 *c, nir_shader *s)
+{
+   if (!(s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
+      return;
+
+   /* NIR writes depth to OD.X (component 0); hardware reads from OD.W */
+   i915_emit_arith(c->p, A0_MOV,
+                   UREG(REG_TYPE_OD, 0), A0_DEST_CHANNEL_W, 0,
+                   swizzle(UREG(REG_TYPE_OD, 0), X, Y, Z, X),
+                   0, 0);
+}
+
+void
+i915_translate_fragment_program_nir(struct i915_context *i915,
+                                    struct i915_fragment_shader *ifs,
+                                    nir_shader *s,
+                                    const struct corm_compile_opts *opts)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(s);
+   bool debug = I915_DBG_ON(DBG_FS) &&
+                (!ifs->internal || NIR_DEBUG(PRINT_INTERNAL));
+
+   if (debug) {
+      mesa_logi("NIR fragment shader:");
+      nir_log_shaderi(s);
+   }
+
+   struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile);
+   p->shader = ifs;
+   p->error = ralloc_strdup(NULL, "");
+   p->log_program_errors = !ifs->internal;
+
+   ifs->num_constants = 0;
+   memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags));
+   memset(p->register_phases, 0, sizeof(p->register_phases));
+
+   for (int i = 0; i < I915_TEX_UNITS; i++)
+      ifs->texcoords[i].semantic = -1;
+
+   p->nr_tex_indirect = 1;
+   p->nr_tex_insn = 0;
+   p->nr_alu_insn = 0;
+   p->nr_decl_insn = 0;
+   p->csr = p->program;
+   p->decl = p->declarations;
+   p->decl_s = 0;
+   p->decl_t = 0;
+   p->temp_flag = ~0x0U << I915_MAX_TEMPORARY;
+   p->utemp_flag = ~0x7;
+
+   *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM;
+
+   struct nir_to_i915 c = {
+      .p = p,
+      .ifs = ifs,
+      .opts = *opts,
+      .ureg_map_size = impl->ssa_alloc,
+      .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)),
+      .def_csr = CALLOC(impl->ssa_alloc, sizeof(uint32_t *)),
+      .deferred_const = CALLOC(impl->ssa_alloc, sizeof(float)),
+      .last_use = CALLOC(impl->ssa_alloc, sizeof(int)),
+   };
+
+   memset(c.last_use, -1, impl->ssa_alloc * sizeof(int));
+   compute_last_use(&c, impl);
+
+   c.ip = 0;
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         emit_instr(&c, instr);
+         if (p->error[0])
+            break;
+         release_dead_temps(&c, instr);
+         c.ip++;
+      }
+      if (p->error[0])
+         break;
+   }
+
+   if (!p->error[0])
+      fixup_depth_write(&c, s);
+
+   /* finalize */
+   if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT)
+      i915_program_error(p, "exceeded max tex indirect (%d/%d)",
+                         p->nr_tex_indirect, I915_MAX_TEX_INDIRECT);
+   if (p->nr_tex_insn > I915_MAX_TEX_INSN)
+      i915_program_error(p, "exceeded max tex insn (%d/%d)",
+                         p->nr_tex_insn, I915_MAX_TEX_INSN);
+   if (p->nr_alu_insn > I915_MAX_ALU_INSN)
+      i915_program_error(p, "exceeded max ALU insn (%d/%d)",
+                         p->nr_alu_insn, I915_MAX_ALU_INSN);
+   if (p->nr_decl_insn > I915_MAX_DECL_INSN)
+      i915_program_error(p, "exceeded max decl insn (%d/%d)",
+                         p->nr_decl_insn, I915_MAX_DECL_INSN);
+
+   if (p->nr_alu_insn == 0 && p->nr_tex_insn == 0) {
+      i915_use_passthrough_shader(ifs);
+      ifs->nr_alu_insn = 1;
+      goto cleanup;
+   }
+
+   ifs->nr_alu_insn = p->nr_alu_insn;
+   ifs->nr_tex_insn = p->nr_tex_insn;
+   ifs->nr_tex_indirect = p->nr_tex_indirect;
+   ifs->nr_temps = util_bitcount(p->temp_flag);
+
+   {
+      unsigned long program_size = (unsigned long)(p->csr - p->program);
+      unsigned long decl_size = (unsigned long)(p->decl - p->declarations);
+
+      p->declarations[0] |= program_size + decl_size - 2;
+
+      assert(!ifs->program);
+      ifs->program_len = decl_size + program_size;
+      ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t));
+      memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t));
+      memcpy(&ifs->program[decl_size], p->program,
+             program_size * sizeof(uint32_t));
+
+      if (p->error[0]) {
+         /* dump the program for debugging, then replace with passthrough */
+         if (debug && ifs->program_len > 2) {
+            mesa_logi("FAILED program (%d ALU):", p->nr_alu_insn);
+            i915_disassemble_program(ifs->program, ifs->program_len);
+         }
+         FREE(ifs->program);
+         ifs->program = NULL;
+         ifs->program_len = 0;
+         i915_use_passthrough_shader(ifs);
+      }
+   }
+
+cleanup:
+   if (p->error[0])
+      ifs->error = p->error;
+   else
+      ralloc_free(p->error);
+
+   FREE(c.last_use);
+   FREE(c.deferred_const);
+   FREE(c.def_csr);
+   FREE(c.ureg_map);
+   FREE(p);
+
+   if (debug) {
+      if (ifs->error)
+         mesa_loge("%s", ifs->error);
+
+      mesa_logi("i915 fragment shader with %d constants%s",
+                ifs->num_constants, ifs->num_constants ? ":" : "");
+
+      for (int i = 0; i < I915_MAX_CONSTANT; i++) {
+         if (ifs->constant_flags[i] & 0x0f) {
+            mesa_logi("\t\tC[%d] = { %f, %f, %f, %f }", i,
+                      ifs->constants[i][0], ifs->constants[i][1],
+                      ifs->constants[i][2], ifs->constants[i][3]);
+         }
+      }
+      i915_disassemble_program(ifs->program, ifs->program_len);
+   }
+}
diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c
index b4ae362dfef..731f2444fec 100644
--- a/src/gallium/drivers/i915/i915_fpc_optimize.c
+++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
@@ -405,6 +405,8 @@ i915_fpc_optimize_mov_before_tex(struct i915_optimize_context *ctx,
        target_is_texture2d(next->FullInstruction.Texture.Texture) &&
        same_src_dst_reg(&next->FullInstruction.Src[0],
                         &current->FullInstruction.Dst[0]) &&
+       (current->FullInstruction.Dst[0].Register.WriteMask &
+        i915_tex_mask(next)) == i915_tex_mask(next) &&
        is_unswizzled(&current->FullInstruction.Src[0], i915_tex_mask(next)) &&
        unused_from(ctx, &current->FullInstruction.Dst[0], index)) {
       memcpy(&next->FullInstruction.Src[0], &current->FullInstruction.Src[0],
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index b6cfb2a3dfb..9277e55e9e3 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -54,55 +54,9 @@
  * Simple pass-through fragment shader to use when we don't have
  * a real shader (or it fails to compile for some reason).
  */
-static unsigned passthrough_program[] = {
-   _3DSTATE_PIXEL_SHADER_PROGRAM | ((1 * 3) - 1),
-   /* move to output color:
-    */
-   (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL |
-    (REG_TYPE_R << A0_SRC0_TYPE_SHIFT) | (0 << A0_SRC0_NR_SHIFT)),
-   ((SRC_ONE << A1_SRC0_CHANNEL_X_SHIFT) |
-    (SRC_ZERO << A1_SRC0_CHANNEL_Y_SHIFT) |
-    (SRC_ZERO << A1_SRC0_CHANNEL_Z_SHIFT) |
-    (SRC_ONE << A1_SRC0_CHANNEL_W_SHIFT)),
-   0};
-
 /**
  * component-wise negation of ureg
  */
-static inline int
-negate(int reg, int x, int y, int z, int w)
-{
-   /* Another neat thing about the UREG representation */
-   return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) |
-                 ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) |
-                 ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) |
-                 ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT));
-}
-
-/**
- * In the event of a translation failure, we'll generate a simple color
- * pass-through program.
- */
-static void
-i915_use_passthrough_shader(struct i915_fragment_shader *fs)
-{
-   fs->program = (uint32_t *)MALLOC(sizeof(passthrough_program));
-   if (fs->program) {
-      memcpy(fs->program, passthrough_program, sizeof(passthrough_program));
-      fs->program_len = ARRAY_SIZE(passthrough_program);
-   }
-   fs->num_constants = 0;
-}
-
-void
-i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
-{
-   va_list args;
-   va_start(args, msg);
-   ralloc_vasprintf_append(&p->error, msg, args);
-   va_end(args);
-}
-
 static uint32_t
 get_mapping(struct i915_fragment_shader *fs, enum tgsi_semantic semantic,
             int index)
@@ -1006,12 +960,11 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
                          p->nr_decl_insn, I915_MAX_DECL_INSN);
    }
 
-   /* hw doesn't seem to like empty frag programs (num_instructions == 1 is just
-    * TGSI_END), even when the depth write fixup gets emitted below - maybe that
-    * one is fishy, too?
-    */
-   if (ifs->info.num_instructions == 1)
-      i915_program_error(p, "Empty fragment shader");
+   if (ifs->info.num_instructions == 1) {
+      i915_use_passthrough_shader(ifs);
+      ifs->nr_alu_insn = 1;
+      goto done;
+   }
 
    if (strlen(p->error) != 0) {
       i915_use_passthrough_shader(ifs);
@@ -1024,6 +977,10 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
       assert(!ifs->program);
 
       ifs->program_len = decl_size + program_size;
+      ifs->nr_alu_insn = p->nr_alu_insn;
+      ifs->nr_tex_insn = p->nr_tex_insn;
+      ifs->nr_tex_indirect = p->nr_tex_indirect;
+      ifs->nr_temps = util_bitcount(p->temp_flag);
       ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t));
       memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t));
       memcpy(&ifs->program[decl_size], p->program,
@@ -1032,14 +989,16 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
       if (i915) {
          util_debug_message(
             &i915->debug, SHADER_INFO,
-            "%s shader: %d inst, %d tex, %d tex_indirect, %d temps, %d const",
+            "%s shader: %d instructions, %d alu, %d tex, %d tex_indirect, "
+            "%d temps, %d const",
             _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT),
-            (int)program_size, p->nr_tex_insn, p->nr_tex_indirect,
-            p->shader->info.file_max[TGSI_FILE_TEMPORARY] + 1,
-            ifs->num_constants);
+            ifs->nr_alu_insn + ifs->nr_tex_insn,
+            ifs->nr_alu_insn, ifs->nr_tex_insn, ifs->nr_tex_indirect,
+            ifs->nr_temps, ifs->num_constants);
       }
    }
 
+done:
    if (strlen(p->error) != 0)
       ifs->error = p->error;
    else
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 17db0d34034..df43fb05149 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -176,6 +176,8 @@ i915_optimize_nir(struct nir_shader *s)
 {
    bool progress;
 
+   NIR_PASS(_, s, nir_lower_int_to_float);
+
    do {
       progress = false;
 
@@ -212,6 +214,11 @@ i915_optimize_nir(struct nir_shader *s)
 
    } while (progress);
 
+   NIR_PASS(_, s, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS(_, s, nir_lower_bool_to_float, false);
+   NIR_PASS(_, s, nir_opt_algebraic);
+   NIR_PASS(_, s, nir_opt_dce);
+
    NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp,
             NULL);
 
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 8d786c02e41..24adc396241 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -31,7 +31,9 @@
 #include "compiler/nir/nir_builder.h"
 #include "draw/draw_context.h"
 #include "nir/nir_to_tgsi.h"
+#include "tgsi/tgsi_from_mesa.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
@@ -542,6 +544,37 @@ static const struct nir_to_tgsi_options ntt_options = {
    .lower_fabs = true,
 };
 
+static int
+type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+static bool
+scalarize_vector_bools(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   return alu->op == nir_op_bcsel ||
+          alu->op == nir_op_fcsel_ge ||
+          alu->op == nir_op_fcsel_gt;
+}
+
+static bool
+lower_fsqrt_filter(const nir_instr *instr, UNUSED const void *data)
+{
+   return instr->type == nir_instr_type_alu &&
+          nir_instr_as_alu(instr)->op == nir_op_fsqrt;
+}
+
+static nir_def *
+lower_fsqrt_impl(nir_builder *b, nir_instr *instr, UNUSED void *data)
+{
+   nir_def *src = nir_instr_as_alu(instr)->src[0].src.ssa;
+   return nir_fmul(b, src, nir_frsq(b, src));
+}
+
 static char *
 i915_check_control_flow(nir_shader *s)
 {
@@ -565,6 +598,94 @@ i915_check_control_flow(nir_shader *s)
    return NULL;
 }
 
+enum i915_fs_mode {
+   I915_FS_TGSI,
+   I915_FS_NIR,
+   I915_FS_BOTH,
+};
+
+static enum i915_fs_mode
+i915_get_fs_mode(void)
+{
+   const char *env = debug_get_option("I915_FS", "both");
+   if (!strcmp(env, "tgsi"))
+      return I915_FS_TGSI;
+   if (!strcmp(env, "nir"))
+      return I915_FS_NIR;
+   return I915_FS_BOTH;
+}
+
+static void
+i915_populate_fs_metadata(struct i915_fragment_shader *ifs, nir_shader *s)
+{
+   ifs->num_inputs = 0;
+   ifs->writes_z = s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH);
+
+   nir_foreach_shader_in_variable(var, s) {
+      unsigned sem_name, sem_index;
+      tgsi_get_gl_varying_semantic((gl_varying_slot)var->data.location, true,
+                                   &sem_name, &sem_index);
+      unsigned idx = ifs->num_inputs++;
+      ifs->input_semantic_name[idx] = sem_name;
+      ifs->input_semantic_index[idx] = sem_index;
+   }
+}
+
+static void
+i915_compile_tgsi(struct i915_context *i915,
+                  struct i915_fragment_shader *ifs,
+                  struct pipe_screen *screen,
+                  nir_shader *nir_clone)
+{
+   ifs->state.tokens = nir_to_tgsi_options(nir_clone, screen, &ntt_options);
+   ifs->state.type = PIPE_SHADER_IR_TGSI;
+   tgsi_scan_shader(ifs->state.tokens, &ifs->info);
+   i915_translate_fragment_program(i915, ifs);
+}
+
+static bool
+corm_fs_better(const struct i915_fragment_shader *a,
+               const struct i915_fragment_shader *b)
+{
+   if (a->nr_tex_indirect != b->nr_tex_indirect)
+      return a->nr_tex_indirect < b->nr_tex_indirect;
+   if (a->nr_alu_insn != b->nr_alu_insn)
+      return a->nr_alu_insn < b->nr_alu_insn;
+   if (a->nr_temps != b->nr_temps)
+      return a->nr_temps < b->nr_temps;
+   return a->num_constants < b->num_constants;
+}
+
+static const char *
+corm_win_reason(const struct i915_fragment_shader *winner,
+                const struct i915_fragment_shader *loser,
+                char *buf, size_t len)
+{
+   if (!loser) {
+      snprintf(buf, len, "only");
+      return buf;
+   }
+   int da = (int)winner->nr_alu_insn - (int)loser->nr_alu_insn;
+   int dp = (int)winner->nr_tex_indirect - (int)loser->nr_tex_indirect;
+   int dt = (int)winner->nr_temps - (int)loser->nr_temps;
+   if (dp != 0)
+      snprintf(buf, len, "%+d phase", dp);
+   else if (da != 0)
+      snprintf(buf, len, "%+d alu", da);
+   else if (dt != 0)
+      snprintf(buf, len, "%+d temps", dt);
+   else if ((int)winner->num_constants != (int)loser->num_constants)
+      snprintf(buf, len, "%+d const",
+               (int)winner->num_constants - (int)loser->num_constants);
+   else if (winner->program_len == loser->program_len &&
+            !memcmp(winner->program, loser->program,
+                    winner->program_len * sizeof(uint32_t)))
+      snprintf(buf, len, "identical");
+   else
+      snprintf(buf, len, "tied");
+   return buf;
+}
+
 static void *
 i915_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
@@ -576,39 +697,222 @@ i915_create_fs_state(struct pipe_context *pipe,
 
    ifs->draw_data = draw_create_fragment_shader(i915->draw, templ);
 
-   if (templ->type == PIPE_SHADER_IR_NIR) {
-      nir_shader *s = templ->ir.nir;
-      ifs->internal = s->info.internal;
-
-      char *msg = i915_check_control_flow(s);
-      if (msg) {
-         if (I915_DBG_ON(DBG_FS) &&
-             (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL))) {
-            mesa_logi("failing shader:");
-            nir_log_shaderi(s);
-         }
-         if (templ->report_compile_error) {
-            ((struct pipe_shader_state *)templ)->error_message = strdup(msg);
-            ralloc_free(s);
-            i915_delete_fs_state(NULL, ifs);
-            return NULL;
-         }
-      }
-
-      ifs->state.tokens = nir_to_tgsi_options(s, pipe->screen, &ntt_options);
-   } else {
-      assert(templ->type == PIPE_SHADER_IR_TGSI);
-      /* we need to keep a local copy of the tokens */
+   if (templ->type == PIPE_SHADER_IR_TGSI) {
       ifs->state.tokens = tgsi_dup_tokens(templ->tokens);
+      ifs->state.type = PIPE_SHADER_IR_TGSI;
       ifs->internal = i915->no_log_program_errors;
+      tgsi_scan_shader(ifs->state.tokens, &ifs->info);
+      i915_translate_fragment_program(i915, ifs);
+      return ifs;
    }
 
-   ifs->state.type = PIPE_SHADER_IR_TGSI;
+   assert(templ->type == PIPE_SHADER_IR_NIR);
+   nir_shader *s = templ->ir.nir;
+   ifs->internal = s->info.internal;
 
-   tgsi_scan_shader(ifs->state.tokens, &ifs->info);
+   bool debug = I915_DBG_ON(DBG_FS) &&
+                (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL));
+
+   char *msg = i915_check_control_flow(s);
+   if (msg) {
+      if (debug) {
+         mesa_logi("failing shader:");
+         nir_log_shaderi(s);
+      }
+      if (templ->report_compile_error) {
+         ((struct pipe_shader_state *)templ)->error_message = strdup(msg);
+         ralloc_free(s);
+         i915_delete_fs_state(NULL, ifs);
+         return NULL;
+      }
+   }
+
+   static enum i915_fs_mode fs_mode = -1;
+   if (fs_mode == (enum i915_fs_mode)-1)
+      fs_mode = i915_get_fs_mode();
+
+   bool try_nir = (fs_mode == I915_FS_NIR || fs_mode == I915_FS_BOTH);
+   bool try_tgsi = (fs_mode == I915_FS_TGSI || fs_mode == I915_FS_BOTH);
+
+   struct i915_fragment_shader tgsi_fs = {0};
+
+   static const struct corm_compile_opts corm_variants[] = {
+      { .deferred_const = false, .seq_sne_opt = false },
+      { .deferred_const = false, .seq_sne_opt = true },
+      { .deferred_const = true,  .seq_sne_opt = false },
+      { .deferred_const = true,  .seq_sne_opt = true },
+      { .deferred_const = false, .seq_sne_opt = false, .late_scalar = true },
+      { .deferred_const = false, .seq_sne_opt = true,  .late_scalar = true },
+      { .deferred_const = true,  .seq_sne_opt = false, .late_scalar = true },
+      { .deferred_const = true,  .seq_sne_opt = true,  .late_scalar = true },
+   };
+
+   struct i915_fragment_shader nir_results[ARRAY_SIZE(corm_variants)];
+   int best_nir = -1;
+
+   if (try_nir) {
+      nir_shader *nir_s = try_tgsi ? nir_shader_clone(NULL, s) : s;
+      NIR_PASS(_, nir_s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+               type_size, (nir_lower_io_options)0);
+      NIR_PASS(_, nir_s, nir_lower_alu_to_scalar, scalarize_vector_bools, NULL);
+      NIR_PASS(_, nir_s, nir_opt_vectorize, NULL, NULL);
+      NIR_PASS(_, nir_s, nir_lower_bool_to_float, false);
+      NIR_PASS(_, nir_s, nir_shader_lower_instructions, lower_fsqrt_filter,
+               lower_fsqrt_impl, NULL);
+      NIR_PASS(_, nir_s, nir_opt_copy_prop);
+      NIR_PASS(_, nir_s, nir_opt_cse);
+      NIR_PASS(_, nir_s, nir_opt_dce);
+      NIR_PASS(_, nir_s, nir_opt_algebraic);
+      NIR_PASS(_, nir_s, nir_opt_algebraic_late);
+      NIR_PASS(_, nir_s, nir_opt_dce);
+      NIR_PASS(_, nir_s, nir_opt_shrink_vectors, false);
+      NIR_PASS(_, nir_s, nir_opt_copy_prop);
+      NIR_PASS(_, nir_s, nir_opt_dce);
+      nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s));
+
+      for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
+         nir_shader *variant_nir = nir_shader_clone(NULL, nir_s);
+         if (corm_variants[v].late_scalar) {
+            NIR_PASS(_, variant_nir, nir_lower_alu_to_scalar, NULL, NULL);
+            NIR_PASS(_, variant_nir, nir_opt_copy_prop);
+            NIR_PASS(_, variant_nir, nir_opt_algebraic);
+            NIR_PASS(_, variant_nir, nir_opt_dce);
+            nir_index_ssa_defs(nir_shader_get_entrypoint(variant_nir));
+         }
+         memset(&nir_results[v], 0, sizeof(nir_results[v]));
+         i915_populate_fs_metadata(&nir_results[v], variant_nir);
+         i915_translate_fragment_program_nir(i915, &nir_results[v],
+                                            variant_nir, &corm_variants[v]);
+         ralloc_free(variant_nir);
+
+         bool ok = !nir_results[v].error || !nir_results[v].error[0];
+         if (ok && (best_nir < 0 ||
+                    corm_fs_better(&nir_results[v], &nir_results[best_nir])))
+            best_nir = v;
+      }
+
+      ralloc_free(nir_s);
+   }
+
+   if (try_tgsi) {
+      i915_compile_tgsi(i915, &tgsi_fs, pipe->screen, s);
+   } else {
+      ralloc_free(s);
+   }
+
+   bool nir_ok = best_nir >= 0;
+   bool tgsi_ok = try_tgsi && (!tgsi_fs.error || !tgsi_fs.error[0]);
+   struct i915_fragment_shader *best_nir_fs = nir_ok ? &nir_results[best_nir] : NULL;
+
+   bool use_nir;
+   if (nir_ok && tgsi_ok)
+      use_nir = !corm_fs_better(&tgsi_fs, best_nir_fs);
+   else
+      use_nir = nir_ok;
+
+   if (debug && try_nir && try_tgsi) {
+      for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
+         bool ok = !nir_results[v].error || !nir_results[v].error[0];
+         mesa_logi("  NIR[dc=%d,ss=%d]: %s (%d ALU, %d phase, %d temps)%s",
+                   corm_variants[v].deferred_const,
+                   corm_variants[v].seq_sne_opt,
+                   ok ? "ok" : "FAIL",
+                   ok ? nir_results[v].nr_alu_insn : 0,
+                   ok ? nir_results[v].nr_tex_indirect : 0,
+                   ok ? nir_results[v].nr_temps : 0,
+                   (int)v == best_nir ? " *" : "");
+      }
+      mesa_logi("  TGSI: %s (%d ALU, %d phase, %d temps)",
+                tgsi_ok ? "ok" : "FAIL",
+                tgsi_ok ? tgsi_fs.nr_alu_insn : 0,
+                tgsi_ok ? tgsi_fs.nr_tex_indirect : 0,
+                tgsi_ok ? tgsi_fs.nr_temps : 0);
+      mesa_logi("  -> %s%s", use_nir ? "NIR" : "TGSI",
+                use_nir ? (corm_fs_better(best_nir_fs, &tgsi_fs)
+                           ? " (better)" : " (tied)") : "");
+   }
+
+   /* Free non-winning NIR variants */
+   if (try_nir) {
+      for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) {
+         if ((int)v != best_nir) {
+            FREE(nir_results[v].program);
+            ralloc_free(nir_results[v].error);
+         }
+      }
+   }
+
+   struct i915_fragment_shader *winner, *loser = NULL;
+   struct i915_fragment_shader nir_loser_copy = {0};
+   if (use_nir) {
+      winner = best_nir_fs;
+      loser = tgsi_ok ? &tgsi_fs : NULL;
+   } else {
+      winner = &tgsi_fs;
+      if (best_nir_fs) {
+         nir_loser_copy = *best_nir_fs;
+         nir_loser_copy.program = NULL;
+         loser = &nir_loser_copy;
+         FREE(best_nir_fs->program);
+         ralloc_free(best_nir_fs->error);
+      }
+   }
+
+   if (i915 && !ifs->internal) {
+      bool neither = (winner->nr_alu_insn + winner->nr_tex_insn) == 0;
+      char reason[32];
+      if (neither)
+         snprintf(reason, sizeof(reason), "neither");
+      else
+         corm_win_reason(winner, loser, reason, sizeof(reason));
+      util_debug_message(
+         &i915->debug, SHADER_INFO,
+         "%s shader [%s, %s]: %d instructions, %d alu, %d tex, "
+         "%d tex_indirect, %d temps, %d const",
+         _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT),
+         neither ? "FAIL" : use_nir ? "NIR" : "TGSI", reason,
+         winner->nr_alu_insn + winner->nr_tex_insn,
+         winner->nr_alu_insn, winner->nr_tex_insn, winner->nr_tex_indirect,
+         winner->nr_temps, winner->num_constants);
+   }
+
+   ifs->program = winner->program;
+   ifs->program_len = winner->program_len;
+   ifs->nr_alu_insn = winner->nr_alu_insn;
+   ifs->nr_tex_insn = winner->nr_tex_insn;
+   ifs->nr_tex_indirect = winner->nr_tex_indirect;
+   ifs->nr_temps = winner->nr_temps;
+   ifs->num_constants = winner->num_constants;
+   memcpy(ifs->constants, winner->constants, sizeof(ifs->constants));
+   memcpy(ifs->constant_flags, winner->constant_flags,
+          sizeof(ifs->constant_flags));
+   memcpy(ifs->texcoords, winner->texcoords, sizeof(ifs->texcoords));
+   ifs->reads_pntc = winner->reads_pntc;
+   ifs->writes_z = winner->writes_z;
+   ifs->num_inputs = winner->num_inputs;
+   memcpy(ifs->input_semantic_name, winner->input_semantic_name,
+          sizeof(ifs->input_semantic_name));
+   memcpy(ifs->input_semantic_index, winner->input_semantic_index,
+          sizeof(ifs->input_semantic_index));
+   if (winner->error)
+      ifs->error = winner->error;
+
+   /* The loser's info may be in use (TGSI path populates ifs->info) */
+   if (try_tgsi)
+      ifs->info = tgsi_fs.info;
+
+   if (loser) {
+      FREE(loser->program);
+      ralloc_free(loser->error);
+   }
+   if (!use_nir && try_tgsi) {
+      /* TGSI won — tokens are in tgsi_fs via i915_compile_tgsi.
+       * We need them for ifs->state for draw's FS pipeline. */
+      ifs->state = tgsi_fs.state;
+   } else if (try_tgsi) {
+      FREE((void *)tgsi_fs.state.tokens);
+   }
 
-   /* The shader's compiled to i915 instructions here */
-   i915_translate_fragment_program(i915, ifs);
    if (ifs->error && templ->report_compile_error) {
       ((struct pipe_shader_state *)templ)->error_message = strdup(ifs->error);
       i915_delete_fs_state(NULL, ifs);
@@ -667,28 +971,11 @@ i915_create_vs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
    struct i915_context *i915 = i915_context(pipe);
-   void *vertex_shader;
 
-   struct pipe_shader_state from_nir = {PIPE_SHADER_IR_TGSI};
-   if (templ->type == PIPE_SHADER_IR_NIR) {
-      nir_shader *s = templ->ir.nir;
+   if (templ->type == PIPE_SHADER_IR_NIR)
+      NIR_PASS(_, templ->ir.nir, nir_lower_point_size, 1.0, 255.0);
 
-      NIR_PASS(_, s, nir_lower_point_size, 1.0, 255.0);
-
-      /* The gallivm draw path doesn't support non-native-integers NIR shaders,
-       * st/mesa does native-integers for the screen as a whole rather than
-       * per-stage, and i915 FS can't do native integers.  So, convert to TGSI,
-       * where the draw path *does* support non-native-integers.
-       */
-      from_nir.tokens = nir_to_tgsi(s, pipe->screen);
-      templ = &from_nir;
-   }
-
-   vertex_shader = draw_create_vertex_shader(i915->draw, templ);
-
-   FREE((void *)from_nir.tokens);
-
-   return vertex_shader;
+   return draw_create_vertex_shader(i915->draw, templ);
 }
 
 static void
diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
index f3561b143e8..8a92d6d0a7b 100644
--- a/src/gallium/drivers/i915/i915_state_emit.c
+++ b/src/gallium/drivers/i915/i915_state_emit.c
@@ -332,28 +332,33 @@ emit_constants(struct i915_context *i915)
       OUT_BATCH((1 << nr) - 1);
 
       for (i = 0; i < nr; i++) {
-         const uint32_t *c;
-         if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
-            /* grab user-defined constant */
-            c = (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT])
-                   ->data;
+         uint8_t flags = i915->fs->constant_flags[i];
+         uint8_t user_mask = flags >> 4;
+
+         if (!user_mask) {
+            const uint32_t *c = (uint32_t *)i915->fs->constants[i];
+            OUT_BATCH(c[0]);
+            OUT_BATCH(c[1]);
+            OUT_BATCH(c[2]);
+            OUT_BATCH(c[3]);
+         } else if (user_mask == 0xf) {
+            const uint32_t *c =
+               (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT])
+                  ->data;
             c += 4 * i;
+            OUT_BATCH(c[0]);
+            OUT_BATCH(c[1]);
+            OUT_BATCH(c[2]);
+            OUT_BATCH(c[3]);
          } else {
-            /* emit program constant */
-            c = (uint32_t *)i915->fs->constants[i];
+            const uint32_t *user =
+               (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT])
+                  ->data;
+            user += 4 * i;
+            const uint32_t *imm = (uint32_t *)i915->fs->constants[i];
+            for (unsigned ch = 0; ch < 4; ch++)
+               OUT_BATCH((user_mask & (1 << ch)) ? user[ch] : imm[ch]);
          }
-#if 0 /* debug */
-         {
-            float *f = (float *) c;
-            printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
-                   (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
-                    ? "user" : "immediate"));
-         }
-#endif
-         OUT_BATCH(*c++);
-         OUT_BATCH(*c++);
-         OUT_BATCH(*c++);
-         OUT_BATCH(*c++);
       }
    }
 }
diff --git a/src/gallium/drivers/i915/meson.build b/src/gallium/drivers/i915/meson.build
index 80dc825fbc5..ef1d5f7ad34 100644
--- a/src/gallium/drivers/i915/meson.build
+++ b/src/gallium/drivers/i915/meson.build
@@ -16,6 +16,7 @@ files_i915 = files(
   'i915_flush.c',
   'i915_fpc_emit.c',
   'i915_fpc.h',
+  'i915_fpc_nir.c',
   'i915_fpc_optimize.c',
   'i915_fpc_translate.c',
   'i915_prim_emit.c',