diff --git a/src/.clang-format b/src/.clang-format
index 7e22bed1676..d2df8c5b55d 100644
--- a/src/.clang-format
+++ b/src/.clang-format
@@ -300,6 +300,52 @@ ForEachMacros:
   - foreach_bo
   - foreach_bo_safe
 
+# intel
+  - jay_foreach_ssa_file
+  - jay_foreach_function
+  - jay_foreach_block
+  - jay_foreach_block_safe
+  - jay_foreach_block_rev
+  - jay_foreach_block_from
+  - jay_foreach_block_from_rev
+  - jay_foreach_dst
+  - jay_foreach_dst_index
+  - jay_foreach_inst_in_block
+  - jay_foreach_inst_in_block_rev
+  - jay_foreach_inst_in_block_safe
+  - jay_foreach_inst_in_block_safe_rev
+  - jay_foreach_inst_in_block_from
+  - jay_foreach_inst_in_block_from_rev
+  - jay_foreach_inst_in_shader
+  - jay_foreach_inst_in_shader_rev
+  - jay_foreach_inst_in_shader_safe
+  - jay_foreach_inst_in_shader_safe_rev
+  - jay_foreach_inst_in_func
+  - jay_foreach_inst_in_func_rev
+  - jay_foreach_inst_in_func_safe
+  - jay_foreach_inst_in_func_safe_rev
+  - jay_foreach_successor
+  - jay_foreach_predecessor
+  - jay_foreach_comp
+  - jay_foreach_comp_rev
+  - jay_foreach_src
+  - jay_foreach_src_rev
+  - jay_foreach_ssa_src
+  - jay_foreach_ssa_src_rev
+  - jay_foreach_ssa_src_comp
+  - jay_foreach_index
+  - jay_foreach_index_rev
+  - jay_foreach_src_index
+  - jay_foreach_src_index_rev
+  - jay_repair_foreach_phi
+  - jay_foreach_phi_src_in_block
+  - jay_foreach_phi_dst_in_block
+  - jay_foreach_preload
+  - jay_foreach_killed
+  - jay_foreach_ra_src
+  - jay_foreach_ra_file
+  - jay_foreach_pipe
+
 # Disable clang formatting by default. Drivers that use clang-format
 # inherit from this .clang-format file and re-enable formatting:
 #
diff --git a/src/intel/compiler/jay/.clang-format b/src/intel/compiler/jay/.clang-format
new file mode 100644
index 00000000000..04cf17f20bb
--- /dev/null
+++ b/src/intel/compiler/jay/.clang-format
@@ -0,0 +1,31 @@
+BasedOnStyle: InheritParentConfig
+DisableFormat: false
+
+AlignConsecutiveBitFields: Consecutive
+BitFieldColonSpacing: None
+
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros:
+  Enabled: true
+  AcrossComments: true
+AlignArrayOfStructures: Left
+
+ColumnLimit: 80
+
+BreakStringLiterals: false
+SpaceBeforeParens: ControlStatementsExceptControlMacros
+SpaceAfterCStyleCast: true
+BinPackParameters: OnePerLine
+AllowAllArgumentsOnNextLine: false
+PenaltyBreakBeforeFirstCallParameter: 100
+ReferenceAlignment: Middle
+
+BreakBeforeBinaryOperators: None
+PenaltyBreakAssignment: 0
+
+SpacesInContainerLiterals: true
+Cpp11BracedListStyle: false
+
+AlignOperands: Align
+BreakBinaryOperations: RespectPrecedence
+BreakBeforeTernaryOperators: false
diff --git a/src/intel/compiler/jay/README.md b/src/intel/compiler/jay/README.md
new file mode 100644
index 00000000000..8ac3ed0897b
--- /dev/null
+++ b/src/intel/compiler/jay/README.md
@@ -0,0 +1,3 @@
+Xe2 compiler experiments.
+
+**Work-in-progress, not ready for users/benchmarks.**
diff --git a/src/intel/compiler/jay/jay.h b/src/intel/compiler/jay/jay.h
new file mode 100644
index 00000000000..914c0d8ea71
--- /dev/null
+++ b/src/intel/compiler/jay/jay.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "compiler/brw/brw_compiler.h"
+#include "util/shader_stats.h"
+#include "nir.h"
+
+struct intel_device_info;
+struct nir_shader_compiler_options;
+
+struct jay_shader_bin {
+   const uint32_t *kernel;
+   uint32_t size;
+   struct genisa_stats stats;
+};
+
+struct jay_shader_bin *jay_compile(const struct intel_device_info *devinfo,
+                                   void *mem_ctx,
+                                   nir_shader *nir,
+                                   union brw_any_prog_data *prog_data,
+                                   union brw_any_prog_key *key);
diff --git a/src/intel/compiler/jay/jay_assign_flags.c b/src/intel/compiler/jay/jay_assign_flags.c
new file mode 100644
index 00000000000..5442eb154a1
--- /dev/null
+++ b/src/intel/compiler/jay/jay_assign_flags.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/*
+ * Instruction selection works on SSA FLAG and UFLAG variables. This pass
+ * implements a flag register allocator, assigning each FLAG/UFLAG either to a
+ * hardware flag register and/or spilling to a GPR/UGPR.
+ *
+ * As a simplification, hardware flags are block-local. At block boundaries,
+ * 32-bit 0/~0 (U)GPRs are our canonical representation for (U)FLAGs.
+ *
+ * Producers: CMP produce both 0/~0 GPRs and flags, while conditional modifiers
+ * produce only flags. Boolean arithmetic is lowered to GPRs.
+ *
+ * Consumers: SEL/CSEL consumes both GPRs and flags, while predication consumes
+ * only flags. Boolean arithmetic again requires GPRs.
+ *
+ * Our strategy is to turn flags into GPR representations globally while keeping
+ * copies in flags where it makes sense locally.
+ */
+
+static inline jay_def
+canonicalize_flag(jay_def x)
+{
+   assert(jay_is_flag(x));
+   x.file = x.file == UFLAG ? UGPR : GPR;
+   return x;
+}
+
+struct var_info {
+   unsigned flag           :3;
+   bool uniform            :1;
+   bool read_by_predication:1;
+   bool free_canonical     :1;
+   unsigned pad            :2;
+} PACKED;
+static_assert(sizeof(struct var_info) == 1);
+
+struct flag_ra {
+   jay_builder *b;
+   struct var_info *vars;
+   uint32_t flag_to_global[JAY_MAX_FLAGS];
+   uint32_t flag_to_local[JAY_MAX_FLAGS];
+   unsigned roundrobin;
+   unsigned ballots:JAY_MAX_FLAGS;
+};
+
+static jay_def
+assign_flag(struct flag_ra *ra,
+            jay_def flag,
+            enum jay_file file,
+            bool free_canonical,
+            bool ballot)
+{
+   jay_def canonical = canonicalize_flag(flag);
+   jay_def tmp = jay_alloc_def(ra->b, file, 1);
+
+   /* Dedicate a flag for ballot since uniform access would clobber the zeroing.
+    * TODO: We could optimize this with more tracking.
+    */
+   unsigned num_flags = jay_num_regs(ra->b->shader, FLAG);
+   tmp.reg = ballot ? 0 : (1 + (ra->roundrobin++) % (num_flags - 2));
+
+   ra->vars[jay_index(canonical)] = (struct var_info) {
+      .uniform = tmp.file == UFLAG,
+      .flag = tmp.reg,
+      .free_canonical = free_canonical,
+   };
+
+   ra->flag_to_global[tmp.reg] = jay_index(canonical);
+   ra->flag_to_local[tmp.reg] = jay_index(tmp);
+
+   if (ballot) {
+      ra->ballots |= BITFIELD_BIT(tmp.reg);
+   }
+
+   return tmp;
+}
+
+static bool
+rewrite_sel_with_zero(jay_inst *I, unsigned zero)
+{
+   jay_def flag = I->src[2];
+   unsigned other = 1 - zero;
+
+   if (!jay_defs_equivalent(I->src[zero], jay_imm(0)) ||
+       I->src[other].abs ||
+       I->src[other].negate ||
+       jay_type_size_bits(I->type) != 32) {
+      return false;
+   }
+
+   if (jay_defs_equivalent(I->src[other], jay_imm(0xffffffff)) && zero == 1) {
+      /* (c ? 0xffffffff : 0) -> canonical(c) */
+      I->op = JAY_OPCODE_MOV;
+      I->src[0] = canonicalize_flag(flag);
+      jay_shrink_sources(I, 1);
+   } else {
+      /* ([!]c ? a : 0) --> (a &  [~]canonical(c)) and
+       * ([!]c ? 0 : a) --> (a & ~[~]canonical(c))
+       */
+      I->op = JAY_OPCODE_AND;
+      I->src[0] = I->src[other];
+      I->src[1] = canonicalize_flag(flag);
+      I->src[1].negate ^= (zero == 0);
+      jay_shrink_sources(I, 2);
+   }
+
+   return true;
+}
+
+static bool
+rewrite_sel_to_csel(jay_inst *I)
+{
+   if (jay_type_size_bits(I->type) != 32) {
+      return false;
+   }
+
+   /* SEL.f32 lowers to CSEL.f32 to preserve source modifiers & float controls.
+    * That works since we reinterpret 0/~0 as 0.0/NaN.
+    */
+   jay_def flag = I->src[2];
+   I->op = JAY_OPCODE_CSEL;
+   I->conditional_mod = flag.negate ? JAY_CONDITIONAL_EQ : JAY_CONDITIONAL_NE;
+   I->src[2] = canonicalize_flag(flag);
+   I->src[2].negate = false;
+   return true;
+}
+
+static bool
+rewrite_without_flag(struct flag_ra *ra, jay_inst *I, unsigned s, bool in_flag)
+{
+   if (I->op == JAY_OPCODE_PHI_SRC) {
+      I->src[s] = canonicalize_flag(I->src[s]);
+      return true;
+   }
+
+   if (jay_debug & JAY_DBG_NOOPT) {
+      return false;
+   }
+
+   if (I->op == JAY_OPCODE_SEL &&
+       (!in_flag || ra->vars[jay_index(I->src[s])].free_canonical) &&
+       !I->predication) {
+
+      return rewrite_sel_with_zero(I, 0) ||
+             rewrite_sel_with_zero(I, 1) ||
+             (!in_flag && rewrite_sel_to_csel(I));
+   }
+
+   return false;
+}
+
+static void
+assign_block(jay_function *func, jay_block *block, struct var_info *var_to_flag)
+{
+   jay_builder b = { .shader = func->shader, .func = func };
+   struct flag_ra ra_ = { .b = &b, .vars = var_to_flag }, *ra = &ra_;
+
+   jay_foreach_inst_in_block_safe(block, I) {
+      if (I->op == JAY_OPCODE_CAST_CANONICAL_TO_FLAG) {
+         /* Assume the source is already 0/~0 canonical and use it. */
+         I->op = JAY_OPCODE_MOV;
+         I->type = JAY_TYPE_U32;
+         I->dst = canonicalize_flag(I->dst);
+         continue;
+      } else if (I->type == JAY_TYPE_U1) {
+         /* Boolean logic turns into bitwise logic on the canonical form */
+         if (!jay_is_null(I->dst)) {
+            I->dst = canonicalize_flag(I->dst);
+         }
+
+         jay_foreach_src(I, s) {
+            if (!(s == 2 && I->op == JAY_OPCODE_SEL) &&
+                jay_src_type(I, s) == JAY_TYPE_U1) {
+               if (jay_is_imm(I->src[s])) {
+                  /* Convert 1-bit boolean to 0/~0 */
+                  assert(jay_is_imm(I->src[s]) && jay_as_uint(I->src[s]) <= 1);
+                  I->src[s] = jay_imm(jay_as_uint(I->src[s]) ? ~0 : 0);
+               } else {
+                  I->src[s] = canonicalize_flag(I->src[s]);
+               }
+            }
+         }
+
+         I->type = JAY_TYPE_U32;
+      }
+
+      /* Handle flag sources */
+      jay_foreach_src(I, s) {
+         if (!jay_is_flag(I->src[s])) {
+            continue;
+         }
+
+         unsigned index = jay_index(I->src[s]);
+         bool ballot = jay_src_type(I, s) != JAY_TYPE_U1;
+         enum jay_file file = I->dst.file == UGPR && !ballot ? UFLAG : FLAG;
+         bool in_flag = ra->flag_to_global[var_to_flag[index].flag] == index &&
+                        ((file == UFLAG) == var_to_flag[index].uniform);
+
+         /* If we don't actually need the flag, we're done. */
+         if (rewrite_without_flag(ra, I, s, in_flag)) {
+            continue;
+         }
+
+         /* Otherwise, ensure we have the value in a flag. */
+         if (!in_flag) {
+            jay_def tmp = assign_flag(ra, I->src[s], file, false, ballot);
+
+            /* XXX: We need a more systematic approach to modifiers :/ */
+            b.cursor = jay_before_inst(I);
+            jay_def d = I->src[s];
+            d.negate = false;
+            jay_CMP(&b, JAY_TYPE_U32, JAY_CONDITIONAL_NE, tmp,
+                    canonicalize_flag(d), 0);
+         }
+
+         /* ...and rewrite to use the flag */
+         unsigned reg = var_to_flag[index].flag;
+         jay_def flag = jay_scalar(file, ra->flag_to_local[reg]);
+         flag.reg = reg;
+         jay_replace_src(&I->src[s], flag);
+      }
+
+      /* Handle flag writes */
+      b.cursor = jay_after_inst(I);
+
+      /* If the flag is written directly (for an inverse ballot), recover the
+       * canonical representation with a SEL.
+       */
+      if (!jay_is_null(I->dst) && jay_is_flag(I->dst)) {
+         jay_def canonical = canonicalize_flag(I->dst);
+         I->dst = assign_flag(ra, I->dst, I->dst.file, false, false);
+         jay_SEL(&b, JAY_TYPE_U32, canonical, ~0, 0, I->dst);
+      }
+
+      if (!jay_is_null(I->cond_flag)) {
+         I->broadcast_flag =
+            var_to_flag[jay_index(I->cond_flag)].read_by_predication &&
+            I->cond_flag.file == UFLAG &&
+            I->op == JAY_OPCODE_CMP;
+
+         jay_def canonical = canonicalize_flag(I->cond_flag);
+         I->cond_flag =
+            assign_flag(ra, I->cond_flag,
+                        I->broadcast_flag ? FLAG : I->cond_flag.file,
+                        I->op == JAY_OPCODE_CMP, false);
+
+         if (I->op == JAY_OPCODE_CMP) {
+            assert(jay_is_null(I->dst));
+
+            if (I->broadcast_flag) {
+               /* We need to recover the UGPR from the replicated FLAG. Thanks
+                * to our write-masking and broadcasting, the flag is already
+                * 0/~0. We simply need to sign-extend.
+                */
+               jay_i2i32(&b, canonical, b.shader->dispatch_width, I->cond_flag);
+            } else if (jay_type_size_bits(I->type) != 32) {
+               I->dst = jay_alloc_def(&b, canonical.file,
+                                      jay_type_vector_length(I->type));
+               jay_i2i32(&b, canonical, jay_type_size_bits(I->type), I->dst);
+            } else {
+               /* 32-bit CMP returns the canonical form */
+               I->dst = canonical;
+            }
+         } else {
+            assert(jay_type_size_bits(I->type) == 32 && "limited cmod prop");
+
+            if (jay_is_null(I->dst)) {
+               I->dst = jay_alloc_def(&b, canonical.file,
+                                      jay_type_vector_length(I->type));
+            }
+
+            /* Recover the canonical representation with a CMP. Hopefully,
+             * either the CMP or the cmod will be eliminated by a later DCE.
+             */
+            jay_CMP(&b, I->type, I->conditional_mod, canonical, I->dst, 0)
+               ->cond_flag.reg =
+               jay_num_regs(b.shader, FLAG) - 1; // TODO: no null flag
+         }
+      }
+   }
+
+   /* Ballots require zeroing flags */
+   b.cursor = jay_before_block(block);
+   u_foreach_bit(i, ra->ballots) {
+      jay_ZERO_FLAG(&b, i);
+   }
+}
+
+static void
+copyprop(jay_function *f)
+{
+   jay_inst **defs = calloc(f->ssa_alloc, sizeof(defs[0]));
+
+   jay_foreach_inst_in_func_safe(f, block, I) {
+      jay_foreach_dst_index(I, _, d) {
+         defs[d] = I;
+      }
+
+      if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND)
+         continue;
+
+      jay_foreach_ssa_src(I, s) {
+         jay_def src = I->src[s];
+         if (src.collect)
+            continue;
+
+         jay_inst *def = defs[jay_base_index(src)];
+         if (jay_defs_equivalent(def->dst, src) &&
+             !def->predication &&
+             def->op == JAY_OPCODE_MOV &&
+             (I->src[s].file == def->src[0].file ||
+              (I->op == JAY_OPCODE_CMP && jay_is_imm(def->src[0])))) {
+
+            jay_replace_src(&I->src[s], def->src[0]);
+         }
+      }
+   }
+
+   free(defs);
+}
+
+void
+jay_assign_flags(jay_shader *s)
+{
+   jay_foreach_function(s, f) {
+      struct var_info *map = calloc(f->ssa_alloc, sizeof(map[0]));
+      uint32_t *def_to_block = calloc(f->ssa_alloc, sizeof(def_to_block));
+
+      jay_foreach_inst_in_func(f, block, I) {
+         if (!jay_is_null(I->cond_flag)) {
+            def_to_block[jay_index(I->cond_flag)] = block->index + 1;
+         }
+
+         if (I->predication) {
+            jay_def predicate = *jay_inst_get_predicate(I);
+            if (def_to_block[jay_index(predicate)] == block->index + 1) {
+               map[jay_index(predicate)].read_by_predication = true;
+            }
+         }
+      }
+
+      jay_foreach_block(f, b) {
+         assign_block(f, b, map);
+      }
+
+      free(map);
+      free(def_to_block);
+
+      /* Flag RA leaves moves. Clean up after ourselves. */
+      copyprop(f);
+   }
+}
+/* TODO: revisit
+ * dEQP-GLES3.functional.shaders.arrays.compare.equal_highp_vec4_highp_vec4_vertex
+ */
diff --git a/src/intel/compiler/jay/jay_builder.h b/src/intel/compiler/jay/jay_builder.h
new file mode 100644
index 00000000000..a65b826e9f2
--- /dev/null
+++ b/src/intel/compiler/jay/jay_builder.h
@@ -0,0 +1,643 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "compiler/brw/brw_eu.h"
+#include "compiler/brw/brw_eu_defines.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+
+/* Like in NIR, for use with the builder */
+enum jay_cursor_option {
+   jay_cursor_after_block,
+   jay_cursor_before_inst,
+   jay_cursor_after_inst
+};
+
+typedef struct PACKED {
+   union {
+      jay_block *block;
+      jay_inst *inst;
+   };
+
+   enum jay_cursor_option option;
+} jay_cursor;
+
+static inline bool
+jay_cursors_equal(jay_cursor a, jay_cursor b)
+{
+   return !memcmp(&a, &b, sizeof(a));
+}
+
+static inline jay_cursor
+jay_after_block(jay_block *block)
+{
+   return (jay_cursor) { .block = block, .option = jay_cursor_after_block };
+}
+
+static inline jay_cursor
+jay_before_inst(jay_inst *I)
+{
+   return (jay_cursor) { .inst = I, .option = jay_cursor_before_inst };
+}
+
+static inline jay_cursor
+jay_after_inst(jay_inst *I)
+{
+   return (jay_cursor) { .inst = I, .option = jay_cursor_after_inst };
+}
+
+static inline jay_cursor
+jay_before_block(jay_block *block)
+{
+   jay_foreach_inst_in_block(block, I) {
+      if (I->op != JAY_OPCODE_PHI_DST &&
+          I->op != JAY_OPCODE_PRELOAD &&
+          I->op != JAY_OPCODE_ELSE)
+         return jay_before_inst(I);
+   }
+
+   /* Whole block is phis, so insert at the end */
+   return jay_after_block(block);
+}
+
+static inline jay_cursor
+jay_after_block_logical(jay_block *block)
+{
+   jay_foreach_inst_in_block_rev(block, I) {
+      if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op))
+         return jay_after_inst(I);
+   }
+
+   /* Whole block is phis, so insert at the start */
+   return jay_before_block(block);
+}
+
+static inline jay_cursor
+jay_before_jump(jay_block *block)
+{
+   jay_inst *jump = jay_block_ending_jump(block);
+   return jump ? jay_before_inst(jump) : jay_after_block(block);
+}
+
+/* Get a cursor at the start of a function, after any preloads */
+static inline jay_cursor
+jay_before_function(jay_function *f)
+{
+   jay_block *block = jay_first_block(f);
+
+   jay_foreach_inst_in_block(block, I) {
+      if (I->op != JAY_OPCODE_PRELOAD)
+         return jay_before_inst(I);
+   }
+
+   /* The whole block is preloads, so insert at the end */
+   return jay_after_block(block);
+}
+
+/*
+ * Map a control flow edge to a block. If the block has one successor, the
+ * predecessor is unique. Else, the successor is unique; the successor must not
+ * have other predecessorss since there are no critical edges.
+ */
+static inline jay_block *
+jay_edge_to_block(jay_block *pred, jay_block *succ)
+{
+   assert(jay_num_successors(pred) == 1 || jay_num_predecessors(succ) == 1);
+   return jay_num_successors(pred) == 1 ? pred : succ;
+}
+
+/*
+ * Get a cursor to insert along a control flow edge: either at the start of
+ * the successor or the end of the predecessor. This relies on the control
+ * flow graph having no critical edges.
+ */
+static inline jay_cursor
+jay_along_edge(jay_block *pred, jay_block *succ)
+{
+   jay_block *to = jay_edge_to_block(pred, succ);
+
+   if (to == pred)
+      return jay_after_block_logical(pred);
+   else
+      return jay_before_block(succ);
+}
+
+typedef struct {
+   jay_shader *shader;
+   jay_function *func;
+   jay_cursor cursor;
+} jay_builder;
+
+static inline jay_builder
+jay_init_builder(jay_function *f, jay_cursor cursor)
+{
+   return (jay_builder) { .shader = f->shader, .func = f, .cursor = cursor };
+}
+
+static inline void
+jay_builder_insert(jay_builder *b, jay_inst *I)
+{
+   jay_cursor *cursor = &b->cursor;
+
+   if (cursor->option == jay_cursor_after_inst) {
+      list_add(&I->link, &cursor->inst->link);
+   } else if (cursor->option == jay_cursor_after_block) {
+      list_addtail(&I->link, &cursor->block->instructions);
+   } else {
+      assert(cursor->option == jay_cursor_before_inst);
+      list_addtail(&I->link, &cursor->inst->link);
+   }
+
+   cursor->option = jay_cursor_after_inst;
+   cursor->inst = I;
+}
+
+static inline jay_def
+jay_alloc_def(jay_builder *b, enum jay_file file, unsigned size)
+{
+   unsigned idx = b->func->ssa_alloc;
+   b->func->ssa_alloc += size;
+   return jay_contiguous_def(file, idx, size);
+}
+
+/*
+ * Collect SSA indices into a source. If the indices are not contiguous, this
+ * uses a heap-allocated collect. Otherwise, a contiguous def is used.
+ */
+static inline jay_def
+jay_collect(jay_builder *b,
+            enum jay_file file,
+            const uint32_t *indices,
+            unsigned nr)
+{
+   if (nr == 0)
+      return jay_null();
+
+   for (unsigned i = 1; i < nr; ++i) {
+      if (indices[i] != (indices[0] + i)) {
+         static_assert(sizeof(uintptr_t) <= sizeof(uint64_t) &&
+                       "sorry, no Morello support");
+         void *dup =
+            linear_memdup(b->shader->lin_ctx, indices, sizeof(uint32_t) * nr);
+         uint64_t payload = (uintptr_t) dup;
+
+         /* We require pointers to fit within (32+JAY_REG_BITS) bits. Luckily
+          * this will always be the case on common architectures.
+          */
+         assert(payload < (1ull << (32 + JAY_REG_BITS)));
+
+         return (jay_def) {
+            ._payload = (uint32_t) payload,
+            .reg = (uint32_t) (payload >> 32),
+            .file = file,
+            .num_values_m1 = nr - 1,
+            .collect = true,
+         };
+      }
+   }
+
+   return jay_contiguous_def(file, indices[0], nr);
+}
+
+/*
+ * Set the n'th channel of a def to index. This requires a copy-on-write.
+ *
+ * This implementation could likely be optimized.
+ */
+static inline void
+jay_insert_channel(jay_builder *b, jay_def *d, unsigned c, jay_def scalar)
+{
+   uint32_t indices[JAY_MAX_DEF_LENGTH];
+   uint32_t count = jay_num_values(*d);
+
+   assert(scalar.file == d->file && !scalar.negate && !scalar.abs);
+   assert(c < count && count <= ARRAY_SIZE(indices));
+
+   /* First, decompress the def. */
+   jay_foreach_comp(*d, i) {
+      indices[i] = jay_channel(*d, i);
+   }
+
+   /* Next, update the indices in place */
+   indices[c] = jay_index(scalar);
+
+   /* Now collect it back. */
+   jay_replace_src(d, jay_collect(b, d->file, indices, count));
+}
+
+/*
+ * Concatenate a list of vectors, collecting all the indices in order.
+ */
+static inline jay_def
+jay_collect_vectors(jay_builder *b, jay_def *vecs, uint32_t nr)
+{
+   uint32_t indices[JAY_MAX_DEF_LENGTH];
+   uint32_t nr_indices = 0;
+
+   for (unsigned i = 0; i < nr; ++i) {
+      assert(vecs[i].file == vecs[0].file && jay_is_ssa(vecs[i]));
+      assert(!vecs[i].negate && !vecs[i].abs);
+
+      jay_foreach_comp(vecs[i], c) {
+         assert(nr_indices < ARRAY_SIZE(indices));
+         indices[nr_indices++] = jay_channel(vecs[i], c);
+      }
+   }
+
+   return jay_collect(b, vecs[0].file, indices, nr_indices);
+}
+
+static inline jay_def
+jay_collect_two(jay_builder *b, jay_def u, jay_def v)
+{
+   jay_def vecs[] = { u, v };
+   return jay_collect_vectors(b, vecs, 2);
+}
+
+static inline jay_inst *
+jay_alloc_inst(jay_builder *b,
+               enum jay_opcode op,
+               uint8_t num_srcs,
+               unsigned extra_bytes)
+{
+   const size_t size =
+      offsetof(jay_inst, src) + num_srcs * sizeof(jay_def) + extra_bytes;
+
+   jay_inst *I = (jay_inst *) linear_zalloc_child(b->shader->lin_ctx, size);
+   I->op = op;
+   I->num_srcs = num_srcs;
+   I->dst = jay_null();
+   I->cond_flag = jay_null();
+
+   return I;
+}
+
+static inline void
+jay_shrink_sources(jay_inst *I, uint8_t new_num_srcs)
+{
+   assert(new_num_srcs < I->num_srcs);
+   unsigned info_size = jay_inst_info_size(I);
+
+   memmove(&I->src[new_num_srcs], &I->src[I->num_srcs], info_size);
+   I->num_srcs = new_num_srcs;
+}
+
+static inline jay_inst *
+jay_clone_inst(jay_builder *b, jay_inst *I, uint8_t new_num_srcs)
+{
+   assert(new_num_srcs >= I->num_srcs);
+   unsigned info_size = jay_inst_info_size(I);
+
+   jay_inst *clone = jay_alloc_inst(b, I->op, new_num_srcs, info_size);
+
+   memcpy((uint8_t *) clone + sizeof(struct list_head),
+          (uint8_t *) I + sizeof(struct list_head),
+          sizeof(jay_inst) - sizeof(struct list_head));
+
+   clone->num_srcs = new_num_srcs;
+
+   memcpy(clone->src, I->src, I->num_srcs * sizeof(jay_def));
+   memcpy(&clone->src[new_num_srcs], &I->src[I->num_srcs], info_size);
+   return clone;
+}
+
+static inline jay_inst *
+jay_grow_sources(jay_builder *b, jay_inst *I, uint8_t new_num_srcs)
+{
+   jay_inst *clone = jay_clone_inst(b, I, new_num_srcs);
+
+   if ((b->cursor.option == jay_cursor_before_inst ||
+        b->cursor.option == jay_cursor_after_inst) &&
+       b->cursor.inst == I) {
+
+      b->cursor.inst = clone;
+   }
+
+   jay_builder b_ = jay_init_builder(b->func, jay_before_inst(I));
+   jay_builder_insert(&b_, clone);
+   jay_remove_instruction(I);
+   return clone;
+}
+
+static inline jay_inst *
+jay_add_predicate_else(jay_builder *b,
+                       jay_inst *I,
+                       jay_def predicate,
+                       jay_def default_value)
+{
+   assert(!I->predication && "pre-condition");
+   assert(jay_is_flag(predicate) && jay_is_ssa(default_value));
+
+   unsigned pred_index = I->num_srcs;
+   I = jay_grow_sources(b, I, pred_index + 2);
+   I->src[pred_index] = predicate;
+   I->src[pred_index + 1] = default_value;
+   I->predication = JAY_PREDICATED_DEFAULT;
+   return I;
+}
+
+static inline jay_inst *
+jay_add_predicate(jay_builder *b, jay_inst *I, jay_def predicate)
+{
+   assert(!I->predication && "pre-condition");
+   assert(jay_is_flag(predicate));
+
+   unsigned pred_index = I->num_srcs;
+   I = jay_grow_sources(b, I, pred_index + 1);
+   I->src[pred_index] = predicate;
+   I->predication = JAY_PREDICATED;
+   return I;
+}
+
+static inline jay_inst *
+jay_set_cond_flag(jay_builder *b, jay_inst *I, jay_def cond_flag)
+{
+   assert(jay_is_flag(cond_flag) && jay_is_null(I->cond_flag));
+
+   I->cond_flag = cond_flag;
+   return I;
+}
+
+static inline jay_inst *
+jay_set_conditional_mod(jay_builder *b,
+                        jay_inst *I,
+                        jay_def cond_flag,
+                        enum jay_conditional_mod cmod)
+{
+   I->conditional_mod = cmod;
+   return jay_set_cond_flag(b, I, cond_flag);
+}
+
+static inline jay_def
+jay_identity_def(jay_def x)
+{
+   return x;
+}
+
+#ifdef __cplusplus
+static inline jay_def
+JAY_BUILD_SRC(jay_def x)
+{
+   return x;
+}
+static inline jay_def
+JAY_BUILD_SRC(uint32_t x)
+{
+   return jay_imm(x);
+}
+#else
+#define JAY_BUILD_SRC(X)                                                       \
+   _Generic((X),                                                               \
+      jay_def: jay_identity_def,                                               \
+      uint32_t: jay_imm,                                                       \
+      int32_t: jay_imm,                                                        \
+      uint8_t: jay_imm)(X)
+#endif
+
+/* Include generated builder helpers */
+#include "jay_builder_opcodes.h"
+
+static inline jay_inst *
+_jay_CMP(jay_builder *b,
+         enum jay_type src_type,
+         enum jay_conditional_mod cmod,
+         jay_def dst,
+         jay_def src0,
+         jay_def src1)
+{
+   jay_inst *I = jay_alloc_inst(b, JAY_OPCODE_CMP, 2, 0);
+   I->type = src_type;
+   I->src[0] = src0;
+   I->src[1] = src1;
+
+   /* Even if we want to write a 32-bit 0/~0 result, we still need to
+    * register-allocate a flag, since the hardware will implicitly clobber one
+    * regardless.
+    */
+   if (!jay_is_flag(dst)) {
+      I->dst = dst;
+      dst = jay_alloc_def(b, dst.file == UGPR ? UFLAG : FLAG, 1);
+   }
+
+   jay_set_conditional_mod(b, I, dst, cmod);
+   jay_builder_insert(b, I);
+   return I;
+}
+
+#define jay_CMP(b, st, cmod, dst, src0, src1)                                  \
+   _jay_CMP(b, st, cmod, dst, JAY_BUILD_SRC(src0), JAY_BUILD_SRC(src1))
+
+struct jayb_send_params {
+   enum brw_sfid sfid;
+   uint64_t msg_desc;
+   jay_def dst;
+   jay_def header;
+   jay_def *srcs;
+   jay_def desc, ex_desc;
+   enum jay_type type;
+   enum jay_type src_type[2];
+   unsigned nr_srcs;
+   uint32_t ex_desc_imm;
+   bool eot;
+   bool check_tdr;
+   bool uniform;
+   bool bindless;
+};
+
+static inline jay_inst *
+_jay_SEND(jay_builder *b, const struct jayb_send_params p)
+{
+   const struct intel_device_info *devinfo = b->shader->devinfo;
+   jay_inst *I = jay_alloc_inst(b, JAY_OPCODE_SEND, 4, sizeof(jay_send_info));
+   jay_send_info *info = jay_get_send_info(I);
+   bool has_header = !jay_is_null(p.header);
+
+   I->dst = p.dst;
+   I->type = p.type;
+
+   assert(I->type);
+   info->type_0 = p.src_type[0] ? p.src_type[0] : I->type;
+   info->type_1 = p.src_type[1] ? p.src_type[1] : info->type_0;
+
+   if (has_header) {
+      assert(p.nr_srcs == 1 || info->type_0 == info->type_1);
+
+      /* If there is a message header, split the send into <header> and
+       * <payload> since the header is UGPR but the payload is GPR.
+       */
+      I->src[2] = p.header;
+      I->src[3] = jay_collect_vectors(b, &p.srcs[0], p.nr_srcs);
+      info->type_1 = info->type_0;
+      info->type_0 = JAY_TYPE_U32 /* header type */;
+   } else if (jay_type_size_bits(info->type_0) == 16 &&
+              !p.uniform &&
+              b->shader->dispatch_width == 32) {
+      /* Pack 16-bit vectors to match the hardware with the data model.
+       *
+       * XXX: This is a hack. Move to NIR for better
+       * codegen in tests like
+       * dEQP-GLES31.functional.texture.multisample.samples_4.use_texture_int_2d_array.
+       */
+      assert(info->type_0 == info->type_1);
+      jay_def srcs[8];
+      unsigned n = 0, i;
+      for (i = 0; i + 2 <= p.nr_srcs; i += 2) {
+         assert(p.srcs[i].file == p.srcs[i + 1].file);
+         assert(jay_num_values(p.srcs[i]) == jay_num_values(p.srcs[i + 1]));
+
+         for (unsigned c = 1; c < jay_num_values(p.srcs[i]); ++c) {
+            assert(jay_channel(p.srcs[i], c) == 0);
+            assert(jay_channel(p.srcs[i + 1], c) == 0);
+         }
+
+         jay_def lo = jay_extract(p.srcs[i], 0),
+                 hi = jay_extract(p.srcs[i + 1], 0);
+         jay_def bfi = jay_BFI2_u32(b, 0xffff0000, hi, lo);
+
+         if (p.srcs[i].file == UGPR) {
+            uint32_t defs[16] = { jay_index(bfi) };
+            srcs[n++] = jay_collect(b, UGPR, defs, jay_ugpr_per_grf(b->shader));
+         } else {
+            srcs[n++] = bfi;
+         }
+      }
+      if (i < p.nr_srcs) {
+         srcs[n++] = p.srcs[i++];
+      }
+      assert(i == p.nr_srcs);
+
+      I->src[2] = jay_collect_vectors(b, srcs, n);
+      I->src[3] = jay_null();
+   } else if (p.nr_srcs <= 2) {
+      /* Easy case: keep everything scalar */
+      I->src[2] = p.nr_srcs > 0 ? p.srcs[0] : jay_null();
+      I->src[3] = p.nr_srcs > 1 ? p.srcs[1] : jay_null();
+   } else {
+      /* Otherwise, we need to pick a point to split at.
+       *
+       * Heuristic: don't split render targer writes becuase RA gets confused
+       * with the EOT requirements. Split everything else in half.
+       *
+       * TODO: Come up with a better heuristic.
+       */
+      assert(info->type_0 == info->type_1);
+      unsigned split = !p.check_tdr ? DIV_ROUND_UP(p.nr_srcs, 2) : p.nr_srcs;
+      I->src[2] = jay_collect_vectors(b, &p.srcs[0], split);
+      I->src[3] = jay_collect_vectors(b, &p.srcs[split], p.nr_srcs - split);
+   }
+
+   /* For message headers we pack a UGPR vector as a single GRF */
+   unsigned lens[3];
+   for (unsigned i = 0; i < 3; ++i) {
+      jay_def x = i == 0 ? I->dst : I->src[1 + i];
+      lens[i] = jay_num_values(x);
+
+      /* XXX: For the non-transpose uniform case, do we need to pad out
+       * with undefs for correctness so we don't fall off the side of the
+       * regfile? for sends like:
+       *
+       * (1&W)       mov.u32 u10.0, u0.8                                 | A@1
+         (1&W)       mov.u32 u10.1, u0.9                                 | A@1
+         (1&W)       send.u32 u12, g10, _, 0x04403580, 0x00000000
+                     ugm MsgDesc: ( load, a64, d32, V4, L1STATE_L3MOCS dst_len =
+       4, src0_len = 2, src1_len = 0 flat )  base_offset 0  | A@1 $0
+
+        * We don't care what's in g11, but it has to *exist*. But that is
+        * probably implicitly correct as long as the reg file ends with GRFs.
+        * Which it has to <Xe3 because of EOT. So no code change needed but I
+        * need to document this.
+       */
+      if (x.file == UGPR) {
+         lens[i] = DIV_ROUND_UP(lens[i], jay_ugpr_per_grf(b->shader));
+      } else {
+         lens[i] *= jay_grf_per_gpr(b->shader);
+      }
+
+      lens[i] *= reg_unit(devinfo);
+   }
+
+   info->sfid = p.sfid;
+   info->eot = p.eot;
+   info->check_tdr = p.check_tdr;
+   info->uniform = p.uniform;
+   info->bindless = p.bindless;
+   info->ex_desc_imm = p.ex_desc_imm;
+   info->ex_mlen = lens[2];
+   I->src[0] = jay_imm(((uint32_t) p.msg_desc) |
+                       brw_message_desc(devinfo, lens[1], lens[0], has_header));
+
+   if (!jay_is_null(p.desc)) {
+      jay_def a = jay_alloc_def(b, J_ADDRESS, 1);
+      jay_OR(b, JAY_TYPE_U32, a, p.desc, I->src[0]);
+      I->src[0] = a;
+   }
+
+   if (jay_is_null(p.ex_desc)) {
+      I->src[1] =
+         jay_imm(brw_message_ex_desc(devinfo, lens[2]) | (p.msg_desc >> 32));
+   } else if (p.ex_desc.file == J_ADDRESS) {
+      I->src[1] = p.ex_desc;
+   } else {
+      I->src[1] = jay_alloc_def(b, J_ADDRESS, 1);
+      if (info->bindless) {
+         jay_MOV(b, I->src[1], p.ex_desc);
+      } else {
+         jay_OR(b, JAY_TYPE_U32, I->src[1], p.ex_desc,
+                brw_message_ex_desc(devinfo, info->ex_mlen));
+      }
+   }
+
+   assert(!info->uniform || jay_is_null(I->dst) || I->dst.file == UGPR);
+   jay_builder_insert(b, I);
+   return I;
+}
+
+#define jay_SEND(b, ...) _jay_SEND(b, (struct jayb_send_params) { __VA_ARGS__ })
+
+static inline void
+jay_copy_strided(jay_builder *b, jay_def dst, jay_def src, bool src_strided)
+{
+   unsigned src_stride = src_strided ? jay_ugpr_per_grf(b->shader) : 1;
+   uint32_t n = MIN2(jay_num_values(dst), jay_num_values(src) / src_stride);
+
+   for (unsigned i = 0; i < n; ++i) {
+      jay_MOV(b, jay_extract(dst, i), jay_extract(src, i * src_stride));
+   }
+}
+
+static inline void
+jay_copy(jay_builder *b, jay_def dst, jay_def src)
+{
+   jay_copy_strided(b, dst, src, false);
+}
+
+static inline jay_def
+jay_as_gpr(jay_builder *b, jay_def src)
+{
+   if (src.file == GPR || jay_is_null(src))
+      return src;
+
+   jay_def def = jay_alloc_def(b, GPR, jay_num_values(src));
+   jay_copy(b, def, src);
+   return def;
+}
+
+static inline void
+jay_i2i32(jay_builder *b, jay_def dst, unsigned src_bits, jay_def src)
+{
+   if (src_bits < 32) {
+      jay_CVT(b, JAY_TYPE_S32, dst, src, jay_type(JAY_TYPE_S, src_bits),
+              JAY_ROUND, 0);
+   } else if (src_bits == 32) {
+      jay_MOV(b, dst, src);
+   } else {
+      assert(src.reg == 0 && ".reg not preserved in this path but that's OK");
+      jay_MOV(b, dst, jay_extract(src, 0));
+   }
+}
diff --git a/src/intel/compiler/jay/jay_builder_opcodes.h.py b/src/intel/compiler/jay/jay_builder_opcodes.h.py
new file mode 100644
index 00000000000..735a653f08e
--- /dev/null
+++ b/src/intel/compiler/jay/jay_builder_opcodes.h.py
@@ -0,0 +1,153 @@
+# Copyright 2026 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+from typing import TYPE_CHECKING
+import argparse
+import sys
+
+from mako import exceptions
+from mako.template import Template
+
+from jay_opcodes import OPCODES
+
+if TYPE_CHECKING:
+    from jay_opcodes import Opcode
+
+
+def infer_type(op: 'Opcode') -> bool:
+    return op.has_dest and (set(op.types) <= set(["u1", "u32", "u64"]) or
+                            op.name == 'mov')
+
+
+def signature(op: 'Opcode', with_dest: bool = True, with_types: bool = False,
+              mode: str = 'prototype', type_: str = 't', src: str = '{}') -> str:
+    arr = [('jay_builder *', 'b')]
+
+    if with_types and len(op.types) > 1 and not infer_type(op):
+        arr += [('enum jay_type', type_)]
+
+    if with_dest and op.has_dest:
+        arr += [('jay_def', 'dst')]
+
+    arr += [('jay_def', src.format(f'src{i}')) for i in range(op.num_srcs)]
+    arr += [x for x in op.extra_struct if not x[1].startswith('pad')]
+
+    return ', '.join([(t + ' ' if mode == 'prototype' else '') + v for t, v in arr])
+
+
+TEMPLATE = """
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include "jay_private.h"
+
+#ifndef NDEBUG
+#define type_assert(op, ...) if (!(__VA_ARGS__)) { fprintf(stderr, "%s does not allow type: ", #op); jay_print_type(stderr, t); fprintf(stderr, "\\n"); } assert(__VA_ARGS__)
+#else
+#define type_assert(...)
+#endif
+
+% for op in opcodes.values():
+<%
+    OPCODE = op.name.upper()
+    num_srcs = op.num_srcs
+    has_dest = op.has_dest
+    multi_type = len(op.types) > 1
+    info_size = f'sizeof(jay_{op.name}_info)' if op.extra_struct else '0'
+    operands = ["dst"] + [f"src{i}" for i in range(num_srcs)]
+    if num_srcs > 0:
+        uniform = " && " .join([f"jay_is_uniform(src{i})" for i in range(num_srcs)])
+        reg_file = f"({uniform}) ? UGPR : GPR"
+    else:
+        reg_file = "GPR"
+    if not op.types:
+        continue
+    # Ignore the lane index when determining the type of a shuffle
+    infer_operands = operands[0:-1] if op.name == "shuffle" else operands
+%>
+static inline jay_inst *
+_jay_${OPCODE}(${signature(op, with_types = True)})
+{
+% if infer_type(op):
+   enum jay_type t = jay_num_values(dst) == 2 ? JAY_TYPE_U64 :
+                     ${" && ".join([f"(jay_is_flag({x}) || jay_is_imm({x}))" for x in infer_operands])}
+                     ? JAY_TYPE_U1 : JAY_TYPE_U32;
+% elif multi_type:
+   type_assert(${OPCODE}, 0
+% for type in op.types:
+    || t == JAY_TYPE_${type.upper()}
+% endfor
+   );
+
+% else:
+   enum jay_type t = JAY_TYPE_${op.types[0].upper()};
+
+% endif
+   jay_inst *inst = jay_alloc_inst(b, JAY_OPCODE_${OPCODE}, ${num_srcs}, ${info_size});
+% for _, prop in op.extra_struct:
+% if not prop.startswith('pad'):
+   jay_set_${op.name}_${prop}(inst, ${prop});
+% endif
+% endfor
+
+   inst->type = t;
+% if op.has_dest:
+   inst->dst = dst;
+% endif
+% for i in range(num_srcs):
+   inst->src[${i}] = src${i};
+% endfor
+
+   jay_builder_insert(b, inst);
+   return inst;
+}
+
+#define jay_${OPCODE}(${signature(op, with_types = True, mode = 'call')}) _jay_${OPCODE}(${signature(op, with_types = True, src = 'JAY_BUILD_SRC({})', mode='call')})
+
+% for type in op.types:
+static inline ${'jay_def' if op.has_dest else 'void'}
+_jay_${OPCODE}_${type}(${signature(op, with_dest = False)})
+{
+% if op.has_dest:
+   jay_def dst = jay_alloc_def(b, ${reg_file}, ${2 if '64' in type else 1});
+%endif
+   jay_${OPCODE}(${signature(op, with_types = True, type_ = 'JAY_TYPE_'+type.upper(), mode = 'call')});
+% if op.has_dest:
+   return dst;
+% endif
+}
+#define jay_${OPCODE}_${type}(${signature(op, with_dest = False, mode =
+'call')}) _jay_${OPCODE}_${type}(${signature(op, src='JAY_BUILD_SRC({})', mode = 'call', with_dest = False)})
+% endfor
+
+% endfor
+
+#undef type_assert
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('output', action='store')
+    args = parser.parse_args()
+
+    ops = {op: v for (op, v) in OPCODES.items() if op not in {'cmp', 'send'}}
+
+    try:
+        with open(args.output, 'w', encoding='utf-8') as f:
+            f.write(Template(TEMPLATE).render(
+                opcodes=ops,
+                signature=signature,
+                infer_type=infer_type))
+    except Exception:
+        print(exceptions.text_error_template().render())
+        return 1
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/intel/compiler/jay/jay_extra_info.h.py b/src/intel/compiler/jay/jay_extra_info.h.py
new file mode 100644
index 00000000000..cffe74fe5eb
--- /dev/null
+++ b/src/intel/compiler/jay/jay_extra_info.h.py
@@ -0,0 +1,153 @@
+# Copyright 2026 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+import argparse
+import sys
+
+from mako import exceptions
+from mako.template import Template
+
+from jay_opcodes import OPCODES, ENUMS
+
+TEMPLATE = """/* Do not include directly */
+PRAGMA_DIAGNOSTIC_PUSH
+PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
+
+% for enum, (prefix, values) in enums.items():
+% if enum.startswith('jay'):
+enum PACKED ${enum} {
+% for v in values:
+   ${prefix}_${v.upper()},
+% endfor
+};
+% endif
+% endfor
+
+% for name, op in opcodes:
+typedef struct jay_${name}_info {
+% for T, prop in op.extra_struct:
+    ${T} ${prop};
+% endfor
+} jay_${name}_info;
+
+% for prefix, _suffix in [('const ', '_const'), ('', '')]:
+static inline ${prefix} struct jay_${name}_info *
+jay_get_${name}_info${_suffix}(${prefix}jay_inst *I)
+{
+   assert(I->op == JAY_OPCODE_${name.upper()});
+   return (${prefix}struct jay_${name}_info *) &I->src[I->num_srcs];
+}
+
+% endfor
+% for T, prop in op.extra_struct:
+% if not prop.startswith('pad'):
+static inline ${T}
+jay_${name}_${prop}(const jay_inst *I)
+{
+   return jay_get_${name}_info_const(I)->${prop};
+}
+
+static inline void
+jay_set_${name}_${prop}(jay_inst *I, ${T} value)
+{
+   jay_get_${name}_info(I)->${prop} = value;
+}
+
+% endif
+% endfor
+% endfor
+
+static inline unsigned
+jay_inst_info_size(jay_inst *I)
+{
+   switch (I->op) {
+% for name, op in opcodes:
+   case JAY_OPCODE_${name.upper()}: return sizeof(struct jay_${name}_info);
+% endfor
+   default: return 0;
+   }
+}
+
+#ifndef __cplusplus
+static inline const char *
+jay_print_inst_info(FILE *fp, const jay_inst *I, const char *sep)
+{
+   switch (I->op) {
+% for name, op in opcodes:
+   case JAY_OPCODE_${name.upper()}: {
+% for T, prop in op.extra_struct:
+% if not (prop.startswith('pad') or name == 'bfn' or T == 'enum jay_type'):
+<%
+   value = f"jay_{name}_{prop}(I)"
+   spec = '0x%"PRIx64"' if T == 'uint64_t' else "%u"
+%>
+% if T.startswith('enum') and T[5:] in enums:
+<%
+    bare = T[5:]
+    prefix, values = enums[bare]
+%>
+      const char *${bare}_str[] = {
+% for v in values:
+         [${prefix}_${v.upper()}] = "${v}",
+% endfor
+      };
+      assert(${value} < ARRAY_SIZE(${bare}_str));
+<%
+      spec = "%s"
+      value = f'{T[5:]}_str[{value}]'
+%>
+% endif
+% if T == 'enum jay_rounding_mode':
+      if (strcmp(${value}, "round")) {
+         fprintf(fp, "%s%s", sep, ${value});
+         sep = ", ";
+      }
+% elif T == 'bool':
+      if (${value}) {
+         fprintf(fp, "%s${prop}", sep);
+         sep = ", ";
+      }
+% elif T.startswith('enum') or len(op.extra_struct) == 1:
+      fprintf(fp, "%s${spec}", sep, ${value});
+      sep = ", ";
+% else:
+      if (${value}) {
+         fprintf(fp, "%s${prop}=${spec}", sep, ${value});
+         sep = ", ";
+      }
+% endif
+% endif
+% endfor
+      break;
+   }
+% endfor
+   default: break;
+   }
+
+   return sep;
+}
+#endif
+
+PRAGMA_DIAGNOSTIC_POP
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('output', action='store')
+    args = parser.parse_args()
+
+    try:
+        with open(args.output, 'w', encoding='utf-8') as f:
+            f.write(Template(TEMPLATE).render(
+                opcodes=[(k, v) for k, v in OPCODES.items() if v.extra_struct],
+                enums=ENUMS))
+    except Exception:
+        print(exceptions.text_error_template().render())
+        return 1
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c
new file mode 100644
index 00000000000..de24701b7ad
--- /dev/null
+++ b/src/intel/compiler/jay/jay_from_nir.c
@@ -0,0 +1,3838 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/brw/brw_compiler.h"
+#include "compiler/brw/brw_eu.h"
+#include "compiler/brw/brw_eu_defines.h"
+#include "compiler/brw/brw_nir.h"
+#include "compiler/brw/brw_private.h"
+#include "compiler/brw/brw_sampler.h"
+#include "compiler/intel_nir.h"
+#include "compiler/intel_shader_enums.h"
+#include "compiler/list.h"
+#include "intel/dev/intel_debug.h"
+#include "util/bitpack_helpers.h"
+#include "util/bitscan.h"
+#include "util/bitset.h"
+#include "util/lut.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+#include "intel_device_info_gen.h"
+#include "jay.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_defines.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "nir_opcodes.h"
+#include "shader_enums.h"
+#include "shader_stats.h"
+
+static const struct debug_named_value jay_debug_options[] = {
+   { "noopt",       JAY_DBG_NOOPT,       "Disable backend optimizer"             },
+   { "printdemand", JAY_DBG_PRINTDEMAND, "Print demand per instruction"          },
+   { "spill",       JAY_DBG_SPILL,       "Shrink register file to test spilling" },
+   { "sync",        JAY_DBG_SYNC,        "Sync after every instruction"          },
+   DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(jay_debug, "JAY_DEBUG", jay_debug_options, 0)
+int jay_debug = 0;
+
+typedef struct jay_vs_payload {
+   /* "the maximum limit is 30 elements per vertex" (bspec 56124) */
+   jay_def attributes[30 * 4];
+} jay_vs_payload;
+
+typedef struct jay_cs_payload {
+   jay_def local_invocation_ids;
+} jay_cs_payload;
+
+typedef struct jay_fs_payload {
+   jay_def bary[INTEL_BARYCENTRIC_MODE_COUNT];
+
+   struct {
+      jay_def xy, z, w;
+   } coord;
+
+   jay_def pixel_sample_mask;
+   jay_def deltas[64];
+} jay_fs_payload;
+
+struct nir_to_jay_state {
+   jay_shader *s;
+   jay_function *f;
+   const nir_shader *nir;
+   const struct intel_device_info *devinfo;
+
+   jay_builder bld;
+
+   jay_block *current_block;
+   jay_block *after_block;
+   jay_block *break_block;
+
+   unsigned indent;
+
+   /* We cache ballot(true), ctz(ballot(true)), and 4*ctz(ballot(true)) within a
+    * block. If we had competent backend CSE - or emitted uniformize in NIR and
+    * taught NIR's CSE about ballots - we could remove this kludge.
+    */
+   jay_def active_lane_mask, active_lane, active_lane_x4;
+
+   /* These defs contain the extracted payload. They are only valid while
+    * translating NIR->Jay since they aren't maintained by Jay passes.
+    */
+   struct {
+      jay_def u0, u1;
+      jay_def sampler_state_pointer, scratch_surface;
+      jay_def inline_data;
+      jay_def push_data[512];
+      jay_def lane_id;
+      jay_def urb_handle;
+
+      union {
+         jay_vs_payload vs;
+         jay_cs_payload cs;
+         jay_fs_payload fs;
+      };
+   } payload;
+};
+
+static jay_def
+payload_u1(struct nir_to_jay_state *nj, unsigned idx, unsigned len)
+{
+   if (jay_is_null(nj->payload.u1))
+      return jay_null();
+   else
+      return jay_extract_range(nj->payload.u1, idx, len);
+}
+
+static jay_def
+emit_active_lane_mask(struct nir_to_jay_state *nj)
+{
+   /* TODO: We don't use jay_exec_mask yet due to hardware issues */
+   if (jay_is_null(nj->active_lane_mask)) {
+      nj->active_lane_mask = jay_alloc_def(&nj->bld, FLAG, 1);
+      jay_MOV(&nj->bld, nj->active_lane_mask, 1);
+   }
+
+   return nj->active_lane_mask;
+}
+
+static jay_def
+emit_active_lane(struct nir_to_jay_state *nj)
+{
+   /* For this instruction to execute, some lane must be active. Therefore there
+    * is a 1 in the lower [dispatch width] bits of the lane mask, so we may
+    * equivalently use fbl.u32 instead of fbl.u[dispatch width].
+    */
+   if (jay_is_null(nj->active_lane)) {
+      nj->active_lane = jay_alloc_def(&nj->bld, UGPR, 1);
+      jay_FBL(&nj->bld, nj->active_lane, emit_active_lane_mask(nj));
+   }
+
+   return nj->active_lane;
+}
+
+static jay_def
+emit_uniformize(struct nir_to_jay_state *nj, jay_def x)
+{
+   jay_builder *b = &nj->bld;
+   if (x.file != GPR && x.file != FLAG) {
+      return x;
+   }
+
+   if (jay_is_null(nj->active_lane_x4)) {
+      nj->active_lane_x4 = jay_SHL_u32(b, emit_active_lane(nj), 2);
+   }
+
+   jay_def u = jay_alloc_def(b, x.file == FLAG ? UFLAG : UGPR, 1);
+   jay_SHUFFLE(b, u, x, nj->active_lane_x4);
+   return u;
+}
+
+static jay_block *jay_emit_cf_list(struct nir_to_jay_state *nj,
+                                   struct exec_list *list);
+
+/** Returns true if the entire compute workgroup fits in a single subgroup. */
+static bool
+jay_workgroup_is_one_subgroup(jay_builder *b, const nir_shader *nir)
+{
+   return mesa_shader_stage_uses_workgroup(nir->info.stage) &&
+          !nir->info.workgroup_size_variable &&
+          nir_static_workgroup_size(nir) <= b->shader->dispatch_width;
+}
+
+static enum jay_type
+jay_base_type_for_nir(nir_alu_type nir_type)
+{
+   /* clang-format off */
+   switch (nir_alu_type_get_base_type(nir_type)) {
+   case nir_type_int:   return JAY_TYPE_S;
+   case nir_type_uint:  return JAY_TYPE_U;
+   case nir_type_bool:  return JAY_TYPE_S;
+   case nir_type_float: return JAY_TYPE_F;
+   default:             UNREACHABLE("invalid NIR type");
+   }
+   /* clang-format on */
+}
+
+static enum jay_file
+jay_file_for_def(const nir_def *def)
+{
+   return def->bit_size == 1 ? (def->divergent ? FLAG : UFLAG) :
+                               (def->divergent ? GPR : UGPR);
+}
+
+/**
+ * Returns an jay_type for the ALU op's i-th source.
+ * (Useful for conversions and comparisons.)
+ */
+static enum jay_type
+jay_alu_source_type(nir_alu_instr *alu, unsigned i)
+{
+   return jay_type(jay_base_type_for_nir(nir_op_infos[alu->op].input_types[i]),
+                   nir_src_bit_size(alu->src[i].src));
+}
+
+static inline jay_def
+nj_def(nir_def *def)
+{
+   unsigned bits = def->num_components * MAX2(def->bit_size, 32);
+   unsigned words = DIV_ROUND_UP(bits, 32);
+
+   return jay_contiguous_def(jay_file_for_def(def), def->index, words);
+}
+
+static inline jay_def
+nj_src(nir_src src)
+{
+   return nj_def(src.ssa);
+}
+
+static void
+jay_emit_alu(struct nir_to_jay_state *nj, nir_alu_instr *alu)
+{
+   jay_builder *b = &nj->bld;
+   jay_def dst = nj_def(&alu->def);
+
+   nir_alu_type nir_type = nir_op_infos[alu->op].output_type;
+   enum jay_type base_type = jay_base_type_for_nir(nir_type);
+   enum jay_type type = jay_type(base_type, alu->def.bit_size);
+
+   jay_def src[NIR_ALU_MAX_INPUTS];
+   for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
+      unsigned len = nir_src_bit_size(alu->src[i].src) == 64 ? 2 : 1;
+      src[i] = jay_extract_range(nj_src(alu->src[i].src),
+                                 len * alu->src[i].swizzle[0], len);
+   }
+
+   switch (alu->op) {
+#define CMP(op, jay)                                                           \
+   case nir_op_##op:                                                           \
+      jay_CMP(b, jay_alu_source_type(alu, 0), JAY_CONDITIONAL_##jay, dst,      \
+              src[0], src[1]);                                                 \
+      break;
+
+#define UNOP(nir, jay_op)                                                      \
+   case nir_op_##nir:                                                          \
+      jay_##jay_op(b, type, dst, src[0]);                                      \
+      break;
+
+#define MATH(nir, jay_op)                                                      \
+   case nir_op_##nir:                                                          \
+      jay_MATH(b, type, dst, src[0], JAY_MATH_##jay_op);                       \
+      break;
+
+#define UNOP_UNTYPED(nir, jay_op)                                              \
+   case nir_op_##nir:                                                          \
+      jay_##jay_op(b, dst, src[0]);                                            \
+      break;
+
+#define BINOP(nir, jay_op)                                                     \
+   case nir_op_##nir:                                                          \
+      jay_##jay_op(b, type, dst, src[0], src[1]);                              \
+      break;
+
+#define DP4A(nir, jay_op, sat_)                                                \
+   case nir_op_##nir:                                                          \
+      jay_DP4A_##jay_op(b, dst, src[2], src[0], src[1])->saturate = sat_;      \
+      break;
+
+      CMP(flt, LT)
+      CMP(ilt, LT)
+      CMP(ult, LT)
+      CMP(fge, GE)
+      CMP(ige, GE)
+      CMP(uge, GE)
+      CMP(feq, EQ)
+      CMP(ieq, EQ)
+      CMP(fneu, NE)
+      CMP(ine, NE)
+
+      MATH(frcp, INV)
+      MATH(fexp2, EXP)
+      MATH(flog2, LOG)
+      MATH(fsin, SIN)
+      MATH(fcos, COS)
+      MATH(fsqrt, SQRT)
+      MATH(frsq, RSQ)
+      UNOP(ffract, FRC)
+      UNOP(ftrunc, RNDZ)
+      UNOP(ffloor, RNDD)
+      UNOP(fround_even, RNDE)
+
+      UNOP_UNTYPED(mov, copy)
+      UNOP_UNTYPED(unpack_32_2x16_split_x, MOV)
+      UNOP_UNTYPED(b2b1, CAST_CANONICAL_TO_FLAG)
+      UNOP_UNTYPED(inot, NOT)
+      UNOP_UNTYPED(bitfield_reverse, BFREV)
+      UNOP_UNTYPED(bit_count, CBIT)
+      UNOP_UNTYPED(uclz, LZD)
+      UNOP_UNTYPED(find_lsb, FBL)
+
+      BINOP(imin, MIN)
+      BINOP(umin, MIN)
+      BINOP(fmin, MIN)
+      BINOP(imax, MAX)
+      BINOP(umax, MAX)
+      BINOP(fmax, MAX)
+      BINOP(fadd, ADD)
+      BINOP(iadd, ADD)
+      BINOP(fmul, MUL)
+      BINOP(imul_32x16, MUL_32X16)
+      BINOP(umul_32x16, MUL_32X16)
+      BINOP(ishl, SHL)
+      BINOP(ishr, ASR)
+      BINOP(ushr, SHR)
+      BINOP(urol, ROL)
+      BINOP(uror, ROR)
+      BINOP(urhadd, AVG)
+      BINOP(irhadd, AVG)
+      BINOP(iand, AND)
+      BINOP(ior, OR)
+      BINOP(ixor, XOR)
+
+      DP4A(sdot_4x8_iadd, SS, false)
+      DP4A(sdot_4x8_iadd_sat, SS, true)
+      DP4A(udot_4x8_uadd, UU, false)
+      DP4A(udot_4x8_uadd_sat, UU, true)
+      DP4A(sudot_4x8_iadd, SU, false)
+      DP4A(sudot_4x8_iadd_sat, SU, true)
+
+#undef CMP
+#undef UNOP
+#undef UNOP_UNTYPED
+#undef BINOP
+#undef DP4A
+
+   case nir_op_imul:
+      if (jay_type_size_bits(type) == 32) {
+         jay_MUL_32(b, type, dst, src[0], src[1], false);
+      } else {
+         jay_MUL(b, type, dst, src[0], src[1]);
+      }
+
+      break;
+
+   case nir_op_imul_high:
+   case nir_op_umul_high:
+      jay_MUL_32(b, type, dst, src[0], src[1], true);
+      break;
+
+   case nir_op_bfm:
+      jay_BFI1(b, dst, src[0], src[1]);
+      break;
+
+   case nir_op_b2f64:
+      jay_SEL(b, JAY_TYPE_U32, jay_extract(dst, 1), 0x3ff00000, 0, src[0]);
+      jay_MOV(b, jay_extract(dst, 0), 0);
+      break;
+
+   case nir_op_ufind_msb_rev:
+   case nir_op_ifind_msb_rev:
+      jay_FBH(b, jay_alu_source_type(alu, 0), dst, src[0]);
+      break;
+
+   case nir_op_u2u8:
+   case nir_op_u2u16:
+   case nir_op_u2u32:
+   case nir_op_i2i8:
+   case nir_op_i2i16:
+   case nir_op_i2i32:
+      assert(nir_src_bit_size(alu->src[0].src) > 1 &&
+             "predicate conversions are lowered");
+
+      if (alu->def.bit_size <= nir_src_bit_size(alu->src[0].src)) {
+         /* Downconversion. Upper bits garbage convention makes this a no-op.
+          * The extract handles 64->32 narrowing conversions.
+          */
+         jay_MOV(b, dst, jay_extract(src[0], 0));
+         break;
+      }
+
+      FALLTHROUGH;
+   case nir_op_i2f64:
+   case nir_op_i2i64:
+   case nir_op_u2u64:
+   case nir_op_u2f64:
+   case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+   case nir_op_f2i32:
+   case nir_op_f2u32:
+   case nir_op_f2i32_sat:
+   case nir_op_f2u32_sat:
+   case nir_op_i2f32:
+   case nir_op_u2f32:
+   case nir_op_f2f32:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
+   case nir_op_f2f16:
+   case nir_op_f2i16:
+   case nir_op_f2u16:
+   case nir_op_f2i8:
+   case nir_op_f2u8: {
+      enum jay_type src_type = jay_alu_source_type(alu, 0);
+
+      /* UGPR byte to float is not supported. Do it in 2 steps. */
+      if (jay_type_size_bits(src_type) == 8 &&
+          jay_base_type(type) == JAY_TYPE_F &&
+          dst.file == UGPR) {
+
+         enum jay_type integer = jay_type_rebase(type, jay_base_type(src_type));
+         jay_def tmp = jay_alloc_def(b, UGPR, 1);
+         jay_CVT(b, integer, tmp, src[0], src_type, JAY_ROUND, 0);
+         jay_CVT(b, type, dst, tmp, integer, JAY_ROUND, 0);
+      } else {
+         jay_CVT(b, type, dst, src[0], src_type, JAY_ROUND, 0);
+      }
+
+      break;
+   }
+
+   case nir_op_f2f16_rtne:
+   case nir_op_f2f16_rtz:
+      jay_CVT(b, JAY_TYPE_F16, dst, src[0], jay_alu_source_type(alu, 0),
+              alu->op == nir_op_f2f16_rtz ? JAY_RTZ : JAY_RNE, 0);
+      break;
+
+   case nir_op_fsat:
+      jay_MODIFIER(b, type, dst, src[0])->saturate = true;
+      break;
+
+   case nir_op_fneg:
+   case nir_op_ineg:
+      jay_MODIFIER(b, type, dst, jay_negate(src[0]));
+      break;
+
+   case nir_op_fabs:
+   case nir_op_iabs:
+      jay_MODIFIER(b, type, dst, jay_abs(src[0]));
+      break;
+
+   case nir_op_iadd3:
+      jay_ADD3(b, type, dst, src[0], src[1], src[2]);
+      break;
+
+   case nir_op_uadd_sat:
+   case nir_op_iadd_sat:
+      jay_ADD(b, type, dst, src[0], src[1])->saturate = true;
+      break;
+
+   case nir_op_usub_sat:
+   case nir_op_isub_sat:
+      jay_ADD(b, type, dst, src[0], jay_negate(src[1]))->saturate = true;
+      break;
+
+   case nir_op_ihadd:
+   case nir_op_uhadd: {
+      /* AVG(x, y) - ((x ^ y) & 1) */
+      jay_def avg = jay_alloc_def(b, dst.file, 1);
+      jay_def bfn = jay_alloc_def(b, dst.file, 1);
+      jay_AVG(b, type, avg, src[0], src[1]);
+      jay_BFN(b, bfn, 1, src[0], src[1], UTIL_LUT3(a & (b ^ c)));
+      jay_ADD(b, type, dst, avg, jay_negate(bfn));
+      break;
+   }
+
+   case nir_op_unpack_64_2x32_split_x:
+      jay_MOV(b, dst, jay_extract(src[0], 0));
+      break;
+   case nir_op_unpack_64_2x32_split_y:
+      jay_MOV(b, dst, jay_extract(src[0], 1));
+      break;
+   case nir_op_unpack_32_2x16_split_y:
+      jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, 1);
+      break;
+
+   case nir_op_pack_32_4x8_split: {
+      /* TODO: Optimize */
+      jay_def r = jay_BFI2_u32(b, 0x0000ff00, src[1], src[0]);
+      r = jay_BFI2_u32(b, 0x00ff0000, src[2], r);
+      jay_BFI2(b, dst, 0xff000000, src[3], r);
+      break;
+   }
+
+   case nir_op_pack_32_2x16_split:
+      /* TODO: Optimize */
+      jay_BFI2(b, dst, 0xffff0000, src[1], src[0]);
+      break;
+
+   case nir_op_pack_64_2x32_split:
+      jay_MOV(b, jay_extract(dst, 0), src[0]);
+      jay_MOV(b, jay_extract(dst, 1), src[1]);
+      break;
+
+   case nir_op_bitfield_select:
+      assert(jay_type_size_bits(type) <= 32);
+      jay_BFN(b, dst, src[0], src[1], src[2], UTIL_LUT3((a & b) | (~a & c)));
+      break;
+
+   case nir_op_ubfe:
+   case nir_op_ibfe:
+      jay_BFE(b, type, dst, src[0], src[1], src[2]);
+      break;
+   case nir_op_bfi:
+      jay_BFI2(b, dst, src[0], src[1], src[2]);
+      break;
+
+   case nir_op_ffma:
+      jay_MAD(b, type, dst, src[0], src[1], src[2]);
+      break;
+
+   case nir_op_fcsel:
+      jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
+         JAY_CONDITIONAL_NE;
+      break;
+
+   case nir_op_fcsel_gt:
+   case nir_op_i32csel_gt:
+      jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
+         JAY_CONDITIONAL_GT;
+      break;
+
+   case nir_op_fcsel_ge:
+   case nir_op_i32csel_ge:
+      jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
+         JAY_CONDITIONAL_GE;
+      break;
+
+   case nir_op_bcsel:
+      assert(alu->def.bit_size < 64);
+      assert(jay_is_flag(src[0]));
+
+      /* b2i8 gets lowered into 8-bit csel. Just use the upper bits garbage
+       * convention to implement with SEL.u16 instead.
+       */
+      if (type == JAY_TYPE_U8) {
+         type = JAY_TYPE_U16;
+      }
+
+      jay_SEL(b, type, dst, src[1], src[2], src[0]);
+      break;
+
+   case nir_op_extract_u8:
+      jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U8, JAY_ROUND,
+              nir_alu_src_as_uint(alu->src[1]));
+      break;
+
+   case nir_op_extract_i8:
+      jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S8, JAY_ROUND,
+              nir_alu_src_as_uint(alu->src[1]));
+      break;
+
+   case nir_op_extract_u16:
+      jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND,
+              nir_alu_src_as_uint(alu->src[1]));
+      break;
+
+   case nir_op_extract_i16:
+      jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S16, JAY_ROUND,
+              nir_alu_src_as_uint(alu->src[1]));
+      break;
+
+   default:
+      if (nir_op_is_vec(alu->op)) {
+         for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
+            unsigned len = jay_type_vector_length(type);
+            jay_copy(b, jay_extract_range(dst, len * i, len), src[i]);
+         }
+
+         break;
+      }
+
+      nir_print_instr(&alu->instr, stderr);
+      fprintf(stderr, "\n");
+      UNREACHABLE("unhandled instruction");
+   }
+}
+
+static void
+jay_emit_load_const(struct nir_to_jay_state *nj, nir_load_const_instr *lc)
+{
+   jay_builder *b = &nj->bld;
+   jay_def dst = nj_def(&lc->def);
+   assert(lc->def.num_components == 1 && "must be scalarized");
+
+   if (lc->def.bit_size == 64 && lc->value[0].u64 >> 32) {
+      jay_MOV_IMM64(b, dst, lc->value[0].u64);
+   } else {
+      jay_MOV(b, dst, lc->value[0].u32);
+   }
+}
+
+static jay_def
+jay_resource_handle(jay_builder *b,
+                    nir_src *nsrc,
+                    unsigned *bti_const,
+                    bool *internal,
+                    bool *bindless)
+{
+   if (!nsrc) {
+      return jay_null();
+   }
+
+   nir_intrinsic_instr *rin = nir_src_as_intrinsic(*nsrc);
+
+   if (nir_src_is_const(*nsrc)) {
+      *bti_const = nir_src_as_uint(*nsrc);
+      return jay_null();
+   } else if (!rin || rin->intrinsic != nir_intrinsic_resource_intel) {
+      return nj_src(*nsrc);
+   }
+
+   uint32_t flags = nir_intrinsic_resource_access_intel(rin);
+   if (internal) {
+      *internal = !!(flags & nir_resource_intel_internal);
+   }
+   if (bindless) {
+      *bindless = !!(flags & nir_resource_intel_bindless);
+   }
+
+   if (nir_src_is_const(rin->src[1])) {
+      *bti_const = nir_src_as_uint(rin->src[1]);
+      return jay_null();
+   } else {
+      return nj_src(rin->src[1]);
+   }
+}
+
+static inline enum lsc_flush_type
+translate_flush_type(nir_intrinsic_instr *intr)
+{
+   switch (nir_intrinsic_memory_semantics(intr)) {
+   case NIR_MEMORY_ACQUIRE:
+      return LSC_FLUSH_TYPE_INVALIDATE;
+   case NIR_MEMORY_RELEASE:
+      return LSC_FLUSH_TYPE_CLEAN;
+   case NIR_MEMORY_ACQ_REL:
+      return LSC_FLUSH_TYPE_EVICT;
+   case NIR_MEMORY_MAKE_AVAILABLE:
+   case NIR_MEMORY_MAKE_VISIBLE:
+   default:
+      UNREACHABLE("unexpected memory semantic");
+   }
+}
+
+static void
+emit_lsc_fence(struct nir_to_jay_state *nj,
+               nir_intrinsic_instr *intr,
+               enum brw_sfid sfid)
+{
+   bool device = nir_intrinsic_memory_scope(intr) >= SCOPE_QUEUE_FAMILY;
+   enum lsc_fence_scope scope = device ? LSC_FENCE_TILE : LSC_FENCE_THREADGROUP;
+   enum lsc_flush_type type =
+      sfid == BRW_SFID_SLM ? LSC_FLUSH_TYPE_NONE : translate_flush_type(intr);
+
+   jay_def notif = jay_alloc_def(&nj->bld, UGPR, jay_ugpr_per_grf(nj->s));
+   uint32_t desc = lsc_fence_msg_desc(nj->s->devinfo, scope, type, false);
+
+   jay_SEND(&nj->bld, .sfid = sfid, .msg_desc = desc, .srcs = &nj->payload.u0,
+            .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true, .dst = notif);
+}
+
+static void
+jay_emit_memory_barrier(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
+{
+   nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
+
+   jay_SYNC(&nj->bld, TGL_SYNC_ALLWR);
+
+   if (modes & nir_var_image) {
+      emit_lsc_fence(nj, intr, BRW_SFID_TGM);
+      assert(!nj->nir->info.use_lowered_image_to_global && "fix common code");
+   }
+
+   if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
+      emit_lsc_fence(nj, intr, BRW_SFID_UGM);
+   }
+
+   if (modes & (nir_var_shader_out | nir_var_mem_task_payload)) {
+      emit_lsc_fence(nj, intr, BRW_SFID_URB);
+   }
+
+   if ((modes & nir_var_mem_shared) &&
+       !jay_workgroup_is_one_subgroup(&nj->bld, nj->nir)) {
+      emit_lsc_fence(nj, intr, BRW_SFID_SLM);
+   }
+}
+
+static void
+jay_emit_signal_barrier(struct nir_to_jay_state *nj)
+{
+   jay_builder *b = &nj->bld;
+
+   /* Signal barrier / Active threads only (BSpec 72052).
+    *
+    * Source 0 is the number of subgroups in [31:24], which comes from the u0.2
+    * payload in [31:24]. Mask out the other bits, then replicate to [23:15].
+    *
+    * TODO: This can be done faster with a SIMD2 8-bit move.
+    */
+   jay_def a = jay_AND_u32(b, jay_extract(nj->payload.u0, 2), 0xff000000);
+   jay_def m2 = jay_OR_u32(b, a, jay_SHR_u32(b, a, 8));
+
+   /* Use an active threads only barrier. TODO: I think we can optimize. */
+   if (b->shader->devinfo->ver >= 20) {
+      m2 = jay_OR_u32(b, m2, BITFIELD_BIT(8));
+   }
+
+   uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
+   indices[2] = jay_index(m2);
+   jay_def zipped = jay_collect(b, UGPR, indices, 3);
+
+   jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY,
+            .msg_desc = BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG, .srcs = &zipped,
+            .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true);
+
+   jay_SYNC(b, TGL_SYNC_BAR);
+}
+
+static void
+jay_emit_derivative(jay_builder *b,
+                    jay_def dst,
+                    nir_intrinsic_instr *intr,
+                    enum jay_quad_swizzle swz0,
+                    enum jay_quad_swizzle swz1)
+{
+   assert(intr->def.bit_size == 32 && "todo");
+   jay_def val = nj_src(intr->src[0]);
+
+   jay_ADD(b, JAY_TYPE_F32, dst, jay_QUAD_SWIZZLE_u32(b, val, swz1),
+           jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0)));
+}
+
+static void
+jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr)
+{
+   jay_def data = nj_src(intr->src[0]);
+   jay_def srcs[8];
+
+   /* Optimize unconditional discards. Should probably do this in NIR. */
+   bool trivial =
+      nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]);
+
+   for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) {
+      srcs[i] = trivial ? jay_INDETERMINATE_u32(b) :
+                          jay_as_gpr(b, jay_extract(data, i));
+   }
+
+   jay_inst *send =
+      jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true,
+               .msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs(
+                              nir_get_scalar(intr->src[1].ssa, 0))) |
+                           (nir_scalar_as_uint(nir_scalar_chase_movs(
+                               nir_get_scalar(intr->src[1].ssa, 1)))
+                            << 32),
+               .srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]),
+               .type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr));
+
+   /* Handle the disable predicate. It is logically inverted. */
+   if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) {
+      jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2])));
+   }
+}
+
+static enum lsc_data_size
+lsc_bits_to_data_size(unsigned bit_size)
+{
+   /* clang-format off */
+   switch (bit_size / 8) {
+   case 1:  return LSC_DATA_SIZE_D8U32;
+   case 2:  return LSC_DATA_SIZE_D16U32;
+   case 4:  return LSC_DATA_SIZE_D32;
+   case 8:  return LSC_DATA_SIZE_D64;
+   default: UNREACHABLE("Unsupported data size.");
+   }
+   /* clang-format on */
+}
+
+static enum lsc_opcode
+lsc_op_for_atomic(nir_atomic_op op)
+{
+   /* clang-format off */
+   switch (op) {
+   case nir_atomic_op_iadd:     return LSC_OP_ATOMIC_ADD;
+   case nir_atomic_op_imin:     return LSC_OP_ATOMIC_MIN;
+   case nir_atomic_op_umin:     return LSC_OP_ATOMIC_UMIN;
+   case nir_atomic_op_imax:     return LSC_OP_ATOMIC_MAX;
+   case nir_atomic_op_umax:     return LSC_OP_ATOMIC_UMAX;
+   case nir_atomic_op_iand:     return LSC_OP_ATOMIC_AND;
+   case nir_atomic_op_ior:      return LSC_OP_ATOMIC_OR;
+   case nir_atomic_op_ixor:     return LSC_OP_ATOMIC_XOR;
+   case nir_atomic_op_xchg:     return LSC_OP_ATOMIC_STORE;
+   case nir_atomic_op_cmpxchg:  return LSC_OP_ATOMIC_CMPXCHG;
+   case nir_atomic_op_fmin:     return LSC_OP_ATOMIC_FMIN;
+   case nir_atomic_op_fmax:     return LSC_OP_ATOMIC_FMAX;
+   case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
+   case nir_atomic_op_fadd:     return LSC_OP_ATOMIC_FADD;
+   default:                     UNREACHABLE("Unsupported NIR atomic");
+   }
+   /* clang-format on */
+}
+
+static jay_def
+jay_src_as_strided(jay_builder *b,
+                   jay_def x,
+                   unsigned element_sz,
+                   enum jay_file dst_file)
+{
+   if (dst_file == UGPR) {
+      assert(jay_is_uniform(x) && "Uniform dests require uniform sources");
+
+      if (x.file != UGPR) {
+         jay_def tmp = jay_alloc_def(b, UGPR, jay_num_values(x));
+         jay_copy(b, tmp, x);
+         x = tmp;
+      }
+
+      uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
+      unsigned nr = jay_num_values(x) * jay_ugpr_per_grf(b->shader);
+      assert(nr < ARRAY_SIZE(indices));
+
+      for (unsigned i = 0; i < jay_num_values(x) / element_sz; ++i) {
+         for (unsigned j = 0; j < element_sz; ++j) {
+            indices[(i * jay_ugpr_per_grf(b->shader)) + j] =
+               jay_channel(x, (i * element_sz) + j);
+         }
+      }
+
+      return jay_collect(b, UGPR, indices, nr);
+   } else {
+      /* Could be a GPR or UGPR source */
+      assert(dst_file == GPR);
+      return jay_as_gpr(b, x);
+   }
+}
+
+static jay_def
+jay_scratch_surface(struct nir_to_jay_state *nj)
+{
+   if (jay_is_null(nj->payload.scratch_surface)) {
+      jay_function *func = nj->f;
+      assert(func->is_entrypoint && "todo: this needs ABI");
+
+      jay_builder b = jay_init_builder(func, jay_before_function(func));
+      nj->payload.scratch_surface = jay_alloc_def(&b, J_ADDRESS, 1);
+
+      jay_def u0_5 = jay_extract(nj->payload.u0, 5);
+      jay_def state = jay_AND_u32(&b, u0_5, ~BITFIELD_MASK(10));
+      jay_SHR(&b, JAY_TYPE_U32, nj->payload.scratch_surface, state, 4);
+   }
+
+   return nj->payload.scratch_surface;
+}
+
+static void
+jay_emit_mem_access(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
+{
+   jay_builder *b = &nj->bld;
+   bool slm = nir_is_shared_access(intr);
+   bool tgm = nir_intrinsic_has_image_dim(intr);
+   bool urb = intr->intrinsic == nir_intrinsic_store_urb_lsc_intel ||
+              intr->intrinsic == nir_intrinsic_store_urb_vec4_intel;
+   enum brw_sfid sfid = slm ? BRW_SFID_SLM :
+                        tgm ? BRW_SFID_TGM :
+                        urb ? BRW_SFID_URB :
+                              BRW_SFID_UGM;
+
+   nir_src *data_src = nir_get_io_data_src(intr);
+   bool scratch = intr->intrinsic == nir_intrinsic_load_scratch_intel ||
+                  intr->intrinsic == nir_intrinsic_store_scratch_intel;
+
+   enum lsc_opcode op;
+   if (nir_intrinsic_has_atomic_op(intr))
+      op = lsc_op_for_atomic(nir_intrinsic_atomic_op(intr));
+   else if (sfid == BRW_SFID_TGM)
+      op = data_src ? LSC_OP_STORE_CMASK : LSC_OP_LOAD_CMASK;
+   else
+      op = data_src ? LSC_OP_STORE : LSC_OP_LOAD;
+
+   nir_src *bti = nir_get_io_index_src(intr), *ubo = NULL;
+   nir_src *offset_src = tgm ? &intr->src[1] : nir_get_io_offset_src(intr);
+
+   if (intr->intrinsic == nir_intrinsic_load_ubo ||
+       intr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) {
+      ubo = bti;
+      bti = NULL;
+      b->shader->prog_data->base.has_ubo_pull = true;
+   }
+
+   const struct intel_device_info *devinfo = b->shader->devinfo;
+   bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest;
+   jay_def data = data_src ? nj_src(*data_src) : jay_null();
+   unsigned bti_const = 0;
+   bool internal = false;
+   bool bindless = false;
+   jay_def bti_indirect =
+      jay_resource_handle(b, bti ?: ubo, &bti_const, &internal, &bindless);
+   jay_def offset = nj_src(*offset_src);
+   nir_def *ndata = data_src ? data_src->ssa : &intr->def;
+   jay_def dst = has_dest ? nj_def(&intr->def) : jay_null();
+   int32_t base_offset =
+      nir_intrinsic_has_base(intr) ? nir_intrinsic_base(intr) : 0;
+
+   /* Optimize increment/decrement */
+   if (op == LSC_OP_ATOMIC_ADD && nir_src_is_const(*data_src)) {
+      int64_t add_val = nir_src_as_int(*data_src);
+      if (add_val == 1 || add_val == -1) {
+         op = add_val == 1 ? LSC_OP_ATOMIC_INC : LSC_OP_ATOMIC_DEC;
+         data = jay_null();
+      }
+   }
+
+   /* Pack the coordinates. TODO: MSAA */
+   if (tgm) {
+      unsigned nr = nir_image_intrinsic_coord_components(intr);
+      offset = jay_extract_range(offset, 0, nr);
+   }
+
+   internal |= scratch;
+   enum lsc_addr_surface_type surf_type = internal     ? LSC_ADDR_SURFTYPE_SS :
+                                          bindless     ? LSC_ADDR_SURFTYPE_BSS :
+                                          (bti || ubo) ? LSC_ADDR_SURFTYPE_BTI :
+                                                         LSC_ADDR_SURFTYPE_FLAT;
+
+   bool a64 = surf_type == LSC_ADDR_SURFTYPE_FLAT && sfid == BRW_SFID_UGM;
+   enum lsc_addr_size addr_size = a64 ? LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32;
+   enum jay_type offset_type = a64 ? JAY_TYPE_U64 : JAY_TYPE_U32;
+
+   bool cmask = op == LSC_OP_LOAD_CMASK || op == LSC_OP_STORE_CMASK;
+   bool uniform = !(has_dest && dst.file != UGPR);
+
+   if (nir_intrinsic_has_align(intr)) {
+      assert(nir_intrinsic_align(intr) >= (ndata->bit_size / 8));
+   }
+
+   if (!has_dest) {
+      uniform &= jay_is_null(data) || data.file == UGPR;
+      uniform &= jay_is_null(offset) || offset.file == UGPR;
+      uniform &= !(cmask || urb);
+   }
+
+   /* Per bspec 57330, 8-bit/16-bit are not supported for transpose */
+   bool transpose = uniform && !cmask && ndata->bit_size >= 32;
+   bool scalar_uniform = uniform && !cmask && ndata->bit_size < 32;
+
+   if (!uniform) {
+      offset = jay_as_gpr(b, offset);
+   } else if (!transpose) {
+      offset = jay_src_as_strided(b, offset, a64 ? 2 : 1, UGPR);
+   }
+
+   if (!jay_is_null(data) && !transpose && !scalar_uniform)
+      data = jay_as_gpr(b, data);
+
+   unsigned access =
+      nir_intrinsic_has_access(intr) ? nir_intrinsic_access(intr) : 0;
+
+   bool volatile_access = access & ACCESS_VOLATILE;
+   bool coherent_access = access & ACCESS_COHERENT;
+
+   /* Bspec: Atomic instruction -> Cache section:
+    *
+    *    Atomic messages are always forced to "un-cacheable" in the L1
+    *    cache.
+    *
+    * Bspec: Overview of memory Access:
+    *
+    *   If a read from a Null tile gets a cache-hit in a virtually-addressed
+    *   GPU cache, then the read may not return zeroes.
+    *
+    * If a shader writes to a null tile and wants to be able to read it back
+    * as zero, it will use the 'volatile' decoration for the access, otherwise
+    * the compiler may choose to optimize things out, breaking the
+    * residencyNonResidentStrict guarantees. Due to the above, we need to make
+    * these operations uncached.
+    */
+   unsigned cache =
+      urb ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) :
+      lsc_opcode_is_atomic(op) ?
+            LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
+      volatile_access ?
+            (devinfo->ver >= 20 ?
+                /* Xe2 has a better L3 that can deal with null tiles.*/
+                (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
+                             LSC_CACHE(devinfo, LOAD, L1UC_L3C)) :
+                /* On older platforms, all caches have to be bypassed. */
+                (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) :
+                             LSC_CACHE(devinfo, LOAD, L1UC_L3UC))) :
+            /* Skip L1 for coherent accesses */
+         coherent_access ? (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
+                                        LSC_CACHE(devinfo, LOAD, L1UC_L3C)) :
+      !has_dest          ? LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
+                           LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);
+
+   unsigned max_imm_bits = brw_max_immediate_offset_bits(surf_type);
+   assert(base_offset >= u_intN_min(max_imm_bits));
+   assert(base_offset <= u_intN_max(max_imm_bits));
+   assert(base_offset == 0 || sfid != BRW_SFID_TGM);
+
+   const unsigned base_offs_bits =
+      util_bitpack_sint(base_offset, 0, max_imm_bits - 1);
+
+   unsigned nr = ndata->num_components;
+   uint64_t desc =
+      lsc_msg_desc(devinfo, op, surf_type, addr_size,
+                   lsc_bits_to_data_size(ndata->bit_size),
+                   cmask ? BITFIELD_MASK(nr) : nr, transpose, cache);
+
+   jay_def tmp = dst;
+
+   if (dst.file == UGPR) {
+      if (transpose) {
+         /* Transpose writes whole GRFs, so round up */
+         tmp = jay_alloc_def(b, UGPR,
+                             ALIGN_POT(jay_num_values(dst),
+                                       jay_ugpr_per_grf(b->shader)));
+      } else {
+         /* Without transpose we write at GRF granularity. Pad out. */
+         tmp = jay_alloc_def(b, UGPR,
+                             jay_ugpr_per_grf(b->shader) * jay_num_values(dst));
+      }
+   }
+
+   jay_def srcs[] = { offset, data };
+
+   /* Second data source immediately follows the first */
+   if (op == LSC_OP_ATOMIC_CMPXCHG || op == LSC_OP_ATOMIC_FCMPXCHG) {
+      jay_def data2 = nj_src(*(data_src + 1));
+
+      if (!transpose) {
+         data2 = jay_as_gpr(b, data2);
+      }
+
+      srcs[1] = jay_collect_two(b, data, data2);
+   }
+
+   jay_def ex_desc = jay_null();
+   uint32_t ex_desc_imm = 0;
+   if (scratch) {
+      ex_desc = jay_scratch_surface(nj);
+
+      if (has_dest) {
+         b->shader->fills++;
+      } else {
+         b->shader->spills++;
+      }
+   } else if (surf_type == LSC_ADDR_SURFTYPE_FLAT) {
+      desc |= ((uint64_t) lsc_flat_ex_desc(devinfo, base_offs_bits) << 32);
+   } else if (jay_is_null(bti_indirect)) {
+      desc |=
+         ((uint64_t) lsc_bti_ex_desc(devinfo, bti_const, base_offs_bits) << 32);
+   } else if (!jay_is_null(bti_indirect)) {
+      ex_desc = bti_indirect;
+
+      if (surf_type == LSC_ADDR_SURFTYPE_SS ||
+          surf_type == LSC_ADDR_SURFTYPE_BSS) {
+         ex_desc_imm = SET_BITS(GET_BITS(base_offs_bits, 16, 4), 31, 19) |
+                       SET_BITS(GET_BITS(base_offs_bits, 3, 0), 15, 12);
+      } else {
+         /* TODO: Move the SHL to NIR for CSE? */
+         assert(surf_type == LSC_ADDR_SURFTYPE_BTI);
+         assert(base_offs_bits == 0);
+         ex_desc = jay_SHL_u32(b, bti_indirect, 24);
+      }
+   }
+
+   enum jay_type data_type = jay_type(JAY_TYPE_U, MAX2(ndata->bit_size, 32));
+   jay_SEND(b, .sfid = sfid, .msg_desc = desc, .srcs = srcs,
+            .nr_srcs = jay_is_null(data) ? 1 : 2, .dst = tmp, .type = data_type,
+            .src_type = { offset_type, data_type }, .uniform = uniform,
+            .bindless = surf_type == LSC_ADDR_SURFTYPE_BSS, .ex_desc = ex_desc,
+            .ex_desc_imm = ex_desc_imm);
+
+   if (has_dest && !jay_defs_equivalent(tmp, dst)) {
+      jay_copy_strided(b, dst, tmp, !transpose);
+   }
+}
+
+static void
+jay_emit_barycentric(struct nir_to_jay_state *nj,
+                     nir_intrinsic_instr *intr,
+                     enum intel_barycentric_mode mode)
+{
+   assert(nj->s->stage == MESA_SHADER_FRAGMENT);
+   enum glsl_interp_mode glsl_mode = nir_intrinsic_interp_mode(intr);
+
+   if (glsl_mode == INTERP_MODE_NOPERSPECTIVE) {
+      mode += INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL;
+   } else {
+      assert(glsl_mode == INTERP_MODE_SMOOTH);
+   }
+
+   jay_copy(&nj->bld, nj_def(&intr->def), nj->payload.fs.bary[mode]);
+}
+
+static void
+jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
+{
+   jay_shader *s = nj->s;
+   jay_function *f = nj->f;
+   jay_builder *b = &nj->bld;
+   jay_cs_payload *cs =
+      mesa_shader_stage_is_compute(s->stage) ? &nj->payload.cs : NULL;
+
+   const bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest;
+   jay_def dst = has_dest ? nj_def(&intr->def) : jay_null();
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_resource_intel:
+      /* No code to generate here */
+      break;
+
+   case nir_intrinsic_global_atomic:
+   case nir_intrinsic_global_atomic_swap:
+   case nir_intrinsic_image_atomic:
+   case nir_intrinsic_image_atomic_swap:
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_load_global:
+   case nir_intrinsic_load_global_constant:
+   case nir_intrinsic_load_global_constant_uniform_block_intel:
+   case nir_intrinsic_load_scratch_intel:
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_load_shared_uniform_block_intel:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_ssbo_intel:
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_uniform_block_intel:
+   case nir_intrinsic_shared_atomic:
+   case nir_intrinsic_shared_atomic_swap:
+   case nir_intrinsic_ssbo_atomic:
+   case nir_intrinsic_ssbo_atomic_swap:
+   case nir_intrinsic_store_global:
+   case nir_intrinsic_store_urb_lsc_intel:
+   case nir_intrinsic_store_scratch_intel:
+   case nir_intrinsic_store_shared:
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_store_ssbo_intel:
+   case nir_intrinsic_bindless_image_load:
+   case nir_intrinsic_bindless_image_store:
+   case nir_intrinsic_bindless_image_atomic:
+   case nir_intrinsic_bindless_image_atomic_swap:
+      jay_emit_mem_access(nj, intr);
+      break;
+
+   case nir_intrinsic_load_push_data_intel: {
+      unsigned sz = intr->def.bit_size / 8;
+      unsigned base_offset = nir_intrinsic_base(intr);
+      assert(util_is_aligned(base_offset, sz));
+
+      if (nir_src_is_const(intr->src[0])) {
+         unsigned load_offset = nir_src_as_uint(intr->src[0]);
+         unsigned offs = base_offset + load_offset;
+         assert(util_is_aligned(load_offset, sz));
+
+         if (sz >= 4) {
+            jay_foreach_comp(dst, c) {
+               jay_MOV(b, jay_extract(dst, c),
+                       nj->payload.push_data[(offs / 4) + c]);
+            }
+         } else {
+            jay_foreach_comp(dst, c) {
+               unsigned comp_offs = offs + c * sz;
+               if (util_is_aligned(comp_offs, 4)) {
+                  jay_MOV(b, jay_extract(dst, c),
+                          nj->payload.push_data[comp_offs / 4]);
+               } else {
+                  jay_CVT(b, JAY_TYPE_U32, jay_extract(dst, c),
+                          nj->payload.push_data[comp_offs / 4],
+                          JAY_TYPE_U | intr->def.bit_size, JAY_ROUND,
+                          (comp_offs % 4) / sz);
+               }
+            }
+         }
+      } else {
+         UNREACHABLE("todo: indirect push data");
+      }
+      break;
+   }
+
+   case nir_intrinsic_barrier:
+      if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
+         jay_emit_memory_barrier(nj, intr);
+      }
+
+      if (cs) {
+         if (nir_intrinsic_execution_scope(intr) == SCOPE_WORKGROUP) {
+            if (jay_workgroup_is_one_subgroup(b, nj->nir)) {
+               // XXX: when we have a scheduler, jay_SCHEDULE_BARRIER(b);
+            } else {
+               jay_emit_signal_barrier(nj);
+               s->prog_data->cs.uses_barrier = true;
+            }
+         }
+      } else {
+         // XXX: when we have a scheduler, jay_SCHEDULE_BARRIER(b);
+      }
+      break;
+
+   case nir_intrinsic_begin_invocation_interlock:
+   case nir_intrinsic_end_invocation_interlock:
+      UNREACHABLE("TODO");
+
+   case nir_intrinsic_load_reloc_const_intel:
+      jay_RELOC(b, dst, nir_intrinsic_param_idx(intr),
+                nir_intrinsic_base(intr));
+      break;
+
+   case nir_intrinsic_store_render_target_intel:
+      assert(nj->nir->info.stage == MESA_SHADER_FRAGMENT);
+      jay_emit_fb_write(b, intr);
+      break;
+
+   case nir_intrinsic_shader_clock:
+      /* We must access the timestamp register atomically, but 64-bit
+       * instructions cannot read ARF. Instead use a 2x32-bit vectorized move.
+       */
+      assert(dst.file == UGPR && "required for vectorization");
+      jay_MOV(b, dst, jay_contiguous_def(J_ARF, JAY_ARF_TIMESTAMP, 2))->type =
+         JAY_TYPE_U32;
+      break;
+
+   case nir_intrinsic_load_sample_mask_in: {
+      jay_def mask = jay_extract(nj->payload.u0, 15);
+
+      if (nj->s->dispatch_width == 32) {
+         /* TODO: Optimize */
+         jay_def hi = jay_extract(nj->payload.u1, 15);
+         mask = jay_BFI2_u32(b, 0xffff0000, hi, mask);
+      }
+
+      jay_MOV(b, dst, mask);
+      break;
+   }
+
+   case nir_intrinsic_load_subgroup_invocation:
+      /* TODO: Lower this in NIR? */
+      jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16,
+              JAY_ROUND, 0);
+      break;
+
+   case nir_intrinsic_demote:
+   case nir_intrinsic_demote_if:
+      /* TODO: Already lowered, but need to implement for performance. */
+      break;
+
+   case nir_intrinsic_ddx:
+   case nir_intrinsic_ddx_coarse:
+      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX,
+                          JAY_QUAD_SWIZZLE_YYYY);
+      break;
+   case nir_intrinsic_ddx_fine:
+      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXZZ,
+                          JAY_QUAD_SWIZZLE_YYWW);
+      break;
+
+   case nir_intrinsic_ddy:
+   case nir_intrinsic_ddy_coarse:
+      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX,
+                          JAY_QUAD_SWIZZLE_ZZZZ);
+      break;
+   case nir_intrinsic_ddy_fine:
+      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XYXY,
+                          JAY_QUAD_SWIZZLE_ZWZW);
+      break;
+
+   case nir_intrinsic_first_invocation:
+      jay_MOV(b, dst, emit_active_lane(nj));
+      break;
+
+   case nir_intrinsic_read_first_invocation:
+      jay_MOV(b, dst, emit_uniformize(nj, nj_src(intr->src[0])));
+      break;
+
+   case nir_intrinsic_ballot:
+   case nir_intrinsic_ballot_relaxed: {
+      jay_def val = nj_src(intr->src[0]);
+      if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
+         val = emit_active_lane_mask(nj);
+      } else if (val.file == UFLAG) {
+         /* Move to a FLAG temporary so we can ballot it. */
+         val = jay_MOV(b, jay_alloc_def(b, FLAG, 1), val)->dst;
+      } else {
+         assert(val.file == FLAG);
+      }
+
+      assert(intr->def.bit_size == b->shader->dispatch_width);
+      jay_MOV(b, dst, val);
+      break;
+   }
+
+   /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a
+    * GPR input, we could uniformize (as behaviour is undefined for
+    * non-uniform inputs) but a lowered bit extract is cheaper than uniformize.
+    */
+   case nir_intrinsic_inverse_ballot: {
+      assert(dst.file == FLAG);
+      jay_def x = nj_src(intr->src[0]);
+      if (x.file == GPR) {
+         jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id);
+         jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1);
+         jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_NE);
+      } else {
+         jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_load_local_invocation_id:
+      assert(cs);
+      UNREACHABLE("todo: implement me from payload");
+      jay_copy(b, dst, cs->local_invocation_ids);
+      break;
+
+   case nir_intrinsic_load_barycentric_pixel:
+      jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL);
+      break;
+
+   case nir_intrinsic_load_barycentric_sample:
+      jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+      break;
+
+   case nir_intrinsic_load_barycentric_centroid:
+      jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID);
+      break;
+
+   case nir_intrinsic_load_pixel_coord_intel:
+      jay_MOV(b, dst, nj->payload.fs.coord.xy);
+      break;
+
+   case nir_intrinsic_load_frag_coord_z:
+      jay_MOV(b, dst, nj->payload.fs.coord.z);
+      break;
+
+   case nir_intrinsic_load_frag_coord_w_rcp:
+      jay_MOV(b, dst, nj->payload.fs.coord.w);
+      break;
+
+   case nir_intrinsic_load_urb_output_handle_intel:
+      jay_MOV(b, dst, nj->payload.urb_handle);
+      break;
+
+   case nir_intrinsic_load_layer_id:
+      jay_EXTRACT_LAYER(b, dst, jay_extract(nj->payload.u0, 9),
+                        payload_u1(nj, 9, 1));
+      break;
+
+   case nir_intrinsic_load_front_face: {
+      /* Bit 11 is facingness for the first polygon. TODO: Multipolygon. */
+      jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(),
+                              jay_extract(nj->payload.u0, 9), BITFIELD_BIT(11));
+
+      /* The bit is actually backfacingness so check for equality with 0 */
+      jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_EQ);
+      break;
+   }
+
+   /* Sample ID comes in as 4-bit numbers in g1.0:
+    *
+    *    15:12 Slot 3 SampleID
+    *     11:8 Slot 2 SampleID
+    *      7:4 Slot 1 SampleID
+    *      3:0 Slot 0 SampleID
+    *
+    * Each slot corresponds to four channels, so we want to replicate each
+    * half-byte value to 4 channels in a row:
+    *
+    *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
+    *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
+    *
+    *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0
+    *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
+    *
+    * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
+    * channels to read the first byte (7:0), and the second group of 8
+    * channels to read the second byte (15:8).  Then, we shift right by
+    * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
+    * values into place.  Finally, we AND with 0xf to keep the low nibble.
+    *
+    * According to the "PS Thread Payload for Normal Dispatch"
+    * pages on the BSpec, the sample ids are stored in R0.8/R1.8
+    * on gfx20+ and in R1.0/R2.0 on gfx8+.
+    */
+   case nir_intrinsic_load_sample_id: {
+      jay_def x = jay_alloc_def(b, GPR, 1);
+      jay_EXTRACT_BYTE_PER_8LANES(b, x, jay_extract(nj->payload.u0, 8),
+                                  payload_u1(nj, 8, 1));
+      jay_AND_U32_U16(b, dst, jay_SHR_ODD_SUBSPANS_BY_4_u16(b, x), 0xF);
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+      if (s->stage == MESA_SHADER_VERTEX) {
+         unsigned offs = nir_intrinsic_base(intr) * 4;
+         offs += nir_intrinsic_component(intr);
+         assert(intr->def.bit_size == 32 && "todo");
+
+         jay_copy(b, dst,
+                  jay_collect_vectors(b, nj->payload.vs.attributes + offs,
+                                      intr->def.num_components));
+         break;
+      }
+
+      FALLTHROUGH;
+   case nir_intrinsic_load_fs_input_interp_deltas: {
+      assert(s->stage == MESA_SHADER_FRAGMENT);
+      unsigned location = nir_intrinsic_io_semantics(intr).location +
+                          nir_src_as_uint(intr->src[0]);
+      unsigned i = (s->prog_data->fs.urb_setup[location] * 4) +
+                   nir_intrinsic_component(intr);
+
+      if (intr->intrinsic == nir_intrinsic_load_input) {
+         assert(intr->def.num_components == 1 && "should be scalarized");
+      }
+
+      /* Zeroth delta is the flat value */
+      jay_copy(b, dst, nj->payload.fs.deltas[i]);
+      break;
+   }
+
+   case nir_intrinsic_load_subgroup_id:
+      assert(cs && f->is_entrypoint && "todo: this needs ABI");
+      /* Subgroup ID in Thread Group is u0.2 bits 7:0 */
+      jay_AND(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 0xFF);
+      break;
+
+   case nir_intrinsic_load_num_subgroups:
+      assert(cs && f->is_entrypoint && "todo: this needs ABI");
+      /* Number of subgroups in Thread Group is u0.2 bits 31:24 */
+      jay_SHR(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 24);
+      break;
+
+   case nir_intrinsic_load_workgroup_id:
+      assert(cs && f->is_entrypoint && "todo: this needs ABI");
+      jay_MOV(b, jay_extract(dst, 0), jay_extract(nj->payload.u0, 1));
+      jay_MOV(b, jay_extract(dst, 1), jay_extract(nj->payload.u0, 6));
+      jay_MOV(b, jay_extract(dst, 2), jay_extract(nj->payload.u0, 7));
+      break;
+
+   case nir_intrinsic_shuffle_intel: {
+      jay_def data = nj_src(intr->src[0]);
+
+      if (nir_src_is_const(intr->src[1])) {
+         /* Broadcast takes a lane index, with only 32-bit registers */
+         jay_BROADCAST_IMM(b, dst, data, nir_src_as_uint(intr->src[1]) / 4);
+      } else {
+         /* Shuffle takes a byte index */
+         jay_SHUFFLE(b, dst, data, nj_src(intr->src[1]));
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_quad_broadcast:
+      jay_QUAD_SWIZZLE(b, dst, nj_src(intr->src[0]),
+                       JAY_QUAD_SWIZZLE_XXXX + nir_src_as_uint(intr->src[1]));
+      break;
+
+   case nir_intrinsic_load_inline_data_intel: {
+      assert(cs && f->is_entrypoint && "todo: this needs ABI");
+      b->shader->prog_data->cs.uses_inline_data = true;
+
+      unsigned offset = nir_intrinsic_base(intr) / 4;
+      unsigned nr = jay_num_values(dst);
+      jay_copy(b, dst, jay_extract_range(nj->payload.inline_data, offset, nr));
+      break;
+   }
+
+   default:
+#ifndef NDEBUG
+      assert(intr->intrinsic < nir_num_intrinsics);
+      fprintf(stdout, "intrinsic: %s\n",
+              nir_intrinsic_infos[intr->intrinsic].name);
+#endif
+      UNREACHABLE("unknown intrinsic");
+   }
+}
+
+static bool
+sampler_needs_header(enum brw_sampler_opcode op,
+                     nir_texop nir_op,
+                     const struct intel_device_info *devinfo)
+{
+   switch (op) {
+   case BRW_SAMPLER_OPCODE_SAMPLEINFO:
+      return true;
+   case BRW_SAMPLER_OPCODE_LD:
+   case BRW_SAMPLER_OPCODE_LD_LZ:
+      /* Xe3 HW does not seem to work unless we force a header. */
+      return devinfo->ver >= 30;
+   default:
+      return nir_op == nir_texop_tg4;
+   }
+}
+
+static void
+jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex)
+{
+   /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
+    *
+    *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
+    *     Enable will be incorect for sample_c when applied to a surface with
+    *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
+    *     Enable may incorrectly report pixels as referencing a Null surface."
+    *
+    * We'll take care of this in NIR.
+    */
+   assert(!tex->is_sparse ||
+          nir_tex_instr_src_index(tex, nir_tex_src_comparator) == -1);
+
+   jay_builder *b = &nj->bld;
+   jay_def dst = nj_def(&tex->def);
+   jay_def tmp = dst;
+
+   const enum brw_sampler_opcode op = (enum brw_sampler_opcode)(
+      tex->backend_flags & ~BRW_TEX_INSTR_FUSED_EU_DISABLE);
+   const struct brw_sampler_payload_desc *payload_desc =
+      brw_get_sampler_payload_desc(op);
+
+   /* First deal with surface & sampler */
+   unsigned payload_type_bit_size = 0;
+   bool surface_bindless = false;
+   bool sampler_bindless = false;
+   jay_def surface, sampler, packed_offsets = jay_null();
+   jay_def payload[JAY_MAX_SAMPLER_MESSAGE_SIZE];
+   int i;
+   if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle)) >= 0) {
+      unsigned x;
+      surface =
+         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless);
+      if (jay_is_null(surface))
+         surface = jay_imm(x);
+      assert(tex->texture_index == 0);
+   } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset)) >=
+              0) {
+      unsigned x;
+      surface =
+         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless);
+      if (jay_is_null(surface))
+         surface = jay_imm(x + tex->texture_index);
+      else if (tex->texture_index)
+         surface = jay_ADD_u32(b, surface, tex->texture_index);
+   } else {
+      surface = jay_imm(tex->texture_index);
+   }
+
+   if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle)) >= 0) {
+      unsigned x;
+      sampler =
+         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless);
+      if (jay_is_null(sampler))
+         surface = jay_imm(x);
+      assert(tex->sampler_index == 0);
+   } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset)) >=
+              0) {
+      unsigned x;
+      sampler =
+         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless);
+      if (jay_is_null(sampler))
+         sampler = jay_imm(x + tex->sampler_index);
+      else
+         sampler = jay_ADD_u32(b, sampler, tex->sampler_index);
+   } else {
+      sampler = jay_imm(tex->sampler_index);
+   }
+
+   surface = emit_uniformize(nj, surface);
+   sampler = emit_uniformize(nj, sampler);
+
+   /* Now the sampler payload */
+   bool has_offset_in_payload = false;
+   bool payload_uniform = true;
+   uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0;
+   for (uint32_t i = 0;
+        payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID;
+        i++) {
+      nir_tex_src_type nir_source;
+      unsigned nir_comp;
+
+#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name
+#define S(name, component)                                                     \
+   do {                                                                        \
+      nir_source = nir_tex_src_##name;                                         \
+      nir_comp = component;                                                    \
+   } while (0)
+
+      struct brw_sampler_payload_src sampler_src = payload_desc->sources[i];
+
+      switch (sampler_src.param) {
+      case P(U):
+         S(coord, 0);
+         break;
+      case P(V):
+         S(coord, 1);
+         break;
+      case P(R):
+         S(coord, 2);
+         break;
+      case P(AI):
+         S(coord, 3);
+         break;
+      case P(BIAS):
+         S(bias, 0);
+         break;
+      case P(LOD):
+         S(lod, 0);
+         break;
+      case P(MLOD):
+         S(min_lod, 0);
+         break;
+      case P(REF):
+         S(comparator, 0);
+         break;
+      case P(DUDX):
+         S(ddx, 0);
+         break;
+      case P(DUDY):
+         S(ddy, 0);
+         break;
+      case P(DVDX):
+         S(ddx, 1);
+         break;
+      case P(DVDY):
+         S(ddy, 1);
+         break;
+      case P(DRDX):
+         S(ddx, 2);
+         break;
+      case P(DRDY):
+         S(ddy, 2);
+         break;
+      case P(SI):
+         S(ms_index, 0);
+         break;
+      case P(MCSL):
+         S(ms_mcs_intel, 0);
+         break;
+      case P(MCSH):
+         S(ms_mcs_intel, 1);
+         break;
+      case P(MCS0):
+         S(ms_mcs_intel, 0);
+         break;
+      case P(MCS1):
+         S(ms_mcs_intel, 1);
+         break;
+      case P(MCS2):
+         S(ms_mcs_intel, 2);
+         break;
+      case P(MCS3):
+         S(ms_mcs_intel, 3);
+         break;
+
+      case P(OFFU):
+         S(offset, 0);
+         has_offset_in_payload = true;
+         break;
+      case P(OFFV):
+         S(offset, 1);
+         has_offset_in_payload = true;
+         break;
+      case P(OFFUV4):
+      case P(OFFUVR4):
+      case P(OFFUV6):
+      case P(OFFUVR6):
+      case P(BIAS_OFFUV6):
+      case P(BIAS_OFFUVR4):
+      case P(LOD_OFFUV6):
+      case P(LOD_OFFUVR4):
+      case P(OFFUV4_R):
+      case P(OFFUV6_R):
+      case P(OFFUVR4_R):
+         /* There is no payload with 2 packed entries, so backend1 is always
+          * the one payload parameter packed. */
+         S(backend1, 0);
+         has_offset_in_payload = true;
+         break;
+
+      case P(BIAS_AI):
+      case P(LOD_AI):
+      case P(MLOD_R):
+         /* There is no payload with 2 packed entries, so backend1 is always
+          * the one payload parameter packed. */
+         S(backend1, 0);
+         break;
+
+      default:
+         UNREACHABLE("unhandled sampler param");
+      }
+
+#undef P
+#undef S
+
+      jay_def param_val = jay_null();
+
+      int j = nir_tex_instr_src_index(tex, nir_source);
+      if (j >= 0 && nir_comp < tex->src[j].src.ssa->num_components) {
+         param_val = jay_extract(nj_src(tex->src[j].src), nir_comp);
+
+         unsigned bitsize = nir_src_bit_size(tex->src[j].src);
+         assert(payload_type_bit_size == 0 || payload_type_bit_size == bitsize);
+         payload_type_bit_size = bitsize;
+      }
+
+      /* The hardware requires a LOD for buffer textures */
+      if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF &&
+          sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) {
+         sampler_src.optional = false;
+      }
+
+      /* Wa_14012688258:
+       *
+       * Don't trim zeros at the end of payload for sample operations
+       * in cube and cube arrays.
+       *
+       * Compiler should send U,V,R parameters even if V,R are 0.
+       */
+      if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+          intel_needs_workaround(nj->devinfo, 14012688258) &&
+          (sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U ||
+           sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V ||
+           sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) {
+         sampler_src.optional = false;
+      }
+
+      /* The last source present in the payload dictates the number of
+       * sources, unless it's required.
+       *
+       * We can skip the last source if it's zero.
+       */
+      if (!sampler_src.optional || !jay_is_null(param_val))
+         n_sources = i + 1;
+
+      if (jay_is_null(param_val)) {
+         param_val = jay_alloc_def(b, dst.file, 1);
+         jay_MOV(b, param_val, 0);
+      }
+
+      payload[i] = param_val;
+      payload_uniform &= jay_is_uniform(payload[i]);
+   }
+
+   i = nir_tex_instr_src_index(tex, nir_tex_src_backend2);
+   if (i >= 0) {
+      packed_offsets = nj_src(tex->src[i].src);
+   }
+
+   /* Xe2+ should never used packed offsets since it has enough opcodes to
+    * handle any programmable offset.
+    */
+   assert(jay_is_null(packed_offsets) || nj->devinfo->ver < 20);
+
+   /* If the NIR instruction has an offset param but the sampler payload
+    * doesn't, we can put the offset into the header of the message.
+    *
+    * The restriction though is that it should be a constant value.
+    */
+   int offs_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset);
+   bool has_const_offsets = offs_idx != -1 && !has_offset_in_payload;
+
+   bool is_high_sampler = !jay_is_imm(sampler) || jay_as_uint(sampler) >= 16;
+   bool residency = tex->is_sparse;
+   unsigned null_mask_component = 0;
+
+   const bool needs_header = sampler_needs_header(op, tex->op, nj->devinfo) ||
+                             has_const_offsets ||
+                             !jay_is_null(packed_offsets) ||
+                             sampler_bindless ||
+                             is_high_sampler ||
+                             residency;
+
+   uint8_t component_mask;
+   if (tex->op == nir_texop_tg4) {
+      component_mask = WRITEMASK_XYZW;
+   } else if (residency) {
+      /* intel_nir_lower_sparse guarantees that texturing operations only
+       * read the data, or the sparse residency code, but not both at once.
+       *
+       * We need to use UGPRs for the residency result because the sampler
+       * returns the null pixel mask in lane 0, regardless of lanemasking.
+       *
+       * Unfortunately, the sampler doesn't allow us to writemask out all
+       * four colour channels, so we have to needlessly return red.  This
+       * isn't uniform data, but we store it in an array of UGPRs anyway
+       * in order to have a consistent def file.  The colour data will be
+       * immediately dead anyway.
+       */
+      assert(tex->op == nir_texop_sparse_residency_intel ||
+             tex->op == nir_texop_sparse_residency_txf_intel);
+      assert(nir_def_components_read(&tex->def) == WRITEMASK_Y);
+      component_mask = WRITEMASK_X;
+      unsigned red_grfs = payload_uniform ? 1 : jay_grf_per_gpr(b->shader);
+      unsigned grfs = red_grfs + 1;
+      tmp = jay_alloc_def(b, UGPR, grfs * jay_ugpr_per_grf(b->shader));
+      null_mask_component = red_grfs * jay_ugpr_per_grf(b->shader);
+   } else {
+      component_mask = nir_def_components_read(&tex->def);
+
+      /* We can reduce the return length of the message to drop unused
+       * trailing components, but shrinking with a discontiguous mask
+       * requires a message header.  We only do that if we need a header
+       * for other reasons, as it's more expensive than writing extra data.
+       */
+      if (!needs_header) {
+         component_mask =
+            (uint8_t) BITFIELD_MASK(util_last_bit(component_mask));
+      }
+
+      /* TODO: Shrink 16-bit textures too. Shrinking is problematic for some
+       * component masks due to 32-bit granularity of ISA registers.
+       */
+      if (tex->def.bit_size != 32 || (jay_debug & JAY_DBG_NOOPT))
+         component_mask = nir_component_mask(tex->def.num_components);
+
+      /* If we shrunk the destination, we need a temporary */
+      if (component_mask != BITFIELD_MASK(tex->def.num_components)) {
+         tmp = jay_alloc_def(b, GPR, util_bitcount(component_mask));
+      }
+   }
+
+   /* SENDs always write entire GRFs so we need to pad out for uniform dests */
+   if (dst.file == UGPR && !residency) {
+      unsigned nr = jay_ugpr_per_grf(b->shader) * jay_num_values(tmp);
+      tmp = jay_alloc_def(b, UGPR, nr);
+   }
+
+   if (tex->op == nir_texop_texture_samples) {
+      assert(needs_header);
+      payload_type_bit_size = 32;
+      n_sources = 0;
+   }
+
+   jay_def header = jay_null();
+   if (needs_header) {
+      uint32_t header2;
+      if (tex->op == nir_texop_tg4) {
+         /* Gathers have a component but no write mask */
+         header2 = (tex->component << 16);
+      } else {
+         /* If present, the header write mask are inverted compared to NIR */
+         header2 = (~component_mask & 0xf) << 12;
+      }
+
+      if (residency)
+         header2 |= 1 << 23; /* g0.2 bit 23: Pixel Null Mask Enable */
+
+      if (has_const_offsets) {
+         const unsigned num_components = nir_tex_instr_src_size(tex, offs_idx);
+         for (unsigned i = 0; i < num_components; i++) {
+            nir_scalar s = nir_get_scalar(tex->src[offs_idx].src.ssa, i);
+            s = nir_scalar_chase_movs(s);
+            assert(nir_scalar_is_const(s));
+            int offset = nir_scalar_as_int(s);
+
+            /* Offsets are 4-bits, reversed order */
+            header2 |= (offset & 0xf) << ((2 - i) * 4);
+         }
+      }
+
+      /* Vectorized zeroing of the header. TODO: This can be optimized more. */
+      jay_def zeroes = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader));
+      jay_MOV(b, zeroes, 0);
+
+      jay_def ugprs[JAY_MAX_DEF_LENGTH];
+      jay_foreach_comp(zeroes, i) {
+         ugprs[i] = jay_extract(zeroes, i);
+      }
+
+      /* Set the main immediate part of the header */
+      if (header2 != 0) {
+         ugprs[2] = jay_MOV_u32(b, header2);
+      }
+
+      if (sampler_bindless) {
+         /* Bindless sampler handles aren't relative to the sampler state
+          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
+          * Instead, it's an absolute pointer relative to dynamic state base
+          * address.
+          *
+          * Sampler states are 16 bytes each and the pointer we give here has
+          * to be 32-byte aligned.  In order to avoid more indirect messages
+          * than required, we assume that all bindless sampler states are
+          * 32-byte aligned.  This sacrifices a bit of general state base
+          * address space but means we can do something more efficient in the
+          * shader.
+          */
+         ugprs[3] = sampler;
+      } else {
+         /* Select the default dynamic state base address + offset */
+         jay_def sampler_ptr = nj->payload.sampler_state_pointer;
+
+         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
+          * with the ones included in g0.3 bits 4:0.  Mask them out.
+          */
+         if (b->shader->devinfo->ver >= 11) {
+            sampler_ptr = jay_AND_u32(b, sampler_ptr, INTEL_MASK(31, 5));
+         }
+
+         /* TODO: We should probably lower this in NIR. */
+         if (is_high_sampler) {
+            if (jay_is_imm(sampler)) {
+               unsigned s = jay_as_uint(sampler);
+               const int sampler_state_size_B = 16;
+               unsigned offs_B = ROUND_DOWN_TO(s, 16) * sampler_state_size_B;
+               assert(offs_B > 0 && "since s > 0");
+               sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B);
+            } else {
+               jay_def offs_B =
+                  jay_SHL_u32(b, jay_AND_u32(b, sampler, 0xf0), 4);
+               sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B);
+            }
+         }
+
+         ugprs[3] = sampler_ptr;
+      }
+      /* Zip it all up into a vector of UGPRs which will RA to a single GRF */
+      header = jay_collect_vectors(b, ugprs, jay_num_values(zeroes));
+   }
+
+   assert(payload_type_bit_size == 16 || payload_type_bit_size == 32);
+   unsigned simd_mode = 0;
+   unsigned simd_width = payload_uniform ? 1 : nj->s->dispatch_width;
+   if (nj->devinfo->ver < 20) {
+      if (payload_type_bit_size == 16) {
+         assert(nj->devinfo->ver >= 11);
+         simd_mode = simd_width <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
+                                       GFX10_SAMPLER_SIMD_MODE_SIMD16H;
+      } else {
+         simd_mode = simd_width <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
+                                       BRW_SAMPLER_SIMD_MODE_SIMD16;
+      }
+   } else {
+      if (payload_type_bit_size == 16) {
+         simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
+                                        XE2_SAMPLER_SIMD_MODE_SIMD32H;
+      } else {
+         simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
+                                        XE2_SAMPLER_SIMD_MODE_SIMD32;
+      }
+   }
+
+   uint64_t desc = 0;
+   jay_def desc_src = jay_null(), desc_ex_src = jay_null();
+
+   unsigned sampler_imm = 0;
+   if (jay_is_imm(sampler) && !sampler_bindless) {
+      sampler_imm = jay_as_uint(sampler) % 16;
+   }
+
+   const unsigned msg_type = brw_get_sampler_hw_opcode(op);
+   bool is_16 = false; /* TODO */
+   unsigned ret_type = is_16 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS :
+                               GFX8_SAMPLER_RETURN_FORMAT_32BITS;
+
+   if (!surface_bindless &&
+       jay_is_imm(surface) &&
+       (jay_is_imm(sampler) || sampler_bindless)) {
+      desc = brw_sampler_desc(nj->devinfo, jay_as_uint(surface), sampler_imm,
+                              msg_type, simd_mode, ret_type);
+   } else if (surface_bindless) {
+      /* Bindless surface */
+      desc = brw_sampler_desc(nj->devinfo, GFX9_BTI_BINDLESS, sampler_imm,
+                              msg_type, simd_mode, ret_type);
+
+      /* For bindless samplers, the entire address is included in the message
+       * header so we can leave the portion in the message descriptor 0.
+       */
+      if (!sampler_bindless && !jay_is_imm(sampler)) {
+         desc_src = jay_SHL_u32(b, sampler, 8);
+      }
+
+      /* We assume that the driver provided the handle in the top 20 bits so
+       * we can use the surface handle directly as the extended descriptor.
+       */
+      desc_ex_src = jay_alloc_def(b, J_ADDRESS, 1);
+      jay_MOV(b, desc_ex_src, surface);
+   } else {
+      /* Immediate portion of the descriptor */
+      desc = brw_sampler_desc(nj->devinfo, 0, 0, msg_type, simd_mode, ret_type);
+
+      if (sampler_bindless) {
+         desc_src = surface;
+      } else if (!sampler_bindless && jay_is_imm(sampler)) {
+         desc_src = jay_OR_u32(b, surface, jay_as_uint(sampler) << 8);
+      } else {
+         desc_src = jay_OR_u32(b, jay_SHL_u32(b, sampler, 8), surface);
+      }
+
+      desc_src = jay_AND_u32(b, desc_src, 0xfff);
+   }
+
+   if (n_sources > 2 || !jay_is_null(header)) {
+      for (unsigned i = 0; i < n_sources; ++i) {
+         payload[i] =
+            jay_src_as_strided(b, payload[i], 1, payload_uniform ? UGPR : GPR);
+      }
+   }
+
+   enum jay_type src_type = jay_type(JAY_TYPE_U, payload_type_bit_size);
+   jay_SEND(b, .sfid = BRW_SFID_SAMPLER, .msg_desc = desc, .desc = desc_src,
+            .ex_desc = desc_ex_src, .header = header, .srcs = payload,
+            .nr_srcs = n_sources, .type = JAY_TYPE_U32,
+            .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform,
+            .bindless = surface_bindless);
+
+   /* If we sampled into a temporary, copy out to the final */
+   if (residency) {
+      jay_MOV(b, jay_extract(dst, 1), jay_extract(tmp, null_mask_component));
+   } else if (!jay_defs_equivalent(dst, tmp)) {
+      unsigned i = 0;
+      unsigned tmp_stride = dst.file == UGPR ? jay_ugpr_per_grf(b->shader) : 1;
+
+      u_foreach_bit(c, component_mask) {
+         jay_MOV(b, jay_extract(dst, c), jay_extract(tmp, (i++) * tmp_stride));
+      }
+   }
+
+   if (mesa_shader_stage_is_compute(b->shader->stage)) {
+      b->shader->prog_data->cs.uses_sampler |= !nir_tex_instr_is_query(tex);
+   }
+}
+
+static void
+jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      jay_block_add_successor(nj->current_block, nj->break_block);
+      jay_BREAK(&nj->bld);
+      break;
+   case nir_jump_halt:
+      // TODO: Do we want a predicated EOT here, or a jump to the end?
+      assert(!"TODO: implement HALT");
+      break;
+   case nir_jump_return:
+      /* Should be lowered */
+   default:
+      UNREACHABLE("unknown jump");
+   }
+}
+
+static void
+jay_emit_instr(struct nir_to_jay_state *nj, jay_block *block, nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      jay_emit_alu(nj, nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      jay_emit_intrinsic(nj, nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_tex:
+      jay_emit_texture(nj, nir_instr_as_tex(instr));
+      break;
+
+   case nir_instr_type_load_const:
+      jay_emit_load_const(nj, nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_phi:
+   case nir_instr_type_undef: {
+      jay_def def = nj_def(nir_instr_def(instr));
+
+      jay_foreach_comp(def, c) {
+         if (instr->type == nir_instr_type_phi) {
+            jay_PHI_DST(&nj->bld, jay_extract(def, c));
+         } else {
+            jay_INDETERMINATE(&nj->bld, jay_extract(def, c));
+         }
+      }
+
+      break;
+   }
+
+   case nir_instr_type_jump:
+      jay_emit_jump(nj, nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_deref:
+      UNREACHABLE("All derefs should've been lowered");
+
+   default:
+      UNREACHABLE("unknown instruction type");
+   }
+}
+
+static jay_block *
+jay_create_block(struct nir_to_jay_state *nj)
+{
+   jay_block *block = jay_new_block(nj->f);
+   block->indent = nj->indent;
+   return block;
+}
+
+static jay_inst *
+jay_block_ending_unconditional_jump(jay_block *block)
+{
+   jay_inst *jump = jay_block_ending_jump(block);
+   return jump && !jump->predication ? jump : NULL;
+}
+
+static void
+jay_emit_if(struct nir_to_jay_state *nj, nir_if *nif)
+{
+   jay_builder *b = &nj->bld;
+   jay_def condition = nj_src(nif->condition);
+
+   jay_block *before_block = nj->current_block;
+   jay_block *after_block = jay_create_block(nj);
+
+   /* Push */
+   ++nj->indent;
+
+   jay_block *else_first = jay_create_block(nj);
+
+   jay_block *then_first = jay_emit_cf_list(nj, &nif->then_list);
+   jay_block *then_last = nj->current_block;
+
+   nj->after_block = else_first;
+
+   jay_block *else_first_2 = jay_emit_cf_list(nj, &nif->else_list);
+   jay_block *else_last = nj->current_block;
+   assert(else_first == else_first_2);
+
+   /* Pop */
+   --nj->indent;
+
+   jay_block_add_successor(before_block, then_first);
+   jay_block_add_successor(before_block, else_first);
+
+   if (!jay_block_ending_unconditional_jump(then_last))
+      jay_block_add_successor(then_last, after_block);
+
+   if (!jay_block_ending_unconditional_jump(else_last))
+      jay_block_add_successor(else_last, after_block);
+
+   nj->after_block = after_block;
+
+   /* Emit the if-else-endif sequence */
+   b->cursor = jay_after_block(before_block);
+   jay_add_predicate(b, jay_IF(b), condition);
+
+   b->cursor = jay_before_block(else_first);
+   jay_ELSE(b);
+
+   b->cursor = jay_after_block(else_last);
+   jay_ENDIF(b);
+}
+
+static void
+jay_emit_loop(struct nir_to_jay_state *nj, nir_loop *nloop)
+{
+   assert(!nir_loop_has_continue_construct(nloop));
+
+   jay_builder *b = &nj->bld;
+   jay_block *saved_break = nj->break_block;
+
+   /* Make the block that will be after the loop exit */
+   nj->break_block = jay_create_block(nj);
+   ++nj->indent;
+
+   /* Make a block for the loop body, which is also the loop header */
+   jay_block *loop_header = jay_create_block(nj);
+   loop_header->loop_header = true;
+
+   /* The current block falls through to the start of the loop */
+   jay_block_add_successor(nj->current_block, loop_header);
+
+   /* Emit the loop body */
+   nj->after_block = loop_header;
+   jay_emit_cf_list(nj, &nloop->body);
+
+   /* Emit the backedge */
+   jay_inst *jump = jay_block_ending_jump(nj->current_block);
+   if (jump && jump->op == JAY_OPCODE_BREAK) {
+      jump->op = JAY_OPCODE_LOOP_ONCE;
+   } else {
+      jay_block_add_successor(nj->current_block, loop_header);
+      jay_WHILE(b);
+   }
+
+   /* Pop */
+   --nj->indent;
+   nj->after_block = nj->break_block;
+   nj->break_block = saved_break;
+
+   b->cursor = jay_after_block(nj->after_block);
+}
+
+static jay_block *
+jay_emit_block(struct nir_to_jay_state *nj, nir_block *nb)
+{
+   jay_builder *b = &nj->bld;
+
+   if (nj->after_block) {
+      nj->current_block = nj->after_block;
+      nj->after_block = NULL;
+   } else {
+      nj->current_block = jay_create_block(nj);
+   }
+
+   jay_block *block = nj->current_block;
+   block->uniform = !nb->divergent;
+   list_addtail(&block->link, &nj->f->blocks);
+
+   b->cursor = jay_after_block(block);
+
+   /* Emit the contents of the block */
+   nir_foreach_instr(instr, nb) {
+      jay_emit_instr(nj, block, instr);
+   }
+
+   /* Look in the current NIR block's successors for any phis. Each of them
+    * should have a source corresponding to a value coming from our current
+    * block. Create PHI_SRC opcodes in the current block for those values.
+    * The corresponding PHI_DST may not have been emitted yet, but that's ok.
+    */
+   for (unsigned bs = 0; bs < ARRAY_SIZE(nb->successors); ++bs) {
+      nir_block *nb_successor = nb->successors[bs];
+      if (!nb_successor)
+         continue;
+
+      nir_foreach_phi(nphi, nb_successor) {
+         jay_def val = nj_src(nir_phi_get_src_from_block(nphi, nb)->src);
+
+         /* The phi def might be nonuniform but have uniform source (like a
+          * constant). Move to the correct file in the the source block and
+          * reference that in PHI_SRC.
+          */
+         if (jay_file_for_def(&nphi->def) != val.file) {
+            b->cursor = jay_after_block_logical(block);
+            jay_def tmp = val;
+            val = jay_alloc_def(b, jay_file_for_def(&nphi->def),
+                                jay_num_values(val));
+            jay_copy(b, val, tmp);
+         }
+
+         jay_foreach_comp(val, c) {
+            b->cursor = jay_before_jump(block);
+            jay_PHI_SRC(b, JAY_TYPE_U32, jay_extract(val, c),
+                        nphi->def.index + c);
+         }
+      }
+   }
+
+   b->cursor = jay_after_block(block);
+   nj->active_lane_mask = jay_null();
+   nj->active_lane = jay_null();
+   nj->active_lane_x4 = jay_null();
+
+   return block;
+}
+
+static jay_block *
+jay_emit_cf_list(struct nir_to_jay_state *nj, struct exec_list *list)
+{
+   jay_block *start_block = NULL;
+
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_block: {
+         jay_block *block = jay_emit_block(nj, nir_cf_node_as_block(node));
+
+         if (!start_block)
+            start_block = block;
+         break;
+      }
+
+      case nir_cf_node_if:
+         jay_emit_if(nj, nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         jay_emit_loop(nj, nir_cf_node_as_loop(node));
+         break;
+
+      default:
+         UNREACHABLE("Unknown NIR control flow node");
+      }
+   }
+
+   return start_block;
+}
+
+static void
+jay_emit_eot(struct nir_to_jay_state *nj)
+{
+   jay_builder *b = &nj->bld;
+
+   if (mesa_shader_stage_is_compute(nj->nir->info.stage)) {
+      /* Vectorized copy into the EOT register. Not necessary for correctness
+       * but keeps RA from inserting 16 scalar copies instead.
+       */
+      jay_def copy = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader));
+      jay_MOV(b, copy, nj->payload.u0);
+
+      jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, .eot = true, .msg_desc = 0,
+               .srcs = &copy, .nr_srcs = 1, .type = JAY_TYPE_U32,
+               .uniform = true);
+   } else if (nj->nir->info.stage == MESA_SHADER_VERTEX) {
+      jay_block *block = jay_last_block(nj->f);
+      jay_inst *I = jay_last_inst(block);
+
+      /* TODO: What if this isn't the case? Do we need a no-op store...? */
+      assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == BRW_SFID_URB);
+      jay_set_send_eot(I, true);
+   }
+}
+
+static void
+set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired)
+{
+   /* Only touch cr0 if we are changing bits */
+   if ((*cr0) != desired) {
+      jay_builder b = jay_init_builder(f, cursor);
+      jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired);
+      *cr0 = desired;
+   }
+}
+
+static void
+jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes)
+{
+   /* First, work out the global float control mode for the shader */
+   uint32_t global = 0x0;
+
+   /* Initially fp16 denorms are flushed-to-zero, handle preserve. */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) {
+      global |= BRW_CR0_FP16_DENORM_PRESERVE;
+   }
+
+   /* Initially fp32 denorms are flushed-to-zero, handle preserve.
+    *
+    * TODO: Optimize this, we have a dispatch bit.
+    */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) {
+      global |= BRW_CR0_FP32_DENORM_PRESERVE;
+   }
+
+   /* Initially fp64 denorms are flushed to zero, handle preserve. */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) {
+      global |= BRW_CR0_FP64_DENORM_PRESERVE;
+   }
+
+   /* By default, we are in round-to-even mode. Note we do not permit setting
+    * round mode separately by bitsize but this is ok for current APIs. The
+    * Vulkan driver sets roundingModeIndependence = NONE.
+    *
+    * TODO: Optimize this, there is a command buffer bit for it.
+    */
+   if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) ||
+       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) ||
+       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) {
+      global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT);
+   }
+
+   uint32_t cr0 = 0;
+   jay_function *entrypoint = jay_shader_get_entrypoint(shader);
+   set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global);
+
+   /* Now handle per-instruction deltas to the global mode */
+   jay_foreach_function(shader, func) {
+      jay_foreach_block(func, block) {
+         uint32_t current = cr0;
+
+         jay_foreach_inst_in_block(block, I) {
+            uint32_t required = cr0;
+            enum jay_rounding_mode round =
+               (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND;
+
+            if (round != JAY_ROUND) {
+               required &= ~BRW_CR0_RND_MODE_MASK;
+               required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT);
+            }
+
+            if (jay_type_is_any_float(I->type)) {
+               set_cr0(func, jay_before_inst(I), &current, required);
+            }
+         }
+
+         /* Restore to global state on block boundaries */
+         if (jay_num_successors(block) > 0) {
+            set_cr0(func, jay_after_block(block), &current, cr0);
+         }
+      }
+   }
+}
+
+struct payload_builder {
+   jay_builder *b;
+   unsigned offsets[JAY_NUM_SSA_FILES];
+   jay_def vecs[JAY_NUM_SSA_FILES];
+};
+
+static jay_def
+read_payload(struct payload_builder *b, enum jay_file file)
+{
+   unsigned granularity = file == UGPR ? 16 : 1;
+   unsigned channel = b->offsets[file] % granularity;
+
+   if (channel == 0) {
+      b->vecs[file] = jay_alloc_def(b->b, file, granularity);
+      jay_PRELOAD(b->b, b->vecs[file], b->offsets[file]);
+   }
+
+   b->offsets[file]++;
+   return jay_extract(b->vecs[file], channel);
+}
+
+static jay_def
+read_vector_payload(struct payload_builder *b, enum jay_file file, unsigned len)
+{
+   jay_def defs[JAY_MAX_DEF_LENGTH];
+   assert(len < ARRAY_SIZE(defs));
+
+   for (unsigned i = 0; i < len; ++i) {
+      defs[i] = read_payload(b, file);
+   }
+
+   return jay_collect_vectors(b->b, defs, len);
+}
+
+static void
+setup_payload_push(struct nir_to_jay_state *nj, struct payload_builder *p)
+{
+   unsigned push_size_B = 0;
+   for (int i = 0; i < ARRAY_SIZE(nj->s->prog_data->base.push_sizes); i++) {
+      push_size_B += nj->s->prog_data->base.push_sizes[i];
+   }
+
+   assert(util_is_aligned(push_size_B, 32));
+   for (unsigned i = 0; i < (push_size_B / 4); ++i) {
+      nj->payload.push_data[i] = read_payload(p, UGPR);
+   }
+
+   nj->s->push_grfs = push_size_B / (4 * jay_ugpr_per_grf(nj->s));
+}
+
+static void
+setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
+{
+   nj->payload.urb_handle = read_payload(p, GPR);
+
+   /* XXX: This is a hack to line up with the partition chosen in RA. This whole
+    * thing needs an overhaul. Need to think harder about partitioning.
+    */
+   p->offsets[GPR] += 7;
+
+   for (unsigned i = 0; i < (8 * nj->s->prog_data->vue.urb_read_length); ++i) {
+      assert(i < ARRAY_SIZE(nj->payload.vs.attributes));
+      nj->payload.vs.attributes[i] = read_payload(p, GPR);
+   }
+
+   setup_payload_push(nj, p);
+}
+
+static void
+setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
+{
+   assert(!nj->s->prog_data->cs.generate_local_id);
+   assert(!nj->s->prog_data->cs.uses_btd_stack_ids);
+
+   nj->payload.inline_data =
+      read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
+}
+
+static inline enum intel_barycentric_mode
+brw_barycentric_mode(const struct brw_fs_prog_key *key,
+                     nir_intrinsic_instr *intr)
+{
+   const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr);
+
+   /* Barycentric modes don't make sense for flat inputs. */
+   assert(mode != INTERP_MODE_FLAT);
+
+   unsigned bary;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_at_offset:
+      /* When per sample interpolation is dynamic, assume sample interpolation.
+       * We'll dynamically remap things so that the FS payload is not affected.
+       */
+      bary = key->persample_interp == INTEL_SOMETIMES ?
+                INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
+                INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_at_sample:
+      bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+      break;
+   default:
+      UNREACHABLE("invalid intrinsic");
+   }
+
+   if (mode == INTERP_MODE_NOPERSPECTIVE)
+      bary += 3;
+
+   return (enum intel_barycentric_mode) bary;
+}
+
+struct fs_info_ctx {
+   const struct brw_fs_prog_key *key;
+   struct brw_fs_prog_data *prog_data;
+   const struct intel_device_info *devinfo;
+};
+
+static bool
+gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   struct fs_info_ctx *ctx = data;
+   struct brw_fs_prog_data *prog_data = ctx->prog_data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample:
+      prog_data->barycentric_interp_modes |=
+         1 << brw_barycentric_mode(ctx->key, intr);
+      break;
+
+   case nir_intrinsic_load_barycentric_at_sample:
+   case nir_intrinsic_load_barycentric_at_offset: {
+      unsigned mode = brw_barycentric_mode(ctx->key, intr);
+      prog_data->barycentric_interp_modes |= 1 << mode;
+      prog_data->uses_sample_offsets |=
+         mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE ||
+         mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE;
+
+      if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS)
+         prog_data->uses_npc_bary_coefficients = true;
+      else
+         prog_data->uses_pc_bary_coefficients = true;
+      break;
+   }
+
+   case nir_intrinsic_load_frag_coord_z:
+      prog_data->uses_src_depth = true;
+      break;
+
+   case nir_intrinsic_load_frag_coord_w_rcp:
+      prog_data->uses_src_w = true;
+      break;
+
+   case nir_intrinsic_load_sample_mask_in:
+      /* TODO: Sample masks are broken and discards are broken and simd32
+       * layouts are broken too. XXX.
+       */
+      // prog_data->uses_sample_mask = true;
+      break;
+
+   case nir_intrinsic_load_pixel_coord_intel:
+      BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
+      break;
+
+   default:
+      break;
+   }
+
+   return false;
+}
+
+static void
+brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data,
+                        const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_shader_in_variable(var, shader) {
+      if (var->data.interpolation != INTERP_MODE_FLAT ||
+          var->data.per_primitive)
+         continue;
+
+      unsigned slots = glsl_count_attribute_slots(var->type, false);
+      for (unsigned s = 0; s < slots; s++) {
+         int input_index = prog_data->urb_setup[var->data.location + s];
+
+         if (input_index >= 0)
+            prog_data->flat_inputs |= 1 << input_index;
+      }
+   }
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info.fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_NONE:
+      case FRAG_DEPTH_LAYOUT_ANY:
+         return BRW_PSCDEPTH_ON;
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         return BRW_PSCDEPTH_ON_GE;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         return BRW_PSCDEPTH_ON_LE;
+      case FRAG_DEPTH_LAYOUT_UNCHANGED:
+         /* We initially set this to OFF, but having the shader write the
+          * depth means we allocate register space in the SEND message. The
+          * difference between the SEND register count and the OFF state
+          * programming makes the HW hang.
+          *
+          * Removing the depth writes also leads to test failures. So use
+          * LesserThanOrEqual, which fits writing the same value
+          * (unchanged/equal).
+          *
+          */
+         return BRW_PSCDEPTH_ON_LE;
+      }
+   }
+   return BRW_PSCDEPTH_OFF;
+}
+
+/*
+ * Build up an array of indices into the urb_setup array that
+ * references the active entries of the urb_setup array.
+ * Used to accelerate walking the active entries of the urb_setup array
+ * on each upload.
+ */
+static void
+brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data)
+{
+   /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
+    * skip per-primitive attributes here.
+    */
+
+   /* Make sure uint8_t is sufficient */
+   static_assert(VARYING_SLOT_MAX <= 0xff);
+   uint8_t index = 0;
+   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      if (fs_prog_data->urb_setup[attr] >= 0) {
+         fs_prog_data->urb_setup_attribs[index++] = attr;
+      }
+   }
+   fs_prog_data->urb_setup_attribs_count = index;
+}
+
+static void
+calculate_urb_setup(const struct intel_device_info *devinfo,
+                    const struct brw_fs_prog_key *key,
+                    struct brw_fs_prog_data *prog_data,
+                    nir_shader *nir,
+                    const struct brw_mue_map *mue_map,
+                    int *per_primitive_offsets)
+{
+   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
+   int urb_next = 0; /* in vec4s */
+
+   /* Figure out where the PrimitiveID lives, either in the per-vertex block
+    * or in the per-primitive block or both.
+    */
+   const uint64_t per_vert_primitive_id =
+      key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t per_prim_primitive_id =
+      key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t inputs_read =
+      nir->info.inputs_read &
+      (~nir->info.per_primitive_inputs | per_vert_primitive_id);
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_inputs =
+      nir->info.inputs_read &
+      (nir->info.per_primitive_inputs | per_prim_primitive_id) &
+      ~per_primitive_header_bits;
+   struct intel_vue_map vue_map;
+   uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
+
+   if (mue_map != NULL) {
+      memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
+      memcpy(per_primitive_offsets, mue_map->per_primitive_offsets,
+             sizeof(mue_map->per_primitive_offsets));
+
+      if (!mue_map->wa_18019110168_active) {
+         u_foreach_bit64(location, per_primitive_inputs) {
+            assert(per_primitive_offsets[location] != -1);
+
+            first_read_offset =
+               MIN2(first_read_offset,
+                    (uint32_t) per_primitive_offsets[location]);
+            per_primitive_stride =
+               MAX2((uint32_t) per_primitive_offsets[location] + 16,
+                    per_primitive_stride);
+         }
+      } else {
+         first_read_offset = per_primitive_stride = 0;
+      }
+   } else {
+      brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout,
+                          1 /* pos_slots, TODO */);
+      brw_compute_per_primitive_map(per_primitive_offsets,
+                                    &per_primitive_stride, &first_read_offset,
+                                    0, nir, nir_var_shader_in,
+                                    per_primitive_inputs,
+                                    true /* separate_shader */);
+   }
+
+   if (per_primitive_stride > first_read_offset) {
+      first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
+
+      /* Remove the first few unused registers */
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1)
+            continue;
+         per_primitive_offsets[i] -= first_read_offset;
+      }
+
+      prog_data->num_per_primitive_inputs =
+         2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
+   } else {
+      prog_data->num_per_primitive_inputs = 0;
+   }
+
+   /* Now do the per-vertex stuff (what used to be legacy pipeline) */
+
+   /* If Mesh is involved, we cannot do any packing. Documentation doesn't say
+    * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
+    * using Mesh.
+    */
+   if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) {
+      /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
+       * arbitrary rearrangement of the first 16 varying inputs, so we can put
+       * them wherever we want. Just put them in order.
+       *
+       * This is useful because it means that (a) inputs not used by the
+       * fragment shader won't take up valuable register space, and (b) we
+       * won't have to recompile the fragment shader if it gets paired with a
+       * different vertex (or geometry) shader.
+       */
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (inputs_read & BITFIELD64_BIT(i)) {
+            prog_data->urb_setup[i] = urb_next++;
+         }
+      }
+   } else {
+      /* We have enough input varyings that the SF/SBE pipeline stage can't
+       * arbitrarily rearrange them to suit our whim; we have to put them in
+       * an order that matches the output of the previous pipeline stage
+       * (geometry or vertex shader).
+       */
+      int first_slot = 0;
+      for (int i = 0; i < vue_map.num_slots; i++) {
+         int varying = vue_map.slot_to_varying[i];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+            first_slot = ROUND_DOWN_TO(i, 2);
+            break;
+         }
+      }
+
+      for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
+         int varying = vue_map.slot_to_varying[slot];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) {
+            prog_data->urb_setup[varying] = slot - first_slot;
+         }
+      }
+      urb_next = vue_map.num_slots - first_slot;
+   }
+
+   prog_data->num_varying_inputs = urb_next;
+   prog_data->inputs = inputs_read;
+   prog_data->per_primitive_inputs = per_primitive_inputs;
+
+   brw_compute_urb_setup_index(prog_data);
+}
+
+static void
+populate_fs_prog_data(nir_shader *shader,
+                      const struct intel_device_info *devinfo,
+                      const struct brw_fs_prog_key *key,
+                      struct brw_fs_prog_data *prog_data,
+                      const struct brw_mue_map *mue_map,
+                      int *per_primitive_offsets)
+{
+   struct fs_info_ctx ctx = {
+      .key = key,
+      .prog_data = prog_data,
+      .devinfo = devinfo,
+   };
+   nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx);
+
+   prog_data->uses_kill = shader->info.fs.uses_discard;
+   prog_data->uses_omask =
+      !key->ignore_sample_mask_out &&
+      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+   prog_data->max_polygons = 1;
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   prog_data->sample_shading = shader->info.fs.uses_sample_shading;
+   prog_data->api_sample_shading = key->api_sample_shading;
+   prog_data->min_sample_shading = key->min_sample_shading;
+
+   assert(key->multisample_fbo != INTEL_NEVER ||
+          key->persample_interp == INTEL_NEVER);
+
+   prog_data->persample_dispatch = key->persample_interp;
+   if (prog_data->sample_shading)
+      prog_data->persample_dispatch = INTEL_ALWAYS;
+
+   /* We can only persample dispatch if we have a multisample FBO */
+   prog_data->persample_dispatch =
+      MIN2(prog_data->persample_dispatch, key->multisample_fbo);
+
+   /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
+    * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
+    * to definitively tell whether alpha_to_coverage is on or off.
+    */
+   prog_data->alpha_to_coverage = key->alpha_to_coverage;
+
+   assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER);
+   prog_data->mesh_input = key->mesh_input;
+
+   assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER);
+   prog_data->provoking_vertex_last = key->provoking_vertex_last;
+
+   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select
+    *    POSOFFSET_SAMPLE"
+    *
+    * So we can only really get sample positions if we are doing real
+    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+    * persample dispatch, we hard-code it to 0.5.
+    */
+   prog_data->uses_pos_offset =
+      prog_data->persample_dispatch != INTEL_NEVER &&
+      (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
+       BITSET_TEST(shader->info.system_values_read,
+                   SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
+
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info.fs.inner_coverage;
+
+   /* From the BDW PRM documentation for 3DSTATE_WM:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+    *     Sample or Non- perspective Sample barycentric coordinates."
+    *
+    * So cleanup any potentially set sample barycentric mode when not in per
+    * sample dispatch.
+    */
+   if (prog_data->persample_dispatch == INTEL_NEVER) {
+      prog_data->barycentric_interp_modes &=
+         ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+   }
+
+   if (devinfo->ver >= 20) {
+      prog_data->vertex_attributes_bypass =
+         brw_needs_vertex_attributes_bypass(shader);
+   }
+
+   prog_data->uses_nonperspective_interp_modes =
+      (prog_data->barycentric_interp_modes &
+       INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
+      prog_data->uses_npc_bary_coefficients;
+
+   /* The current VK_EXT_graphics_pipeline_library specification requires
+    * coarse to specified at compile time. But per sample interpolation can be
+    * dynamic. So we should never be in a situation where coarse &
+    * persample_interp are both respectively true & INTEL_ALWAYS.
+    *
+    * Coarse will dynamically turned off when persample_interp is active.
+    */
+   assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
+
+   prog_data->coarse_pixel_dispatch =
+      intel_sometimes_invert(prog_data->persample_dispatch);
+   if (!key->coarse_pixel ||
+       /* DG2 should support this, but Wa_22012766191 says there are issues
+        * with CPS 1x1 + MSAA + FS writing to oMask.
+        */
+       (devinfo->verx10 < 200 &&
+        (prog_data->uses_omask || prog_data->uses_sample_mask)) ||
+       prog_data->sample_shading ||
+       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+       prog_data->computed_stencil ||
+       devinfo->ver < 11) {
+      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
+   }
+
+   /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
+    * Message Descriptor :
+    *
+    *    "Message Type. Specifies the type of message being sent when
+    *     pixel-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Per Message Offset (eval_snapped with immediate offset)
+    *       1: Sample Position Offset (eval_sindex)
+    *       2: Centroid Position Offset (eval_centroid)
+    *       3: Per Slot Offset (eval_snapped with register offset)
+    *
+    *     Message Type. Specifies the type of message being sent when
+    *     coarse-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Coarse to Pixel Mapping Message (internal message)
+    *       1: Reserved
+    *       2: Coarse Centroid Position (eval_centroid)
+    *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
+    *
+    * The Sample Position Offset is marked as reserved for coarse rate
+    * evaluation and leads to hangs if we try to use it. So disable coarse
+    * pixel shading if we have any intrinsic that will result in a pixel
+    * interpolater message at sample.
+    */
+   if (intel_nir_pulls_at_sample(shader))
+      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
+
+   /* We choose to always enable VMask prior to XeHP, as it would cause
+    * us to lose out on the eliminate_find_live_channel() optimization.
+    */
+   prog_data->uses_vmask =
+      devinfo->verx10 < 125 ||
+      shader->info.fs.needs_coarse_quad_helper_invocations ||
+      shader->info.uses_wide_subgroup_intrinsics ||
+      prog_data->coarse_pixel_dispatch != INTEL_NEVER;
+
+   prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients;
+
+   if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
+      prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth;
+      prog_data->uses_src_depth = false;
+   }
+
+   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map,
+                       per_primitive_offsets);
+   brw_compute_flat_inputs(prog_data, shader);
+}
+
+static void
+populate_vs_prog_data(nir_shader *nir,
+                      const struct intel_device_info *devinfo,
+                      const struct brw_vs_prog_key *key,
+                      struct brw_vs_prog_data *prog_data,
+                      unsigned nr_packed_regs,
+                      bool debug)
+{
+   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
+   BITSET_WORD *sysvals = nir->info.system_values_read;
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) {
+      nr_attribute_slots++;
+   }
+
+   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
+   if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
+      nr_attribute_slots++;
+   }
+
+   const struct {
+      bool *data;
+      gl_system_value val;
+   } bool_sysvals[] = {
+      { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW     },
+      { &prog_data->uses_firstvertex,     SYSTEM_VALUE_FIRST_VERTEX        },
+      { &prog_data->uses_baseinstance,    SYSTEM_VALUE_BASE_INSTANCE       },
+      { &prog_data->uses_vertexid,        SYSTEM_VALUE_VERTEX_ID_ZERO_BASE },
+      { &prog_data->uses_instanceid,      SYSTEM_VALUE_INSTANCE_ID         },
+      { &prog_data->uses_drawid,          SYSTEM_VALUE_DRAW_ID             },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) {
+      *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val);
+   }
+
+   unsigned nr_attribute_regs;
+   if (key->vf_component_packing) {
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8);
+      nr_attribute_regs = nr_packed_regs;
+   } else {
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
+      nr_attribute_regs = 4 * nr_attribute_slots;
+   }
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4),
+                                     prog_data->base.vue_map.num_slots);
+   prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+   if (unlikely(debug)) {
+      fprintf(stderr, "VS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
+   }
+}
+
+static void
+setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
+{
+   jay_fs_payload *fs = &nj->payload.fs;
+
+   if (nj->s->dispatch_width == 32) {
+      nj->payload.u1 = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
+   }
+
+   setup_payload_push(nj, p);
+
+   u_foreach_bit(i, nj->s->prog_data->fs.barycentric_interp_modes) {
+      fs->bary[i] = read_vector_payload(p, GPR, 2);
+   }
+
+   if (nj->s->prog_data->fs.uses_src_depth) {
+      fs->coord.z = read_payload(p, GPR);
+   }
+
+   if (nj->s->prog_data->fs.uses_src_w) {
+      fs->coord.w = read_payload(p, GPR);
+   }
+
+   unsigned nr_attribs = 16 * 4; /* TODO */
+   for (unsigned i = 0; i < nr_attribs; ++i) {
+      jay_def comps[] = { read_payload(p, UGPR), read_payload(p, UGPR),
+                          read_payload(p, UGPR) };
+
+      /* The .yz components are swizzled in the hardware compared to NIR. */
+      SWAP(comps[1], comps[2]);
+      fs->deltas[i] = jay_collect_vectors(&nj->bld, comps, ARRAY_SIZE(comps));
+
+      /* Padding */
+      if ((i % 5) == 4) {
+         read_payload(p, UGPR);
+      }
+   }
+
+   /* XXX: I do not love this */
+   if (BITSET_TEST(nj->nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
+      jay_def t = jay_alloc_def(&nj->bld, GPR, 1);
+      jay_def lo = jay_extract_range(nj->payload.u0, 10, 4);
+      jay_EXPAND_QUAD(&nj->bld, t, lo, payload_u1(nj, 10, 4));
+      fs->coord.xy = jay_OFFSET_PACKED_PIXEL_COORDS_u32(&nj->bld, t);
+   }
+
+   /* Due to complexities of the physical payload, the logical payload is split
+    * into even/odd halves. Fix up the offsets and insert copies.
+    */
+   if (nj->s->dispatch_width == 32) {
+      jay_builder *b = &nj->bld;
+      jay_foreach_inst_in_block(nj->after_block, I) {
+         if (I->op == JAY_OPCODE_PRELOAD && I->dst.file == GPR) {
+            unsigned base = (jay_preload_reg(I) % 2) ? p->offsets[GPR] : 0;
+            jay_set_preload_reg(I, base + (jay_preload_reg(I) / 2));
+         }
+      }
+
+      b->cursor = jay_before_block(nj->after_block);
+      unsigned size = p->offsets[GPR];
+
+      /* Odd: copy both halves to contiguous pair after payload */
+      for (unsigned i = 1; i < size; i += 2) {
+         jay_DESWIZZLE_16(b, size + size + i + 1, 2 + i);
+         jay_DESWIZZLE_16(b, size + size + i + 2, 2 + i + size);
+      }
+
+      /* Even: leave the bottom half in place, copy top half. If size=1 (rare
+       * but possible), this would be a no-op move so skip it.
+       */
+      if (size > 1) {
+         for (unsigned i = 0; i < size; i += 2) {
+            jay_inst *I = jay_DESWIZZLE_16(b, 2 + i + 1, 2 + size + i);
+
+            /* Stall in between to avoid a write-after-read hazard */
+            if (i == 0) {
+               I->dep = (struct tgl_swsb) { 1, TGL_PIPE_INT };
+            }
+         }
+      }
+   }
+}
+
+static void
+jay_setup_payload(struct nir_to_jay_state *nj)
+{
+   jay_shader *s = nj->s;
+   jay_builder *b = &nj->bld;
+   nj->after_block = jay_create_block(nj);
+   b->cursor = jay_after_block(nj->after_block);
+
+   struct payload_builder p = { .b = &nj->bld };
+   nj->payload.u0 = read_vector_payload(&p, UGPR, jay_ugpr_per_grf(s));
+   nj->payload.sampler_state_pointer = jay_extract(nj->payload.u0, 3);
+
+   switch (s->stage) {
+   case MESA_SHADER_VERTEX:
+      setup_vertex_payload(nj, &p);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      setup_fragment_payload(nj, &p);
+      break;
+   case MESA_SHADER_COMPUTE:
+   case MESA_SHADER_KERNEL:
+      setup_compute_payload(nj, &p);
+      break;
+   default:
+      UNREACHABLE("unimplemented shader stages");
+   }
+
+   /* Lane ID calculations require &W and therefore are calculated in
+    * uniform control flow to sidestep RA problems. The easy solution is
+    * calculating the lane ID in the first block.
+    *
+    * XXX: This doesn't work for multi-function. Reconsider.
+    */
+   nj->payload.lane_id = jay_LANE_ID_8_u16(b);
+
+   for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
+      nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i);
+   }
+}
+
+/*
+ * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These
+ * blocks have no predecessors, but do have successors and can contribute to
+ * phis. They are dead and violate the IR invariant:
+ *
+ *    Live-in sources are live-out in all predecessors.
+ *
+ * ...which RA (validation) depends on. The simplest solution is to simply
+ * delete these dead blocks. Fortunately, because they are unreachable, this
+ * does not have any ill effects. Notably, this cannot introduce critical edges.
+ *
+ * Deleting a block may cause a successor to become unreachable, so we use a
+ * fixed-point algorithm to converge.
+ */
+static void
+jay_remove_unreachable_blocks(jay_function *func)
+{
+   bool progress;
+   do {
+      progress = false;
+
+      jay_foreach_block(func, pred) {
+         if (pred != jay_first_block(func) &&
+             jay_num_predecessors(pred) == 0 &&
+             jay_num_successors(pred) > 0) {
+
+            jay_foreach_successor(pred, succ) {
+               util_dynarray_delete_unordered(&succ->predecessors, jay_block *,
+                                              pred);
+            }
+
+            pred->successors[0] = NULL;
+            pred->successors[1] = NULL;
+            progress = true;
+         }
+      }
+   } while (progress);
+}
+
+static void
+jay_from_nir_function(const struct intel_device_info *devinfo,
+                      nir_shader *nir,
+                      jay_shader *s,
+                      nir_function_impl *impl)
+{
+   jay_function *f = jay_new_function(s);
+   f->is_entrypoint = impl->function->is_entrypoint;
+
+   struct nir_to_jay_state nj = {
+      .s = s,
+      .f = f,
+      .nir = nir,
+      .devinfo = devinfo,
+      .bld = (jay_builder) { .shader = s, .func = f },
+   };
+
+   /* Jay indices match NIR indices. Therefore the first impl->ssa_alloc
+    * indices are reserved. Our own temporaries go after.
+    */
+   f->ssa_alloc = impl->ssa_alloc;
+
+   if (f->is_entrypoint) {
+      jay_setup_payload(&nj);
+   }
+
+   jay_emit_cf_list(&nj, &impl->body);
+   jay_emit_eot(&nj);
+   jay_remove_unreachable_blocks(f);
+}
+
+static void
+jay_gather_stats(const jay_shader *s, struct genisa_stats *stats)
+{
+   jay_foreach_inst_in_shader(s, f, I) {
+      stats->instrs += I->op != JAY_OPCODE_SYNC;
+      stats->loops += I->op == JAY_OPCODE_WHILE;
+      stats->sends += I->op == JAY_OPCODE_SEND;
+
+      /* XXX: Write a real cycle model */
+      stats->cycles++;
+
+      /* Calculate register usage */
+      if (I->dst.file == GPR)
+         stats->grf_registers =
+            MAX2(stats->grf_registers, I->dst.reg + jay_num_values(I->dst));
+   }
+
+   stats->spills = s->spills;
+   stats->fills = s->fills;
+   stats->sends -= (s->spills + s->fills);
+}
+
+/*
+ * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
+ * its own index. Vectors/64-bit use contiguous indices. We therefore run a
+ * modified version of nir_index_ssa_defs right before translating NIR->Jay.
+ */
+static bool
+index_ssa_def_cb(nir_def *def, void *state)
+{
+   unsigned *index = (unsigned *) state;
+   def->index = *index;
+   *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
+   return true;
+}
+
+static void
+nj_index_ssa_defs(nir_shader *nir)
+{
+   nir_foreach_function_impl(impl, nir) {
+      /* The zero index means null in Jay, so start SSA indices at 1 */
+      unsigned index = 1;
+
+      nir_foreach_block_unstructured(block, impl) {
+         nir_foreach_instr(instr, block)
+            nir_foreach_def(instr, index_ssa_def_cb, &index);
+      }
+
+      impl->ssa_alloc = index;
+   }
+}
+
+static bool
+lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
+{
+   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
+      return false;
+
+   /* TODO: Is this right for multisampling? */
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *active =
+      nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
+
+   nir_def_replace(&intr->def, active);
+   return true;
+}
+
+static bool
+lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
+{
+   if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
+       intr->intrinsic != nir_intrinsic_load_pixel_coord)
+      return false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
+
+   if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
+      c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
+                   nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
+                   nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
+   }
+
+   nir_def_replace(&intr->def, c);
+   return true;
+}
+
+static bool
+jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+   unsigned *simd_width = simd_;
+
+   /* mask & -mask isolates the lowest set bit in the mask. */
+   if (intr->intrinsic == nir_intrinsic_elect) {
+      nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
+      mask = nir_iand(b, mask, nir_ineg(b, mask));
+      nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
+      return true;
+   }
+
+   /* Ballots must match the SIMD size */
+   if (intr->intrinsic == nir_intrinsic_ballot ||
+       intr->intrinsic == nir_intrinsic_ballot_relaxed) {
+      unsigned old_bitsize = intr->def.bit_size;
+      intr->def.bit_size = *simd_width;
+      nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
+      nir_def_rewrite_uses_after(&intr->def, u2uN);
+      return true;
+   }
+
+   /* Note: we don't treat read_invocation specially because there's little
+    * benefit but doing so would require expensive uniformizing in some cases.
+    */
+   if (intr->intrinsic != nir_intrinsic_shuffle &&
+       intr->intrinsic != nir_intrinsic_read_invocation)
+      return false;
+
+   nir_def *data = intr->src[0].ssa;
+   assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
+
+   nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
+   nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
+   return true;
+}
+
+struct frag_out_ctx {
+   nir_def *colour[8], *depth, *stencil, *sample_mask;
+};
+
+static bool
+collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
+{
+   struct frag_out_ctx *ctx = ctx_;
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
+   assert(util_is_power_of_two_nonzero(wrmask + 1) &&
+          "complex writemasks should be lowered");
+
+   /* TODO: Optimize with write mask? */
+
+   gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
+   assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
+   nir_def **out;
+   if (loc == FRAG_RESULT_COLOR) {
+      out = &ctx->colour[0];
+   } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
+      out = &ctx->colour[loc - FRAG_RESULT_DATA0];
+   } else if (loc == FRAG_RESULT_DEPTH) {
+      out = &ctx->depth;
+   } else if (loc == FRAG_RESULT_STENCIL) {
+      UNREACHABLE("todo");
+      out = &ctx->stencil;
+   } else if (loc == FRAG_RESULT_SAMPLE_MASK) {
+      UNREACHABLE("todo");
+      out = &ctx->sample_mask;
+   } else {
+      UNREACHABLE("invalid location");
+   }
+
+   assert((*out) == NULL && "each location written exactly once");
+   *out = intr->src[0].ssa;
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static void
+append_payload(nir_builder *b,
+               nir_def **payload,
+               unsigned *len,
+               unsigned max_len,
+               nir_def *value)
+{
+   if (value != NULL) {
+      for (unsigned i = 0; i < value->num_components; ++i) {
+         payload[*len] = nir_channel(b, value, i);
+         (*len)++;
+         assert((*len) <= max_len);
+      }
+   }
+}
+
+static void
+insert_rt_store(nir_builder *b,
+                const struct intel_device_info *devinfo,
+                signed target,
+                bool last,
+                nir_def *colour,
+                nir_def *src0_alpha,
+                nir_def *depth,
+                nir_def *stencil,
+                nir_def *sample_mask,
+                unsigned dispatch_width)
+{
+   bool null_rt = target < 0;
+   target = MAX2(target, 0);
+
+   if (!colour) {
+      colour = nir_undef(b, 4, 32);
+   }
+
+   colour = nir_pad_vec4(b, colour);
+
+   if (null_rt) {
+      /* Even if we don't write a RT, we still need to write alpha for
+       * alpha-to-coverage and alpha testing. Optimize the other channels out.
+       */
+      colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
+                                     nir_channel(b, colour, 3), 3);
+   }
+
+   /* TODO: Not sure I like this. We'll see what 2src looks like. */
+   unsigned op = dispatch_width == 32 ?
+                    XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
+                    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   uint64_t desc =
+      brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
+
+   uint64_t ex_desc = 0;
+   if (devinfo->ver >= 20) {
+      ex_desc = target << 21 |
+                null_rt << 20 |
+                (src0_alpha ? (1 << 15) : 0) |
+                (stencil ? (1 << 14) : 0) |
+                (depth ? (1 << 13) : 0) |
+                (sample_mask ? (1 << 12) : 0);
+   } else if (devinfo->ver >= 11) {
+      /* Set the "Render Target Index" and "Src0 Alpha Present" fields
+       * in the extended message descriptor, in lieu of using a header.
+       */
+      ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
+   }
+
+   /* Build the payload */
+   nir_def *payload[8] = { NULL };
+   unsigned len = 0;
+   append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
+   append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
+   /* TODO */
+
+   nir_def *disable = b->shader->info.fs.uses_discard ?
+                         nir_is_helper_invocation(b, 1) :
+                         nir_imm_false(b);
+
+   nir_store_render_target_intel(b, nir_vec(b, payload, len),
+                                 nir_imm_ivec2(b, desc, ex_desc), disable,
+                                 .eot = last);
+}
+
+static void
+lower_fragment_outputs(nir_function_impl *impl,
+                       const struct intel_device_info *devinfo,
+                       unsigned nr_color_regions,
+                       unsigned dispatch_width)
+{
+   struct frag_out_ctx ctx = { { NULL } };
+   nir_function_intrinsics_pass(impl, collect_fragment_output,
+                                nir_metadata_control_flow, &ctx);
+   nir_builder b_ = nir_builder_at(nir_after_impl(impl));
+   nir_builder *b = &b_;
+   assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
+
+   signed first = -1;
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) {
+      if (ctx.colour[i]) {
+         first = i;
+         break;
+      }
+   }
+
+   /* Do the later render targets first */
+   for (unsigned i = first + 1; i < nr_color_regions; ++i) {
+      if (ctx.colour[i]) {
+         insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL,
+                         NULL, dispatch_width);
+      }
+   }
+
+   /* Finally do render target zero attaching all the sideband things and
+    * setting the LastRT bit. This needs to exist even if nothing is written
+    * since it also signals end-of-thread.
+    */
+   insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true,
+                   first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth,
+                   ctx.stencil, ctx.sample_mask, dispatch_width);
+}
+
+struct jay_shader_bin *
+jay_compile(const struct intel_device_info *devinfo,
+            void *mem_ctx,
+            nir_shader *nir,
+            union brw_any_prog_data *prog_data,
+            union brw_any_prog_key *key)
+{
+   jay_debug = debug_get_option_jay_debug();
+   enum mesa_shader_stage stage = nir->info.stage;
+   bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage));
+   struct brw_compiler compiler = { .devinfo = devinfo };
+   unsigned nr_packed_regs = 0;
+
+   brw_pass_tracker pt_ = {
+      .nir = nir,
+      .key = &key->base,
+      .dispatch_width = 0,
+      .compiler = &compiler,
+      .archiver = NULL, //params->base.archiver,
+   }, *pt = &pt_;
+
+   BRW_NIR_SNAPSHOT("first");
+
+   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.stage = stage;
+   // TODO: Make the driver do this?
+   // prog_data->base.source_hash = params->source_hash;
+   prog_data->base.total_shared = nir->info.shared_size;
+
+   /* TODO: Real heuristic */
+   bool do_simd32 = INTEL_SIMD(FS, 32);
+   do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
+   unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
+
+   if (stage == MESA_SHADER_VERTEX) {
+      /* We only expect slot compaction to be disabled when using device
+       * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
+       * programming. This should always be enabled together with VF component
+       * packing to minimize the size of the payload.
+       */
+      assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
+
+      /* When using Primitive Replication for multiview, each view gets its own
+       * position slot.
+       */
+      const uint32_t pos_slots =
+         (nir->info.per_view_outputs & VARYING_BIT_POS) ?
+            MAX2(1, util_bitcount(key->base.view_mask)) :
+            1;
+
+      /* Only position is allowed to be per-view */
+      assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
+
+      brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
+                          nir->info.outputs_written, key->base.vue_layout,
+                          pos_slots);
+
+      brw_nir_apply_key(pt, &key->base, simd_width);
+
+      prog_data->vs.inputs_read = nir->info.inputs_read;
+      prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
+      prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
+
+      brw_nir_lower_vs_inputs(nir);
+      brw_nir_lower_vue_outputs(nir);
+      BRW_NIR_SNAPSHOT("after_lower_io");
+
+      memset(prog_data->vs.vf_component_packing, 0,
+             sizeof(prog_data->vs.vf_component_packing));
+      if (key->vs.vf_component_packing) {
+         nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
+      }
+
+      /* Get constant offsets out of the way for proper clip/cull handling */
+      BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+      BRW_NIR_PASS(nir_opt_constant_folding);
+      BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
+                   &prog_data->vue.vue_map, 0, 0);
+   } else if (stage == MESA_SHADER_FRAGMENT) {
+      assert(key->fs.mesh_input == INTEL_NEVER && "todo");
+      assert(!key->fs.force_dual_color_blend && "todo");
+      brw_nir_apply_key(pt, &key->base, 32);
+      brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
+      brw_nir_lower_fs_outputs(nir);
+      NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
+
+      if (!brw_can_coherent_fb_fetch(devinfo))
+         NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
+
+      NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
+               nir_metadata_control_flow, NULL);
+      NIR_PASS(_, nir, nir_opt_barycentric, true);
+
+      lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
+                             key->fs.nr_color_regions, simd_width);
+      NIR_PASS(_, nir, nir_lower_helper_writes, true);
+      NIR_PASS(_, nir, nir_lower_is_helper_invocation);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
+               nir_metadata_control_flow, NULL);
+
+      if (key->fs.alpha_to_coverage != INTEL_NEVER) {
+         /* Run constant fold optimization in order to get the correct source
+          * offset to determine render target 0 store instruction in
+          * emit_alpha_to_coverage pass.
+          */
+         NIR_PASS(_, nir, nir_opt_constant_folding);
+         NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
+      }
+
+      // TODO
+      // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
+
+      if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
+         uint32_t f = 0;
+
+         if (key->fs.multisample_fbo == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
+
+         if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
+
+         if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
+
+         if (key->fs.persample_interp == INTEL_ALWAYS) {
+            f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
+                 INTEL_FS_CONFIG_PERSAMPLE_INTERP;
+         }
+
+         NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
+                  f);
+      }
+   } else {
+      brw_nir_apply_key(pt, &key->base, simd_width);
+   }
+
+   brw_postprocess_nir_opts(pt);
+
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
+            nir_metadata_control_flow, &simd_width);
+   NIR_PASS(_, nir, nir_opt_algebraic_late);
+   NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
+
+   /* Late postprocess while remaining in SSA */
+   /* Run fsign lowering again after the last time brw_nir_optimize is called.
+    * As is the case with conversion lowering (below), brw_nir_optimize can
+    * create additional fsign instructions.
+    */
+   NIR_PASS(_, nir, jay_nir_lower_fsign);
+   NIR_PASS(_, nir, jay_nir_lower_bool);
+   NIR_PASS(_, nir, nir_opt_cse);
+   NIR_PASS(_, nir, nir_opt_dce);
+   NIR_PASS(_, nir, jay_nir_opt_sel_zero);
+
+   /* Run nir_split_conversions only after the last tiem
+    * brw_nir_optimize is called. Various optimizations invoked there can
+    * rematerialize the conversions that the lowering pass eliminates.
+    */
+   const nir_split_conversions_options split_conv_opts = {
+      .callback = intel_nir_split_conversions_cb,
+   };
+   NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
+
+   /* Do this only after the last opt_gcm. GCM will undo this lowering. */
+   if (stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
+   }
+
+   NIR_PASS(_, nir, nir_opt_constant_folding);
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
+   NIR_PASS(_, nir, nir_opt_copy_prop);
+   NIR_PASS(_, nir, nir_opt_dce);
+
+   /* Run divergence analysis at the end */
+   nir_sweep(nir);
+   nj_index_ssa_defs(nir);
+   nir_divergence_analysis(nir);
+
+   if (debug) {
+      /* We can't use nir_print_shader since it reindexes SSA defs. */
+      fprintf(stdout, "NIR right before from_nir:\n\n");
+      nir_print_shader_annotated(nir, stdout, NULL);
+      fflush(stdout);
+   }
+
+   if (stage == MESA_SHADER_VERTEX) {
+      populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs,
+                            nr_packed_regs, debug);
+   } else if (stage == MESA_SHADER_FRAGMENT) {
+      int per_primitive_offsets[VARYING_SLOT_MAX];
+      memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
+
+      populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs,
+                            NULL /* TODO: mue_map */, per_primitive_offsets);
+   }
+
+   jay_shader *s = jay_new_shader(NULL, stage);
+   s->dispatch_width = simd_width;
+   s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width;
+   s->devinfo = devinfo;
+   s->prog_data = prog_data;
+
+   nir_foreach_function_impl(impl, nir) {
+      jay_from_nir_function(devinfo, nir, s, impl);
+   }
+
+   /* Re-number block indices to be sequential and match the NIR. This ensures
+    * block indices are ordered with respect to the control flow graph which is
+    * a convenient IR invariant.
+    */
+   jay_foreach_function(s, f) {
+      unsigned index = 0;
+
+      jay_foreach_block(f, b) {
+         b->index = index++;
+      }
+   }
+
+   jay_validate(s, "NIR->Jay translation");
+
+   if (!(jay_debug & JAY_DBG_NOOPT)) {
+      JAY_PASS(s, jay_opt_propagate_forwards);
+      JAY_PASS(s, jay_opt_propagate_backwards);
+      JAY_PASS(s, jay_opt_dead_code);
+   }
+
+   if (debug) {
+      fprintf(stdout, "Jay shader:\n\n");
+      jay_print(stdout, s);
+   }
+
+   JAY_PASS(s, jay_assign_flags);
+   if (!(jay_debug & JAY_DBG_NOOPT)) {
+      JAY_PASS(s, jay_opt_dead_code);
+   }
+
+   JAY_PASS(s, jay_lower_pre_ra);
+   JAY_PASS(s, jay_partition_grf);
+   JAY_PASS(s, jay_register_allocate);
+   JAY_PASS(s, jay_lower_post_ra);
+   JAY_PASS(s, jay_insert_fp_mode, nir->info.float_controls_execution_mode,
+            nir->info.bit_sizes_float);
+
+   if (!(jay_debug & JAY_DBG_NOOPT)) {
+      JAY_PASS(s, jay_opt_control_flow);
+   }
+
+   JAY_PASS(s, jay_lower_scoreboard);
+
+   if (debug) {
+      fprintf(stdout, "Jay shader (post-RA):\n\n");
+      jay_print(stdout, s);
+   }
+
+   struct jay_shader_bin *bin =
+      jay_to_binary(s, nir->constant_data, nir->constant_data_size);
+   assert(bin->kernel);
+   ralloc_steal(mem_ctx, bin);
+
+   jay_gather_stats(s, &bin->stats);
+   bin->stats.code_size = bin->size;
+
+   if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) {
+      if (nir->info.label) {
+         printf("%s - ", nir->info.label);
+      }
+
+      const char *shader_name =
+         ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(stage),
+                         s->dispatch_width);
+      genisa_stats_fprintf(stdout, shader_name, &bin->stats);
+   }
+
+   bin->stats.workgroup_memory_size = nir->info.shared_size;
+   bin->stats.dispatch_width = simd_width;
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      if (simd_width == 8) {
+         prog_data->fs.dispatch_8 = true;
+      } else if (simd_width == 16) {
+         prog_data->fs.dispatch_16 = true;
+         prog_data->fs.prog_offset_16 = 0;
+      } else if (simd_width == 32) {
+         prog_data->fs.dispatch_32 = true;
+         prog_data->fs.prog_offset_32 = 0;
+      }
+
+      prog_data->fs.has_side_effects = nir->info.writes_memory;
+   } else if (mesa_shader_stage_is_compute(stage)) {
+      unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2;
+      prog_data->cs.prog_offset[i] = 0;
+      prog_data->cs.prog_mask = BITFIELD_BIT(i);
+      prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr;
+      prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr;
+      prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */
+   }
+
+   prog_data->base.program_size = bin->size;
+
+   if (s->scratch_size > 0) {
+      /* We currently only support up to 2MB of scratch space.  If we
+       * need to support more eventually, the documentation suggests
+       * that we could allocate a larger buffer, and partition it out
+       * ourselves.  We'd just have to undo the hardware's address
+       * calculation by subtracting (FFTID * Per Thread Scratch Space)
+       * and then add FFTID * (Larger Per Thread Scratch Space).
+       *
+       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
+       * Thread Group Tracking > Local Memory/Scratch Space.
+       */
+      assert(s->scratch_size <= devinfo->max_scratch_size_per_thread &&
+             "maximum scratch size");
+
+      /* Take the max of any previously compiled variant of the shader. In the
+       * case of bindless shaders with return parts, this will also take the
+       * max of all parts.
+       */
+      prog_data->base.total_scratch =
+         MAX2(prog_data->base.total_scratch,
+              util_next_power_of_two(s->scratch_size));
+   }
+
+   if (stage == MESA_SHADER_VERTEX ||
+       stage == MESA_SHADER_TESS_EVAL ||
+       stage == MESA_SHADER_GEOMETRY ||
+       stage == MESA_SHADER_MESH) {
+
+      uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
+      uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size,
+                                          nir->info.cull_distance_array_size);
+
+      if (stage == MESA_SHADER_MESH) {
+         prog_data->mesh.clip_distance_mask = clip_mask;
+         prog_data->mesh.cull_distance_mask = cull_mask;
+      } else {
+         prog_data->vue.clip_distance_mask = clip_mask;
+         prog_data->vue.cull_distance_mask = cull_mask;
+      }
+   }
+
+   /* Scratch is allocated in 1KiB increments. */
+   prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024);
+
+   ralloc_free(s);
+   return bin;
+}
diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h
new file mode 100644
index 00000000000..37d0b722319
--- /dev/null
+++ b/src/intel/compiler/jay/jay_ir.h
@@ -0,0 +1,1408 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "compiler/brw/brw_compiler.h"
+#include "compiler/brw/brw_eu.h"
+#include "compiler/brw/brw_eu_defines.h"
+#include "compiler/shader_enums.h"
+#include "util/bitset.h"
+#include "util/list.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "util/sparse_bitset.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+#include "jay_opcodes.h"
+
+/* TODO: switch to brw_conditional_mod */
+enum PACKED jay_conditional_mod {
+   JAY_CONDITIONAL_EQ = 1,  /**< Equal to zero */
+   JAY_CONDITIONAL_NE = 2,  /**< Not equal to zero */
+   JAY_CONDITIONAL_GT = 3,  /**< Greater than zero */
+   JAY_CONDITIONAL_LT = 5,  /**< Less than zero */
+   JAY_CONDITIONAL_GE = 4,  /**< Greater than or equal to zero */
+   JAY_CONDITIONAL_LE = 6,  /**< Less than or equal to zero */
+   JAY_CONDITIONAL_OV = 8,  /**< Overflow has occurred */
+   JAY_CONDITIONAL_NAN = 9, /**< Result is NaN */
+};
+
+static inline enum jay_conditional_mod
+jay_conditional_mod_swap_sources(enum jay_conditional_mod mod)
+{
+   /* clang-format off */
+   switch (mod) {
+   case JAY_CONDITIONAL_GT: return JAY_CONDITIONAL_LT;
+   case JAY_CONDITIONAL_LT: return JAY_CONDITIONAL_GT;
+   case JAY_CONDITIONAL_GE: return JAY_CONDITIONAL_LE;
+   case JAY_CONDITIONAL_LE: return JAY_CONDITIONAL_GE;
+   default:                 return mod;
+   }
+   /* clang-format on */
+}
+
+enum PACKED jay_arf {
+   JAY_ARF_NULL = 0,
+   JAY_ARF_MASK = BRW_ARF_MASK,
+   JAY_ARF_CONTROL = BRW_ARF_CONTROL,
+   JAY_ARF_TIMESTAMP = BRW_ARF_TIMESTAMP,
+};
+
+enum PACKED jay_file {
+   /** Non-uniform general purpose registers: 32-bits per SIMT lane. */
+   GPR,
+
+   /** Uniform general purpose registers: 32-bit uniform values */
+   UGPR,
+
+   /** Memory registers representing spilled values: 32-bits per SIMT lane. */
+   MEM,
+
+   /** Memory registers representing spilled values: 32-bits uniform values */
+   UMEM,
+
+   /** Non-uniform flags (predicates): 1-bit per SIMT lane */
+   FLAG,
+
+   /** Uniform flags (predicates): 1-bit uniform value */
+   UFLAG,
+
+   /** Address registers */
+   J_ADDRESS,
+
+   /* Non-SSA files below: */
+
+   /** Accumulators: 32-bits per SIMT lane */
+   ACCUM,
+
+   /** Uniform accumulators: 32-bit uniform value */
+   UACCUM,
+
+   /** Architecture registers: direct access scalar */
+   J_ARF,
+
+   /** Inputs within Jay unit tests */
+   TEST_FILE,
+
+   /* Immediate value */
+   J_IMM,
+
+   JAY_FILE_LAST = J_IMM,
+   JAY_NUM_SSA_FILES = J_ADDRESS + 1,
+
+   /* Set of files that the main RA (and not eg flag RA) allocates. */
+   JAY_NUM_RA_FILES = UMEM + 1,
+   JAY_NUM_GRF_FILES = UGPR + 1,
+};
+static_assert(JAY_FILE_LAST <= 0b1111, "must fit in 4 bits (see jay_def)");
+
+#define jay_foreach_ssa_file(file)                                             \
+   for (enum jay_file file = 0; file < JAY_NUM_SSA_FILES; ++file)
+
+/* Value stuffed into the index field of instructions post-RA that are not
+ * null (0) but do not have an associated SSA index (as they are post-RA).
+ */
+#define JAY_SENTINEL (0xffffffffu)
+
+/* Maximum number of words in an jay_def */
+#define JAY_MAX_DEF_LENGTH (128)
+
+/* Maximum number of sources/destinations other than for phis */
+#define JAY_MAX_SRCS                 (16)
+#define JAY_MAX_DESTS                (2)
+#define JAY_MAX_OPERANDS             (JAY_MAX_SRCS + JAY_MAX_DESTS)
+#define JAY_MAX_FLAGS                (8)
+#define JAY_MAX_SAMPLER_MESSAGE_SIZE (11)
+#define JAY_NUM_LAST_USE_BITS        (32)
+#define JAY_NUM_PHYS_GRF             (128)
+#define JAY_NUM_UGPR                 (1024)
+#define JAY_REG_BITS                 (17)
+
+/*
+ * An jay_def represents a contiguous array of registers or a 32-bit immediate.
+ * It is used for sources or (in restricted form) for destinations.
+ */
+typedef struct jay_def {
+   /* Mode-dependent payload.
+    *
+    * File = IMMEDIATE: Immediate.
+    * Collect = false: Base SSA index.
+    * Collect = true: Pointer to SSA indices.
+    *
+    * SSA indices must be unique even across register files, so that we can
+    * easily track them all in e.g. a bitfield without needing to have
+    * separate data structures for each file.
+    *
+    * Each index represents a single 32-bit (or 1-bit if a predicate) value in
+    * the specified register file. 64-bit or vec4 values use multiple indices.
+    *
+    * Index 0 is reserved as the null value.
+    */
+   uint32_t _payload;
+
+   /* After register allocation, the register assigned to this def.
+    *
+    * Also used for additional pointer bits for collect pre-RA, which is why
+    * this is as large as it is. Could be shrunk with more pointer compression.
+    */
+   unsigned reg:JAY_REG_BITS;
+
+   /* (Post-RA) only, access only the top half of the indexed 32-bit register */
+   bool hi:1;
+
+   /** The associated file (must be < JAY_NUM_SSA_FILES for SSA) */
+   enum jay_file file:4;
+
+   /* Represents either a negation or a bitwise inversion (depending on the
+    * instruction type.)
+    */
+   bool negate:1;
+
+   /* Represents absolute value (on floating point sources) */
+   bool abs:1;
+
+   /* Number of values minus 1 */
+   unsigned num_values_m1:7;
+
+   /* If true, collects many discontiguous SSA indices into a single def.
+    * Requires file = GPR or file = UGPR. Cannot be used post-RA.
+    *
+    * Canonical form is required: the indices pointed to by the payload must NOT
+    * be contiguous. Also, the payload is not owned by the def: the def may be
+    * cheaply copied around, but mutating the payload requires copy-on-write and
+    * maintaining the canonical form.
+    */
+   bool collect:1;
+} jay_def;
+static_assert(sizeof(jay_def) == 8, "packed");
+
+/*
+ * Construct an jay_def representing a bare register with no associated SSA
+ * index, for use post-RA only.
+ */
+static inline jay_def
+jay_bare_reg(enum jay_file file, uint16_t reg)
+{
+   return (jay_def) { ._payload = JAY_SENTINEL, .reg = reg, .file = file };
+}
+
+/*
+ * Set the register for a def (called by RA only). This drops the collect
+ * indices since we do not have space to encode both simultaneously.
+ */
+static inline void
+jay_set_reg(jay_def *d, unsigned r)
+{
+   if (d->collect) {
+      d->collect = false;
+      d->_payload = JAY_SENTINEL;
+   }
+
+   d->reg = r;
+}
+
+static inline uint32_t
+jay_base_index(jay_def d)
+{
+   assert(d.file != J_IMM && !d.collect);
+   return d._payload;
+}
+
+/**
+ * True if the value is null.
+ */
+static inline bool
+jay_is_null(jay_def d)
+{
+   return d._payload == 0 && d.file != J_IMM;
+}
+
+static inline bool
+jay_is_imm(jay_def d)
+{
+   return d.file == J_IMM;
+}
+
+/**
+ * True if the def is a 1-bit flag regardless of whether it is uniform.
+ */
+static inline bool
+jay_is_flag(jay_def d)
+{
+   return d.file == FLAG || d.file == UFLAG;
+}
+
+/**
+ * Return the number of SSA indices referenced by an jay_def.
+ */
+static inline unsigned
+jay_num_values(jay_def d)
+{
+   return jay_is_imm(d) || jay_is_null(d) ? 0 : (d.num_values_m1 + 1);
+}
+
+/**
+ * True if the def is an SSA def (and not, say, an arch register).
+ */
+static inline bool
+jay_is_ssa(jay_def d)
+{
+   return d.file < JAY_NUM_SSA_FILES;
+}
+
+#define jay_foreach_comp(def, c)                                               \
+   for (unsigned c = 0; c < jay_num_values(def); ++c)
+
+#define jay_foreach_comp_rev(def, c)                                           \
+   for (signed c = jay_num_values(def) - 1; c >= 0; --c)
+
+/*
+ * Alias for jay_base_index for use with scalar defs.
+ */
+static inline uint32_t
+jay_index(jay_def d)
+{
+   assert(jay_num_values(d) == 1);
+   return jay_base_index(d);
+}
+
+/**
+ * Return a reference to the array of indices of a collect source.
+ */
+static inline uint32_t *
+_jay_collect_indices(jay_def d)
+{
+   assert(d.collect);
+
+   /* reg has upper bits of the pointer */
+   uint64_t payload = (((uint64_t) d.reg) << 32) | d._payload;
+   return (uint32_t *) (uintptr_t) payload;
+}
+
+/**
+ * Return the n'th channel of an SSA def.
+ *
+ * Note: this is specifically read-only. To mutate, use jay_set_channel.
+ */
+static inline uint32_t
+jay_channel(jay_def d, unsigned c)
+{
+   assert(d.file != J_IMM);
+   assert(c <= d.num_values_m1);
+
+   if (likely(!d.collect)) {
+      return jay_base_index(d) + c;
+   } else {
+      return _jay_collect_indices(d)[c];
+   }
+}
+
+/**
+ * Build a contiguous jay_def.
+ */
+static inline jay_def
+jay_contiguous_def(enum jay_file file, uint32_t index, unsigned count)
+{
+   assert(count > 0 && count <= (1 << 7) && "max def width");
+
+   return (jay_def) {
+      ._payload = index,
+      .file = file,
+      .num_values_m1 = count - 1,
+   };
+}
+
+/*
+ * Replaces a source, preserving the negate/abs if present.
+ */
+static inline void
+jay_replace_src(jay_def *old, jay_def replacement)
+{
+   replacement.negate = old->negate;
+   replacement.abs = old->abs;
+   *old = replacement;
+}
+
+static inline jay_def
+jay_scalar(enum jay_file file, uint32_t index)
+{
+   return jay_contiguous_def(file, index, 1);
+}
+
+static inline jay_def
+jay_null()
+{
+   return jay_scalar(J_ARF, 0);
+}
+
+/**
+ * Return a contiguous subrange inside an SSA def.
+ */
+static inline jay_def
+jay_extract_range(jay_def def, unsigned chan, unsigned count)
+{
+   assert(!jay_is_imm(def));
+   assert((count == 1 || !def.collect) && "slicing collects unsupported");
+   assert(chan + count <= jay_num_values(def));
+
+   uint32_t base = jay_channel(def, chan);
+   jay_replace_src(&def, jay_contiguous_def(def.file, base, count));
+   return def;
+}
+
+/**
+ * Return a scalar SSA def equal to a single channel from an SSA def.
+ */
+static inline jay_def
+jay_extract(jay_def def, unsigned chan)
+{
+   return jay_extract_range(def, chan, 1);
+}
+
+/**
+ * Like jay_extract but working on bare registers. This could be unified to
+ * preserve indices and such but meh.
+ */
+static inline jay_def
+jay_extract_post_ra(jay_def def, unsigned chan)
+{
+   return jay_bare_reg(def.file, def.reg + chan);
+}
+
+/**
+ * Construct an immediate source from a raw 32-bit data pattern.
+ */
+static inline jay_def
+jay_imm(uint32_t imm)
+{
+   return (jay_def) { ._payload = imm, .file = J_IMM };
+}
+
+/**
+ * True if both jay_defs are equivalent up to source modifiers.
+ */
+static inline bool
+jay_defs_equivalent(jay_def a, jay_def b)
+{
+   if (a.file != b.file ||
+       a.num_values_m1 != b.num_values_m1 ||
+       a.collect != b.collect)
+      return false;
+
+   if (likely(!a.collect)) {
+      /* Contiguous or immediate */
+      return a._payload == b._payload && a.reg == b.reg;
+   } else {
+      /* Collect. Component-wise compare. */
+      return !memcmp(_jay_collect_indices(a), _jay_collect_indices(b),
+                     sizeof(uint32_t) * jay_num_values(a));
+   }
+}
+
+/**
+ * True if both registers are equal (for use post-RA).
+ */
+static inline bool
+jay_regs_equal(jay_def a, jay_def b)
+{
+   return a.file == b.file &&
+          a.num_values_m1 == b.num_values_m1 &&
+          a.reg == b.reg;
+}
+
+/**
+ * Return a reference to the execution mask (mask0) architecture register.
+ */
+static inline jay_def
+jay_exec_mask(void)
+{
+   return jay_scalar(J_ARF, JAY_ARF_MASK);
+}
+
+/**
+ * Return a reference to the control (cr0) architecture register.
+ */
+static inline jay_def
+jay_control(void)
+{
+   return jay_scalar(J_ARF, JAY_ARF_CONTROL);
+}
+
+/**
+ * Construct an immediate from a floating point constant.
+ */
+static inline jay_def
+jay_imm_f(float imm)
+{
+   return jay_imm(fui(imm));
+}
+
+/**
+ * Return the negation of a source.
+ */
+static inline jay_def
+jay_negate(jay_def src)
+{
+   src.negate = !src.negate;
+   return src;
+}
+
+/**
+ * Return the absolute value of a source.
+ */
+static inline jay_def
+jay_abs(jay_def src)
+{
+   src.negate = false;
+   src.abs = true;
+   return src;
+}
+
+/**
+ * Returns true if the given source reads the same value in all lanes.
+ */
+static inline bool
+jay_is_uniform(jay_def d)
+{
+   return d.file == UGPR ||
+          d.file == UFLAG ||
+          d.file == UACCUM ||
+          jay_is_imm(d);
+}
+
+/**
+ * Returns true if the given definition represents a spilled variable.
+ */
+static inline bool
+jay_is_mem(jay_def x)
+{
+   return x.file == MEM || x.file == UMEM;
+}
+
+static inline uint32_t
+jay_as_uint(jay_def src)
+{
+   assert(jay_is_imm(src));
+   return src._payload;
+}
+
+static inline bool
+jay_is_zero(jay_def src)
+{
+   return jay_is_imm(src) && jay_as_uint(src) == 0;
+}
+
+/* Chosen so that sized type is the unsized type OR the number bits */
+#define JAY_TYPE_BASE_MASK (128 | 2 | 4)
+
+enum PACKED jay_type {
+   JAY_TYPE_UNTYPED = 0,
+   JAY_TYPE_U = 2,
+   JAY_TYPE_S = 4,
+   JAY_TYPE_F = 6,
+   JAY_TYPE_BF = 128,
+
+   /** Unsigned integers */
+   JAY_TYPE_U64 = JAY_TYPE_U | 64,
+   JAY_TYPE_U32 = JAY_TYPE_U | 32,
+   JAY_TYPE_U16 = JAY_TYPE_U | 16,
+   JAY_TYPE_U8 = JAY_TYPE_U | 8,
+   JAY_TYPE_U1 = JAY_TYPE_U | 1,
+
+   /** Signed integers */
+   JAY_TYPE_S64 = JAY_TYPE_S | 64,
+   JAY_TYPE_S32 = JAY_TYPE_S | 32,
+   JAY_TYPE_S16 = JAY_TYPE_S | 16,
+   JAY_TYPE_S8 = JAY_TYPE_S | 8,
+   JAY_TYPE_S1 = JAY_TYPE_S | 1,
+
+   /** IEEE floating point */
+   JAY_TYPE_F64 = JAY_TYPE_F | 64,
+   JAY_TYPE_F32 = JAY_TYPE_F | 32,
+   JAY_TYPE_F16 = JAY_TYPE_F | 16,
+
+   /** Other floating point variants */
+   JAY_TYPE_BF16 = JAY_TYPE_BF | 16,
+};
+static_assert(sizeof(enum jay_type) == 1);
+
+static inline enum jay_type
+jay_type(enum jay_type base, unsigned bits)
+{
+   /* Normalize booleans */
+   if (bits == 1) {
+      base = JAY_TYPE_U;
+   }
+
+   return (enum jay_type)(base | bits);
+}
+
+static inline enum jay_type
+jay_base_type(enum jay_type t)
+{
+   return (enum jay_type)(t & JAY_TYPE_BASE_MASK);
+}
+
+static inline unsigned
+jay_type_size_bits(enum jay_type t)
+{
+   return t & ~JAY_TYPE_BASE_MASK;
+}
+
+static inline enum jay_type
+jay_type_rebase(enum jay_type t, enum jay_type new_base)
+{
+   return jay_type(new_base, jay_type_size_bits(t));
+}
+
+static inline enum jay_type
+jay_type_resize(enum jay_type t, unsigned bits)
+{
+   return jay_type(jay_base_type(t), bits);
+}
+
+/**
+ * Returns the number of 32-bit values needed to hold a type t.
+ */
+static inline unsigned
+jay_type_vector_length(enum jay_type t)
+{
+   return jay_type_size_bits(t) == 64 ? 2 : 1;
+}
+
+static inline bool
+jay_type_is_any_float(enum jay_type t)
+{
+   return jay_base_type(t) == JAY_TYPE_F || jay_base_type(t) == JAY_TYPE_BF;
+}
+
+enum jay_predication : uint8_t {
+   /** No predication. */
+   JAY_NOT_PREDICATED = 0,
+
+   /**
+    * Predicated with no default value. Used post-RA and for instructions that
+    * do not write a destination.
+    */
+   JAY_PREDICATED = 1,
+
+   /** Predicated with 1 default value. Used pre-RA. */
+   JAY_PREDICATED_DEFAULT = 2,
+};
+
+/**
+ * Representation of a shader instruction in the Jay IR.
+ */
+typedef struct jay_inst {
+   struct list_head link;
+
+   /**
+    * Metadata calculated by liveness analysis: bit i is set if the i'th
+    * non-null SSA index read by the instruction is killed by that read.
+    */
+   BITSET_DECLARE(last_use, JAY_NUM_LAST_USE_BITS);
+
+   enum jay_opcode op;
+   enum jay_type type; /**< execution type of the instruction */
+
+   /** Software scoreboarding dependencies (for non-SYNC instructions) */
+   struct tgl_swsb dep;
+
+   /** Number of sources */
+   uint8_t num_srcs;
+
+   /**
+    * Indicates an instruction reading only uniform sources but writing a FLAG
+    * and no GPR/UGPR that expects the flag to replicate for all SIMD lanes.
+    * This is okay in our data model but cannot be inferred from the files, so
+    * we have a secondary bit to express this.
+    */
+   bool broadcast_flag:1;
+   bool saturate      :1;
+
+   /**
+    * In a SIMD split instruction, whether the regdist dependency is replicated
+    * to each physical instruction. If false, only the first instruction waits.
+    *
+    * If decrement_dep is also set, the regdist is decremented by the macro
+    * length for each instruction (modelling cross-pipe dependencies).
+    */
+   bool replicate_dep:1;
+   bool decrement_dep:1;
+   unsigned padding  :12;
+
+   enum jay_predication predication;
+   enum jay_conditional_mod conditional_mod;
+
+   jay_def cond_flag; /**< conditional flag */
+   jay_def dst;
+
+   jay_def src[];
+} jay_inst;
+
+static_assert(sizeof(jay_inst) == 32 + (sizeof(uintptr_t) * 2), "packed");
+
+/*
+ * Return the number of instruction set defined sources, ignoring implicit
+ * predication and accumulator sources.
+ */
+static inline unsigned
+jay_num_isa_srcs(const jay_inst *I)
+{
+   return I->num_srcs - I->predication - (I->op == JAY_OPCODE_SEL);
+}
+
+static inline bool
+jay_uses_flag(const jay_inst *I)
+{
+   return I->predication ||
+          !jay_is_null(I->cond_flag) ||
+          I->op == JAY_OPCODE_SEL;
+}
+
+static inline void
+jay_remove_instruction(jay_inst *inst)
+{
+   list_del(&inst->link);
+}
+
+static inline bool
+jay_has_src_mods(jay_inst *I, unsigned s)
+{
+   return jay_opcode_infos[I->op].src_mods & BITFIELD_BIT(s);
+}
+
+static inline bool
+jay_inst_has_default(jay_inst *I)
+{
+   return I->predication >= JAY_PREDICATED_DEFAULT;
+}
+
+static inline jay_def *
+jay_inst_get_predicate(jay_inst *I)
+{
+   assert(I->predication);
+   return &I->src[I->num_srcs - I->predication];
+}
+
+static inline jay_def *
+jay_inst_get_default(jay_inst *I)
+{
+   assert(jay_inst_has_default(I));
+   return &I->src[I->num_srcs - 1];
+}
+
+/* Must be included late since it depends on jay_inst but the rest of this file
+ * depends on the inline functions it defines.
+ */
+#include "jay_extra_info.h"
+
+static inline enum jay_type
+jay_src_type(const jay_inst *I, unsigned s)
+{
+   /* Predicates */
+   if (s == (unsigned) (I->num_srcs - I->predication) ||
+       (I->op == JAY_OPCODE_SEL && s == 2) ||
+       (I->op == JAY_OPCODE_PHI_SRC && jay_is_flag(I->src[s])))
+      return JAY_TYPE_U1;
+
+   /* Conversions have an explicit source type, use that. */
+   if (I->op == JAY_OPCODE_CVT)
+      return jay_cvt_src_type(I);
+
+   /* 16-bit operand */
+   if (I->op == JAY_OPCODE_MUL_32X16 && s == 1)
+      return jay_type_resize(I->type, jay_type_size_bits(I->type) / 2);
+
+   if (I->op == JAY_OPCODE_SEND) {
+      if (s < 2)
+         return JAY_TYPE_U32;
+      else if (s < 4)
+         return s == 3 ? jay_send_type_1(I) : jay_send_type_0(I);
+   }
+
+   if (I->op == JAY_OPCODE_CAST_CANONICAL_TO_FLAG)
+      return JAY_TYPE_U32;
+
+   /* Shifts are always small even with 64-bit destinations */
+   if ((I->op == JAY_OPCODE_SHL ||
+        I->op == JAY_OPCODE_SHR ||
+        I->op == JAY_OPCODE_ASR) &&
+       s == 1)
+      return JAY_TYPE_U16;
+
+   /* TODO: Do we want to allow zero-extension generally? */
+   if (I->op == JAY_OPCODE_AND_U32_U16)
+      return JAY_TYPE_U16;
+
+   /* Mixed-signedness integer dot product opcode */
+   if (I->op == JAY_OPCODE_DP4A_SU && s == 2)
+      return JAY_TYPE_U32;
+
+   /* Shuffle lane index distinct from data type */
+   if (I->op == JAY_OPCODE_SHUFFLE && s == 1)
+      return JAY_TYPE_U32;
+
+   /* Other instructions inherit the destination type. */
+   return I->type;
+}
+
+enum jay_stride {
+   JAY_STRIDE_2 = 0,
+   JAY_STRIDE_4,
+   JAY_STRIDE_8,
+   JAY_NUM_STRIDES,
+};
+
+static inline unsigned
+jay_stride_to_bits(enum jay_stride s)
+{
+   assert(s <= JAY_STRIDE_8);
+   return 16 << s;
+}
+
+#define JAY_PARTITION_BLOCKS (3)
+
+struct jay_register_block {
+   uint16_t start, len;
+};
+
+struct jay_partition {
+   /** Consecutive ranges of GRFs in GPR/UGPRs. */
+   struct jay_register_block blocks[JAY_NUM_GRF_FILES][JAY_PARTITION_BLOCKS];
+
+   /** Number of GPR/UGPRs per GRF, times 16. For example, 16 encodes SIMD16
+    * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 UGPRs).
+    * 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR).
+    */
+   unsigned units_x16[JAY_NUM_GRF_FILES];
+
+   /** Base GPR for each stride. The file is partitioned (4, 8, 2, 4=EOT). */
+   unsigned base8, base2, base_eot;
+
+   /** Region of the UGPR partition suitable for large UGPR vectors */
+   struct jay_register_block large_ugpr_block;
+};
+
+static inline enum jay_stride
+jay_gpr_to_stride(struct jay_partition *p, unsigned reg)
+{
+   return (reg < p->base8 || reg >= p->base_eot) ? JAY_STRIDE_4 :
+          reg >= p->base2                        ? JAY_STRIDE_2 :
+                                                   JAY_STRIDE_8;
+}
+
+/**
+ * Representation of a shader in the Jay IR.
+ */
+typedef struct jay_shader {
+   mesa_shader_stage stage;
+   struct list_head functions;
+   const struct intel_device_info *devinfo;
+   union brw_any_prog_data *prog_data;
+   unsigned spills, fills;
+   unsigned scratch_size;
+   unsigned push_grfs;
+
+   /**
+    * Ralloc linear context. Since we don't typically free as we go,
+    * most allocations should go through this context for efficiency.
+    */
+   struct linear_ctx *lin_ctx;
+
+   /* Dispatch width of the current compile: 8, 16, or 32. */
+   unsigned dispatch_width;
+
+   /**
+    * Number of GPR/UGPRs used across all functions in the shader. This is the
+    * limit that must be allocated for the shader.
+    */
+   unsigned num_regs[JAY_NUM_RA_FILES];
+
+   /**
+    * Register file partition chosen for the whole shader.
+    */
+   struct jay_partition partition;
+
+   /** Current compilation phase (for printing & validation) */
+   bool post_ra;
+} jay_shader;
+
+static inline jay_shader *
+jay_new_shader(void *memctx, mesa_shader_stage stage)
+{
+   jay_shader *s = rzalloc(NULL, jay_shader);
+   s->stage = stage;
+   s->lin_ctx = linear_context(s);
+   list_inithead(&s->functions);
+   return s;
+}
+
+static inline unsigned
+jay_ugpr_per_grf(jay_shader *s)
+{
+   unsigned B_per_unit = 32 /* see reg_unit */;
+   unsigned B_per_ugpr = 4;
+
+   return reg_unit(s->devinfo) * (B_per_unit / B_per_ugpr);
+}
+
+static inline unsigned
+jay_grf_per_gpr(jay_shader *s)
+{
+   assert(reg_unit(s->devinfo) == 1 || reg_unit(s->devinfo) == 2);
+   return reg_unit(s->devinfo) == 2 ? (s->dispatch_width / 16) :
+                                      (s->dispatch_width / 8);
+}
+
+static inline unsigned
+jay_phys_flag_per_virt(jay_shader *s)
+{
+   /* TODO: Check if this holds on older platforms */
+   return jay_grf_per_gpr(s);
+}
+
+/*
+ * Returns whether an instruction will lower to a SEND post-RA: either a SEND or
+ * a spill/fill that has not yet been lowered.
+ */
+static inline bool
+jay_is_send_like(const jay_inst *I)
+{
+   if (I->op == JAY_OPCODE_MOV)
+      return jay_is_mem(I->dst) || jay_is_mem(I->src[0]);
+   else
+      return I->op == JAY_OPCODE_SEND;
+}
+
+/*
+ * Returns whether an instruction contains cross-lane access.
+ */
+static inline bool
+jay_is_shuffle_like(const jay_inst *I)
+{
+   return I->op == JAY_OPCODE_SHUFFLE ||
+          I->op == JAY_OPCODE_QUAD_SWIZZLE ||
+          I->op == JAY_OPCODE_BROADCAST_IMM;
+}
+
+/*
+ * Return the required alignment for the register assigned to a given source.
+ */
+static inline unsigned
+jay_src_alignment(jay_shader *shader, const jay_inst *I, unsigned s)
+{
+   /* SENDs operate on entire GRFs at a time, so align UGPRs to GRFs. This
+    * includes UGPR->UMEM moves which lower to SENDs.
+    */
+   if ((I->op == JAY_OPCODE_SEND && I->src[s].file == UGPR) ||
+       (I->dst.file == UMEM)) {
+      return jay_ugpr_per_grf(shader);
+   }
+
+   /* If the destination is 64-bit, we need the sources to be aligned. Along
+    * with a suitable partitioning, this ensures only the aligned low half of
+    * a strided register is used, preventing invalid assembly like:
+    *
+    *    mov.s64 g40, g42.1<2>:s32
+    *
+    * ..which would violate the rule:
+    *
+    *    Register Regioning patterns where register data bit location of the LSB
+    *    of the channels are changed between source and destination are not
+    *    supported except for broadcast of a scalar.
+    */
+   return jay_type_vector_length(I->type);
+}
+
+/*
+ * Return the required alignment for the register assigned to a destination.
+ */
+static inline unsigned
+jay_dst_alignment(jay_shader *shader, const jay_inst *I)
+{
+   /* SENDs write entire GRFs, so align UGPRs to GRFs. Similarly for any
+    * instructions involving accumulators:
+    *
+    *    Register Regioning patterns where register data bit locations are
+    *    changed between source and destination are not supported when an
+    *    accumulator is used as an implicit source or an explicit source in an
+    *    instruction. (TODO)
+    */
+   if (I->dst.file == UGPR &&
+       (I->op == JAY_OPCODE_SEND ||
+        (I->op == JAY_OPCODE_MOV && I->src[0].file == UMEM) ||
+        I->op == JAY_OPCODE_MUL_32)) {
+
+      return jay_ugpr_per_grf(shader);
+   }
+
+   /* If any source is 64-bit, align the destination to 64-bit too. As above. */
+   return jay_type_vector_length(jay_src_type(I, 0));
+}
+
+static inline bool
+jay_inst_is_uniform(const jay_inst *I)
+{
+   if (I->op == JAY_OPCODE_SEND)
+      return jay_send_uniform(I);
+
+   return jay_is_uniform(I->dst) ||
+          (I->dst.file == J_ADDRESS && jay_is_uniform(I->src[0])) ||
+          I->cond_flag.file == UFLAG ||
+          I->op == JAY_OPCODE_SYNC ||
+          I->dst.file == FLAG ||
+          (I->dst.file == J_ARF && !jay_is_null(I->dst));
+}
+
+unsigned jay_simd_split(jay_shader *s, const jay_inst *I);
+
+static inline unsigned
+jay_simd_width_logical(jay_shader *s, const jay_inst *I)
+{
+   unsigned base = jay_inst_is_uniform(I) ? 1 : s->dispatch_width;
+
+   /* Handle vectors-of-UGPR operations with special care for 64-bit */
+   unsigned vec_per_channel = jay_type_vector_length(I->type);
+   unsigned dst_size = jay_num_values(I->dst);
+   assert(util_is_aligned(dst_size, vec_per_channel));
+
+   if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) {
+      assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1);
+      base = dst_size;
+   }
+
+   return base;
+}
+
+static inline unsigned
+jay_simd_width_physical(jay_shader *s, const jay_inst *I)
+{
+   return jay_simd_width_logical(s, I) >> jay_simd_split(s, I);
+}
+
+/*
+ * Returns the number of physical instructions emitted for each logical
+ * instruction not accounting for SIMD split. That is, the number of
+ * instructions that macros will expand to in jay_to_binary or 1 for non-macros.
+ */
+static inline unsigned
+jay_macro_length(const jay_inst *I)
+{
+   bool macro = (I->op == JAY_OPCODE_MUL_32 ||
+                 I->op == JAY_OPCODE_SHUFFLE ||
+                 I->op == JAY_OPCODE_LOOP_ONCE);
+   return macro ? 2 : 1;
+}
+
+static inline bool
+jay_is_no_mask(const jay_inst *I)
+{
+   return jay_inst_is_uniform(I) ||
+          I->broadcast_flag ||
+          I->op == JAY_OPCODE_QUAD_SWIZZLE ||
+          I->op == JAY_OPCODE_DESWIZZLE_16 ||
+          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
+          I->op == JAY_OPCODE_LANE_ID_8 ||
+          I->op == JAY_OPCODE_LANE_ID_EXPAND;
+}
+
+/**
+ * Representation of an (implemented) function in the Jay IR. This corresponds
+ * to nir_function_impl in NIR.
+ */
+typedef struct jay_function {
+   struct list_head link;
+
+   /* Parent pointer for convenience */
+   struct jay_shader *shader;
+
+   /* Set of SSA indices of defs that are dead immediately after being written
+    * (because they are never read but cannot be DCE'd).
+    */
+   BITSET_WORD *dead_defs;
+
+   /* Register demand metadata calculated & used in RA */
+   unsigned demand[JAY_NUM_SSA_FILES];
+
+   unsigned num_blocks;
+   struct list_head blocks;
+   bool is_entrypoint;
+
+   uint32_t ssa_alloc;
+} jay_function;
+
+static inline jay_function *
+jay_new_function(jay_shader *s)
+{
+   jay_function *f = rzalloc(s, jay_function);
+   list_inithead(&f->blocks);
+
+   f->shader = s;
+   f->ssa_alloc = 1; /* skip null */
+
+   list_add(&f->link, &s->functions);
+   return f;
+}
+
+static inline jay_function *
+jay_shader_get_entrypoint(jay_shader *s)
+{
+   /* TODO: Multifunction shaders */
+   assert(list_is_singular(&s->functions));
+   return list_first_entry(&s->functions, jay_function, link);
+}
+
+static inline unsigned
+jay_num_regs(jay_shader *shader, enum jay_file file)
+{
+   assert(file < JAY_NUM_SSA_FILES);
+
+   if (file < JAY_NUM_RA_FILES)
+      return shader->num_regs[file];
+   else if (file == FLAG)
+      return shader->dispatch_width == 32 ? 4 : 8;
+   else if (file == UFLAG)
+      return 0;
+   else
+      return 1 /* TODO: We don't have address or accumulator RA yet */;
+}
+
+static inline enum jay_stride
+jay_def_stride(jay_shader *shader, jay_def x)
+{
+   assert(x.file == GPR);
+   return jay_gpr_to_stride(&shader->partition, x.reg);
+}
+
+/* Represents an allocated register number with file in the top 3 bits. */
+typedef uint16_t jay_reg;
+
+/** Represents a set of registers that may be clobbered for lowering swaps */
+struct jay_temp_regs {
+   jay_reg gpr, gpr2, ugpr, ugpr2;
+};
+
+/**
+ * A basic block representation
+ */
+typedef struct jay_block {
+   struct list_head link;
+   struct list_head instructions;
+
+   /** Control flow graph */
+   struct jay_block *successors[2];
+   struct util_dynarray predecessors;
+
+   /** Index of the block in source order */
+   unsigned index;
+
+   /** Liveness analysis results */
+   struct u_sparse_bitset live_in;
+   struct u_sparse_bitset live_out;
+
+   /**
+    * After register allocation but before going out-of-SSA, registers that
+    * are free at the logical end of the block (before phi_src). These will
+    * be clobbered by the out-of-SSA pass.
+    */
+   struct jay_temp_regs temps_out;
+
+   /**
+    * Is this block a loop header?  If not, all of its predecessors precede it
+    * in source order.
+    */
+   bool loop_header;
+
+   /** True if all non-exited lanes execute this block together */
+   bool uniform;
+
+   /** Pretty printing based on original structured control flow */
+   uint8_t indent;
+} jay_block;
+
+static inline jay_block *
+jay_new_block(jay_function *f)
+{
+   jay_block *block = rzalloc(f, jay_block);
+
+   util_dynarray_init(&block->predecessors, block);
+   list_inithead(&block->instructions);
+
+   block->index = f->num_blocks++;
+   return block;
+}
+
+static inline bool
+jay_op_is_control_flow(enum jay_opcode op)
+{
+   return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE;
+}
+
+/**
+ * Returns the control flow instruction at the end of a block or NULL.
+ */
+static inline jay_inst *
+jay_block_ending_jump(jay_block *block)
+{
+   jay_inst *last = list_is_empty(&block->instructions) ?
+                       NULL :
+                       list_last_entry(&block->instructions, jay_inst, link);
+   return last && jay_op_is_control_flow(last->op) ? last : NULL;
+}
+
+static inline unsigned
+jay_num_predecessors(jay_block *block)
+{
+   return util_dynarray_num_elements(&block->predecessors, jay_block *);
+}
+
+static inline unsigned
+jay_num_successors(jay_block *block)
+{
+   static_assert(ARRAY_SIZE(block->successors) == 2);
+   return !!block->successors[0] + !!block->successors[1];
+}
+
+static inline jay_block *
+jay_first_predecessor(jay_block *block)
+{
+   if (jay_num_predecessors(block) == 0)
+      return NULL;
+
+   return *util_dynarray_element(&block->predecessors, struct jay_block *, 0);
+}
+
+/* Block worklist helpers */
+
+#define jay_worklist_push_head(w, block) u_worklist_push_head(w, block, index)
+#define jay_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index)
+#define jay_worklist_peek_head(w)        u_worklist_peek_head(w, jay_block, index)
+#define jay_worklist_pop_head(w)         u_worklist_pop_head(w, jay_block, index)
+#define jay_worklist_peek_tail(w)        u_worklist_peek_tail(w, jay_block, index)
+#define jay_worklist_pop_tail(w)         u_worklist_pop_tail(w, jay_block, index)
+
+/* Iterators */
+
+#define jay_foreach_function(s, v)                                             \
+   list_for_each_entry(jay_function, v, &s->functions, link)
+
+#define jay_foreach_block(f, v)                                                \
+   list_for_each_entry(jay_block, v, &f->blocks, link)
+
+#define jay_foreach_block_safe(f, v)                                           \
+   list_for_each_entry_safe(jay_block, v, &f->blocks, link)
+
+#define jay_foreach_block_rev(f, v)                                            \
+   list_for_each_entry_rev(jay_block, v, &f->blocks, link)
+
+#define jay_foreach_block_from(f, from, v)                                     \
+   list_for_each_entry_from(jay_block, v, from, &f->blocks, link)
+
+#define jay_foreach_block_from_rev(f, from, v)                                 \
+   list_for_each_entry_from_rev(jay_block, v, from, &f->blocks, link)
+
+#define jay_foreach_inst_in_block(block, v)                                    \
+   list_for_each_entry(jay_inst, v, &(block)->instructions, link)
+
+#define jay_foreach_inst_in_block_rev(block, v)                                \
+   list_for_each_entry_rev(jay_inst, v, &(block)->instructions, link)
+
+#define jay_foreach_inst_in_block_safe(block, v)                               \
+   list_for_each_entry_safe(jay_inst, v, &(block)->instructions, link)
+
+#define jay_foreach_inst_in_block_safe_rev(block, v)                           \
+   list_for_each_entry_safe_rev(jay_inst, v, &(block)->instructions, link)
+
+#define jay_foreach_inst_in_block_from(block, v, from)                         \
+   list_for_each_entry_from(jay_inst, v, from, &(block)->instructions, link)
+
+#define jay_foreach_inst_in_block_from_rev(block, v, from)                     \
+   list_for_each_entry_from_rev(jay_inst, v, from, &(block)->instructions, link)
+
+#define jay_foreach_inst_in_func(func, block, v)                               \
+   jay_foreach_block(func, block)                                              \
+      jay_foreach_inst_in_block(block, v)
+
+#define jay_foreach_inst_in_func_rev(func, block, v)                           \
+   jay_foreach_block_rev(func, block)                                          \
+      jay_foreach_inst_in_block_rev(block, v)
+
+#define jay_foreach_inst_in_func_safe(func, block, v)                          \
+   jay_foreach_block(func, block)                                              \
+      jay_foreach_inst_in_block_safe(block, v)
+
+#define jay_foreach_inst_in_func_safe_rev(func, block, v)                      \
+   jay_foreach_block_rev(func, block)                                          \
+      jay_foreach_inst_in_block_safe_rev(block, v)
+
+#define jay_foreach_inst_in_shader(s, func, inst)                              \
+   jay_foreach_function(s, func)                                               \
+      jay_foreach_inst_in_func(func, v_block, inst)
+
+#define jay_foreach_inst_in_shader_safe(s, func, inst)                         \
+   jay_foreach_function(s, func)                                               \
+      jay_foreach_inst_in_func_safe(func, v_block, inst)
+
+#define jay_foreach_successor(blk, v)                                          \
+   jay_block *v;                                                               \
+   jay_block **_v;                                                             \
+   for (_v = (jay_block **) &blk->successors[0], v = *_v;                      \
+        v != NULL && _v < (jay_block **) &blk->successors[2]; _v++, v = *_v)
+
+#define jay_foreach_predecessor(blk, v)                                        \
+   util_dynarray_foreach(&blk->predecessors, jay_block *, v)
+
+#define jay_foreach_src(inst, s) for (unsigned s = 0; s < inst->num_srcs; ++s)
+
+#define jay_foreach_src_rev(inst, s)                                           \
+   for (signed s = inst->num_srcs - 1; s >= 0; --s)
+
+#define jay_foreach_ssa_src(I, s)                                              \
+   jay_foreach_src(I, s)                                                       \
+      if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s]))
+
+#define jay_foreach_ssa_src_rev(I, s)                                          \
+   jay_foreach_src_rev(I, s)                                                   \
+      if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s]))
+
+#define jay_foreach_index(def, c, idx)                                         \
+   jay_foreach_comp(def, c)                                                    \
+      for (uint32_t idx = jay_channel(def, c); idx != 0; idx = 0)
+
+#define jay_foreach_index_rev(def, c, idx)                                     \
+   jay_foreach_comp_rev(def, c)                                                \
+      for (uint32_t idx = jay_channel(def, c); idx != 0; idx = 0)
+
+#define jay_foreach_src_index(I, s, c, i)                                      \
+   jay_foreach_ssa_src(I, s)                                                   \
+      jay_foreach_index(I->src[s], c, i)
+
+#define jay_foreach_src_index_rev(I, s, c, i)                                  \
+   jay_foreach_ssa_src_rev(I, s)                                               \
+      jay_foreach_index_rev(I->src[s], c, i)
+
+#define jay_foreach_dst(I, d)                                                  \
+   for (unsigned _d = 0; _d < 2; ++_d)                                         \
+      for (jay_def d = (_d ? I->cond_flag : I->dst); !jay_is_null(d);          \
+           d = jay_null())
+
+#define jay_foreach_dst_index(I, d, i)                                         \
+   jay_foreach_dst(I, d)                                                       \
+      jay_foreach_index(d, _c, i)
+
+/*
+ * Phi iterators take advantage of the known position of phis in the block.
+ */
+#define jay_foreach_phi_src_in_block(block, phi)                               \
+   jay_foreach_inst_in_block_safe_rev(block, phi)                              \
+      if (jay_op_is_control_flow(phi->op))                                     \
+         continue;                                                             \
+      else if (phi->op != JAY_OPCODE_PHI_SRC)                                  \
+         break;                                                                \
+      else
+
+#define jay_foreach_phi_dst_in_block(block, phi)                               \
+   jay_foreach_inst_in_block(block, phi)                                       \
+      if (phi->op != JAY_OPCODE_PHI_DST)                                       \
+         break;                                                                \
+      else
+
+#define jay_foreach_preload(func, preload)                                     \
+   jay_foreach_inst_in_block_safe(jay_first_block(func), preload)              \
+      if (I->op != JAY_OPCODE_PRELOAD)                                         \
+         break;                                                                \
+      else
+
+static inline jay_block *
+jay_first_block(jay_function *f)
+{
+   assert(!list_is_empty(&f->blocks));
+   jay_block *first_block = list_first_entry(&f->blocks, jay_block, link);
+   assert(first_block->index == 0);
+   return first_block;
+}
+
+static inline jay_inst *
+jay_first_inst(jay_block *block)
+{
+   if (list_is_empty(&block->instructions))
+      return NULL;
+   else
+      return list_first_entry(&block->instructions, jay_inst, link);
+}
+
+static inline jay_block *
+jay_last_block(jay_function *f)
+{
+   if (list_is_empty(&f->blocks))
+      return NULL;
+   else
+      return list_last_entry(&f->blocks, jay_block, link);
+}
+
+static inline jay_inst *
+jay_last_inst(jay_block *block)
+{
+   if (list_is_empty(&block->instructions))
+      return NULL;
+   else
+      return list_last_entry(&block->instructions, jay_inst, link);
+}
+
+static inline jay_block *
+jay_next_block(jay_block *block)
+{
+   return list_first_entry(&(block->link), jay_block, link);
+}
+
+static inline void
+jay_block_add_successor(jay_block *block, jay_block *succ)
+{
+   unsigned i = block->successors[0] ? 1 : 0;
+
+   assert(succ && block->successors[0] != succ && block->successors[1] != succ);
+   assert(block->successors[i] == NULL && "at most 2 successors");
+
+   block->successors[i] = succ;
+   util_dynarray_append(&(succ->predecessors), block);
+}
+
+static inline unsigned
+jay_source_last_use_bit(const jay_def *srcs, unsigned src_idx)
+{
+   assert(jay_is_ssa(srcs[src_idx]) && "precondition");
+   unsigned i = 0;
+
+   for (unsigned s = 0; s < src_idx; ++s) {
+      jay_foreach_index(srcs[s], c, idx) {
+         i++;
+      }
+   }
+
+   return i;
+}
+
+#define jay_foreach_killed(I, s, c)                                            \
+   for (unsigned _kill_idx = 0; _kill_idx == 0; _kill_idx = 1)                 \
+      jay_foreach_src_index(I, s, c, idx)                                      \
+         for (unsigned _k = _kill_idx++; _k != ~0; _k = ~0)                    \
+            if (BITSET_TEST(I->last_use, _k))
+
+/* Helper to run a pass */
+#define JAY_PASS(shader, pass, ...)                                            \
+   do {                                                                        \
+      pass(shader, ##__VA_ARGS__);                                             \
+      jay_validate(shader, #pass);                                             \
+   } while (0)
+
+#define JAY_DEFINE_FUNCTION_PASS(name, per_func)                               \
+   void name(jay_shader *s)                                                    \
+   {                                                                           \
+      jay_foreach_function(s, f) {                                             \
+         per_func(f);                                                          \
+      }                                                                        \
+   }
diff --git a/src/intel/compiler/jay/jay_liveness.c b/src/intel/compiler/jay/jay_liveness.c
new file mode 100644
index 00000000000..ebe89f7504f
--- /dev/null
+++ b/src/intel/compiler/jay/jay_liveness.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/bitset.h"
+#include "util/macros.h"
+#include "util/sparse_bitset.h"
+#include "util/u_math.h"
+#include "util/u_worklist.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/* LiveIn = GEN + (LiveOut - KILL) */
+static void
+update_liveness_for_inst(BITSET_WORD *dead_defs,
+                         struct u_sparse_bitset *live_in,
+                         jay_inst *I)
+{
+   /* No destination is live-in before the instruction, but any destination not
+    * live-in after is immediately dead.
+    */
+   jay_foreach_dst_index(I, _, def) {
+      if (u_sparse_bitset_test(live_in, def)) {
+         u_sparse_bitset_clear(live_in, def);
+      } else {
+         BITSET_SET(dead_defs, def);
+      }
+   }
+
+   if (I->op == JAY_OPCODE_PHI_SRC) {
+      /* Phi sources do not require last-use bits. */
+      jay_foreach_src_index(I, src_idx, comp, index) {
+         u_sparse_bitset_set(live_in, index);
+      }
+   } else {
+      BITSET_ZERO(I->last_use);
+      unsigned last_use_i = 0;
+
+      jay_foreach_src_index(I, s, comp, index) {
+         /* If the source is not live after this instruction, but becomes
+          * live at this instruction, this is the last use.
+          */
+         if (!u_sparse_bitset_test(live_in, index)) {
+            assert(last_use_i < JAY_NUM_LAST_USE_BITS);
+            BITSET_SET(I->last_use, last_use_i);
+         }
+
+         u_sparse_bitset_set(live_in, index);
+         ++last_use_i;
+      }
+   }
+}
+
+/**
+ * Calculate liveness information for SSA values.
+ *
+ * This populates the jay_block::live_in/live_out bitsets and last_use flags.
+ */
+void
+jay_compute_liveness(jay_function *f)
+{
+   u_worklist worklist;
+   u_worklist_init(&worklist, f->num_blocks, NULL);
+
+   ralloc_free(f->dead_defs);
+   f->dead_defs = BITSET_RZALLOC(f, f->ssa_alloc);
+
+   jay_foreach_block(f, block) {
+      u_sparse_bitset_free(&block->live_in);
+      u_sparse_bitset_free(&block->live_out);
+
+      u_sparse_bitset_init(&block->live_in, f->ssa_alloc, block);
+      u_sparse_bitset_init(&block->live_out, f->ssa_alloc, block);
+
+      jay_worklist_push_head(&worklist, block);
+   }
+
+   while (!u_worklist_is_empty(&worklist)) {
+      /* Pop in reverse order since liveness is a backwards pass */
+      jay_block *block = jay_worklist_pop_head(&worklist);
+
+      /* Update its liveness information:
+       * 1. Assume everything liveout from this block was live_in
+       * 2. Clear live_in for anything defined in this block
+       */
+      u_sparse_bitset_dup(&block->live_in, &block->live_out);
+
+      jay_foreach_inst_in_block_rev(block, inst) {
+         update_liveness_for_inst(f->dead_defs, &block->live_in, inst);
+      }
+
+      /* Propagate block->live_in[] to the live_out[] of predecessors. Since
+       * phis are split, they are handled naturally without special cases.
+       */
+      jay_foreach_predecessor(block, p) {
+         if (u_sparse_bitset_merge(&(*p)->live_out, &block->live_in)) {
+            jay_worklist_push_tail(&worklist, *p);
+         }
+      }
+   }
+
+#ifndef NDEBUG
+   jay_block *first_block = jay_first_block(f);
+   jay_block *last_block = list_last_entry(&f->blocks, jay_block, link);
+
+   assert(u_sparse_bitset_count(&first_block->live_in) == 0 && "invariant");
+   assert(u_sparse_bitset_count(&last_block->live_out) == 0 && "invariant");
+#endif
+
+   u_worklist_fini(&worklist);
+}
+
+/*
+ * Calculate the register demand for each SSA file using the previously
+ * calculated liveness analysis. SSA makes this exact in linear-time.
+ */
+void
+jay_calculate_register_demands(jay_function *func)
+{
+   enum jay_file *files = calloc(func->ssa_alloc, sizeof(enum jay_file));
+   BITSET_WORD *killed = BITSET_CALLOC(func->ssa_alloc);
+   unsigned *max_demand = func->demand;
+   memset(max_demand, 0, sizeof(func->demand));
+
+   jay_foreach_inst_in_func(func, block, I) {
+      jay_foreach_dst_index(I, def, index) {
+         files[index] = def.file;
+      }
+   }
+
+   jay_foreach_block(func, block) {
+      unsigned demands[JAY_NUM_SSA_FILES] = {};
+
+      /* Everything live-in. */
+      U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) {
+         ++demands[files[i]];
+      }
+
+      jay_foreach_ssa_file(f) {
+         max_demand[f] = MAX2(demands[f], max_demand[f]);
+      }
+
+      jay_foreach_inst_in_block(block, I) {
+         /* We must have enough register file space for the register payload */
+         if (I->op == JAY_OPCODE_PRELOAD) {
+            uint32_t max = jay_preload_reg(I) + jay_num_values(I->dst);
+            max_demand[I->dst.file] = MAX2(max_demand[I->dst.file], max);
+         }
+
+         /* Collect source values to kill */
+         jay_foreach_killed(I, s, c) {
+            BITSET_SET(killed, jay_channel(I->src[s], c));
+         }
+
+         /* Make destinations live */
+         jay_foreach_dst(I, d) {
+            demands[d.file] += util_next_power_of_two(jay_num_values(d));
+         }
+
+         /* Update maximum demands */
+         jay_foreach_ssa_file(f) {
+            max_demand[f] = MAX2(demands[f], max_demand[f]);
+         }
+
+         /* Dead destinations are those written by the instruction but killed
+          * immediately after the instruction finishes.
+          */
+         jay_foreach_dst_index(I, d, index) {
+            if (BITSET_TEST(func->dead_defs, index)) {
+               assert(demands[d.file] > 0);
+               --demands[d.file];
+            }
+         }
+
+         jay_foreach_dst(I, d) {
+            unsigned n = jay_num_values(d);
+            demands[d.file] -= util_next_power_of_two(n) - n;
+         }
+
+         /* Late-kill sources */
+         jay_foreach_killed(I, s, c) {
+            uint32_t index = jay_channel(I->src[s], c);
+
+            if (BITSET_TEST(killed, index)) {
+               BITSET_CLEAR(killed, index);
+
+               assert(demands[I->src[s].file] > 0);
+               --demands[I->src[s].file];
+            }
+         }
+
+         if (jay_debug & JAY_DBG_PRINTDEMAND) {
+            printf("(LA) [G:%u\tU:%u] ", demands[GPR], demands[UGPR]);
+            jay_print_inst(stdout, I);
+         }
+      }
+   }
+
+   free(files);
+   free(killed);
+}
diff --git a/src/intel/compiler/jay/jay_lower_post_ra.c b/src/intel/compiler/jay/jay_lower_post_ra.c
new file mode 100644
index 00000000000..db8661b011d
--- /dev/null
+++ b/src/intel/compiler/jay/jay_lower_post_ra.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/macros.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/*
+ * If default != dest, we need to lower. Predicated moves generalize as SEL,
+ * with default in src0 to allow for immediates.
+ *
+ * For anything else, we have to insert a copy.
+ */
+static void
+lower_non_tied_default(jay_builder *b, jay_inst *I, jay_def default_)
+{
+   jay_def not_pred = jay_negate(*jay_inst_get_predicate(I));
+   assert(default_.file != FLAG && "we don't support this");
+
+   if (I->op == JAY_OPCODE_MOV) {
+      jay_SEL(b, I->type, I->dst, default_, I->src[0], not_pred);
+      jay_remove_instruction(I);
+   } else {
+      jay_foreach_comp(I->dst, c) {
+         jay_def dst = jay_extract_post_ra(I->dst, c);
+         jay_def src = jay_extract_post_ra(default_, c);
+
+         jay_add_predicate(b, jay_MOV(b, dst, src), not_pred);
+      }
+   }
+}
+
+static inline jay_def
+hi(jay_def x)
+{
+   x.hi = true;
+   return x;
+}
+
+static bool
+lower(jay_builder *b, jay_inst *I)
+{
+   switch (I->op) {
+   case JAY_OPCODE_PRELOAD:
+   case JAY_OPCODE_PHI_DST:
+   case JAY_OPCODE_INDETERMINATE:
+      /* Delete instructions that only exist for RA. Uninitialized register
+       * contents is a perfectly cromulent indeterminate value.
+       */
+      return true;
+
+   case JAY_OPCODE_MOV: {
+      /* Delete trivial moves */
+      if (jay_regs_equal(I->dst, I->src[0]) && !I->predication)
+         return true;
+
+      if (I->dst.file == GPR && I->src[0].file == GPR) {
+         jay_def dst = I->dst, src = I->src[0], tmp4 = jay_bare_reg(GPR, 0);
+         enum jay_stride dst_stride = jay_def_stride(b->shader, dst);
+         enum jay_stride src_stride = jay_def_stride(b->shader, src);
+         assert(jay_def_stride(b->shader, tmp4) == JAY_STRIDE_4 && "ABI");
+
+         if (dst_stride == JAY_STRIDE_8 && src_stride == JAY_STRIDE_2) {
+            jay_MOV(b, dst, tmp4);
+            jay_MOV(b, tmp4, src)->type = JAY_TYPE_U16;
+            jay_MOV(b, hi(tmp4), hi(src))->type = JAY_TYPE_U16;
+
+            jay_XOR(b, JAY_TYPE_U32, dst, dst, tmp4);
+            jay_XOR(b, JAY_TYPE_U32, tmp4, dst, tmp4);
+            jay_XOR(b, JAY_TYPE_U32, dst, dst, tmp4);
+            return true;
+         } else if (dst_stride == JAY_STRIDE_2 && src_stride == JAY_STRIDE_8) {
+            jay_MOV(b, dst, tmp4)->type = JAY_TYPE_U16;
+            jay_MOV(b, hi(dst), hi(tmp4))->type = JAY_TYPE_U16;
+            jay_MOV(b, tmp4, src);
+
+            for (unsigned i = 0; i < 3; ++i) {
+               jay_XOR(b, JAY_TYPE_U16, i == 1 ? tmp4 : dst, dst, tmp4);
+               jay_XOR(b, JAY_TYPE_U16, i == 1 ? hi(tmp4) : hi(dst), hi(dst),
+                       hi(tmp4));
+            }
+
+            return true;
+         }
+
+         /* Lower 4B<-->2B copies. To pack the register file, RA
+          * sometimes inserts 32-bit copies involving 16-bit strided sources like
+          * "mov.u32 r4 <32-bit>, r50 <16-bit>". This cannot be implemented in a
+          * single hardware instruction, so we split into two 16-bit copies.
+          */
+         enum jay_stride min_stride = MIN2(dst_stride, src_stride);
+         unsigned stride_sz = jay_stride_to_bits(min_stride);
+         unsigned type_sz = jay_type_size_bits(I->type);
+
+         if (stride_sz < type_sz) {
+            assert(stride_sz == 16 && type_sz == 32 && "no other case hit");
+            I->type = JAY_TYPE_U16;
+            jay_MOV(b, hi(dst), hi(src))->type = JAY_TYPE_U16;
+         }
+      }
+
+      return false;
+   }
+
+   case JAY_OPCODE_SWAP: {
+      jay_def x = I->src[0], y = I->src[1];
+      /* TODO: Need stride-aware lowering here too like MOV. Same ideas. */
+      if (jay_def_stride(b->shader, x) != jay_def_stride(b->shader, y))
+         UNREACHABLE("todo");
+
+      jay_XOR(b, JAY_TYPE_U32, x, y, x);
+      jay_XOR(b, JAY_TYPE_U32, y, x, y);
+      jay_XOR(b, JAY_TYPE_U32, x, y, x);
+      return true;
+   }
+
+   case JAY_OPCODE_ZERO_FLAG: {
+      jay_MOV(b, jay_bare_reg(FLAG, jay_zero_flag_reg(I)), 0)->type =
+         JAY_TYPE_U32;
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+void
+jay_lower_post_ra(jay_shader *s)
+{
+   jay_foreach_inst_in_shader_safe(s, func, I) {
+      jay_builder b = jay_init_builder(func, jay_before_inst(I));
+
+      if (jay_inst_has_default(I)) {
+         if (!jay_regs_equal(I->dst, *jay_inst_get_default(I))) {
+            lower_non_tied_default(&b, I, *jay_inst_get_default(I));
+         }
+
+         /* Now just drop the default source */
+         jay_shrink_sources(I, I->num_srcs - 1);
+         I->predication = JAY_PREDICATED;
+      }
+
+      if (lower(&b, I)) {
+         jay_remove_instruction(I);
+      }
+   }
+}
diff --git a/src/intel/compiler/jay/jay_lower_pre_ra.c b/src/intel/compiler/jay/jay_lower_pre_ra.c
new file mode 100644
index 00000000000..d71ea7c3711
--- /dev/null
+++ b/src/intel/compiler/jay/jay_lower_pre_ra.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/bitscan.h"
+#include "util/hash_table.h"
+#include "util/lut.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/*
+ * Register allocation operates only on power-of-two vectors. Pad out
+ * non-power-of-two vectors with null values to simplify RA.
+ */
+static jay_def
+lower_npot_vector(jay_builder *b, jay_def x)
+{
+   unsigned n = jay_num_values(x);
+
+   if (!util_is_power_of_two_or_zero(n)) {
+      uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
+
+      for (unsigned i = 0; i < n; ++i) {
+         indices[i] = jay_channel(x, i);
+      }
+
+      x = jay_collect(b, x.file, indices, util_next_power_of_two(n));
+   }
+
+   assert(util_is_power_of_two_or_zero(jay_num_values(x)) && "post-cond");
+   return x;
+}
+
+/**
+ * Vectors need to be allocated to contiguous registers. Furthermore, we
+ * require power-of-two sizes in certain cases, that's handled here too.
+ *
+ * This means that a value cannot appear in multiple channels of an
+ * instruction, as register allocation would need to assign the same value to
+ * locations <X+i> and <X+j>. Scalars don't have this restriction, except for
+ * SENDs because the hardware bans repeated sources.
+ *
+ * If a value appears in multiple positions, we emit copies so that each
+ * can be register allocated in the correct position.
+ */
+static void
+lower_contiguous_sources(jay_builder *b, jay_inst *I)
+{
+   b->cursor = jay_before_inst(I);
+   uint32_t seen[JAY_MAX_DEF_LENGTH], nr_seen = 0;
+
+   jay_foreach_src(I, s) {
+      if (jay_num_values(I->src[s]) > 1 || I->op == JAY_OPCODE_SEND) {
+         jay_foreach_index(I->src[s], c, index) {
+            /* Search for the index */
+            unsigned i;
+            for (i = 0; i < nr_seen && seen[i] != index; ++i) {
+            }
+
+            if (i == nr_seen) {
+               /* Record a new index */
+               assert(nr_seen < ARRAY_SIZE(seen));
+               seen[nr_seen++] = index;
+            } else {
+               /* Insert a copy to access a duplicated index */
+               jay_def copy = jay_alloc_def(b, I->src[s].file, 1);
+               jay_MOV(b, copy, jay_extract(I->src[s], c));
+               jay_insert_channel(b, &I->src[s], c, copy);
+            }
+         }
+
+         jay_replace_src(&I->src[s], lower_npot_vector(b, I->src[s]));
+      }
+   }
+}
+
+static jay_def
+lower_imm_to_ugpr(jay_builder *b,
+                  jay_inst *I,
+                  unsigned s,
+                  struct hash_table_u64 *constants)
+{
+   /* Although only 32-bit constants are supported, 64-bit constants are
+    * separate in the key since they must be zero-extended. We could optimize
+    * this but it doesn't really matter.
+    */
+   uint32_t imm = jay_as_uint(I->src[s]);
+   bool is_64bit = jay_type_size_bits(jay_src_type(I, s)) == 64;
+   uint64_t key = imm | (is_64bit ? BITFIELD64_BIT(32) : 0);
+
+   jay_inst *mov = _mesa_hash_table_u64_search(constants, key);
+   if (mov)
+      return mov->dst;
+
+   /* Try to use source modifiers to reuse a constant if we can */
+   if (jay_src_type(I, s) == JAY_TYPE_F32 && jay_has_src_mods(I, s)) {
+      mov = _mesa_hash_table_u64_search(constants, fui(-uif(imm)));
+      if (mov)
+         return jay_negate(mov->dst);
+   }
+
+   /* If this is a new constant, insert a move and cache it. Currently, we pool
+    * constants per-function. Inserting everything at the start guarantees that
+    * these moves dominate all their uses, although it hurts register pressure.
+    * The spiller should rematerialize constants where necessary to ensure we
+    * don't lose the wave, but we could still probably optimize this.
+    */
+   jay_def x = jay_alloc_def(b, UGPR, is_64bit ? 2 : 1);
+   b->cursor = jay_before_function(b->func);
+   _mesa_hash_table_u64_insert(constants, key, jay_MOV(b, x, imm));
+   return x;
+}
+
+static bool
+try_swap_src01(jay_inst *I)
+{
+   if (I->op == JAY_OPCODE_SEL) {
+      /* sel(a, b, p) = sel(b, a, !p) */
+      I->src[2].negate ^= true;
+   } else if (I->op == JAY_OPCODE_CMP) {
+      I->conditional_mod = jay_conditional_mod_swap_sources(I->conditional_mod);
+   } else if (I->op == JAY_OPCODE_BFN) {
+      jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1));
+   } else if (!jay_opcode_infos[I->op]._2src_commutative) {
+      /* Nothing to do for commutative, but otherwise we give up */
+      return false;
+   }
+
+   SWAP(I->src[0], I->src[1]);
+   return true;
+}
+
+/*
+ * Instructions can only encode immediates in certain positions. Lower
+ * immediates to moves where necessary.
+ */
+static void
+lower_immediates(jay_builder *b, jay_inst *I, struct hash_table_u64 *constants)
+{
+   /* Canonicalize compare-with-zero to increase freedom */
+   if (I->op == JAY_OPCODE_CMP &&
+       jay_is_zero(I->src[1]) &&
+       jay_is_null(I->dst) &&
+       I->type == JAY_TYPE_U32) {
+
+      assert(!jay_is_null(I->cond_flag) && !I->predication);
+      I->op = JAY_OPCODE_MOV;
+      jay_shrink_sources(I, 1);
+   }
+
+   /* One source supports immediates but the other does not, so swap. */
+   unsigned other = I->op == JAY_OPCODE_BFN ? 1 : 0;
+   if (jay_is_imm(I->src[other]) &&
+       !_mesa_hash_table_u64_search(constants, jay_as_uint(I->src[other]))) {
+
+      try_swap_src01(I);
+   }
+
+   /* Immediates allowed only in certain cases, lower the rest */
+   jay_foreach_src(I, s) {
+      if (jay_is_imm(I->src[s])) {
+         uint32_t imm = jay_as_uint(I->src[s]);
+
+         bool last = s == (jay_num_isa_srcs(I) - 1);
+         bool allowed = s < 2 && (last || I->op == JAY_OPCODE_SEND);
+         allowed |= (I->op == JAY_OPCODE_BFN && s == 0 && imm < UINT16_MAX);
+
+         if (!allowed) {
+            I->src[s] = lower_imm_to_ugpr(b, I, s, constants);
+         }
+      }
+   }
+}
+
+void
+jay_lower_pre_ra(jay_shader *s)
+{
+   struct hash_table_u64 *constants = _mesa_hash_table_u64_create(NULL);
+
+   jay_foreach_function(s, f) {
+      /* Pool constants per function. */
+      _mesa_hash_table_u64_clear(constants);
+
+      jay_foreach_inst_in_func(f, block, I) {
+         jay_builder b = { .shader = s, .func = f };
+
+         /* lower_immediates must be last since it consumes I */
+         lower_contiguous_sources(&b, I);
+         lower_immediates(&b, I, constants);
+      }
+   }
+
+   _mesa_hash_table_u64_destroy(constants);
+}
diff --git a/src/intel/compiler/jay/jay_lower_scoreboard.c b/src/intel/compiler/jay/jay_lower_scoreboard.c
new file mode 100644
index 00000000000..305dfff57ba
--- /dev/null
+++ b/src/intel/compiler/jay/jay_lower_scoreboard.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <limits.h>
+#include "compiler/brw/brw_eu_defines.h"
+#include "util/bitset.h"
+#include "util/macros.h"
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/* TODO: Shrink */
+#define MAX_KEYS   (2 * JAY_NUM_UGPR)
+#define NUM_TOKENS (16)
+
+/** SEND scoreboarding */
+struct gpr_range {
+   unsigned base, width;
+};
+
+static inline struct gpr_range
+def_to_gpr(jay_function *func, jay_inst *I, jay_def x)
+{
+   if (x.file == GPR || x.file == UGPR) {
+      unsigned base = x.file == UGPR ? func->shader->num_regs[GPR] : 0;
+      return (struct gpr_range) { base + x.reg, jay_num_values(x) };
+   } else {
+      return (struct gpr_range) { 0, 0 };
+   }
+}
+
+static inline void
+sync_sbid(jay_function *func, jay_inst *I, uint32_t *busy, unsigned sbid)
+{
+   jay_builder b = jay_init_builder(func, jay_before_inst(I));
+   jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, sbid);
+   *busy &= ~BITFIELD_BIT(sbid);
+}
+
+static void
+lower_send_local(jay_function *func, jay_block *block)
+{
+   struct {
+      BITSET_DECLARE(reading, MAX_KEYS);
+      BITSET_DECLARE(writing, MAX_KEYS);
+   } tokens[NUM_TOKENS];
+
+   uint32_t busy = 0;
+   unsigned roundrobin = 0;
+
+   jay_foreach_inst_in_block_safe(block, I) {
+      /* Read-after-write */
+      jay_foreach_src(I, s) {
+         struct gpr_range src = def_to_gpr(func, I, I->src[s]);
+
+         u_foreach_bit(sbid, busy) {
+            if (BITSET_TEST_COUNT(tokens[sbid].writing, src.base, src.width)) {
+               sync_sbid(func, I, &busy, sbid);
+            }
+         }
+      }
+
+      /* Write-after-write & write-after-read */
+      jay_foreach_dst(I, d) {
+         struct gpr_range dst = def_to_gpr(func, I, I->dst);
+
+         u_foreach_bit(sbid, busy) {
+            if (BITSET_TEST_COUNT(tokens[sbid].reading, dst.base, dst.width) ||
+                BITSET_TEST_COUNT(tokens[sbid].writing, dst.base, dst.width)) {
+               sync_sbid(func, I, &busy, sbid);
+            }
+         }
+      }
+
+      if (I->op == JAY_OPCODE_SEND && !jay_send_eot(I)) {
+         unsigned sbid = (roundrobin++) % NUM_TOKENS;
+         jay_set_send_sbid(I, sbid);
+
+         if (!(busy & BITSET_BIT(sbid))) {
+            busy |= BITSET_BIT(sbid);
+            BITSET_ZERO(tokens[sbid].writing);
+            BITSET_ZERO(tokens[sbid].reading);
+         }
+
+         struct gpr_range dst = def_to_gpr(func, I, I->dst);
+         BITSET_SET_COUNT(tokens[sbid].writing, dst.base, dst.width);
+
+         jay_foreach_src(I, s) {
+            struct gpr_range src = def_to_gpr(func, I, I->src[s]);
+            BITSET_SET_COUNT(tokens[sbid].reading, src.base, src.width);
+         }
+      }
+   }
+
+   /* Sync on block boundaries. */
+   if (block != jay_last_block(func)) {
+      jay_builder b = jay_init_builder(func, jay_before_jump(block));
+
+      u_foreach_bit(sbid, busy) {
+         jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, sbid);
+      }
+   }
+}
+
+/**
+ * Regdist scoreboarding
+ *
+ * Register access is tracked per pipe, with 0 (NONE) having data on the writer
+ * packed into a u32 with the following macros.
+ */
+#define make_writer(pipe, ip) (((uint32_t) ip << 3) | (uint32_t) (pipe))
+#define writer_ip(writer)     (writer >> 3)
+#define writer_pipe(writer)   (enum tgl_pipe)(writer & BITFIELD_MASK(3))
+
+#define TGL_NUM_PIPES (TGL_PIPE_ALL)
+typedef uint32_t u32_per_pipe[TGL_NUM_PIPES];
+
+struct swsb_state {
+   unsigned ip[TGL_NUM_PIPES];
+   unsigned last_shape[TGL_NUM_PIPES];
+
+   /* finished_ip[X][Y] = ip means from the perspective of pipe X, ip on pipe Y
+    * has already been waited on.
+    */
+   unsigned finished_ip[TGL_NUM_PIPES][TGL_NUM_PIPES];
+   u32_per_pipe *access;
+};
+
+static enum tgl_pipe
+inst_exec_pipe(const struct intel_device_info *devinfo, jay_inst *I)
+{
+   if (I->op == JAY_OPCODE_SEND || jay_op_is_control_flow(I->op) /* XXX*/) {
+      return TGL_PIPE_NONE;
+   } else if (I->op == JAY_OPCODE_MATH) {
+      return TGL_PIPE_MATH;
+   } else if (I->type == JAY_TYPE_F64) {
+      return TGL_PIPE_LONG;
+   } else if (jay_type_is_any_float(I->type)) {
+      return TGL_PIPE_FLOAT;
+   } else {
+      return TGL_PIPE_INT;
+   }
+}
+
+/**
+ * Return the RegDist pipeline the hardware will synchronize with if no
+ * pipeline information is provided in the SWSB annotation of an
+ * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
+ */
+static enum tgl_pipe
+inferred_sync_pipe(const struct intel_device_info *devinfo, const jay_inst *I)
+{
+   bool has_int_src = false, has_long_src = false;
+
+   if (devinfo->verx10 >= 125) {
+      jay_foreach_src(I, s) {
+         has_int_src |= !jay_type_is_any_float(jay_src_type(I, s));
+         has_long_src |= jay_src_type(I, s) == JAY_TYPE_F64;
+      }
+
+      /* Avoid emitting (RegDist, SWSB) annotations for long instructions on
+       * platforms where they are unordered as they may not be allowed.
+       */
+      if (devinfo->has_64bit_float_via_math_pipe && has_long_src)
+         return TGL_PIPE_NONE;
+   }
+
+   return I->op == JAY_OPCODE_SEND ? TGL_PIPE_NONE :
+          has_long_src             ? TGL_PIPE_LONG :
+          has_int_src              ? TGL_PIPE_INT :
+                                     TGL_PIPE_FLOAT;
+}
+
+static void
+depend_on_writer(struct swsb_state *state, struct gpr_range r, unsigned *dep)
+{
+   for (unsigned i = 0; i < r.width; ++i) {
+      uint32_t w = state->access[r.base + i][0];
+      dep[writer_pipe(w)] = MAX2(dep[writer_pipe(w)], writer_ip(w));
+   }
+}
+
+#define jay_foreach_pipe(pipe)                                                 \
+   for (unsigned pipe = 1; pipe < TGL_NUM_PIPES; ++pipe)
+
+static void
+lower_regdist_local(jay_function *func, jay_block *block, u32_per_pipe *access)
+{
+   struct swsb_state state = { .access = access };
+   jay_inst *last_sync = NULL;
+   bool need_deswizzle_wait = false;
+
+   jay_foreach_inst_in_block_safe(block, I) {
+      enum tgl_pipe exec_pipe = inst_exec_pipe(func->shader->devinfo, I);
+      unsigned dep[TGL_NUM_PIPES] = { 0 };
+      if (I->op == JAY_OPCODE_SYNC) {
+         last_sync = I;
+         continue;
+      } else if (I->op == JAY_OPCODE_DESWIZZLE_16) {
+         need_deswizzle_wait = true;
+         state.ip[TGL_PIPE_INT]++;
+         continue;
+      }
+
+      /* Force a wait on the deswizzles at the start of the program. XXX: Is
+       * there a cleaner way to deal with this?
+       */
+      if (need_deswizzle_wait) {
+         dep[TGL_PIPE_INT] = state.ip[TGL_PIPE_INT];
+         need_deswizzle_wait = false;
+      }
+
+      /* Write-after-{write, read} */
+      jay_foreach_dst(I, def) {
+         struct gpr_range r = def_to_gpr(func, I, def);
+         depend_on_writer(&state, r, dep);
+
+         for (unsigned i = 0; i < r.width; ++i) {
+            jay_foreach_pipe(p) {
+               dep[p] = MAX2(dep[p], state.access[r.base + i][p]);
+            }
+         }
+      }
+
+      /* Read-after-write */
+      jay_foreach_src(I, s) {
+         depend_on_writer(&state, def_to_gpr(func, I, I->src[s]), dep);
+      }
+
+      unsigned nr_waits = 0;
+      unsigned last_pipe = TGL_PIPE_NONE;
+
+      /* If dependency P implies dependency Q, drop dependency Q to avoid
+       * unnecessary annotations.
+       */
+      jay_foreach_pipe(p) {
+         if (dep[p]) {
+            jay_foreach_pipe(q) {
+               if (dep[q] && state.finished_ip[p][q] >= dep[q]) {
+                  dep[q] = 0;
+               }
+            }
+         }
+      }
+
+      unsigned min_delta = 7;
+      jay_foreach_pipe(p) {
+         if (dep[p] && (exec_pipe == TGL_PIPE_NONE /* TODO: Sends */ ||
+                        dep[p] > state.finished_ip[exec_pipe][p])) {
+            unsigned delta = state.ip[p] - dep[p] + 1;
+            min_delta = MIN2(min_delta, delta);
+            state.finished_ip[exec_pipe][p] = dep[p];
+            nr_waits++;
+            last_pipe = p;
+         }
+      }
+
+      /* If we're SIMD split the same way as our dependency, we can relax the
+       * dependency to have each half wait in parallel. We could do even better
+       * with more tracking but this should be good enough for now.
+       */
+      unsigned simd_split = jay_simd_split(func->shader, I);
+      unsigned shape = ((simd_split << 2) | jay_macro_length(I)) + 1;
+      bool same_shape = state.last_shape[last_pipe] == shape;
+
+      if (simd_split && same_shape && nr_waits == 1 && min_delta == 1) {
+         min_delta += ((1 << simd_split) - 1) * jay_macro_length(I);
+         I->replicate_dep = true;
+         I->decrement_dep = last_pipe != exec_pipe;
+      }
+
+      bool has_sbid = I->op == JAY_OPCODE_SEND && !jay_send_eot(I);
+      I->dep = (struct tgl_swsb) {
+         .sbid = has_sbid ? jay_send_sbid(I) : 0,
+         .mode = has_sbid ? TGL_SBID_SET : TGL_SBID_NULL,
+         .regdist = nr_waits ? min_delta : 0,
+         .pipe = nr_waits == 1 && (!has_sbid ||
+                                   last_pipe == TGL_PIPE_FLOAT ||
+                                   last_pipe == TGL_PIPE_INT) ?
+                    last_pipe :
+                    TGL_PIPE_ALL,
+      };
+
+      /* Fold the immediate preceding SYNC.nop into this instruction, allowing
+       * us to wait on both ALU and a SEND in the same annotation.
+       */
+      if (last_sync &&
+          jay_sync_op(last_sync) == TGL_SYNC_NOP &&
+          I->dep.mode == TGL_SBID_NULL &&
+          (I->dep.regdist == 0 ||
+           inferred_sync_pipe(func->shader->devinfo, I) == I->dep.pipe)) {
+
+         assert(last_sync->dep.regdist == 0);
+         assert(last_sync->dep.pipe == TGL_PIPE_NONE);
+
+         I->dep.mode = last_sync->dep.mode;
+         I->dep.sbid = last_sync->dep.sbid;
+
+         jay_remove_instruction(last_sync);
+      }
+
+      if (exec_pipe != TGL_PIPE_NONE) {
+         /* Advance the IP by the number of physical instructions emitted */
+         state.ip[exec_pipe] +=
+            jay_macro_length(I) << jay_simd_split(func->shader, I);
+
+         struct gpr_range r = def_to_gpr(func, I, I->dst);
+         uint32_t now = make_writer(exec_pipe, state.ip[exec_pipe]);
+
+         for (unsigned i = 0; i < r.width; ++i) {
+            state.access[r.base + i][0] = now;
+         }
+
+         jay_foreach_src(I, s) {
+            struct gpr_range r = def_to_gpr(func, I, I->src[s]);
+            for (unsigned i = 0; i < r.width; ++i) {
+               state.access[r.base + i][exec_pipe] = state.ip[exec_pipe];
+            }
+         }
+
+         state.last_shape[exec_pipe] = shape;
+      }
+
+      last_sync = NULL;
+   }
+
+   /* Sync on block boundaries. */
+   jay_inst *first = jay_first_inst(block);
+   if (block != jay_first_block(func) && first && first->op != JAY_OPCODE_SEND) {
+      first->dep = tgl_swsb_regdist(1);
+   }
+}
+
+/*
+ * Trivial scoreboard lowering pass for debugging use. Stalls after every
+ * instruction and assigns SBID zero to all messages.
+ */
+static void
+lower_trivial(jay_function *func)
+{
+   jay_foreach_inst_in_func_safe(func, block, I) {
+      if (I->op == JAY_OPCODE_SEND && !jay_send_eot(I)) {
+         I->dep = tgl_swsb_dst_dep(tgl_swsb_sbid(TGL_SBID_SET, 0), 1);
+
+         jay_builder b = jay_init_builder(func, jay_after_inst(I));
+         jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, 0);
+      } else {
+         I->dep = tgl_swsb_regdist(1);
+      }
+   }
+}
+
+void
+jay_lower_scoreboard(jay_shader *s)
+{
+   uint32_t nr_keys = s->num_regs[GPR] + s->num_regs[UGPR];
+   assert(nr_keys <= MAX_KEYS && "SENDs use uninitialized stack allocation");
+   u32_per_pipe *access = malloc(sizeof(*access) * nr_keys);
+
+   jay_foreach_function(s, func) {
+      if (jay_debug & JAY_DBG_SYNC) {
+         lower_trivial(func);
+      } else {
+         jay_foreach_block(func, block) {
+            memset(access, 0, sizeof(*access) * nr_keys);
+            lower_send_local(func, block);
+            lower_regdist_local(func, block, access);
+         }
+      }
+   }
+
+   free(access);
+}
diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c
new file mode 100644
index 00000000000..21fbac1777e
--- /dev/null
+++ b/src/intel/compiler/jay/jay_lower_spill.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/brw/brw_eu_defines.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/* We reserve an address register for spilling by ABI */
+#define ADDRESS_REG jay_bare_reg(J_ADDRESS, 2)
+
+static void
+insert_spill_fill(jay_builder *b,
+                  jay_def mem,
+                  jay_def gpr,
+                  jay_def sp,
+                  bool load,
+                  unsigned *sp_delta_B,
+                  unsigned umem_base)
+{
+   assert(jay_is_mem(mem) && !jay_is_mem(gpr));
+
+   bool uniform = mem.file == UMEM;
+   unsigned offs_B = mem.reg * 4;
+   unsigned mem_reg_B =
+      uniform ? (umem_base + offs_B) : (offs_B * b->shader->dispatch_width);
+
+   /* The stack pointer needs to be offset to the desired offset */
+   signed sp_adjust_B = mem_reg_B - (*sp_delta_B);
+   if (sp_adjust_B) {
+      jay_ADD(b, JAY_TYPE_U32, sp, sp, sp_adjust_B);
+      *sp_delta_B = mem_reg_B;
+   }
+
+   const struct intel_device_info *devinfo = b->shader->devinfo;
+   unsigned cache = load ? LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS) :
+                           LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS);
+   uint32_t desc = lsc_msg_desc(devinfo, load ? LSC_OP_LOAD : LSC_OP_STORE,
+                                LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SIZE_A32,
+                                LSC_DATA_SIZE_D32, 1, uniform, cache);
+   if (uniform) {
+      sp.num_values_m1 = 0;
+   }
+
+   jay_def srcs[] = { sp, gpr };
+
+   jay_SEND(b, .sfid = BRW_SFID_UGM, .msg_desc = desc, .srcs = srcs,
+            .nr_srcs = load ? 1 : 2, .dst = load ? gpr : jay_null(),
+            .type = JAY_TYPE_U32, .uniform = uniform, .ex_desc = ADDRESS_REG);
+}
+
+void
+jay_lower_spill(jay_function *func)
+{
+   jay_builder b = jay_init_builder(func, jay_before_function(func));
+
+   /* We reserve the top UGPRs for spilling by ABI */
+   unsigned ugpr_reservation = func->shader->num_regs[UGPR];
+   assert(util_is_aligned(ugpr_reservation + 1, func->shader->dispatch_width));
+
+   jay_def surf = jay_bare_reg(UGPR, ugpr_reservation);
+   jay_def sp = jay_bare_reg(UGPR, ugpr_reservation + 1);
+   sp.num_values_m1 = func->shader->dispatch_width - 1;
+
+   /* Calculate how much stack space we need */
+   unsigned nr_mem = 0, nr_umem = 0;
+   jay_foreach_inst_in_func(func, block, I) {
+      if (I->op == JAY_OPCODE_MOV && jay_is_send_like(I)) {
+         jay_def mem = jay_is_mem(I->dst) ? I->dst : I->src[0];
+         unsigned *nr = mem.file == UMEM ? &nr_umem : &nr_mem;
+
+         *nr = MAX2(*nr, mem.reg + 1);
+      }
+   }
+
+   assert((nr_umem > 0) || (nr_mem > 0));
+   unsigned umem_base = (func->shader->dispatch_width * nr_mem * 4);
+
+   /* We burn the address & stack pointer registers for all spills/fills in a
+    * shader. Preinitialize at the top using a scratch register.
+    *
+    * TODO: Need ABI for multi-function.
+    */
+   assert(func->is_entrypoint);
+   jay_AND(&b, JAY_TYPE_U32, surf, jay_bare_reg(UGPR, 5), ~BITFIELD_MASK(10));
+   jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4);
+
+   /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */
+   jay_def tmp2 = jay_bare_reg(GPR, func->shader->partition.base2);
+   jay_LANE_ID_8(&b, tmp2);
+   for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) {
+      jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i);
+   }
+
+   jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4));
+   jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0);
+   if (b.shader->scratch_size) {
+      jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size);
+   }
+
+   jay_foreach_block(func, block) {
+      /* We offset the stack pointer locally within a block to form offsets. By
+       * contract keep it in its canonical (unoffset) form at block boundaries.
+       */
+      unsigned sp_delta_B = 0;
+      bool address_valid = true;
+
+      jay_foreach_inst_in_block_safe(block, I) {
+         b.cursor = jay_before_inst(I);
+
+         if (I->op == JAY_OPCODE_MOV && jay_is_send_like(I)) {
+            if (!address_valid) {
+               jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4);
+               address_valid = true;
+            }
+
+            if (jay_is_mem(I->dst)) {
+               insert_spill_fill(&b, I->dst, I->src[0], sp, false, &sp_delta_B,
+                                 umem_base);
+               func->shader->spills++;
+            } else {
+               insert_spill_fill(&b, I->src[0], I->dst, sp, true, &sp_delta_B,
+                                 umem_base);
+               func->shader->fills++;
+            }
+
+            jay_remove_instruction(I);
+         } else if (I->op == JAY_OPCODE_SHUFFLE) {
+            /* Shuffles implicitly clobber the address register so we'll need to
+             * rematerialize the surface state (but be lazy).
+             */
+            address_valid = false;
+         }
+      }
+
+      /* Canonicalize our internal registers at block boundaries */
+      if (jay_num_successors(block) > 0) {
+         if (!address_valid) {
+            jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4);
+         }
+
+         if (sp_delta_B > 0) {
+            jay_ADD(&b, JAY_TYPE_U32, sp, sp, -sp_delta_B);
+         }
+      }
+   }
+
+   /* Note this is bogus with recursion, but recursion is not supported on any
+    * current graphics/compute API.
+    */
+   func->shader->scratch_size += umem_base + (nr_umem * 4);
+}
diff --git a/src/intel/compiler/jay/jay_nir_algebraic.py b/src/intel/compiler/jay/jay_nir_algebraic.py
new file mode 100644
index 00000000000..209f9585172
--- /dev/null
+++ b/src/intel/compiler/jay/jay_nir_algebraic.py
@@ -0,0 +1,95 @@
+# Copyright 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+import argparse
+import sys
+from math import pi
+
+a = 'a'
+b = 'b'
+c = 'c'
+
+lower_fsign = [
+    (('fsign', a), ('bcsel', ('!flt', 0, a), +1.0,
+                    ('bcsel', ('!flt', a, 0), -1.0, 0.0))),
+    (('fceil', a), ('fneg', ('ffloor', ('fneg', a)))),
+
+    # inot is free on and/or/xor sources but not dests. Apply De Morgan's.
+    (('inot', ('iand(is_used_once)', ('inot', a), b)), ('ior', a, ('inot', b))),
+    (('inot', ('ior(is_used_once)', ('inot', a), b)), ('iand', a, ('inot', b))),
+    (('inot', ('ixor(is_used_once)', ('inot', a), b)), ('ixor', a, b)),
+    (('inot', ('iand(is_used_once)', a, b)), ('ior', ('inot', a), ('inot', b))),
+    (('inot', ('ior(is_used_once)', a, b)), ('iand', ('inot', a), ('inot', b))),
+    (('inot', ('ixor(is_used_once)', a, b)), ('ixor', ('inot', a), b)),
+
+    # Remove the zeroing. Down-conversion is free but extracts are not.
+    (('u2f32', ('extract_u8', a, 0)), ('u2f32', ('u2u8', a))),
+    (('u2f32', ('extract_u16', a, 0)), ('u2f32', ('u2u16', a))),
+    (('i2f32', ('extract_i8', a, 0)), ('i2f32', ('i2i8', a))),
+    (('i2f32', ('extract_i16', a, 0)), ('i2f32', ('i2i16', a))),
+
+    (('pack_half_2x16_split', a, b),
+     ('pack_32_2x16_split', ('f2f16', a), ('f2f16', b))),
+
+    # Allows us to use more modifiers
+    (('bcsel', a, ('iadd(is_used_once)', b, c), b),
+     ('iadd', ('bcsel', a, c, 0), b)),
+]
+
+
+lower_bool = [
+    # Try to use conditional modifiers more
+    (('ieq', ('iand(is_used_once)', a, b), b),
+     ('ieq', ('iand', ('inot', a), b), 0)),
+    (('ine', ('iand(is_used_once)', a, b), b),
+     ('ine', ('iand', ('inot', a), b), 0)),
+]
+
+for T, sizes, one in [('f', [16, 32], 1.0),
+                      ('i', [8, 16, 32], 1),
+                      ('b', [8, 16, 32], -1)]:
+    for sz in sizes:
+        if T in ['f', 'i']:
+            lower_bool.extend([
+                ((f'{T}neg', (f'b2{T}{sz}', ('inot', 'a@1'))),
+                 ('bcsel', a, 0, -one)),
+                ((f'{T}neg', (f'b2{T}{sz}', 'a@1')), ('bcsel', a, -one, 0)),
+            ])
+
+        lower_bool.extend([
+            ((f'b2{T}{sz}', ('inot', 'a@1')), ('bcsel', a, 0, one)),
+            ((f'b2{T}{sz}', 'a@1'), ('bcsel', a, one, 0)),
+        ])
+
+lower_bool.extend([
+    ((f'b2i64', 'a@1'), ('pack_64_2x32_split', ('bcsel', a, 1, 0), 0)),
+])
+
+opt_sel_zero = [
+    (('bcsel@32', a, 0, 1), ('iadd', ('bcsel', a, 0xffffffff, 0), 1)),
+    (('bcsel@32', a, 1, 0), ('ineg', ('bcsel', a, 0xffffffff, 0))),
+]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    parser.add_argument('output')
+    args = parser.parse_args()
+
+    sys.path.insert(0, args.import_path)
+    import nir_algebraic  # pylint: disable=import-error
+
+    with open(args.output, 'w', encoding='utf-8') as f:
+        f.write('#include "jay_private.h"')
+
+        f.write(nir_algebraic.AlgebraicPass(
+            "jay_nir_lower_fsign", lower_fsign).render())
+        f.write(nir_algebraic.AlgebraicPass(
+            "jay_nir_lower_bool", lower_bool).render())
+        f.write(nir_algebraic.AlgebraicPass(
+            "jay_nir_opt_sel_zero", opt_sel_zero).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py
new file mode 100644
index 00000000000..928d1e90b04
--- /dev/null
+++ b/src/intel/compiler/jay/jay_opcodes.py
@@ -0,0 +1,233 @@
+# Copyright 2026 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+from typing import TYPE_CHECKING
+from dataclasses import dataclass
+import enum
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+
+@dataclass
+class Opcode:
+    name: str
+    has_dest: bool
+    num_srcs: int
+    types: list[str]
+    negate: int
+    sat: bool
+    cmod: bool
+    side_effects: bool
+    _2src_commutative: bool
+    extra_struct: list[tuple[str, str]]
+
+
+@enum.unique
+class Props(enum.IntEnum):
+    NEGATE0 = 1 << 0
+    NEGATE1 = 1 << 1
+    NEGATE2 = 1 << 2
+    NEGATE3 = 1 << 3
+    SAT = 1 << 4
+    CMOD = 1 << 5
+    SIDE_EFFECTS = 1 << 6
+    COMMUTATIVE = 1 << 7
+    NO_DEST_ = 1 << 8
+    NEGATE = NEGATE0 | NEGATE1 | NEGATE2 | NEGATE3
+    NO_DEST = SIDE_EFFECTS | NO_DEST_
+
+
+_opcodes: dict[str, Opcode] = {}
+
+
+def op(name: str, num_srcs: int, types: str | None = None,
+       props: int = 0, extra_struct: str | list[str] | None = None) -> None:
+    types_ = types.split(' ') if types else ['untyped']
+
+    # We can always negate the predicate.
+    negate_mask = (props & Props.NEGATE) | (1 << num_srcs)
+
+    if extra_struct is not None:
+        extra_struct_ = [(' '.join(x.split(' ')[0:-1]), x.split(' ')[-1])
+                         for x in extra_struct]
+    else:
+        extra_struct_ = []
+
+    _opcodes[name] = Opcode(name, not bool(props & Props.NO_DEST_),
+                            num_srcs, types_, negate_mask,
+                            bool(props & Props.SAT), bool(props & Props.CMOD),
+                            bool(props & Props.SIDE_EFFECTS),
+                            bool(props & Props.COMMUTATIVE),
+                            extra_struct_)
+
+
+op('and', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE)
+op('or',  2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE)
+op('xor', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE)
+
+op('add',   2, 'u32 s32 u64 s64 f32 f64 f16 bf16 u16 s16',
+   Props.SAT | Props.CMOD | Props.COMMUTATIVE | Props.NEGATE)
+op('add3',  3, 'u32 s32 u64 s64 u16 s16', Props.SAT |
+   Props.CMOD | Props.COMMUTATIVE | Props.NEGATE)
+op('asr',   2, 's32 s64 s16', Props.CMOD | Props.NEGATE0)
+op('avg',   2, 's16 s32 u16 u32', Props.NEGATE | Props.CMOD)
+op('bfe',   3, 'u32 s32', Props.NEGATE0)
+op('bfi1',  2, 'u32')
+op('bfi2',  3, 'u32')
+op('bfn',   3, 'u32', Props.CMOD, ['uint8_t ctrl'])
+op('bfrev', 1, 'u32', Props.NEGATE)
+op('cbit',  1, 'u32', Props.NEGATE | Props.CMOD)
+op('cmp',   2, 'u32', Props.NEGATE | Props.CMOD)
+
+
+# With an 8/16-bit type, `index` specifies the element index of the source
+# within the 32-bit word. For example, if src_type == U16 and index == 1, this
+# converts the upper 16-bits of the input.
+op('cvt', 1, 'u8 s8 u16 s16 u32 s32 u64 s64 f32 f64 f16 bf16', Props.NEGATE | Props.SAT, [
+    'enum jay_type src_type',
+    'enum jay_rounding_mode rounding_mode',
+    'uint8_t index',
+    'uint8_t pad'
+])
+
+op('fbh',        1, 'u32 s32')
+op('fbl',        1, 'u32')
+op('lzd',        1, 'u32')
+op('frc',        1, 'f32 f64', Props.NEGATE | Props.CMOD)
+op('mad',        3, 'u32 s32 u16 s16 f32 f64 f16 bf16',
+   Props.NEGATE | Props.SAT | Props.CMOD | Props.COMMUTATIVE)
+op('max',        2, 'u32 s32 u64 s64 u16 s16 f32 f64 f16 bf16',
+   Props.NEGATE | Props.SAT | Props.COMMUTATIVE)
+op('min',        2, 'u32 s32 u64 s64 u16 s16 f32 f64 f16 bf16',
+   Props.NEGATE | Props.SAT | Props.COMMUTATIVE)
+op('mov',        1, 'u1 u16 u32 u64', Props.NEGATE0 | Props.CMOD)
+op('modifier',   1, 'f32 f64 f16 s16 s32 s64 u16 u32 u64 s8',
+   Props.NEGATE | Props.SAT | Props.CMOD)
+op('mul',        2, 'u16 s16 f32 f64 f16 bf16',
+   Props.NEGATE | Props.SAT | Props.CMOD | Props.COMMUTATIVE)
+op('mul_high',   2, 'u32 s32', Props.COMMUTATIVE)
+op('mul_32x16',  2, 'u32 s32')
+op('mul_32',     2, 'u32 s32', Props.COMMUTATIVE, ['bool high'])
+op('sel',        3, 'u32 f32 u1 u16', Props.NEGATE)
+op('csel',       3, 'u32 s32 f32', Props.NEGATE)
+op('dp4a_uu',    3, 'u32', Props.SAT)
+op('dp4a_ss',    3, 's32', Props.SAT)
+op('dp4a_su',    3, 's32', Props.SAT)
+op('rndd',       1, 'f16 f32 f64', Props.NEGATE | Props.SAT)
+op('rndz',       1, 'f16 f32 f64', Props.NEGATE | Props.SAT)
+op('rnde',       1, 'f16 f32 f64', Props.NEGATE | Props.SAT)
+op('math', 1, 'f16 f32',     Props.NEGATE | Props.SAT, ['enum jay_math op'])
+
+for n in ['rol', 'ror', 'shl', 'shr']:
+    op(n, 2, 'u32 u64 u16 s16 s32 s64', Props.CMOD | Props.NEGATE0)
+
+op('quad_swizzle', 1, 'u1 u32', 0, ['enum jay_quad_swizzle swizzle'])
+op('sync', 0, None, Props.NO_DEST, ['enum tgl_sync_function op'])
+
+for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else',
+          'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret',
+          'loop_once']:
+    op(n, 0, None, Props.NO_DEST)
+
+op('send', 4, None, Props.SIDE_EFFECTS, [
+    'enum brw_sfid sfid',
+    'uint8_t sbid',
+    'bool eot',
+    'bool check_tdr',
+    'bool uniform',
+    'bool bindless',
+    'enum jay_type type_0',
+    'enum jay_type type_1',
+    'uint8_t ex_mlen',
+    'uint32_t ex_desc_imm',
+])
+
+op('reloc',   0, 'u32 u64', 0, ['unsigned param', 'unsigned base'])
+op('preload', 0, 'u32',     0, ['unsigned reg'])
+op('deswizzle_16', 0, 'u32', Props.NO_DEST, ['unsigned dst', 'unsigned src'])
+
+# Calculating the lane ID requires multiple power-of-two steps each involving
+# complex architectural features not modelled in the IR.
+op('lane_id_8', 0, 'u16')
+op('lane_id_expand', 1, 'u16', 0, ['unsigned width'])
+
+# Sample ID calculation
+op('extract_byte_per_8lanes', 2, 'u32')
+op('shr_odd_subspans_by_4', 1, 'u16')
+op('and_u32_u16', 2, 'u32')
+
+# Pixel coord calculations. expand_quad replicates out the per-2x2 values from
+# its source g0.[10...13] and - in the case of SIMD32 - g1.[10...13] into a
+# per-lane value. Then offset_packed_pixel_coords adds the appropriate packed
+# 2x16-bit offset within each quad, giving 2x16-bit per-lane coordinates.
+op('expand_quad', 2, 'u32')
+op('offset_packed_pixel_coords', 1, 'u32')
+op('extract_layer', 2, 'u32')
+
+# Generated by RA and lowered after. Valid only for GPR/UGPR.
+op('swap', 2, 'u32', Props.NO_DEST)
+
+# Phi function representations
+#
+# Unlike in NIR, we represent Phi functions as a pair of opcodes, purely
+# for convenience since it makes many things easier to work with.
+#
+# Phis locially exist along control flow edges between blocks.  PHI_DST
+# lives where 𝜙 would traditionally be written, at the point where the new
+# value is defined.  A PHI_DST will have a corresponding PHI_SRC in each of
+# its predecessor block, representing value coming in along that edge.  This
+# ensures that source modifiers, scalar to vector promotion, or other source
+# evaluation happens in the predecessor block.
+#
+# The PHI_SRC refers to the SSA index of the PHI_DST. For example, 'if (..) r3 =
+# r1 else r3 = r2 endif' might look
+#
+#            (following block)  | (then block) | (else block)
+#            START B3 <B1 <B2   |  ...         |  ...
+#              r3 = 𝜙           | 𝜙3 = r1      | 𝜙3 = r2
+#              ...              | END B1       | END B2
+#
+# Here, PHI_DST defines a new SSA value r3. The PHI_SRC in blocks B1 and B2 each
+# indicate that the r3 phi's value is r1 when coming from B1, and r2 when coming
+# from B2. This would traditionally be written r3 = 𝜙(r1, r2).
+#
+# Phis operate on whole 32-bit lane values. Phis are not allowed to mix files.
+op('phi_src', 1, 'u1 u32', Props.NO_DEST, ['uint32_t index'])
+op('phi_dst', 0, 'u1 u32')
+
+# Output from a unit test to prevent dead code elimination.
+op('unit_test', 1, 'u32', Props.NO_DEST)
+
+# Produces a stable indeterminate value. Freeze(Poison) in LLVM parlance.
+op('indeterminate', 0, 'u1 u32')
+
+op('not', 1, 'u1 u32', Props.CMOD)
+op('cast_canonical_to_flag', 1, 'u1')
+
+op('mov_imm64', 0, 'u64', 0, ['uint64_t imm'])
+op('zero_flag', 0, 'u1', Props.NO_DEST, ['unsigned reg'])
+
+# Cross-lane shuffle. src0=data, src1=offset in bytes. Clobbers an address reg.
+op('shuffle', 2, 'u1 u32')
+
+# Shuffle with a constant lane index.
+op('broadcast_imm', 1, 'u1 u32', 0, ['unsigned lane'])
+
+OPCODES = _opcodes
+
+ENUMS: 'Mapping[str, tuple[str, list[str]]]' = {
+    'jay_quad_swizzle': ('JAY_QUAD_SWIZZLE', ['xxxx', 'yyyy', 'zzzz', 'wwww',
+                                              'xyxy', 'zwzw', 'xxzz', 'yyww']),
+    'jay_rounding_mode': ('JAY', ['round', 'rne', 'ru', 'rd', 'rtz']),
+    'jay_math': ('JAY_MATH', ['_', 'inv', 'log', 'exp', 'sqrt', 'rsq', 'sin', 'cos']),
+    'brw_sfid': ('BRW_SFID', ['null', 'sampler', 'message_gateway',
+                              'render_cache', 'urb', 'bindless_thread_dispatch',
+                              'ray_trace_accelerator', 'hdc0',
+                              'pixel_interpolator', 'tgm', 'slm', 'ugm']),
+    'tgl_sync_function': ('TGL_SYNC', ['nop', 'allrd', 'allwr', 'fence', 'bar', 'host']),
+}
+
+# Clean up namespace
+del op
+del _opcodes
diff --git a/src/intel/compiler/jay/jay_opcodes_gen.py b/src/intel/compiler/jay/jay_opcodes_gen.py
new file mode 100644
index 00000000000..17e745590f6
--- /dev/null
+++ b/src/intel/compiler/jay/jay_opcodes_gen.py
@@ -0,0 +1,99 @@
+# Copyright 2026 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+import argparse
+import sys
+
+from mako import exceptions
+from mako.template import Template
+
+from jay_opcodes import OPCODES
+
+HEADER_TEMPLATE = """/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include <stdbool.h>
+#include "util/macros.h"
+
+enum PACKED jay_opcode {
+% for opcode in opcodes:
+   JAY_OPCODE_${opcode.upper()},
+% endfor
+   JAY_NUM_OPCODES
+};
+static_assert(sizeof(enum jay_opcode) == 1);
+
+struct jay_opcode_info {
+   const char *name;
+   unsigned num_srcs;
+
+   /** Bitfield of sources which support negation/abs */
+   uint8_t src_mods;
+
+   /** Which modifiers are broadly supported by the opcode. Note there may be
+     * further restrictions (e.g. based on types) not encoded here.
+     */
+   bool sat;
+   bool cmod;
+
+   /** Whether the operation has side effects not expressed in the SSA IR */
+   bool side_effects;
+
+   /** op(a, b, c, ...) = op(b, a, c, ...) */
+   bool _2src_commutative;
+};
+
+extern const struct jay_opcode_info jay_opcode_infos[JAY_NUM_OPCODES];
+"""
+
+CODE_TEMPLATE = """/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "jay_opcodes.h"
+
+const struct jay_opcode_info jay_opcode_infos[JAY_NUM_OPCODES] = {
+% for opcode, op in opcodes.items():
+   [JAY_OPCODE_${opcode.upper()}] = {
+      .name = "${opcode}",
+      .num_srcs = ${op.num_srcs},
+      .src_mods = ${bin(op.negate)},
+% for mod in ["sat", "cmod", "side_effects", "_2src_commutative"]:
+% if getattr(op, mod):
+      .${mod} = true,
+% endif
+% endfor
+   },
+% endfor
+};
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--code', action='store', default=None)
+    parser.add_argument('--header', action='store', default=None)
+    args = parser.parse_args()
+
+    if not (args.header or args.code):
+        parser.error('At least one of --code or --header is required')
+
+    try:
+        if args.code is not None:
+            with open(args.code, 'w', encoding='utf-8') as f:
+                f.write(Template(CODE_TEMPLATE).render(opcodes=OPCODES))
+        if args.header is not None:
+            with open(args.header, 'w', encoding='utf-8') as f:
+                f.write(Template(HEADER_TEMPLATE).render(opcodes=OPCODES))
+    except Exception:
+        print(exceptions.text_error_template().render())
+        return 1
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/intel/compiler/jay/jay_opt_control_flow.c b/src/intel/compiler/jay/jay_opt_control_flow.c
new file mode 100644
index 00000000000..1f337f37296
--- /dev/null
+++ b/src/intel/compiler/jay/jay_opt_control_flow.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/list.h"
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/*
+ * Detect the block "else; endif" and remove the no-op else, effectively
+ * removing empty else blocks. Logically, that causes critical edges, so this
+ * pass must run late (post-RA).
+ */
+static void
+opt_empty_else(jay_block *blk)
+{
+   unsigned i = 0;
+   enum jay_opcode ops[] = { JAY_OPCODE_ELSE, JAY_OPCODE_ENDIF };
+
+   jay_foreach_inst_in_block(blk, I) {
+      if (i >= ARRAY_SIZE(ops) || ops[i++] != I->op)
+         return;
+   }
+
+   if (i == ARRAY_SIZE(ops)) {
+      jay_remove_instruction(jay_first_inst(blk));
+   }
+}
+
+/*
+ * Replace short if-statements with predication. Assumes opt_empty_else already
+ * ran. TODO: Generalize.
+ */
+static void
+opt_predicate(jay_function *f, jay_block *block)
+{
+   jay_inst *if_ = jay_last_inst(block);
+   if (!if_ || if_->op != JAY_OPCODE_IF)
+      return;
+
+   /* If's fallthrough to the then */
+   jay_block *then_block = jay_next_block(block);
+   assert(block->successors[0] == then_block && "successors for if");
+
+   /* We're searching for a single block then, so the next block is else */
+   jay_block *else_block = jay_next_block(then_block);
+   if (block->successors[1] != else_block ||
+       list_length(&then_block->instructions) > 3 ||
+       !list_is_singular(&else_block->instructions))
+      return;
+
+   /* We can only access one flag per instruction, so do not predicate anything
+    * accessing flags. This also ensures the if-condition flag is kept live.
+    *
+    * MIN/MAX turn into SEL which cannot be predicated despite not using flags.
+    *
+    * Predicating NoMask instructions doesn't work if we are electing a nonzero
+    * lane but the NoMask forces lane 0. This should be optimized later.
+    */
+   jay_foreach_inst_in_block(then_block, I) {
+      if (jay_uses_flag(I) ||
+          I->op == JAY_OPCODE_MIN ||
+          I->op == JAY_OPCODE_MAX ||
+          I->op == JAY_OPCODE_CSEL ||
+          jay_is_no_mask(I))
+         return;
+   }
+
+   jay_inst *endif = jay_last_inst(else_block);
+   if (endif->op != JAY_OPCODE_ENDIF)
+      return;
+
+   /* Rewrite with predication */
+   jay_builder b = jay_init_builder(f, jay_after_block(block));
+   assert(if_->predication == JAY_PREDICATED && "if's are always predicated");
+
+   jay_foreach_inst_in_block_safe(then_block, I) {
+      jay_add_predicate(&b, I, *jay_inst_get_predicate(if_));
+   }
+
+   /* Remove the jumps */
+   jay_remove_instruction(if_);
+   jay_remove_instruction(endif);
+}
+
+/*
+ * Optimize "(f0) break; while" to "(!f0) while". As break/while appear in
+ * different blocks, we optimize the entire function at a time.
+ */
+static void
+opt_predicate_while(jay_function *func)
+{
+   jay_inst *prev_break = NULL;
+
+   jay_foreach_block(func, block) {
+      if (list_is_empty(&block->instructions)) {
+         /* Ignore empty blocks */
+      } else if (jay_last_inst(block)->op == JAY_OPCODE_BREAK) {
+         prev_break = jay_last_inst(block);
+      } else if (jay_first_inst(block)->op == JAY_OPCODE_WHILE &&
+                 prev_break &&
+                 prev_break->predication) {
+         assert(!jay_first_inst(block)->predication);
+         jay_inst_get_predicate(prev_break)->negate ^= true;
+
+         jay_remove_instruction(jay_first_inst(block));
+         jay_remove_instruction(prev_break);
+
+         jay_builder b = jay_init_builder(func, jay_before_block(block));
+         jay_builder_insert(&b, prev_break);
+
+         prev_break->op = JAY_OPCODE_WHILE;
+         prev_break = NULL;
+      } else {
+         prev_break = NULL;
+      }
+   }
+}
+
+void
+jay_opt_control_flow(jay_shader *s)
+{
+   jay_foreach_function(s, f) {
+      /* Iterating blocks in reverse lets both opts converge in 1 pass */
+      jay_foreach_block_rev(f, block) {
+         opt_empty_else(block);
+         opt_predicate(f, block);
+      }
+
+      /* Do last: opt_predicate_while depends on both previous optimizations */
+      opt_predicate_while(f);
+   }
+}
diff --git a/src/intel/compiler/jay/jay_opt_dead_code.c b/src/intel/compiler/jay/jay_opt_dead_code.c
new file mode 100644
index 00000000000..da9d7299d57
--- /dev/null
+++ b/src/intel/compiler/jay/jay_opt_dead_code.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/bitset.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+static void
+pass(jay_function *f)
+{
+   BITSET_WORD *live_set = BITSET_CALLOC(f->ssa_alloc);
+
+   jay_foreach_inst_in_func_safe_rev(f, block, I) {
+      /* TODO: Allow for atomics? */
+      if (!BITSET_TEST_COUNT(live_set, jay_base_index(I->dst),
+                             jay_num_values(I->dst)) &&
+          I->op != JAY_OPCODE_SEND) {
+         I->dst = jay_null();
+      }
+
+      if (!jay_is_null(I->cond_flag) &&
+          !BITSET_TEST(live_set, jay_index(I->cond_flag)) &&
+          (I->op != JAY_OPCODE_CMP || jay_is_null(I->dst))) {
+
+         I->cond_flag = jay_null();
+         I->conditional_mod = 0;
+      }
+
+      bool no_dest = jay_is_null(I->dst) && jay_is_null(I->cond_flag);
+      bool side_effects = jay_opcode_infos[I->op].side_effects;
+
+      if (no_dest && !side_effects) {
+         jay_remove_instruction(I);
+      } else {
+         jay_foreach_src_index(I, s, _, index) {
+            BITSET_SET(live_set, index);
+         }
+      }
+   }
+
+   /* Eliminate phis. This step may leave dead code but it's good enough in
+    * practice since NIR already eliminated dead phis.
+    */
+   jay_foreach_block(f, block) {
+      jay_foreach_phi_src_in_block(block, I) {
+         if (!BITSET_TEST(live_set, jay_phi_src_index(I))) {
+            jay_remove_instruction(I);
+         }
+      }
+   }
+
+   free(live_set);
+}
+
+JAY_DEFINE_FUNCTION_PASS(jay_opt_dead_code, pass)
diff --git a/src/intel/compiler/jay/jay_opt_propagate.c b/src/intel/compiler/jay/jay_opt_propagate.c
new file mode 100644
index 00000000000..25a58253d93
--- /dev/null
+++ b/src/intel/compiler/jay/jay_opt_propagate.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/lut.h"
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+static enum jay_type
+canonicalize_for_bit_compare(enum jay_type type)
+{
+   enum jay_type base = jay_base_type(type);
+   return (base == JAY_TYPE_S) ? jay_type_rebase(type, JAY_TYPE_U) : type;
+}
+
+static bool
+propagate_cmod(jay_function *func, jay_inst *I, jay_inst **defs)
+{
+   enum jay_type cmp_type = I->type;
+   enum jay_conditional_mod cmod = I->conditional_mod;
+   jay_inst *def = NULL;
+
+   /* TODO: Generalize cmod propagation */
+   if (jay_type_size_bits(cmp_type) != 32)
+      return false;
+
+   /* Pattern match `cmp ssa, 0` or `cmp 0, ssa`. */
+   jay_foreach_ssa_src(I, s) {
+      if (jay_is_zero(I->src[1 - s])) {
+         def = defs[jay_base_index(I->src[s])];
+
+         /* Canonicalize the cmod to have the zero second */
+         cmod = s == 1 ? jay_conditional_mod_swap_sources(cmod) : cmod;
+         break;
+      }
+   }
+
+   /* Check if we can fold into the def */
+   if (!def || !jay_is_null(def->cond_flag) || !jay_opcode_infos[def->op].cmod)
+      return false;
+
+   /* "Neither Saturate nor conditional modifier allowed with DW integer
+    * multiply."
+    *
+    * Could be refined.
+    */
+   if (def->op == JAY_OPCODE_MUL && !jay_type_is_any_float(def->type))
+      return false;
+
+   enum jay_type instr_type = def->type;
+
+   if (cmod == JAY_CONDITIONAL_NE || cmod == JAY_CONDITIONAL_EQ) {
+      cmp_type = canonicalize_for_bit_compare(cmp_type);
+      instr_type = canonicalize_for_bit_compare(instr_type);
+   }
+
+   if (instr_type != cmp_type)
+      return false;
+
+   jay_builder b = jay_init_builder(func, jay_before_inst(I));
+   jay_set_conditional_mod(&b, def, I->cond_flag, cmod);
+   return true;
+}
+
+static jay_def
+jay_compose_src(jay_def to, jay_def from)
+{
+   if (to.abs) {
+      from.negate = false;
+      from.abs = true;
+   }
+
+   from.negate ^= to.negate;
+   return from;
+}
+
+static bool
+uses_modifiers(const jay_inst *I)
+{
+   jay_foreach_src(I, s) {
+      if (I->src[s].abs || I->src[s].negate)
+         return true;
+   }
+
+   return I->saturate;
+}
+
+static void
+propagate_modifier(jay_inst *I, unsigned s, jay_inst *mod)
+{
+   /* Check if we can propagate abs/neg here in general */
+   if (!jay_has_src_mods(I, s) || mod->saturate)
+      return;
+
+   /* Try to make the types compatible. */
+   if (jay_src_type(I, s) != mod->type) {
+      if (I->op == JAY_OPCODE_SEL && !uses_modifiers(I)) {
+         I->type = mod->type;
+      } else {
+         return;
+      }
+   }
+
+   jay_replace_src(&I->src[s], mod->src[0]);
+   I->src[s] = jay_compose_src(I->src[s], mod->src[0]);
+}
+
+static void
+propagate_not(jay_inst *I, unsigned s, jay_inst *mod)
+{
+   /* Handle inot specially for predicates, and logic operations per bspec text:
+    *
+    *    When used with logic instructions (and, not, or, xor), [the
+    *    negate] field indicates whether the source bits are
+    *    inverted... regardless of the source type.
+    */
+   if ((s == I->num_srcs - I->predication) ||
+       I->op == JAY_OPCODE_AND ||
+       I->op == JAY_OPCODE_OR ||
+       I->op == JAY_OPCODE_XOR) {
+      jay_replace_src(&I->src[s], mod->src[0]);
+      I->src[s].negate ^= true;
+   } else if (I->op == JAY_OPCODE_BFN) {
+      jay_replace_src(&I->src[s], mod->src[0]);
+      jay_set_bfn_ctrl(I, util_lut3_invert_source(jay_bfn_ctrl(I), s));
+   }
+}
+
+static void
+propagate_forwards(jay_function *f)
+{
+   jay_inst **defs = calloc(f->ssa_alloc, sizeof(defs[0]));
+
+   jay_foreach_inst_in_func_safe(f, block, I) {
+      jay_builder b = jay_init_builder(f, jay_before_inst(I));
+
+      jay_foreach_dst_index(I, _, d) {
+         defs[d] = I;
+      }
+
+      /* Copy propagate individual components into vectors */
+      jay_foreach_src_index(I, s, c, idx) {
+         jay_inst *def = defs[idx];
+         assert(def != NULL && "SSA");
+
+         if (def->op == JAY_OPCODE_MOV &&
+             !def->predication &&
+             jay_num_values(def->dst) == 1 &&
+             jay_num_values(def->src[0]) == 1 &&
+             I->src[s].file == def->src[0].file) {
+
+            jay_insert_channel(&b, &I->src[s], c, def->src[0]);
+         }
+      }
+
+      /* Don't propagate into phis yet - TODO: File awareness */
+      if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND)
+         continue;
+
+      jay_foreach_ssa_src(I, s) {
+         /* Copy propagate whole vectors */
+         jay_def src = I->src[s];
+         if (src.collect)
+            continue;
+
+         jay_inst *def = defs[jay_base_index(src)];
+         assert(def != NULL && "SSA");
+
+         if (!jay_defs_equivalent(def->dst, src) || def->predication)
+            continue;
+
+         if (def->op == JAY_OPCODE_MOV) {
+            /* Default values must have the same file as their dest, do not
+             * propagate invalid there. Also don't propagate inverse-ballots.
+             * Also only source 0 can read ARF (i.e. ballotted flags).
+             */
+            if ((I->src[s].file == def->src[0].file) ||
+                ((!jay_inst_has_default(I) ||
+                  &I->src[s] != jay_inst_get_default(I)) &&
+                 !(I->src[s].file == UFLAG && !jay_is_imm(def->src[0])) &&
+                 !(I->src[s].file == FLAG) &&
+                 (s == 0 || !jay_is_flag(def->src[0])) &&
+                 !(jay_is_imm(def->src[0]) && I->src[s].negate))) {
+
+               jay_replace_src(&I->src[s], def->src[0]);
+            }
+         } else if (def->op == JAY_OPCODE_MODIFIER && !jay_uses_flag(def)) {
+            propagate_modifier(I, s, def);
+         } else if (def->op == JAY_OPCODE_NOT && !jay_uses_flag(def)) {
+            propagate_not(I, s, def);
+         }
+      }
+
+      if (I->op == JAY_OPCODE_CMP && propagate_cmod(f, I, defs)) {
+         /* Even if we propagate the predicate write, there might be uses of the
+          * register value (TODO: Maybe check for this and skip propagating in
+          * that case?). So we cannot remove the compare, just strip the cond
+          * flag. Furthermore the CMP we always clobber some predicate, so give
+          * it an immediately-dead one instead.
+          */
+         I->cond_flag = jay_alloc_def(&b, I->cond_flag.file, 1);
+         continue;
+      }
+   }
+
+   free(defs);
+}
+
+static bool
+propagate_fsat(jay_inst *I, jay_inst *fsat)
+{
+   if (fsat->op != JAY_OPCODE_MODIFIER ||
+       fsat->predication ||
+       fsat->src[0].negate ||
+       fsat->src[0].abs ||
+       (fsat->conditional_mod && !jay_opcode_infos[I->op].cmod) ||
+       I->conditional_mod ||
+       I->type != fsat->type ||
+       !jay_type_is_any_float(fsat->type))
+      return false;
+
+   /* saturate(saturate(x)) = saturate(x) */
+   I->saturate |= fsat->saturate;
+   I->dst = fsat->dst;
+   I->cond_flag = fsat->cond_flag;
+   I->conditional_mod = fsat->conditional_mod;
+   return true;
+}
+
+static void
+propagate_backwards(jay_function *f)
+{
+   jay_inst **uses = calloc(f->ssa_alloc, sizeof(uses[0]));
+   BITSET_WORD *multiple = BITSET_CALLOC(f->ssa_alloc);
+
+   jay_foreach_inst_in_func_rev(f, block, I) {
+      /* Record uses */
+      jay_foreach_src_index(I, s, c, ssa_index) {
+         if (uses[ssa_index])
+            BITSET_SET(multiple, ssa_index);
+         else
+            uses[ssa_index] = I;
+      }
+
+      /* TODO: f64 sat propagation */
+      if (jay_num_values(I->dst) != 1)
+         continue;
+
+      assert(jay_is_ssa(I->dst));
+
+      jay_inst *use = uses[jay_base_index(I->dst)];
+      if (!use || BITSET_TEST(multiple, jay_base_index(I->dst)))
+         continue;
+
+      if (jay_opcode_infos[I->op].sat &&
+          jay_type_is_any_float(I->type) &&
+          propagate_fsat(I, use)) {
+
+         jay_remove_instruction(use);
+         continue;
+      }
+
+      /* Fold UGPR->{GPR, FLAG} copies coming out of NIR */
+      if (I->type == use->type &&
+          I->op != JAY_OPCODE_PHI_DST &&
+          use->op == JAY_OPCODE_MOV) {
+
+         I->dst = use->dst;
+         jay_remove_instruction(use);
+         continue;
+      }
+   }
+
+   free(multiple);
+   free(uses);
+}
+
+JAY_DEFINE_FUNCTION_PASS(jay_opt_propagate_forwards, propagate_forwards)
+JAY_DEFINE_FUNCTION_PASS(jay_opt_propagate_backwards, propagate_backwards)
diff --git a/src/intel/compiler/jay/jay_print.c b/src/intel/compiler/jay/jay_print.c
new file mode 100644
index 00000000000..3b8c3781d20
--- /dev/null
+++ b/src/intel/compiler/jay/jay_print.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/brw/brw_eu_defines.h"
+#include "util/lut.h"
+#include "util/macros.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+#define ENUM_TO_STR(x, arr)                                                    \
+   ({                                                                          \
+      assert(x < ARRAY_SIZE(arr));                                             \
+      arr[x];                                                                  \
+   })
+
+static const char *jay_conditional_mod_str[] = {
+   [JAY_CONDITIONAL_EQ] = ".eq", [JAY_CONDITIONAL_NE] = ".ne",
+   [JAY_CONDITIONAL_GT] = ".gt", [JAY_CONDITIONAL_LT] = ".lt",
+   [JAY_CONDITIONAL_GE] = ".ge", [JAY_CONDITIONAL_LE] = ".le",
+   [JAY_CONDITIONAL_OV] = ".ov", [JAY_CONDITIONAL_NAN] = ".nan",
+};
+
+static const char *jay_arf_str[] = {
+   [JAY_ARF_NULL] = "_",
+   [JAY_ARF_MASK] = "mask",
+   [JAY_ARF_CONTROL] = "ctrl",
+   [JAY_ARF_TIMESTAMP] = "timestamp",
+};
+
+static const char *jay_file_str[JAY_FILE_LAST + 1] = {
+   [GPR] = "r",       [UGPR] = "u",    [FLAG] = "f",      [UFLAG] = "uf",
+   [J_ADDRESS] = "a", [ACCUM] = "acc", [UACCUM] = "uacc", [J_ARF] = "arf",
+   [MEM] = "m",       [UMEM] = "um",   [TEST_FILE] = "t",
+};
+
+static const char *jay_base_types[] = {
+   [JAY_TYPE_U] = "u", [JAY_TYPE_S] = "s", [JAY_TYPE_F] = "f", [JAY_TYPE_BF] = "bf"
+};
+
+void
+jay_print_type(FILE *fp, enum jay_type t)
+{
+   fprintf(fp, ".%s%u", ENUM_TO_STR(jay_base_type(t), jay_base_types),
+           jay_type_size_bits(t));
+}
+
+static void
+jay_print_def(FILE *fp, const jay_inst *I, int src)
+{
+   jay_def def = src == -2 ? I->cond_flag : src == -1 ? I->dst : I->src[src];
+   unsigned len = jay_num_values(def);
+   const char *file = ENUM_TO_STR(def.file, jay_file_str);
+   bool has_lu = jay_is_ssa(def) && !jay_is_null(def) && src >= 0;
+   unsigned lu_bit = has_lu ? jay_source_last_use_bit(I->src, src) : 0;
+
+   bool has_index = jay_channel(def, 0) != JAY_SENTINEL;
+   bool has_reg = !def.collect && def.reg && def.file != J_ARF;
+
+   if (jay_is_null(def)) {
+      has_reg = false;
+      fprintf(fp, "_");
+   } else if (def.file == J_ARF) {
+      fputs(ENUM_TO_STR(jay_base_index(def), jay_arf_str), fp);
+   } else if (def.collect) {
+      assert(has_index && "else would be contiguous");
+      fprintf(fp, "(");
+      for (unsigned i = 0; i < len; ++i) {
+         if (i)
+            fprintf(fp, ", ");
+
+         if (jay_channel(def, i)) {
+            if (has_lu && BITSET_TEST(I->last_use, lu_bit))
+               fprintf(fp, "*");
+
+            fprintf(fp, "%s%u", file, jay_channel(def, i));
+            ++lu_bit;
+         } else {
+            fprintf(fp, "_");
+         }
+      }
+      fprintf(fp, ")");
+   } else if (has_index) {
+      fprintf(fp, "%s%s%u",
+              has_lu && BITSET_TEST(I->last_use, lu_bit) ? "*" : "", file,
+              jay_channel(def, 0));
+      if (len > 1) {
+         fprintf(fp, ":%s%u", file, jay_channel(def, len - 1));
+      }
+   }
+
+   if (has_reg) {
+      if (has_index)
+         fprintf(fp, "(");
+
+      fprintf(fp, "%s%u%s", file, def.reg, def.hi ? "h" : "");
+      if (len > 1) {
+         fprintf(fp, ":%s%u", file, def.reg + len - 1);
+      }
+
+      if (has_index)
+         fprintf(fp, ")");
+   }
+}
+
+static void
+jay_print_src(FILE *fp, jay_inst *I, unsigned s)
+{
+   jay_def src = I->src[s];
+   fprintf(fp, "%s%s", src.negate ? "-" : "", src.abs ? "(abs)" : "");
+
+   if (jay_is_imm(src)) {
+      fprintf(fp, "0x%X", jay_as_uint(src));
+      if (util_is_probably_float(jay_as_uint(src))) {
+         float f = uif(jay_as_uint(src));
+         fprintf(fp, fabs(f) >= 1000000.0 ? " (%e)" : " (%f)", f);
+      }
+   } else {
+      jay_print_def(fp, I, s);
+   }
+}
+
+/* XXX: copypaste of brw_print_swsb */
+static void
+jay_print_swsb(FILE *f, const struct tgl_swsb swsb)
+{
+   if (swsb.regdist) {
+      fprintf(f, "%s@%d",
+              (swsb.pipe == TGL_PIPE_FLOAT  ? "F" :
+               swsb.pipe == TGL_PIPE_INT    ? "I" :
+               swsb.pipe == TGL_PIPE_LONG   ? "L" :
+               swsb.pipe == TGL_PIPE_ALL    ? "A" :
+               swsb.pipe == TGL_PIPE_MATH   ? "M" :
+               swsb.pipe == TGL_PIPE_SCALAR ? "S" :
+                                              ""),
+              swsb.regdist);
+   }
+
+   if (swsb.mode) {
+      if (swsb.regdist)
+         fprintf(f, " ");
+
+      fprintf(f, "$%d%s", swsb.sbid,
+              (swsb.mode & TGL_SBID_SET ? "" :
+               swsb.mode & TGL_SBID_DST ? ".dst" :
+                                          ".src"));
+   }
+}
+
+void
+jay_print_inst(FILE *fp, jay_inst *I)
+{
+   const char *sep = "";
+
+   if (!jay_is_null(I->dst)) {
+      jay_print_def(fp, I, -1);
+      sep = ", ";
+   }
+
+   if (!jay_is_null(I->cond_flag)) {
+      fprintf(fp, "%s", sep);
+      jay_print_def(fp, I, -2);
+   }
+
+   if (!jay_is_null(I->dst) || !jay_is_null(I->cond_flag)) {
+      fprintf(fp, " = ");
+   }
+
+   if (I->predication) {
+      fprintf(fp, "(");
+      jay_print_src(fp, I, jay_inst_get_predicate(I) - I->src);
+
+      if (jay_inst_has_default(I)) {
+         fprintf(fp, "/");
+         jay_print_src(fp, I, jay_inst_get_default(I) - I->src);
+      }
+
+      fprintf(fp, ")");
+   }
+
+   if (I->op == JAY_OPCODE_MATH) {
+      jay_print_inst_info(fp, I, "");
+   } else {
+      fprintf(fp, "%s", jay_opcode_infos[I->op].name);
+   }
+
+   if (I->type != JAY_TYPE_UNTYPED) {
+      jay_print_type(fp, I->type);
+   }
+
+   if (I->op == JAY_OPCODE_BFN) {
+      fprintf(fp, ".(%s)", util_lut3_to_str[jay_bfn_ctrl(I)]);
+   }
+
+   const char *cmod = ENUM_TO_STR(I->conditional_mod, jay_conditional_mod_str);
+   fprintf(fp, "%s%s ", I->saturate ? ".sat" : "", cmod ? cmod : "");
+   sep = "";
+
+   for (unsigned i = 0; i < I->num_srcs - I->predication; i++) {
+      fprintf(fp, "%s", sep);
+      jay_print_src(fp, I, i);
+
+      enum jay_type T = jay_src_type(I, i);
+      if (T != I->type && !(T == JAY_TYPE_U1 && jay_is_flag(I->src[i]))) {
+         jay_print_type(fp, T);
+      }
+
+      sep = ", ";
+   }
+
+   if (I->op != JAY_OPCODE_MATH) {
+      sep = jay_print_inst_info(fp, I, sep);
+   }
+
+   /* Software scoreboard dependency info */
+   if (I->dep.regdist || I->dep.mode) {
+      fprintf(fp, "%s%s%s", strlen(sep) ? " {" : "{",
+              I->replicate_dep ? "*" : "", I->decrement_dep ? "+" : "");
+      jay_print_swsb(fp, I->dep);
+      fprintf(fp, "}");
+   }
+
+   fprintf(fp, "\n");
+}
+
+static inline void
+indent(FILE *fp, jay_block *block, bool interior)
+{
+   for (unsigned i = 0; i < block->indent + interior; i++)
+      fprintf(fp, "   ");
+}
+
+static void
+comma_separate(FILE *fp, jay_block *block, bool *first)
+{
+   if (*first) {
+      indent(fp, block, true);
+      *first = false;
+   } else {
+      fprintf(fp, ", ");
+   }
+}
+
+void
+jay_print_block(FILE *fp, jay_block *block)
+{
+   indent(fp, block, false);
+   fprintf(fp, "B%d%s%s", block->index, block->uniform ? " [uniform]" : "",
+           block->loop_header ? " [loop header]" : "");
+   bool first = true;
+   jay_foreach_predecessor(block, p) {
+      fprintf(fp, "%s B%d", first ? " <-" : "", (*p)->index);
+      first = false;
+   }
+   fprintf(fp, " {\n");
+
+   /* We group phi destinations/sources for legibility */
+   first = true;
+   jay_foreach_phi_dst_in_block(block, phi) {
+      comma_separate(fp, block, &first);
+      jay_print_def(fp, phi, -1);
+   }
+   fprintf(fp, "%s", first ? "" : " = 𝜙\n");
+
+   jay_foreach_inst_in_block(block, inst) {
+      if (inst->op != JAY_OPCODE_PHI_DST && inst->op != JAY_OPCODE_PHI_SRC) {
+         indent(fp, block, true);
+         jay_print_inst(fp, inst);
+      }
+   }
+
+   first = true;
+   jay_foreach_phi_src_in_block(block, phi) {
+      comma_separate(fp, block, &first);
+      fprintf(fp, "𝜙%u = ", jay_phi_src_index(phi));
+      jay_print_def(fp, phi, 0);
+   }
+   fprintf(fp, "%s", first ? "" : "\n");
+
+   indent(fp, block, false);
+   fprintf(fp, "}");
+   first = true;
+   jay_foreach_successor(block, succ) {
+      if (succ) {
+         fprintf(fp, "%s B%d", first ? " ->" : "", succ->index);
+         first = false;
+      }
+   }
+   fprintf(fp, "\n\n");
+}
+
+void
+jay_print_func(FILE *fp, jay_function *f)
+{
+   fprintf(fp, "Jay function: \n\n");
+   jay_foreach_block(f, block) {
+      jay_print_block(fp, block);
+   }
+}
+
+void
+jay_print(FILE *fp, jay_shader *s)
+{
+   jay_foreach_function(s, f) {
+      jay_print_func(fp, f);
+   }
+}
diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h
new file mode 100644
index 00000000000..2799eaa7b7b
--- /dev/null
+++ b/src/intel/compiler/jay/jay_private.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "jay_ir.h"
+#include "nir.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define JAY_DBG_NOOPT       BITFIELD_BIT(0)
+#define JAY_DBG_PRINTDEMAND BITFIELD_BIT(1)
+#define JAY_DBG_SPILL       BITFIELD_BIT(2)
+#define JAY_DBG_SYNC        BITFIELD_BIT(3)
+extern int jay_debug;
+
+bool jay_nir_lower_bool(nir_shader *nir);
+bool jay_nir_opt_sel_zero(nir_shader *nir);
+bool jay_nir_lower_fsign(nir_shader *nir);
+
+void jay_compute_liveness(jay_function *f);
+void jay_calculate_register_demands(jay_function *f);
+
+void jay_spill(jay_function *func, enum jay_file file, unsigned limit);
+void jay_partition_grf(jay_shader *shader);
+void jay_register_allocate(jay_shader *s);
+void jay_assign_flags(jay_shader *s);
+void jay_repair_ssa(jay_function *func);
+
+const char *jay_file_to_string(enum jay_file file);
+void jay_print_type(FILE *f, enum jay_type t);
+void jay_print_inst(FILE *f, jay_inst *I);
+void jay_print_block(FILE *f, jay_block *block);
+void jay_print_func(FILE *fp, jay_function *func);
+void jay_print(FILE *f, jay_shader *s);
+
+#ifndef NDEBUG
+void jay_validate(jay_shader *s, const char *when);
+void jay_validate_ra(jay_function *func);
+#else
+static inline void
+jay_validate(jay_shader *s, const char *when)
+{
+}
+
+static inline void
+jay_validate_ra(jay_function *func)
+{
+}
+#endif
+
+void jay_opt_propagate_forwards(jay_shader *s);
+void jay_opt_propagate_backwards(jay_shader *s);
+void jay_opt_dead_code(jay_shader *s);
+void jay_opt_control_flow(jay_shader *s);
+
+void jay_lower_pre_ra(jay_shader *s);
+void jay_lower_post_ra(jay_shader *s);
+void jay_lower_spill(jay_function *func);
+void jay_lower_simd_width(jay_shader *s);
+void jay_lower_scoreboard(jay_shader *s);
+
+struct jay_shader_bin *
+jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size);
+
+#ifdef __cplusplus
+} /* extern C */
+#endif
diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c
new file mode 100644
index 00000000000..65cbf05c080
--- /dev/null
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@@ -0,0 +1,1659 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <assert.h>
+#include "util/bitscan.h"
+#include "util/bitset.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "util/sparse_bitset.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+#include "shader_enums.h"
+
+/**
+ * Register allocation for Jay shaders.
+ *
+ * We use a decoupled register allocation approach.  First, we spill values
+ * until the register demand fits within the size of each register file.
+ *
+ * Secondly, we assign registers using a tree-scan algorithm similar to the
+ * one described in Colombet et al 2011:
+ *
+ *    Q. Colombet, B. Boissinot, P. Brisk, S. Hack and F. Rastello,
+ *        "Graph-coloring and treescan register allocation using repairing,"
+ *        2011 Proceedings of the 14th International Conference on Compilers,
+ *        Architectures and Synthesis for Embedded Systems (CASES), Taipei,
+ *        Taiwan, 2011, pp. 45-54, doi: 10.1145/2038698.2038708.
+ *
+ * We also use a union-find set to construct equivalence classes for phi webs,
+ * and attempt to use the same regs for registers in that class, similar to
+ * the "Aggressive Pre-Coalescing" step described in that paper.
+ *
+ * Finally, we deconstruct SSA.
+ */
+
+static inline bool
+is_ra_src(jay_def d)
+{
+   return d.file < JAY_NUM_RA_FILES;
+}
+
+#define jay_foreach_ra_file(file)                                              \
+   for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file)
+
+#define jay_foreach_ra_src(I, s)                                               \
+   jay_foreach_src(I, s)                                                       \
+      if (is_ra_src(I->src[s]) && !jay_is_null(I->src[s]))
+
+static enum jay_stride
+jay_min_stride_for_type(enum jay_type T)
+{
+   unsigned bits = jay_type_size_bits(T);
+
+   /* We need at least enough contiguous bits per-lane to store a scalar */
+   if (bits == 64)
+      return JAY_STRIDE_8;
+   else if (bits == 32)
+      return JAY_STRIDE_4;
+   else
+      return JAY_STRIDE_2;
+}
+
+static enum jay_stride
+jay_max_stride_for_type(enum jay_type T)
+{
+   /* Horizontal stride can be at most 4 */
+   return (jay_type_size_bits(T) >= 16) ? JAY_STRIDE_8 : JAY_STRIDE_4;
+}
+
+static bool
+jay_restrict_mixed_strides(jay_inst *I, unsigned s)
+{
+   /* From the hardware spec section "Register Region Restrictions":
+    *
+    * "In case of all floating point data types used in destination:" and
+    *
+    * "In case where source or destination datatype is 64b or operation is
+    *  integer DWord multiply:" and
+    *
+    *  "Src2 Restrictions"
+    *
+    *      Register Regioning patterns where register data bit location
+    *      of the LSB of the channels are changed between source and
+    *      destination are not supported on Src0 and Src1 except for
+    *      broadcast of a scalar.
+    *
+    * Therefore, ban mixed-strides in these cases.
+    *
+    * Similarly, SENDs cannot do any regioning so restrict that too.
+    */
+   return jay_type_is_any_float(I->type) ||
+          jay_type_size_bits(I->type) == 64 ||
+          jay_is_send_like(I) ||
+          I->op == JAY_OPCODE_MUL_32X16 ||
+          I->op == JAY_OPCODE_MUL_32 ||
+          s == 2;
+}
+
+static enum jay_stride
+jay_dst_stride_minmax(jay_inst *I, bool do_max)
+{
+   enum jay_stride min = jay_min_stride_for_type(I->type);
+   enum jay_stride max = jay_max_stride_for_type(I->type);
+
+   /* Destination stride must be equal to the ratio of the sizes of the
+    * execution data type to the destination type
+    */
+   if (I->op == JAY_OPCODE_CVT) {
+      min = MAX2(min, jay_min_stride_for_type(jay_src_type(I, 0)));
+   }
+
+   /* V/UV types are restricted */
+   if (I->op == JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4) {
+      return JAY_STRIDE_2;
+   }
+
+   /* The src2 restriction quoted above effectively implies we should not stride
+    * destinations of 3-source instructions either.
+    */
+   if (jay_num_isa_srcs(I) >= 3) {
+      return min;
+   }
+
+   return (do_max && !jay_restrict_mixed_strides(I, 0)) ? max : min;
+}
+
+static enum jay_stride
+jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max)
+{
+   enum jay_stride min = jay_min_stride_for_type(jay_src_type(I, s));
+   enum jay_stride max = jay_max_stride_for_type(jay_src_type(I, s));
+
+   /* SENDs cannot do any regioning so force exactly the types of the sources
+    * regardless of the type of the destination.
+    *
+    * Shuffles could theoretically support regioning but it would be nontrivial
+    * and probably pointless most of the time.
+    */
+   if (jay_is_send_like(I) || jay_is_shuffle_like(I)) {
+      return min;
+   }
+
+   /* While "add.u16 r0<2>, r1<4>" is legal, "add.u16 r0, r1<4>" is not.
+    * Conservatively assume the destination is packed and restrict the source
+    * stride accordingly. This satisfies the special restrictions.
+    */
+   if (jay_type_size_bits(I->type) <= 16) {
+      max = JAY_STRIDE_4;
+   }
+
+   /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet
+    * (preferring to burn the upper bits) but it is used internally.
+    */
+   if (I->op == JAY_OPCODE_LANE_ID_EXPAND) {
+      max = JAY_STRIDE_2;
+   }
+
+   if (jay_restrict_mixed_strides(I, s) &&
+       jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) {
+
+      return jay_dst_stride_minmax(I, do_max);
+   }
+
+   return (do_max && !jay_restrict_mixed_strides(I, s)) ? max : min;
+}
+
+struct affinity {
+   /**
+    * If there is a vector affinity defined for this SSA def, it is relative to
+    * some representative SSA index. Else 0 if there is no affinity.
+    */
+   uint32_t repr;
+
+   /** If the representative: offset in registers from the base.
+    *
+    * If not the representative: offset in registers from the representative. */
+   signed offset:4;
+
+   /**
+    * If true, this value is used in an end-of-thread SEND and requires high
+    * registers.
+    */
+   bool eot:1;
+
+   /** If true, this UGPR needs full GRF alignment */
+   bool grf_align     :1;
+   unsigned align_offs:4;
+   unsigned padding   :22;
+};
+static_assert(sizeof(struct affinity) == 8, "packed");
+
+struct phi_web_node {
+   /* Parent index, or circular for root */
+   uint32_t parent;
+
+   /* If root, assigned register, or ~0 if no register assigned. */
+   uint16_t reg;
+
+   /* Rank, at most log2(n) so need ~5-bits */
+   uint16_t rank;
+
+   /* If root, affinity for the whole web */
+   struct affinity affinity;
+};
+static_assert(sizeof(struct phi_web_node) == 16, "packed");
+
+static unsigned
+phi_web_find(struct phi_web_node *web, unsigned x)
+{
+   if (web[x].parent == x) {
+      /* Root */
+      return x;
+   } else {
+      /* Search up the tree */
+      unsigned root = x;
+      while (web[root].parent != root)
+         root = web[root].parent;
+
+      /* Compress path. Second pass ensures O(1) memory usage. */
+      while (web[x].parent != x) {
+         unsigned temp = web[x].parent;
+         web[x].parent = root;
+         x = temp;
+      }
+
+      return root;
+   }
+}
+
+static void
+phi_web_union(struct phi_web_node *web, unsigned x, unsigned y)
+{
+   x = phi_web_find(web, x);
+   y = phi_web_find(web, y);
+
+   if (x == y)
+      return;
+
+   /* Union-by-rank: ensure x.rank >= y.rank */
+   if (web[x].rank < web[y].rank) {
+      SWAP(x, y);
+   }
+
+   web[y].parent = x;
+
+   /* Increment rank if necessary */
+   if (web[x].rank == web[y].rank) {
+      web[x].rank++;
+   }
+}
+
+#define NO_REG 0xFFFF
+
+static inline jay_reg
+make_reg(enum jay_file file, uint16_t reg)
+{
+   return (((uint16_t) file) << 13) | reg;
+}
+
+static inline unsigned
+r_reg(jay_reg r)
+{
+   assert(r != NO_REG);
+   return r & BITFIELD_MASK(13);
+}
+
+static inline enum jay_file
+r_file(jay_reg r)
+{
+   assert(r != NO_REG);
+   assert((r >> 13) < JAY_NUM_RA_FILES);
+   return r >> 13;
+}
+
+static jay_def
+def_from_reg(jay_reg r)
+{
+   return jay_bare_reg(r_file(r), r_reg(r));
+}
+
+typedef struct jay_ra_state {
+   /** Size of each register file */
+   unsigned num_regs[JAY_NUM_RA_FILES];
+
+   /** First GPR that may be used for EOT sends */
+   unsigned eot_offs;
+
+   /** Phi coalescing data structure */
+   struct phi_web_node *phi_web;
+
+   /**
+    * Global SSA index -> jay_reg map. Unlike reg_for_index, once a register
+    * is picked it will not be shuffled.
+    */
+   jay_reg *global_reg_for_index;
+
+   /**
+    * Block currently being processed. ra_state is allocated once per
+    * function but the following fields are updated as we go through the
+    * program. This keeps RA linearish time.
+    */
+   jay_block *block;
+
+   /** Builder for inserting shuffle code */
+   jay_builder bld;
+
+   /** Local SSA index -> jay_reg map. Only defined for live indices. */
+   jay_reg *reg_for_index;
+
+   /**
+    * Value occupying a register (register -> uint32_t reverse maps) for
+    * registers that are not available. Undefined for available registers.
+    */
+   uint32_t *index_for_reg[JAY_NUM_RA_FILES];
+
+   /** Set of registers that are available */
+   BITSET_WORD *available_regs[JAY_NUM_RA_FILES];
+
+   /**
+    * Within assign_regs_for_inst, the set of registers that have been
+    * assigned and are therefore pinned.
+    *
+    * Invariant: zeroed on entry to assign_regs_for_inst.
+    */
+   BITSET_WORD *pinned[JAY_NUM_RA_FILES];
+
+   /** Vector affinities for each def. */
+   struct affinity *affinities;
+} jay_ra_state;
+
+static inline jay_reg
+current_reg(const jay_ra_state *ra, uint32_t index)
+{
+   assert(index > 0 && index < ra->bld.func->ssa_alloc);
+   jay_reg reg = ra->reg_for_index[index];
+
+   assert(reg != NO_REG);
+   assert(ra->index_for_reg[r_file(reg)][r_reg(reg)] == index);
+   return reg;
+}
+
+/** (dst, src) pairs for use in parallel copies */
+struct jay_parallel_copy {
+   jay_reg dst, src;
+};
+
+static void
+add_copy(struct util_dynarray *copies, jay_reg dst, jay_reg src)
+{
+   if (dst != src) {
+      assert(r_file(dst) == r_file(src));
+      util_dynarray_append(copies, ((struct jay_parallel_copy) { dst, src }));
+   }
+}
+
+static jay_def
+push_temp(jay_builder *b, jay_reg reg, bool stride4)
+{
+   jay_def tmp = def_from_reg(reg);
+
+   if (stride4 && jay_def_stride(b->shader, tmp) != JAY_STRIDE_4) {
+      jay_def new = def_from_reg(0);
+      jay_MOV(b, tmp, new);
+      tmp = new;
+   }
+
+   return tmp;
+}
+
+static void
+pop_temp(jay_builder *b, struct jay_temp_regs t, jay_def temp)
+{
+   if (temp.file == GPR && temp.reg != t.gpr) {
+      jay_MOV(b, temp, def_from_reg(t.gpr));
+   }
+}
+
+/*
+ * Insert a single logical copy. Like jay_MOV but expands to multiple moves
+ * involving a temporary register in some cases.
+ */
+static void
+mov(jay_builder *b, jay_def dst, jay_def src, struct jay_temp_regs temps)
+{
+   if (dst.file == MEM && src.file == MEM) {
+      assert(temps.gpr != NO_REG && "ensured by the spill limit");
+      jay_def temp = push_temp(b, temps.gpr, true /* stride4 */);
+      jay_MOV(b, temp, src);
+      jay_MOV(b, dst, temp);
+      pop_temp(b, temps, temp);
+   } else if (dst.file == UMEM && src.file == UMEM) {
+      assert(temps.ugpr != NO_REG && "ensured by the spill limit");
+      jay_MOV(b, def_from_reg(temps.ugpr), src);
+      jay_MOV(b, dst, def_from_reg(temps.ugpr));
+   } else {
+      jay_MOV(b, dst, src);
+   }
+}
+
+/*
+ * Sequentialize a parallel copy. temps are registers free *before* the
+ * parallel copy. A temporary might be the destination of a copy, but it
+ * cannot be the source of any copy (since copying a free register is
+ * undefined). Therefore it cannot be a part of a cycle, so it is free for use
+ * (only) when handling cycles, which must happen before sequential copies.
+ */
+static void
+jay_emit_parallel_copies(jay_builder *b,
+                         struct jay_parallel_copy *pcopies,
+                         unsigned num_copies,
+                         struct jay_temp_regs temps)
+{
+   /* Compact away trivial copies upfront to reduce runtime. */
+   unsigned new_num_copies = 0;
+   for (unsigned i = 0; i < num_copies; ++i) {
+      assert(r_file(pcopies[i].dst) == r_file(pcopies[i].src));
+
+      if (pcopies[i].dst != pcopies[i].src) {
+         pcopies[new_num_copies++] = pcopies[i];
+      }
+   }
+
+   num_copies = new_num_copies;
+   if (num_copies == 0)
+      return;
+
+   assert(num_copies < UINT16_MAX);
+   BITSET_WORD *done = BITSET_CALLOC(num_copies);
+   uint16_t *reg_use_count[JAY_NUM_RA_FILES];
+   jay_foreach_ra_file(f) {
+      reg_use_count[f] = calloc(b->shader->num_regs[f], sizeof(uint16_t));
+   }
+
+   struct jay_parallel_copy *simple = malloc(num_copies * sizeof(*simple));
+   unsigned num_simple = 0;
+
+#ifndef NDEBUG
+   BITSET_WORD *packed = BITSET_CALLOC(UINT16_MAX);
+
+   if (0) {
+      const char *files = "ruMm";
+      printf("[[\n");
+
+      for (unsigned i = 0; i < num_copies; i++) {
+         printf("  %c%u = %c%u\n", files[r_file(pcopies[i].dst)],
+                r_reg(pcopies[i].dst), files[r_file(pcopies[i].src)],
+                r_reg(pcopies[i].src));
+      }
+
+      printf("]]\n");
+   }
+
+   /**
+    * Assert that each parallel copy destination is unique: no reg can appear
+    * as the destination of two parallel copies.
+    */
+   for (unsigned i = 0; i < num_copies; i++) {
+      assert(!BITSET_TEST(packed, pcopies[i].dst));
+      BITSET_SET(packed, pcopies[i].dst);
+   }
+
+   free(packed);
+#endif
+
+   for (unsigned i = 0; i < num_copies; i++) {
+      ++reg_use_count[r_file(pcopies[i].src)][r_reg(pcopies[i].src)];
+   }
+
+   bool progress;
+   do {
+      progress = false;
+
+      /* Step 1: resolve paths in the transfer graph. This means finding
+       * copies whose destination aren't blocked by something else and then
+       * emitting them, continuing this process until every copy is blocked
+       * and there are only cycles left.
+       *
+       * TODO: We should note that src is also available in dest to unblock
+       * cycles that src is involved in.
+       */
+      for (unsigned i = 0; i < num_copies; i++) {
+         struct jay_parallel_copy *copy = &pcopies[i];
+
+         if (!BITSET_TEST(done, i) &&
+             reg_use_count[r_file(copy->dst)][r_reg(copy->dst)] == 0) {
+
+            simple[num_simple++] = *copy;
+            BITSET_SET(done, i);
+            --reg_use_count[r_file(copy->src)][r_reg(copy->src)];
+            progress = true;
+         }
+      }
+   } while (progress);
+
+   /* Step 2: resolve cycles through swapping.
+    *
+    * At this point, the transfer graph should consist of only cycles.
+    * The reason is that, given any reg n_1 that's the source of a
+    * remaining entry, it has a destination n_2, which (because every
+    * copy is blocked) is the source of some other copy whose destination
+    * is n_3, and so we can follow the chain until we get a cycle. If we
+    * reached some other node than n_1:
+    *
+    * n_1 -> n_2 -> ... -> n_i
+    *         ^             |
+    *         |-------------|
+    *
+    * then n_2 would be the destination of 2 copies, which is illegal
+    * (checked above in an assert). So n_1 must be part of a cycle:
+    *
+    * n_1 -> n_2 -> ... -> n_i
+    * ^                     |
+    * |---------------------|
+    *
+    * and this must be only cycle n_1 is involved in, because any other
+    * path starting from n_1 would also have to end in n_1, resulting in
+    * a node somewhere along the way being the destination of 2 copies
+    * when the 2 paths merge.
+    *
+    * The way we resolve the cycle is through picking a copy (n_1, n_2)
+    * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
+    * out of the cycle:
+    *
+    * n_1 -> ... -> n_i
+    * ^              |
+    * |--------------|
+    *
+    * and we can keep repeating this until the cycle is empty. After each
+    * swap, we update sources of blocking copies. At that point, every
+    * blocking copy's source should be contained within our destination.
+    */
+   for (unsigned i = 0; i < num_copies; i++) {
+      struct jay_parallel_copy *copy = &pcopies[i];
+
+      if (!BITSET_TEST(done, i) && copy->dst != copy->src) {
+         jay_def dst = def_from_reg(copy->dst), src = def_from_reg(copy->src);
+         assert(dst.file == src.file);
+         enum jay_file file = dst.file;
+         jay_reg tmp = (file == GPR || file == MEM) ? temps.gpr : temps.ugpr;
+
+         if (tmp != NO_REG) {
+            struct jay_temp_regs t = { .gpr = temps.gpr2, .ugpr = temps.ugpr2 };
+            jay_def temp = push_temp(b, tmp, file == MEM /* stride4 */);
+            {
+               mov(b, temp, dst, t);
+               mov(b, dst, src, t);
+               mov(b, src, temp, t);
+            }
+            pop_temp(b, temps, temp);
+         } else {
+            jay_SWAP(b, dst, src);
+         }
+
+         for (unsigned j = 0; j < num_copies; j++) {
+            if (pcopies[j].src == copy->dst)
+               pcopies[j].src = copy->src;
+         }
+
+         /* Simple copies are deferred. Their destinations do not conflict with
+          * our swaps, but we need to swap their sources to sink.
+          */
+         for (unsigned j = 0; j < num_simple; j++) {
+            assert(simple[j].dst != copy->src && simple[j].dst != copy->dst);
+
+            if (simple[j].src == copy->src)
+               simple[j].src = copy->dst;
+            else if (simple[j].src == copy->dst)
+               simple[j].src = copy->src;
+         }
+      }
+
+      BITSET_SET(done, i);
+   }
+
+   /* Emit moves after swaps because they fan out and thus increase demand.
+    * This gives us more freedom around temporaries. The rewrite of simple
+    * copies above ensures correctness.
+    *
+    * Simiarly, we first emit memory-memory copies since those require
+    * temporaries but only register copies can clobber the temporaries.
+    */
+   for (unsigned i = 0; i < num_simple; i++) {
+      jay_def dst = def_from_reg(simple[i].dst);
+      jay_def src = def_from_reg(simple[i].src);
+
+      if (jay_is_mem(dst) && jay_is_mem(src)) {
+         mov(b, dst, src, temps);
+      }
+   }
+
+   for (unsigned i = 0; i < num_simple; i++) {
+      jay_def dst = def_from_reg(simple[i].dst);
+      jay_def src = def_from_reg(simple[i].src);
+
+      if (!(jay_is_mem(dst) && jay_is_mem(src))) {
+         mov(b, dst, src, temps);
+
+         if (temps.gpr == simple[i].dst || temps.gpr == simple[i].src) {
+            temps.gpr = NO_REG;
+         }
+
+         if (temps.ugpr == simple[i].dst || temps.ugpr == simple[i].src) {
+            temps.ugpr = NO_REG;
+         }
+      }
+   }
+
+   jay_foreach_ra_file(f) {
+      free(reg_use_count[f]);
+   }
+
+   free(simple);
+   free(done);
+}
+
+static bool
+reg_is_available(jay_ra_state *ra, jay_reg reg)
+{
+   assert(reg != NO_REG);
+   return BITSET_TEST(ra->available_regs[r_file(reg)], r_reg(reg));
+}
+
+static void
+assign_reg_for_index(jay_ra_state *ra, uint32_t index, jay_reg reg)
+{
+   /* Update our data structures */
+   ra->reg_for_index[index] = reg;
+   ra->index_for_reg[r_file(reg)][r_reg(reg)] = index;
+   BITSET_CLEAR(ra->available_regs[r_file(reg)], r_reg(reg));
+
+   /* Update the web to the most recent register. Heuristic from Colombet. */
+   ra->phi_web[phi_web_find(ra->phi_web, index)].reg = reg;
+
+   /* Post-conditions */
+   assert(!reg_is_available(ra, reg));
+   assert(current_reg(ra, index) == reg);
+}
+
+static void
+release_reg(jay_ra_state *ra, jay_reg reg)
+{
+   /* Update available_regs only - the reg<-->index maps are invalidated. */
+   BITSET_SET(ra->available_regs[r_file(reg)], r_reg(reg));
+}
+
+static unsigned
+register_demand(jay_ra_state *ra, enum jay_file f)
+{
+   unsigned n = ra->num_regs[f];
+   return n - __bitset_prefix_sum(ra->available_regs[f], n, BITSET_WORDS(n));
+}
+
+static jay_reg
+try_find_free_reg(jay_ra_state *ra, enum jay_file file, unsigned except)
+{
+   unsigned i;
+
+   /* Prefer stride 4 temporaries, since they are more compatible and thus
+    * should reduce swapping on average.
+    */
+   if (file == GPR) {
+      BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) {
+         if (i != except &&
+             jay_gpr_to_stride(&ra->bld.shader->partition, i) == JAY_STRIDE_4) {
+            return make_reg(file, i);
+         }
+      }
+   }
+
+   BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) {
+      if (i != except) {
+         return make_reg(file, i);
+      }
+   }
+
+   return NO_REG;
+}
+
+static jay_reg
+find_free_reg(jay_ra_state *ra, enum jay_file file, unsigned except)
+{
+   jay_reg reg = try_find_free_reg(ra, file, except);
+
+   if (reg == NO_REG) {
+      fprintf(stderr, "file %u, current demand %u, target %u\n", file,
+              register_demand(ra, file), ra->num_regs[file]);
+      UNREACHABLE("there should have been a free register");
+   }
+
+   return reg;
+}
+
+static inline struct jay_temp_regs
+find_temp_regs(jay_ra_state *ra)
+{
+   jay_reg gpr = try_find_free_reg(ra, GPR, ~0);
+   jay_reg ugpr = try_find_free_reg(ra, UGPR, ~0);
+
+   return (struct jay_temp_regs) {
+      .gpr = gpr,
+      .ugpr = ugpr,
+      .gpr2 = try_find_free_reg(ra, GPR, gpr),
+      .ugpr2 = try_find_free_reg(ra, UGPR, ugpr),
+   };
+}
+
+static unsigned
+pick_regs(jay_ra_state *ra,
+          enum jay_file file,
+          unsigned size,
+          unsigned alignment,
+          enum jay_stride min_stride,
+          enum jay_stride max_stride,
+          jay_inst *I,
+          jay_def var,
+          jay_def *last_killed,
+          bool is_src)
+{
+   struct jay_partition *partition = &ra->bld.shader->partition;
+   unsigned first = 0, end = ra->num_regs[file];
+   unsigned ugpr_per_grf = jay_ugpr_per_grf(ra->bld.shader);
+   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
+   must_tie &= !is_src;
+
+   /* Cross-lane access cannot be SIMD split if the source/destination registers
+    * overlap, but as long as we don't tie those destinations, we're ok.
+    */
+   bool may_tie = !jay_is_shuffle_like(I);
+
+   /* Ensure we do not cross partitions */
+   if (file == UGPR && size > 16) {
+      first = partition->large_ugpr_block.start;
+      end = partition->large_ugpr_block.start + partition->large_ugpr_block.len;
+   }
+
+   /* Sources used by end-of-thread sends must be at the end of the file */
+   if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) {
+      first = ra->eot_offs;
+   }
+
+   /* If possible, keep sources in place to avoid shuffles. */
+   if (is_src && jay_channel(var, 0) != 0) {
+      unsigned cur = r_reg(ra->reg_for_index[jay_channel(var, 0)]);
+      enum jay_stride stride = jay_gpr_to_stride(partition, cur);
+
+      if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) &&
+          util_is_aligned(cur, alignment) &&
+          cur >= first &&
+          cur + size <= end &&
+          (file != GPR || (min_stride <= stride && stride <= max_stride))) {
+         return cur;
+      }
+   }
+
+   unsigned best_cost = UINT32_MAX;
+   unsigned best_reg = 0;
+   struct affinity affinity =
+      ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity;
+
+   assert(alignment >= size && "alignment must be a multiple of size");
+
+   for (unsigned r = first; r + size <= end; r += alignment) {
+      unsigned cost = 0;
+      bool tied = last_killed && last_killed->reg == r;
+      enum jay_stride stride =
+         file == GPR ? jay_gpr_to_stride(partition, r) : min_stride;
+
+      if ((tied ? !may_tie :
+                  (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) ||
+          !(min_stride <= stride && stride <= max_stride))
+         continue;
+
+      /* Assigning a stride that is too big may result in SIMDness splitting.
+       * Model that cost so we prefer packed registers.
+       */
+      cost += stride - min_stride;
+
+      /* If we are used for end-of-thread and it is not in the appropriate
+       * register, we will need to insert 1 copy per channel at the end.
+       */
+      if (affinity.eot && r < ra->eot_offs)
+         cost += size;
+
+      /* If there are stricter alignment requirements later, model the cost of
+       * inserting copies for that.
+       */
+      if (affinity.grf_align &&
+          !util_is_aligned(r - affinity.align_offs, ugpr_per_grf))
+         cost += size;
+
+      if (affinity.repr == jay_channel(var, 0)) {
+         /* If we are the collect representative but the final collect won't
+          * actually be usable, the whole vector will need to be copied.
+          */
+         if (!util_is_aligned(r - affinity.offset, 8) ||
+             (affinity.eot && r - affinity.offset < ra->eot_offs)) {
+            cost += 8;
+         }
+      } else if (affinity.repr) {
+         /* If we are used for a collect but not in the right place, we will
+          * similarly insert copies.
+          */
+         if (ra->reg_for_index[affinity.repr] != NO_REG &&
+             r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) {
+
+            cost += size;
+         }
+      }
+
+      for (unsigned c = 0; c < size; ++c) {
+         unsigned i = r + c;
+
+         /* If the register is unavailable, account for the cost of shuffling */
+         if (!BITSET_TEST(ra->available_regs[file], i) && !tied) {
+            cost++;
+
+            /* ..plus the cost of shuffling back. */
+            if (u_sparse_bitset_test(&ra->block->live_out,
+                                     ra->index_for_reg[file][i]))
+               cost++;
+         }
+
+         /* Model the cost of shuffling for phis */
+         if (c < jay_num_values(var)) {
+            struct phi_web_node *phi_web =
+               &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))];
+            if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != i) {
+               cost += 2;
+            }
+         }
+
+         /* Choosing this register will pin it, leaving it unavailable to later
+          * smaller sources which will need to be shuffled. Account for those
+          * moves.
+          *
+          * TODO: Faster algorithm.
+          */
+         jay_foreach_src_index(I, s, c, index) {
+            if (jay_num_values(I->src[s]) < size &&
+                ra->reg_for_index[index] == make_reg(file, i)) {
+               cost++;
+            }
+         }
+      }
+
+      if (cost < best_cost) {
+         best_cost = cost;
+         best_reg = r;
+
+         /* If we find something with 0 cost, we are guaranteed to pick this
+          * register, so terminate early. This speeds up the search.
+          */
+         if (cost == 0) {
+            break;
+         }
+      }
+   }
+
+   assert(best_cost != UINT32_MAX && "we always find something");
+   assert(best_reg + size <= ra->num_regs[file]);
+   return best_reg;
+}
+
+struct window {
+   jay_reg base;
+   uint16_t length;
+};
+static_assert(sizeof(struct window) == 4, "packed");
+
+static void
+assign_regs_for_inst(jay_ra_state *ra, jay_inst *I)
+{
+   jay_shader *shader = ra->bld.shader;
+   jay_def *vars[JAY_MAX_OPERANDS];
+   jay_def *last_killed[JAY_NUM_RA_FILES] = { 0 };
+   jay_def saved_srcs[JAY_MAX_SRCS];
+   struct jay_parallel_copy copies[JAY_MAX_DEF_LENGTH * JAY_MAX_OPERANDS];
+   uint32_t eviction_indices[JAY_MAX_DEF_LENGTH * JAY_MAX_OPERANDS];
+   unsigned nr_vars = 0, nr_copies = 0;
+
+   /* Gather temporary registers that are free /before/ any shuffling */
+   struct jay_temp_regs temp_regs = find_temp_regs(ra);
+
+   /* Save sources so we can get at last-use info even after munging */
+   typed_memcpy(saved_srcs, I->src, I->num_srcs);
+
+   /* Gather sources (in order) then destinations. This order (with a stable
+    * sort) ensures we see killed sources before same-size destinations,
+    * naturally tying the last source to the destination. Predicated default
+    * values rely on this invariant for correctness.
+    */
+   jay_foreach_ra_src(I, s) {
+      /* Filter out duplicate scalar sources - they should only be assigned
+       * once. Duplicated vector sources are lowered away as a precondition.
+       */
+      bool duplicate = false;
+      if (jay_num_values(I->src[s]) == 1) {
+         uint32_t index = jay_index(I->src[s]);
+
+         for (unsigned i = 0; i < nr_vars; ++i) {
+            jay_def var = *(vars[i]);
+            duplicate |= (jay_num_values(var) == 1 && jay_index(var) == index);
+         }
+      }
+
+      if (!duplicate) {
+         vars[nr_vars++] = &I->src[s];
+
+         /* Record the old registers as parallel copies to be filled in later.
+          * Then release the old registers to be reassigned.
+          */
+         jay_foreach_index(I->src[s], _, index) {
+            jay_reg reg = current_reg(ra, index);
+            assert(reg != NO_REG);
+
+            eviction_indices[nr_copies] = index;
+            copies[nr_copies++] = (struct jay_parallel_copy) { .src = reg };
+            release_reg(ra, reg);
+         }
+      }
+   }
+
+   if (!jay_is_null(I->dst) && I->dst.file < JAY_NUM_RA_FILES) {
+      vars[nr_vars++] = &I->dst;
+   }
+
+   /* Sort variables by size in descending order. We use insertion sort
+    * because it is stable, adaptive, and faster than mergesort for small n.
+    *
+    * Algorithm from CLRS.
+    */
+   for (unsigned i = 1; i < nr_vars; ++i) {
+      jay_def *pivot = vars[i];
+      unsigned j, key = pivot->num_values_m1;
+
+      for (j = i; j > 0 && key > vars[j - 1]->num_values_m1; --j) {
+         vars[j] = vars[j - 1];
+      }
+
+      vars[j] = pivot;
+   }
+
+   /* Partition `copies` into "source shuffles" and "livethrough shuffles" */
+   uint32_t first_eviction_copy = nr_copies;
+
+   /* Choose registers for sources/destinations in order */
+   for (unsigned i = 0; i < nr_vars; ++i) {
+      bool is_src = vars[i] >= I->src;
+      bool killed = false;
+      jay_def var = *(vars[i]);
+      unsigned size = jay_num_values(var);
+      if (is_src) {
+         assert(util_is_power_of_two_nonzero(size) && "NPOT sources lowered");
+      } else {
+         size = util_next_power_of_two(size);
+      }
+
+      unsigned alignment = I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : size;
+      enum jay_file file = var.file;
+      enum jay_stride min_stride = JAY_STRIDE_2, max_stride = JAY_STRIDE_8;
+
+      assert(size > 0 && file < JAY_NUM_RA_FILES && "filtered above");
+
+      if (is_src) {
+         /* If a source is duplicated, we need to take the most constrained
+          * version. This matters for 3-src restrictions.
+          */
+         jay_foreach_src(I, s) {
+            if (jay_defs_equivalent(var, I->src[s])) {
+               alignment = MAX2(alignment, jay_src_alignment(shader, I, s));
+               min_stride =
+                  MAX2(jay_src_stride_minmax(I, s, false), min_stride);
+               max_stride = MIN2(jay_src_stride_minmax(I, s, true), max_stride);
+            }
+         }
+
+         unsigned s = vars[i] - I->src;
+
+         /* Sources are considered killed only if completely killed */
+         unsigned lu = jay_source_last_use_bit(saved_srcs, s);
+
+         killed = true;
+         for (unsigned i = 0; i < size; ++i) {
+            if (jay_channel(I->src[s], i) == 0 ||
+                !BITSET_TEST(I->last_use, lu + i)) {
+               killed = false;
+               break;
+            }
+         }
+      } else {
+         alignment = MAX2(alignment, jay_dst_alignment(shader, I));
+         min_stride = jay_dst_stride_minmax(I, false);
+         max_stride = jay_dst_stride_minmax(I, true);
+      }
+
+      /* Choose registers satisfying the constraints and minimizing shuffles */
+      unsigned base =
+         pick_regs(ra, file, size, alignment, min_stride, max_stride, I, var,
+                   is_src ? NULL : last_killed[file], is_src);
+      jay_reg reg = make_reg(file, base);
+
+      /* If we decided to tie, process that */
+      if (!is_src && last_killed[file] && last_killed[file]->reg == base) {
+         /* Fully killed source so we can zero a contiguous range. Note we need
+          * to use the unpadded size to avoid leaking a register for vec3
+          * destinations tied to vec4 sources.
+          */
+         unsigned offs =
+            jay_source_last_use_bit(saved_srcs, last_killed[file] - I->src);
+         BITSET_CLEAR_COUNT(I->last_use, offs, jay_num_values(var));
+         last_killed[file] = NULL;
+      } else {
+         /* Otherwise pin our choice */
+         BITSET_SET_COUNT(ra->pinned[file], base, size);
+
+         for (unsigned c = 0; c < size; ++c) {
+            /* Evict any livethrough value interfering with our choice */
+            if (!(is_src && jay_channel(var, c) == 0) &&
+                !reg_is_available(ra, reg + c)) {
+               uint32_t index = ra->index_for_reg[file][base + c];
+               struct jay_parallel_copy copy = { .src = reg + c };
+               eviction_indices[nr_copies] = index;
+               copies[nr_copies++] = copy;
+               release_reg(ra, reg + c);
+            }
+         }
+      }
+
+      jay_set_reg(vars[i], base);
+
+      jay_foreach_index(var, c, index) {
+         assign_reg_for_index(ra, index, reg + c);
+      }
+
+      if (killed) {
+         last_killed[file] = vars[i];
+      }
+   }
+
+   /* Set .reg late so duplicated scalar sources are handled properly */
+   jay_foreach_ra_src(I, s) {
+      if (I->src[s]._payload != JAY_SENTINEL) {
+         jay_set_reg(&I->src[s],
+                     r_reg(ra->reg_for_index[jay_channel(I->src[s], 0)]));
+      }
+   }
+
+   /* Look up where shuffled sources ended up */
+   for (unsigned i = 0; i < first_eviction_copy; ++i) {
+      copies[i].dst = ra->reg_for_index[eviction_indices[i]];
+   }
+
+   /* Assign new registers for evicted values */
+   for (unsigned i = first_eviction_copy; i < nr_copies; ++i) {
+      copies[i].dst = find_free_reg(ra, r_file(copies[i].src), ~0);
+      assign_reg_for_index(ra, eviction_indices[i], copies[i].dst);
+   }
+
+   /* Shuffle everything */
+   ra->bld.cursor = jay_before_inst(I);
+   jay_emit_parallel_copies(&ra->bld, copies, nr_copies, temp_regs);
+
+   /* Reset data structures */
+   for (unsigned i = 0; i < nr_vars; ++i) {
+      jay_def var = *(vars[i]);
+      BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg,
+                         util_next_power_of_two(jay_num_values(var)));
+   }
+
+   /* Sources selected for early-kill have had their last_use fields cleared.
+    * Anything else is late-killed. Release those registers.
+    */
+   unsigned kill_idx = 0;
+   jay_foreach_ssa_src(I, s) {
+      jay_foreach_index(saved_srcs[s], c, idx) {
+         if (is_ra_src(I->src[s]) && BITSET_TEST(I->last_use, kill_idx)) {
+            release_reg(ra, make_reg(I->src[s].file, I->src[s].reg + c));
+         }
+
+         kill_idx++;
+      }
+   }
+}
+
+static void
+local_ra(jay_ra_state *ra, jay_block *block)
+{
+   ra->block = block;
+
+   /* Initialize local data structures based on global state */
+   jay_foreach_ra_file(file) {
+      BITSET_SET_COUNT(ra->available_regs[file], 0, ra->num_regs[file]);
+   }
+
+   U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) {
+      if (ra->global_reg_for_index[i] != NO_REG) {
+         assign_reg_for_index(ra, i, ra->global_reg_for_index[i]);
+      }
+   }
+
+   /* Assign registers locally */
+   jay_foreach_inst_in_block(block, I) {
+      if (I->op == JAY_OPCODE_PHI_SRC) {
+         break;
+      } else if (I->op == JAY_OPCODE_PHI_DST) {
+         /* Phis are special as we never shuffle them */
+         unsigned index = jay_index(I->dst);
+         jay_reg reg = ra->phi_web[phi_web_find(ra->phi_web, index)].reg;
+
+         if (reg == NO_REG || !reg_is_available(ra, reg)) {
+            reg = find_free_reg(ra, I->dst.file, ~0);
+         }
+
+         assign_reg_for_index(ra, jay_index(I->dst), reg);
+         I->dst.reg = r_reg(reg);
+      } else if (I->op == JAY_OPCODE_PRELOAD) {
+         /* Preloads always get what they want */
+         I->dst.reg = jay_preload_reg(I);
+         jay_reg base = make_reg(I->dst.file, I->dst.reg);
+
+         jay_foreach_comp(I->dst, c) {
+            assert(reg_is_available(ra, base + c) && "preloads always work");
+            assign_reg_for_index(ra, jay_channel(I->dst, c), base + c);
+         }
+      } else {
+         /* For normal instructions, assign registers. */
+         assign_regs_for_inst(ra, I);
+      }
+
+      /* Release registers for destinations that are immediately killed */
+      jay_foreach_index(I->dst, _, index) {
+         if (BITSET_TEST(ra->bld.func->dead_defs, index)) {
+            release_reg(ra, current_reg(ra, index));
+         }
+      }
+
+      if (jay_debug & JAY_DBG_PRINTDEMAND) {
+         printf("(RA) [G:%u\tU:%u] ", register_demand(ra, GPR),
+                register_demand(ra, UGPR));
+         jay_print_inst(stdout, I);
+      }
+   }
+
+   /* Gather temporary registers that are free /before/ any shuffling */
+   struct jay_temp_regs temp_regs = find_temp_regs(ra);
+
+   /* Reconcile local state with the global structures */
+   jay_foreach_ra_file(file) {
+      BITSET_SET_COUNT(ra->available_regs[file], 0, ra->num_regs[file]);
+   }
+
+   /* Extend live ranges for correctness. Might be a better solution though. */
+   jay_foreach_inst_in_block_rev(block, I) {
+      if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) {
+         break;
+      }
+
+      jay_foreach_ra_src(I, s) {
+         u_sparse_bitset_set(&block->live_out, jay_index(I->src[s]));
+      }
+   }
+
+   /* Already assigned global registers need to be shuffled back */
+   struct util_dynarray copies = UTIL_DYNARRAY_INIT;
+
+   U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) {
+      jay_reg lreg = ra->reg_for_index[i], greg = ra->global_reg_for_index[i];
+
+      if (lreg != NO_REG && greg != NO_REG) {
+         add_copy(&copies, greg, lreg);
+         assign_reg_for_index(ra, i, greg);
+      }
+   }
+
+   /* Live-out variables defined in this block need global registers assigned */
+   U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) {
+      jay_reg reg = ra->reg_for_index[i];
+
+      if (ra->global_reg_for_index[i] == NO_REG && reg != NO_REG) {
+         if (!reg_is_available(ra, reg)) {
+            jay_reg old = reg;
+            reg = find_free_reg(ra, r_file(reg), ~0);
+            add_copy(&copies, reg, old);
+         }
+
+         assign_reg_for_index(ra, i, reg);
+         ra->global_reg_for_index[i] = reg;
+      }
+   }
+
+   /* Gather temporary registers free after shuffling (before phis) */
+   block->temps_out = find_temp_regs(ra);
+
+   /* Handle the end of the block */
+   ra->bld.cursor = jay_before_block(block);
+
+   jay_foreach_inst_in_block_rev(block, I) {
+      if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) {
+         ra->bld.cursor = jay_after_inst(I);
+         break;
+      }
+
+      jay_foreach_ra_src(I, s) {
+         jay_set_reg(&I->src[s],
+                     r_reg(ra->global_reg_for_index[jay_index(I->src[s])]));
+      }
+   }
+
+   const unsigned num_pcopies =
+      util_dynarray_num_elements(&copies, struct jay_parallel_copy);
+
+   jay_emit_parallel_copies(&ra->bld, copies.data, num_pcopies, temp_regs);
+   util_dynarray_fini(&copies);
+}
+
+/*
+ * Record all phi webs. First initialize the union-find data structure
+ * with all SSA defs in their own singletons, then union together anything
+ * related by a phi. The resulting union-find structure will be the webs.
+ */
+static void
+construct_phi_webs(struct phi_web_node *web, jay_function *f)
+{
+   for (unsigned i = 0; i < f->ssa_alloc; ++i) {
+      web[i] = (struct phi_web_node) { .parent = i, .reg = NO_REG };
+   }
+
+   jay_foreach_block(f, block) {
+      jay_foreach_phi_src_in_block(block, phi) {
+         phi_web_union(web, jay_index(phi->src[0]), jay_phi_src_index(phi));
+      }
+   }
+}
+
+static void
+insert_parallel_copies_for_phis(jay_function *f)
+{
+   jay_reg *phi_dsts = calloc(f->ssa_alloc, sizeof(jay_reg));
+   struct util_dynarray copies = UTIL_DYNARRAY_INIT;
+   memset(phi_dsts, 0xFF, sizeof(jay_reg) * f->ssa_alloc);
+
+   jay_foreach_block(f, block) {
+      jay_foreach_phi_dst_in_block(block, I) {
+         phi_dsts[jay_index(I->dst)] = make_reg(I->dst.file, I->dst.reg);
+      }
+   }
+
+   jay_foreach_block(f, block) {
+      jay_builder b = jay_init_builder(f, jay_before_jump(block));
+
+      /* Copy phi source to phi destination along the edge. */
+      jay_foreach_phi_src_in_block(block, phi) {
+         jay_reg src = make_reg(phi->src[0].file, phi->src[0].reg);
+         add_copy(&copies, phi_dsts[jay_phi_src_index(phi)], src);
+         jay_remove_instruction(phi);
+      }
+
+      const unsigned nr =
+         util_dynarray_num_elements(&copies, struct jay_parallel_copy);
+
+      jay_emit_parallel_copies(&b, copies.data, nr, block->temps_out);
+      util_dynarray_clear(&copies);
+   }
+
+   util_dynarray_fini(&copies);
+   free(phi_dsts);
+}
+
+static struct jay_register_block
+block_gpr_to_grf(struct jay_partition *p, enum jay_file file, unsigned block)
+{
+   assert(file == GPR || file == UGPR);
+   assert(((p->blocks[file][block].start * 16) % p->units_x16[file]) == 0);
+   assert(((p->blocks[file][block].len * 16) % p->units_x16[file]) == 0);
+
+   return (struct jay_register_block) {
+      .start = (p->blocks[file][block].start * 16) / p->units_x16[file],
+      .len = (p->blocks[file][block].len * 16) / p->units_x16[file],
+   };
+}
+
+static void
+print_partition(struct jay_partition *p)
+{
+   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
+      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
+         struct jay_register_block B = block_gpr_to_grf(p, f, b);
+         const char *file = f ? "UGPR" : "GPR";
+
+         if (B.len > 1) {
+            fprintf(stderr, "%s: %u-%u\n", file, B.start, B.start + B.len - 1);
+         } else if (B.len == 1) {
+            fprintf(stderr, "%s: %u\n", file, B.start);
+         }
+      }
+   }
+
+   fprintf(stderr, "\n");
+}
+
+/*
+ * Verify that a register partition is a bijective mapping of the GRF file.
+ */
+static void
+validate_partition(struct jay_partition *p,
+                   unsigned stride4_header_size,
+                   unsigned nonuniform_gprs)
+{
+   BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 };
+
+   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
+      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
+         struct jay_register_block B = block_gpr_to_grf(p, f, b);
+         if (B.len) {
+            assert(B.start + B.len <= JAY_NUM_PHYS_GRF && "GRF file size");
+            assert(!BITSET_TEST_COUNT(regs, B.start, B.len) && "uniqueness");
+
+            BITSET_SET_COUNT(regs, B.start, B.len);
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < JAY_NUM_PHYS_GRF; ++i) {
+      assert(BITSET_TEST(regs, i) && "all GRFs mapped");
+   }
+
+   assert(p->large_ugpr_block.len && "partition must have a large UGPR block");
+   assert(p->base2 >= p->base8 && p->base_eot >= p->base2 && "monotonic");
+   assert(p->base8 >= stride4_header_size && "header is big enough");
+   assert(p->base_eot + p->units_x16[GPR] <= nonuniform_gprs && "EOT fits");
+   assert(util_is_aligned(p->base8, 8) && "so vectors don't cross");
+   assert(util_is_aligned(p->base2, 8) && "so vectors don't cross");
+   assert(util_is_aligned(p->base_eot, 8) && "so vectors don't cross");
+}
+
+static void
+build_partition(jay_shader *shader, unsigned *blocks, unsigned n)
+{
+   unsigned base = 0;
+   unsigned ugpr_base = 0;
+   struct jay_partition *p = &shader->partition;
+
+   *p = (struct jay_partition) {
+      .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16,
+      .units_x16[GPR] = 16 / jay_grf_per_gpr(shader),
+   };
+
+   for (unsigned i = 0; i < n; ++i) {
+      enum jay_file file = (i & 1) ? GPR : UGPR;
+      unsigned file_i = i >> 1;
+
+      p->blocks[file][file_i].start = (base * p->units_x16[file]) / 16;
+      p->blocks[file][file_i].len = (blocks[i] * p->units_x16[file]) / 16;
+
+      if (file == UGPR && blocks[i] >= 8) {
+         p->large_ugpr_block = (struct jay_register_block) {
+            .start = (ugpr_base * p->units_x16[file]) / 16,
+            .len = p->blocks[file][file_i].len,
+         };
+      }
+
+      base += blocks[i];
+      if (file == UGPR) {
+         ugpr_base += blocks[i];
+      }
+
+      /* GPR partition blocks must be vector size aligned to avoid crossing */
+      if (file == GPR && i != (n - 1)) {
+         unsigned max_vec = 8;
+         assert(util_is_aligned(blocks[i], max_vec * jay_grf_per_gpr(shader)));
+      }
+   }
+}
+
+/*
+ * Partition the register file for the entire shader. All functions must
+ * share the same partition for correctness with non-uniform function calls.
+ * For unlinked library functions, we must use the ABI partition (TODO).
+ */
+void
+jay_partition_grf(jay_shader *shader)
+{
+   /* Calculate the maximum register demand across all functions in the shader.
+    * We will use this to choose a good partition.
+    */
+   struct jay_partition *p = &shader->partition;
+   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
+
+   jay_foreach_function(shader, f) {
+      jay_compute_liveness(f);
+      jay_calculate_register_demands(f);
+
+      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
+      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
+   }
+
+   /* We must have enough register file space for the register payload, plus the
+    * reserved UGPRs in the case we spill. That UGPR interferes with everything
+    * we preload so it needs to be reserved specially here for the worst case.
+    */
+   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
+      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
+      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width + 1 : 0;
+      assert(I->dst.file < JAY_NUM_GRF_FILES);
+      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
+   }
+
+   /* Determine a good GPR/UGPR split informed by the demand calculation */
+   unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
+   unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
+
+   /* We must have enough for SIMD1 images (TODO: Check if this actually
+    * applies. Or if we could eliminate this with smarter partitioning even.)
+    */
+   unsigned min_ugprs = 16;
+   min_ugprs = MAX2(min_ugprs, 256);
+
+   unsigned grf_block_alignment = 8 * jay_grf_per_gpr(shader); /* max_vec */
+
+   /* TODO: We could partition more cleverly */
+   uniform_grfs = CLAMP(align(uniform_grfs, grf_block_alignment),
+                        DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
+                        128 - (32 * jay_grf_per_gpr(shader)));
+   unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
+
+   /* Check the split */
+   assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
+   assert(nonuniform_grfs >= 32 * jay_grf_per_gpr(shader));
+   assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
+
+   /* Partition GRFs between GPR & UGPR */
+   unsigned dispatch_grf = 0;
+   unsigned stride4_header_size = 0;
+
+   if (shader->stage == MESA_SHADER_VERTEX) {
+      unsigned attrib_grfs = shader->prog_data->vue.urb_read_length * 8;
+      unsigned blocks[] = {
+         1,                                         /* UGPR: g0 */
+         8,                                         /* GPR: URB output handle */
+         shader->push_grfs,                         /* UGPR: Push constants */
+         attrib_grfs,                               /* GPR: Vertex inputs */
+         uniform_grfs - (blocks[0] + blocks[2]),    /* UGPR: * */
+         nonuniform_grfs - (blocks[1] + blocks[3]), /* GPR: * and EOT */
+      };
+
+      build_partition(shader, blocks, ARRAY_SIZE(blocks));
+      dispatch_grf = blocks[0] + blocks[1];
+      stride4_header_size = blocks[1] + blocks[3];
+   } else if (shader->stage == MESA_SHADER_FRAGMENT) {
+      unsigned len0 = jay_grf_per_gpr(shader);
+      unsigned blocks[] = {
+         len0,                /* UGPR: g0 (and maybe g1) */
+         len0 * 8,            /* GPR: Barycentrics */
+         uniform_grfs - len0, /* UGPR: Dispatch (eg push constants) & general */
+         nonuniform_grfs - (len0 * 8), /* GPR: General & end-of-thread */
+      };
+      build_partition(shader, blocks, ARRAY_SIZE(blocks));
+      dispatch_grf = blocks[0] + blocks[1];
+      stride4_header_size = blocks[1];
+   } else {
+      unsigned blocks[] = { uniform_grfs - 4, nonuniform_grfs, 4 };
+      build_partition(shader, blocks, ARRAY_SIZE(blocks));
+   }
+
+   /* TODO: Make the stride partition smarter */
+   unsigned nonuniform_gprs = nonuniform_grfs / jay_grf_per_gpr(shader);
+   unsigned eot_gprs = 16 / jay_grf_per_gpr(shader);
+   p->base8 = ROUND_DOWN_TO(nonuniform_gprs - (16 + eot_gprs), 8) + 0;
+   p->base2 = 8 + p->base8;
+   p->base_eot = 8 + p->base2;
+
+   // print_partition(p);
+   validate_partition(p, stride4_header_size, nonuniform_gprs);
+
+   if (shader->stage == MESA_SHADER_FRAGMENT && shader->dispatch_width == 32) {
+      shader->prog_data->fs.dispatch_grf_start_reg_32 = dispatch_grf;
+   } else if (shader->stage == MESA_SHADER_FRAGMENT &&
+              shader->dispatch_width == 16) {
+      shader->prog_data->fs.dispatch_grf_start_reg_16 = dispatch_grf;
+   } else {
+      shader->prog_data->base.dispatch_grf_start_reg = dispatch_grf;
+   }
+
+   /* By construction of our partition, the entire GRF is used. */
+   shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF;
+
+   /* Set the targets for the virtual register file accordingly */
+   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
+      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
+         shader->num_regs[f] += p->blocks[f][b].len;
+      }
+   }
+
+   /* TODO: These are arbitrary. Need to rework somehow, we have options. */
+   shader->num_regs[MEM] = 512;
+   shader->num_regs[UMEM] = 2048;
+}
+
+static void
+spill_file(jay_function *f, enum jay_file file, bool *spilled)
+{
+   unsigned limit = f->shader->num_regs[file];
+
+   /* If testing spilling, set limit tightly. */
+   if ((jay_debug & JAY_DBG_SPILL) &&
+       file == GPR &&
+       f->shader->stage != MESA_SHADER_VERTEX) {
+      limit = 13;
+   }
+
+   /* Ensures we don't XOR swap, XXX: TODO: FIXME */
+   limit--;
+
+   if (f->demand[file] > limit) {
+      /* In the worst case, we
+       * require 2 temporary registers to lower a memory-memory swap produced by
+       * parallel copy lowering, so adjust the limit to be num_regs - 2.
+       */
+      limit--;
+
+      /* If we spill, we need to reserve UGPRs for spilling */
+      if (!(*spilled)) {
+         unsigned reservation = f->shader->dispatch_width + 1;
+         f->shader->num_regs[UGPR] -= reservation;
+         f->shader->partition.large_ugpr_block.len -= reservation;
+      }
+
+      jay_spill(f, file, limit);
+      jay_validate(f->shader, "spilling");
+      jay_compute_liveness(f);
+      jay_calculate_register_demands(f);
+
+      if (f->demand[file] > limit) {
+         fprintf(stderr, "limit %u but demand %u\n", limit, f->demand[file]);
+         UNREACHABLE("spiller bug");
+      }
+
+      *spilled = true;
+   }
+}
+
+static void
+jay_register_allocate_function(jay_function *f)
+{
+   jay_shader *shader = f->shader;
+   jay_ra_state ra = { .bld.shader = shader, .bld.func = f };
+
+   /* Spill as needed to fit within the limits. We spill GPR before UGPR since
+    * spilling GPRs requires reserving a UGPR.
+    */
+   bool spilled = false;
+   spill_file(f, GPR, &spilled);
+   spill_file(f, UGPR, &spilled);
+
+   typed_memcpy(ra.num_regs, shader->num_regs, JAY_NUM_RA_FILES);
+
+   /* The end of the register file is allowed for end-of-thread messages.
+    * Calculate the offset in GPRs. Compute shaders have this as UGPRs while
+    * fragment shaders have this as GPRs.
+    */
+   if (mesa_shader_stage_is_compute(shader->stage)) {
+      ra.eot_offs = ROUND_DOWN_TO(ra.num_regs[UGPR], jay_ugpr_per_grf(shader)) -
+                    jay_ugpr_per_grf(shader);
+   } else {
+      ra.eot_offs = ra.num_regs[GPR] - (16 / jay_grf_per_gpr(shader));
+   }
+
+   linear_ctx *lin_ctx = linear_context(shader);
+
+   ra.reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc);
+   ra.global_reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc);
+   ra.affinities = linear_zalloc_array(lin_ctx, struct affinity, f->ssa_alloc);
+
+   memset(ra.reg_for_index, 0xFF, sizeof(jay_reg) * f->ssa_alloc);
+   memset(ra.global_reg_for_index, 0xFF, sizeof(jay_reg) * f->ssa_alloc);
+
+   jay_foreach_ra_file(file) {
+      const unsigned num_regs = ra.num_regs[file];
+      ra.index_for_reg[file] = linear_zalloc_array(lin_ctx, uint32_t, num_regs);
+      ra.available_regs[file] = BITSET_LINEAR_ZALLOC(lin_ctx, num_regs);
+      ra.pinned[file] = BITSET_LINEAR_ZALLOC(lin_ctx, num_regs);
+   }
+
+   ra.phi_web = linear_zalloc_array(lin_ctx, struct phi_web_node, f->ssa_alloc);
+
+   /* Construct the phi equivalence classes using the union-find data
+    * structure. This associates all SSA values related to the same phi,
+    * and selects one of them as a canonical/representative value.
+    */
+   construct_phi_webs(ra.phi_web, f);
+
+   jay_foreach_inst_in_func(f, block, I) {
+      jay_foreach_src_index(I, s, c, index) {
+         if (jay_num_values(I->src[s]) > 1) {
+            uint32_t repr = UINT_MAX, repr_c = 0;
+
+            /* Pick the representative with the smallest index, as it most
+             * likely dominates the other components.
+             */
+            jay_foreach_comp(I->src[s], j) {
+               if (jay_channel(I->src[s], j) < repr) {
+                  repr = jay_channel(I->src[s], j);
+                  repr_c = j;
+               }
+            }
+
+            ra.affinities[index].repr = repr;
+            ra.affinities[index].offset = repr == index ? c : c - repr_c;
+         }
+
+         if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) {
+            ra.affinities[index].eot = true;
+         }
+
+         if (jay_src_alignment(shader, I, s) >= jay_ugpr_per_grf(shader)) {
+            ra.affinities[index].grf_align = true;
+            ra.affinities[index].align_offs = c;
+         }
+
+         ra.phi_web[phi_web_find(ra.phi_web, index)].affinity =
+            ra.affinities[index];
+      }
+   }
+
+   jay_foreach_block(f, block) {
+      local_ra(&ra, block);
+   }
+
+   linear_free_context(lin_ctx);
+
+   /* Validate the registers we picked before going out of SSA */
+   jay_validate_ra(f);
+
+   insert_parallel_copies_for_phis(f);
+
+   /* Lower spills using the UGPRs we stole above. We need to update num_regs
+    * for correct scoreboarding calculations.
+    */
+   if (spilled) {
+      jay_lower_spill(f);
+      f->shader->num_regs[UGPR] += f->shader->dispatch_width + 1;
+   }
+}
+
+void
+jay_register_allocate(jay_shader *s)
+{
+   jay_foreach_function(s, f) {
+      jay_register_allocate_function(f);
+   }
+
+   s->post_ra = true;
+}
diff --git a/src/intel/compiler/jay/jay_repair_ssa.c b/src/intel/compiler/jay/jay_repair_ssa.c
new file mode 100644
index 00000000000..794f3977cdf
--- /dev/null
+++ b/src/intel/compiler/jay/jay_repair_ssa.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * Copyright 2022 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ * Implementation of "Simple and Efficient
+ * Construction of Static Single Assignment Form", also by Braun et al.
+ * https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf
+ */
+
+#include "util/bitset.h"
+#include "util/hash_table.h"
+#include "util/ralloc.h"
+#include "util/u_dynarray.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+struct incomplete_phi {
+   jay_def old;
+   unsigned new;
+};
+
+struct phi {
+   jay_block *block;
+   unsigned *src;
+   jay_def old;
+   unsigned dst;
+};
+
+struct ctx {
+   /* Array of index->index maps with the remapped definition at block end */
+   struct hash_table_u64 **defs;
+   struct hash_table_u64 *remap;
+   struct util_dynarray phis, indices, *incomplete_phis;
+   BITSET_WORD *sealed;
+   void *linctx;
+   unsigned alloc, idx_i;
+};
+
+#define jay_repair_foreach_phi(ctx, phi)                                       \
+   util_dynarray_foreach(&(ctx)->phis, struct phi, phi)                        \
+      if (phi->block != NULL)
+
+static unsigned lookup(struct ctx *ctx, jay_block *block, jay_def def);
+
+static unsigned
+remap_idx(struct ctx *ctx, unsigned idx)
+{
+   /* TODO: Switch to union-find */
+   void *remapped;
+   while ((remapped = _mesa_hash_table_u64_search(ctx->remap, idx))) {
+      idx = (uintptr_t) remapped;
+   }
+
+   return idx;
+}
+
+static bool
+try_remove_trivial_phi(struct ctx *ctx, struct phi *phi)
+{
+   unsigned same = 0;
+   for (unsigned i = 0; i < jay_num_predecessors(phi->block); ++i) {
+      unsigned src = remap_idx(ctx, phi->src[i]);
+      if (same && src != same && src != phi->dst) {
+         /* Nontrivial */
+         return false;
+      }
+
+      if (src != phi->dst) {
+         same = src;
+      }
+   }
+
+   _mesa_hash_table_u64_insert(ctx->remap, phi->dst, (void *) (uintptr_t) same);
+   phi->block = NULL;
+   return true;
+}
+
+static void
+add_phi(struct ctx *ctx, jay_block *block, jay_def src, unsigned dst)
+{
+   unsigned i = 0, n = jay_num_predecessors(block);
+   unsigned *srcs = linear_alloc_array(ctx->linctx, unsigned, n);
+   jay_foreach_predecessor(block, pred) {
+      assert(i < n);
+      srcs[i++] = lookup(ctx, *pred, src);
+   }
+
+   struct phi tmpl = { .block = block, .old = src, .dst = dst, .src = srcs };
+   if (!try_remove_trivial_phi(ctx, &tmpl)) {
+      util_dynarray_append(&ctx->phis, tmpl);
+   }
+}
+
+static unsigned
+lookup(struct ctx *ctx, jay_block *block, jay_def def)
+{
+   /* Lookup within a block */
+   struct hash_table_u64 *ht = ctx->defs[block->index];
+   void *local = _mesa_hash_table_u64_search(ht, jay_index(def));
+   if (local) {
+      return (uintptr_t) local;
+   }
+
+   /* For a single predecessor, we can recurse without adding a phi. */
+   bool insert_phi = jay_num_predecessors(block) > 1;
+   unsigned val = insert_phi ? ctx->alloc++ :
+                               lookup(ctx, jay_first_predecessor(block), def);
+
+   _mesa_hash_table_u64_insert(ctx->defs[block->index], jay_index(def),
+                               (void *) (uintptr_t) val);
+
+   if (block->loop_header && !BITSET_TEST(ctx->sealed, block->index)) {
+      struct incomplete_phi tmpl = { .old = def, .new = val };
+      util_dynarray_append(&ctx->incomplete_phis[block->index], tmpl);
+   } else if (insert_phi) {
+      add_phi(ctx, block, def, val);
+   }
+
+   return val;
+}
+
+static void
+remap(struct ctx *ctx, jay_builder *b, jay_def *inout)
+{
+   jay_def def = *inout;
+   unsigned reg = def.reg;
+   jay_foreach_index(def, c, index) {
+      unsigned el = ctx->idx_i++;
+      assert(el < util_dynarray_num_elements(&ctx->indices, unsigned));
+      unsigned idx = *util_dynarray_element(&ctx->indices, unsigned, el);
+      idx = remap_idx(ctx, idx);
+      jay_insert_channel(b, inout, c, jay_scalar(def.file, idx));
+   }
+
+   /* We run after flag RA, so preserve flag registers */
+   if (jay_is_flag(def)) {
+      inout->reg = reg;
+   }
+}
+
+void
+jay_repair_ssa(jay_function *func)
+{
+   jay_builder b = jay_init_builder(func, jay_before_function(func));
+   void *memctx = ralloc_context(NULL);
+   void *linctx = linear_context(memctx);
+   BITSET_WORD *sealed = BITSET_LINEAR_ZALLOC(linctx, func->num_blocks);
+   struct ctx ctx = { .sealed = sealed, .alloc = 1, .linctx = linctx };
+   unsigned *phi_remap = linear_zalloc_array(linctx, unsigned, func->ssa_alloc);
+
+   ctx.remap = _mesa_hash_table_u64_create(memctx);
+   ctx.defs =
+      linear_alloc_array(linctx, struct hash_table_u64 *, func->num_blocks);
+   ctx.incomplete_phis =
+      linear_alloc_array(linctx, struct util_dynarray, func->num_blocks);
+
+   jay_foreach_block(func, block) {
+      ctx.defs[block->index] = _mesa_hash_table_u64_create(memctx);
+      util_dynarray_init(&ctx.incomplete_phis[block->index], memctx);
+   }
+
+   util_dynarray_init(&ctx.phis, memctx);
+   util_dynarray_init(&ctx.indices, memctx);
+
+   jay_foreach_block(func, block) {
+      jay_foreach_inst_in_block(block, I) {
+         jay_foreach_src_index(I, s, c, index) {
+            unsigned val = lookup(&ctx, block, jay_extract(I->src[s], c));
+            util_dynarray_append(&ctx.indices, val);
+         }
+
+         jay_foreach_dst_index(I, d, index) {
+            unsigned val = ctx.alloc++;
+            util_dynarray_append(&ctx.indices, val);
+            if (I->op == JAY_OPCODE_PHI_DST) {
+               phi_remap[index] = val;
+            }
+
+            _mesa_hash_table_u64_insert(ctx.defs[block->index], index,
+                                        (void *) (uintptr_t) val);
+         }
+      }
+
+      /* Seal loop headers after processing the back edge */
+      jay_foreach_successor(block, succ) {
+         if (succ->loop_header && succ->index <= block->index) {
+            util_dynarray_foreach(&ctx.incomplete_phis[succ->index],
+                                  struct incomplete_phi, el) {
+               add_phi(&ctx, succ, el->old, el->new);
+            }
+
+            assert(!BITSET_TEST(sealed, succ->index) && "unique backedge");
+            BITSET_SET(sealed, succ->index);
+         }
+      }
+   }
+
+   /* Optimize trivial phis resulting from backedges. Use-lists would avoid the
+    * fixed point algorithm but this should be good enough for now.
+    */
+   bool progress;
+   do {
+      progress = false;
+      jay_repair_foreach_phi(&ctx, phi) {
+         progress |= try_remove_trivial_phi(&ctx, phi);
+      }
+   } while (progress);
+
+   /* Now apply everything */
+   jay_foreach_block(func, block) {
+      jay_foreach_phi_src_in_block(block, I) {
+         jay_set_phi_src_index(I, phi_remap[jay_phi_src_index(I)]);
+      }
+
+      jay_foreach_inst_in_block(block, I) {
+         jay_foreach_ssa_src(I, s) {
+            remap(&ctx, &b, &I->src[s]);
+         }
+
+         remap(&ctx, &b, &I->dst);
+         remap(&ctx, &b, &I->cond_flag);
+      }
+   }
+
+   jay_repair_foreach_phi(&ctx, phi) {
+      b.cursor = jay_before_block(phi->block);
+      jay_PHI_DST(&b, jay_scalar(phi->old.file, phi->dst));
+
+      unsigned i = 0;
+      jay_foreach_predecessor(phi->block, pred) {
+         b.cursor = jay_before_jump(*pred);
+         unsigned idx = remap_idx(&ctx, phi->src[i++]);
+         jay_PHI_SRC_u32(&b, jay_scalar(phi->old.file, idx), phi->dst);
+      }
+   }
+
+   func->ssa_alloc = ctx.alloc;
+   ralloc_free(memctx);
+}
diff --git a/src/intel/compiler/jay/jay_simd_width.c b/src/intel/compiler/jay/jay_simd_width.c
new file mode 100644
index 00000000000..86a48ba320d
--- /dev/null
+++ b/src/intel/compiler/jay/jay_simd_width.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+
+static unsigned
+max_simd_width(jay_shader *shader, const jay_inst *I)
+{
+   /* Only certain "complex" quad swizzles require splitting down to SIMD4 */
+   if (I->op == JAY_OPCODE_QUAD_SWIZZLE &&
+       (jay_quad_swizzle_swizzle(I) == JAY_QUAD_SWIZZLE_XYXY ||
+        jay_quad_swizzle_swizzle(I) == JAY_QUAD_SWIZZLE_ZWZW)) {
+      return 4;
+   }
+
+   /* These special instructions need to be split for various reasons. */
+   if (I->op == JAY_OPCODE_EXPAND_QUAD ||
+       I->op == JAY_OPCODE_EXTRACT_LAYER ||
+       I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES ||
+       I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
+       I->op == JAY_OPCODE_MUL_32 ||
+       I->op == JAY_OPCODE_SHUFFLE) {
+      return 16;
+   }
+
+   if (I->op != JAY_OPCODE_SEND) {
+      /* If any source/destination is 64-bit strided, we must split to avoid
+       * crossing more than 2 GRFs. Note that SENDs don't have this restriction,
+       * we don't have to split A64 load/store.
+       */
+      if (I->dst.file == GPR &&
+          jay_def_stride(shader, I->dst) == JAY_STRIDE_8) {
+         return 16;
+      }
+
+      jay_foreach_src(I, s) {
+         if (I->src[s].file == GPR &&
+             jay_def_stride(shader, I->src[s]) == JAY_STRIDE_8) {
+            return 16;
+         }
+      }
+   } else {
+      /* TODO: Do we ever split SENDs? ..Can we even split SENDs given we don't
+       * have stride control? How is this supposed to work?
+       *
+       * XXX
+       */
+   }
+
+   return 32;
+}
+
+unsigned
+jay_simd_split(jay_shader *s, const jay_inst *I)
+{
+   unsigned actual = jay_simd_width_logical(s, I);
+   unsigned max = max_simd_width(s, I);
+
+   return (actual > max) ? (util_logbase2(actual) - util_logbase2(max)) : 0;
+}
diff --git a/src/intel/compiler/jay/jay_spill.c b/src/intel/compiler/jay/jay_spill.c
new file mode 100644
index 00000000000..f4c3b85789c
--- /dev/null
+++ b/src/intel/compiler/jay/jay_spill.c
@@ -0,0 +1,849 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2023-2024 Alyssa Rosenzweig
+ * Copyright 2023-2024 Valve Corporation
+ * Copyright 2022 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/bitset.h"
+#include "util/ralloc.h"
+#include "util/sparse_bitset.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+#include "util/u_qsort.h"
+#include "util/u_worklist.h"
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/*
+ * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form
+ * Programs" by Braun and Hack.
+ *
+ * Next-use distances are logically in ℤ ∪ {∞}, modelled as saturating uint32
+ * and referred to as dist_t. Within a block, next-use data is dense. At block
+ * boundaries, next-use maps are stored as key-value pairs, where only variables
+ * with later uses (finite distance) are stored. That sparse representation
+ * ensures linear-time even for shaders with many blocks.
+ */
+#define DIST_INFINITY (UINT32_MAX)
+typedef uint32_t dist_t;
+
+struct next_use {
+   uint32_t index;
+   dist_t dist;
+};
+
+static void
+add_next_use(struct util_dynarray *nu, unsigned node, dist_t dist)
+{
+   struct next_use use = { .index = node, .dist = dist };
+   util_dynarray_append(nu, use);
+}
+
+#define foreach_next_use(nu, it) util_dynarray_foreach(nu, struct next_use, it)
+
+static dist_t
+add_dist(dist_t A, dist_t B)
+{
+   return (A + B < A) ? DIST_INFINITY : (A + B);
+}
+
+/*
+ * Calculate the minimum of two next-use sets. Values absent from one of the
+ * underlying sets are infinity so do not contribute to the minimum, instead
+ * acting like a set union.
+ */
+static bool
+minimum_next_uses(struct util_dynarray *nu,
+                  const struct util_dynarray *from,
+                  dist_t *tmp_dist,
+                  struct u_sparse_bitset *tmp_set)
+{
+   /* Convert "from" to be dense */
+   u_sparse_bitset_clear_all(tmp_set);
+
+   foreach_next_use(from, it) {
+      u_sparse_bitset_set(tmp_set, it->index);
+      tmp_dist[it->index] = it->dist;
+   }
+
+   bool progress = false;
+
+   /* Take the minimum of common elements */
+   foreach_next_use(nu, it) {
+      if (u_sparse_bitset_test(tmp_set, it->index)) {
+         if (tmp_dist[it->index] < it->dist) {
+            it->dist = tmp_dist[it->index];
+            progress = true;
+         }
+
+         u_sparse_bitset_clear(tmp_set, it->index);
+      }
+   }
+
+   /* Add elements that are only in "from" */
+   U_SPARSE_BITSET_FOREACH_SET(tmp_set, index) {
+      add_next_use(nu, index, tmp_dist[index]);
+      progress = true;
+   }
+
+   return progress;
+}
+
+static uint32_t
+inst_cycles(const jay_inst *I)
+{
+   return 1;
+}
+
+struct spill_block {
+   /* W/S sets at the start/end of the block, see spill_ctx::{W,S} */
+   struct u_sparse_bitset W_in, W_out, S_in, S_out;
+
+   /* Next-use maps at the start/end of the block */
+   struct util_dynarray next_use_in, next_use_out;
+
+   /* Estimated cycle count of the block */
+   uint32_t cycles;
+};
+
+struct spill_ctx {
+   jay_function *func;
+
+   /* Register file being spilled */
+   enum jay_file file;
+
+   /* Set of values whose file equals `file` */
+   BITSET_WORD *in_file;
+
+   /* Set of values currently available in the register file */
+   struct u_sparse_bitset W;
+
+   /* For W-entry calculation, phis with a spilled source. For
+    * coupling calculation, phis defined along the given edge.
+    */
+   struct u_sparse_bitset phi_set;
+
+   /* |W| = Current register pressure */
+   unsigned nW;
+
+   /* For each variable in N, local IPs of next-use. Else, infinite. */
+   struct u_sparse_bitset N;
+   dist_t *next_uses;
+
+   /* Current local IP relative to the start of the block */
+   uint32_t ip;
+
+   /* Set of live values that have been spilled. Contrary to the paper, this
+    * is not a subset of W: the definition in the paper is bogus.
+    */
+   struct u_sparse_bitset S;
+
+   /* If a value is rematerializable or a phi, its definition. Else, NULL */
+   jay_inst **defs;
+
+   /* Maximum register pressure allowed */
+   unsigned k;
+
+   /* Number of variables */
+   unsigned n;
+
+   /* Information on blocks indexed in source order */
+   struct spill_block *blocks;
+
+   /* Preallocated array of candidates for calculating W entry */
+   struct next_use *candidates;
+   struct util_dynarray next_ip;
+};
+
+static inline jay_def
+jay_def_as_mem(struct spill_ctx *ctx, jay_def idx)
+{
+   assert(idx.file == GPR || idx.file == UGPR);
+   idx.file = idx.file == UGPR ? UMEM : MEM;
+   idx._payload = jay_base_index(idx) + ctx->n;
+   return idx;
+}
+
+static bool
+can_remat(jay_inst *I)
+{
+   /* TODO */
+   return false;
+}
+
+static bool
+can_remat_node(struct spill_ctx *ctx, unsigned node)
+{
+   return ctx->defs[node] && ctx->defs[node]->op != JAY_OPCODE_PHI_DST;
+}
+
+static jay_inst *
+remat_to(jay_builder *b, jay_def dst, struct spill_ctx *ctx, unsigned node)
+{
+   jay_inst *I = ctx->defs[node];
+   assert(can_remat(I));
+
+   UNREACHABLE("invalid remat");
+}
+
+static void
+insert_spill(jay_builder *b, struct spill_ctx *ctx, unsigned node)
+{
+   if (!can_remat_node(ctx, node)) {
+      jay_def idx = jay_scalar(ctx->file, node);
+      jay_MOV(b, jay_def_as_mem(ctx, idx), idx);
+   }
+}
+
+static void
+insert_reload(struct spill_ctx *ctx,
+              jay_block *block,
+              jay_cursor cursor,
+              unsigned node)
+{
+   jay_builder b = jay_init_builder(ctx->func, cursor);
+   jay_def idx = jay_scalar(ctx->file, node);
+
+   /* Reloading breaks SSA, but jay_repair_ssa will repair */
+   if (can_remat_node(ctx, node)) {
+      remat_to(&b, idx, ctx, node);
+   } else {
+      jay_MOV(&b, idx, jay_def_as_mem(ctx, idx));
+   }
+}
+
+/* Insert into the register file */
+static void
+insert_W(struct spill_ctx *ctx, unsigned v)
+{
+   assert(!u_sparse_bitset_test(&ctx->W, v));
+   assert(BITSET_TEST(ctx->in_file, v));
+
+   u_sparse_bitset_set(&ctx->W, v);
+   ctx->nW++;
+}
+
+/* Remove from the register file */
+static void
+remove_W(struct spill_ctx *ctx, unsigned v)
+{
+   assert(u_sparse_bitset_test(&ctx->W, v));
+   assert(BITSET_TEST(ctx->in_file, v));
+
+   u_sparse_bitset_clear(&ctx->W, v);
+   ctx->nW--;
+}
+
+static int
+nu_score(struct spill_ctx *ctx, struct next_use nu)
+{
+   /* We assume that rematerializing - even before every instuction - is
+    * cheaper than spilling. As long as one of the nodes is rematerializable
+    * (with distance > 0), we choose it over spilling. Within a class of nodes
+    * (rematerializable or not), compare by next-use-distance.
+    */
+   bool remat = can_remat_node(ctx, nu.index) && nu.dist > 0;
+   return (remat ? 0 : 100000) + nu.dist;
+}
+
+static int
+cmp_dist(const void *left_, const void *right_, void *ctx)
+{
+   const struct next_use *left = left_;
+   const struct next_use *right = right_;
+   int l = nu_score(ctx, *left), r = nu_score(ctx, *right);
+
+   return (l > r) - (l < r);
+}
+
+/*
+ * Limit the register file W to maximum size m by evicting registers.
+ */
+static ATTRIBUTE_NOINLINE void
+limit(struct spill_ctx *ctx, jay_inst *I, unsigned m)
+{
+   /* Nothing to do if we're already below the limit */
+   if (ctx->nW <= m) {
+      return;
+   }
+
+   /* Gather candidates for eviction. Note that next_uses gives IPs whereas
+    * cmp_dist expects relative distances. This requires us to subtract ctx->ip
+    * to ensure that cmp_dist works properly. Even though logically it shouldn't
+    * affect the sorted order, practically this matters for correctness with
+    * rematerialization. See the dist=0 test in cmp_dist.
+    */
+   struct next_use vars[JAY_NUM_UGPR];
+   unsigned j = 0;
+
+   U_SPARSE_BITSET_FOREACH_SET(&ctx->W, i) {
+      assert(ctx->next_uses[i] != DIST_INFINITY && "live in W");
+      dist_t dist = ctx->next_uses[i] - ctx->ip;
+
+      assert(j < ARRAY_SIZE(vars));
+      vars[j++] = (struct next_use) { .index = i, .dist = dist };
+   }
+
+   /* Sort by next-use distance */
+   util_qsort_r(vars, j, sizeof(struct next_use), cmp_dist, ctx);
+
+   /* Evict what doesn't fit, inserting a spill for evicted values that we
+    * haven't spilled before with a future use.
+    */
+   for (unsigned i = m; i < j; ++i) {
+      if (!u_sparse_bitset_test(&ctx->S, vars[i].index)) {
+         jay_builder b = jay_init_builder(ctx->func, jay_before_inst(I));
+         insert_spill(&b, ctx, vars[i].index);
+         u_sparse_bitset_set(&ctx->S, vars[i].index);
+      }
+
+      remove_W(ctx, vars[i].index);
+   }
+}
+
+/*
+ * Insert coupling code on block boundaries. This must ensure:
+ *
+ *    - anything live-in we expect to have spilled is spilled
+ *    - anything live-in we expect to have filled is filled
+ *    - phi sources are spilled if the destination is spilled
+ *    - phi sources are filled if the destination is not spilled
+ *
+ * The latter two requirements ensure correct pressure calculations for phis.
+ */
+static ATTRIBUTE_NOINLINE void
+insert_coupling_code(struct spill_ctx *ctx, jay_block *pred, jay_block *succ)
+{
+   jay_builder b = jay_init_builder(ctx->func, jay_before_function(ctx->func));
+   struct spill_block *sp = &ctx->blocks[pred->index];
+   struct spill_block *ss = &ctx->blocks[succ->index];
+
+   /* Insert spill/fill at phi sources to match their destination */
+   jay_foreach_phi_src_in_block(pred, phi_src) {
+      jay_inst *phi_dst = ctx->defs[jay_phi_src_index(phi_src)];
+      unsigned src = jay_index(phi_src->src[0]);
+
+      if (phi_src->src[0].file == ctx->file) {
+         if (jay_is_mem(phi_dst->dst)) {
+            if (!u_sparse_bitset_test(&sp->S_out, src)) {
+               /* Spill the phi source. TODO: avoid redundant spills here */
+               b.cursor = jay_after_block_logical(pred);
+               insert_spill(&b, ctx, src);
+            }
+
+            if (can_remat_node(ctx, jay_index(phi_src->src[0]))) {
+               jay_def idx = jay_scalar(ctx->file, src);
+               jay_def tmp = jay_alloc_def(&b, ctx->file, 1);
+
+               b.cursor = jay_before_function(ctx->func);
+               remat_to(&b, tmp, ctx, src);
+               jay_MOV(&b, jay_def_as_mem(ctx, idx), tmp);
+            }
+
+            /* Use the spilled version */
+            phi_src->src[0] = jay_def_as_mem(ctx, phi_src->src[0]);
+            jay_set_phi_src_index(phi_src, jay_index(phi_dst->dst));
+         } else if (!u_sparse_bitset_test(&sp->W_out, src)) {
+            /* Fill the phi source in the predecessor */
+            jay_block *reload_block = jay_edge_to_block(pred, succ);
+            insert_reload(ctx, reload_block, jay_along_edge(pred, succ), src);
+         }
+      }
+   }
+
+   /* Anything assumed to be spilled in succ must be spilled along all edges. */
+   U_SPARSE_BITSET_FOREACH_SET(&ss->S_in, v) {
+      if (!u_sparse_bitset_test(&sp->S_out, v)) {
+         b.cursor = jay_along_edge(pred, succ);
+         insert_spill(&b, ctx, v);
+      }
+   }
+
+   jay_foreach_phi_dst_in_block(succ, phi) {
+      u_sparse_bitset_set(&ctx->phi_set, jay_index(phi->dst));
+   }
+
+   /* Variables in W at the start of succ must be defined along the edge.
+    * If not live at the end of the predecessor (and it's not a phi defined in
+    * the successor), insert a reload.
+    */
+   U_SPARSE_BITSET_FOREACH_SET(&ss->W_in, v) {
+      if (!u_sparse_bitset_test(&sp->W_out, v) &&
+          !u_sparse_bitset_test(&ctx->phi_set, v)) {
+
+         jay_block *reload_block = jay_edge_to_block(pred, succ);
+         insert_reload(ctx, reload_block, jay_along_edge(pred, succ), v);
+      }
+   }
+}
+
+static dist_t
+lookup_next_use(struct spill_ctx *ctx, unsigned v)
+{
+   return u_sparse_bitset_test(&ctx->N, v) ? ctx->next_uses[v] : DIST_INFINITY;
+}
+
+/*
+ * Produce an array of next-use IPs relative to the start of the block. This is
+ * an array of dist_t scalars, representing the next-use IP of each SSA dest
+ * (right-to-left) and SSA source (left-to-right) of each instuction in the
+ * block (bottom-to-top). Its size equals the # of SSA sources in the block.
+ */
+static ATTRIBUTE_NOINLINE void
+populate_local_next_use(struct spill_ctx *ctx, jay_block *block)
+{
+   struct spill_block *sb = &ctx->blocks[block->index];
+   unsigned ip = sb->cycles;
+
+   foreach_next_use(&sb->next_use_out, it) {
+      dist_t d = add_dist(it->dist, ip);
+
+      if (d != DIST_INFINITY) {
+         u_sparse_bitset_set(&ctx->N, it->index);
+         ctx->next_uses[it->index] = d;
+      }
+   }
+
+   jay_foreach_inst_in_block_rev(block, I) {
+      ip -= inst_cycles(I);
+
+      jay_foreach_src_index(I, s, c, v) {
+         if (I->src[s].file == ctx->file) {
+            if (I->op != JAY_OPCODE_PHI_SRC) {
+               util_dynarray_append(&ctx->next_ip, lookup_next_use(ctx, v));
+            }
+
+            ctx->next_uses[v] = ip;
+            u_sparse_bitset_set(&ctx->N, v);
+         }
+      }
+
+      if (I->dst.file == ctx->file) {
+         jay_foreach_index_rev(I->dst, _, v) {
+            util_dynarray_append(&ctx->next_ip, lookup_next_use(ctx, v));
+         }
+      }
+   }
+
+   assert(ip == 0 && "cycle counting is consistent");
+}
+
+/*
+ * Insert spills/fills for a single basic block, following Belady's algorithm.
+ * Corresponds to minAlgorithm from the paper.
+ */
+static ATTRIBUTE_NOINLINE void
+min_algorithm(struct spill_ctx *ctx,
+              jay_block *block,
+              struct spill_block *sb,
+              dist_t *next_ips,
+              unsigned next_use_cursor)
+{
+   jay_foreach_inst_in_block(block, I) {
+      assert(ctx->nW <= ctx->k && "invariant");
+
+      /* Phis are special since they happen along the edge. When we initialized
+       * W and S, we implicitly chose which phis are spilled. So, here we just
+       * need to rewrite the phis to write into memory.
+       *
+       * Phi sources are handled later.
+       */
+      if (I->op == JAY_OPCODE_PHI_DST) {
+         if (I->dst.file == ctx->file) {
+            if (!u_sparse_bitset_test(&ctx->W, jay_index(I->dst))) {
+               u_sparse_bitset_set(&ctx->S, jay_index(I->dst));
+               I->dst = jay_def_as_mem(ctx, I->dst);
+            }
+         }
+
+         ctx->ip += inst_cycles(I);
+         continue;
+      } else if (I->op == JAY_OPCODE_PHI_SRC) {
+         break;
+      }
+
+      /* Any source that is not in W needs to be reloaded. Gather the set R of
+       * such values, and add them to the register file.
+       */
+      unsigned R[JAY_MAX_SRCS], nR = 0;
+
+      jay_foreach_src_index(I, s, c, v) {
+         if (I->src[s].file == ctx->file && !u_sparse_bitset_test(&ctx->W, v)) {
+            R[nR++] = v;
+            insert_W(ctx, v);
+
+            assert(u_sparse_bitset_test(&ctx->S, v) && "must have spilled");
+            assert(nR <= ARRAY_SIZE(R) && "maximum source count");
+         }
+      }
+
+      /* Limit W to make space for the operands.
+       *
+       * We need to round up to power-of-two destination sizes to match the
+       * rounding in demand calculation.
+       */
+      bool has_dst = I->dst.file == ctx->file;
+      unsigned dst_size = util_next_power_of_two(jay_num_values(I->dst));
+      limit(ctx, I, ctx->k - (has_dst ? dst_size : 0));
+
+      /* Add destinations to the register file */
+      if (I->dst.file == ctx->file) {
+         jay_foreach_index(I->dst, _, index) {
+            assert(next_use_cursor >= 1);
+            ctx->next_uses[index] = next_ips[--next_use_cursor];
+
+            if (ctx->next_uses[index] != DIST_INFINITY) {
+               insert_W(ctx, index);
+            }
+         }
+      }
+
+      /* Update next-use distances for this instuction. Unlike the paper, we
+       * require W contain only live values (with finite next-use distance).
+       *
+       * This happens after the above limit() calls to model sources as
+       * late-kill. This is conservative and could be improved, but it matches
+       * how we currently estimate register demand.
+       */
+      jay_foreach_src_index_rev(I, s, c, node) {
+         if (I->src[s].file == ctx->file) {
+            assert(next_use_cursor >= 1);
+            ctx->next_uses[node] = next_ips[--next_use_cursor];
+
+            if (ctx->next_uses[node] == DIST_INFINITY) {
+               remove_W(ctx, node);
+            }
+         }
+      }
+
+      /* Add reloads for the sources in front of the instuction. */
+      for (unsigned i = 0; i < nR; ++i) {
+         insert_reload(ctx, block, jay_before_inst(I), R[i]);
+      }
+
+      ctx->ip += inst_cycles(I);
+
+      if (jay_debug & JAY_DBG_PRINTDEMAND) {
+         printf("(SP) %u: ", ctx->nW);
+         jay_print_inst(stdout, I);
+      }
+   }
+
+   assert(next_use_cursor == 0 && "exactly sized");
+
+   u_sparse_bitset_dup(&sb->W_out, &ctx->W);
+   u_sparse_bitset_dup(&sb->S_out, &ctx->S);
+}
+
+/*
+ * TODO: Implement section 4.2 of the paper.
+ *
+ * For now, we implement the simpler heuristic in Hack's thesis: sort
+ * the live-in set (+ destinations of phis) by next-use distance.
+ */
+static ATTRIBUTE_NOINLINE void
+compute_w_entry_loop_header(struct spill_ctx *ctx, jay_block *block)
+{
+   unsigned j = 0;
+   /* TODO: Account for phis too! */
+   foreach_next_use(&ctx->blocks[block->index].next_use_in, it) {
+      assert(j < ctx->n);
+      ctx->candidates[j++] = *it;
+   }
+
+   /* Take the best candidates sorted by next-use distance */
+   unsigned n = MIN2(j, ctx->k - ctx->nW);
+   if (n < j) {
+      util_qsort_r(ctx->candidates, j, sizeof(struct next_use), cmp_dist, ctx);
+   }
+
+   for (unsigned i = 0; i < n; ++i) {
+      insert_W(ctx, ctx->candidates[i].index);
+   }
+}
+
+/*
+ * Compute W_entry for a block. Section 4.2 in the paper.
+ */
+static ATTRIBUTE_NOINLINE void
+compute_w_entry(struct spill_ctx *ctx, jay_block *block)
+{
+   unsigned j = 0;
+
+   /* Variables that are in all predecessors are assumed in W_entry. Phis and
+    * variables in some predecessors are scored by next-use.
+    */
+   U_SPARSE_BITSET_FOREACH_SET(&ctx->N, i) {
+      bool all = true, any = false;
+
+      jay_foreach_predecessor(block, P) {
+         bool in = u_sparse_bitset_test(&ctx->blocks[(*P)->index].W_out, i);
+         all &= in;
+         any |= in;
+      }
+
+      if (all) {
+         insert_W(ctx, i);
+      } else if (any) {
+         ctx->candidates[j++] =
+            (struct next_use) { .index = i, .dist = ctx->next_uses[i] };
+      }
+   }
+
+   jay_foreach_predecessor(block, pred) {
+      jay_foreach_phi_src_in_block(*pred, I) {
+         if (!u_sparse_bitset_test(&ctx->blocks[(*pred)->index].W_out,
+                                   jay_index(I->src[0]))) {
+
+            u_sparse_bitset_set(&ctx->phi_set, jay_phi_src_index(I));
+         }
+      }
+   }
+
+   /* Heuristic: if any phi source is spilled, spill the phi. While suboptimal,
+    * this reduces pointless spills/fills with massive phi webs.
+    */
+   jay_foreach_phi_dst_in_block(block, I) {
+      if (!u_sparse_bitset_test(&ctx->phi_set, jay_index(I->dst))) {
+         ctx->candidates[j++] = (struct next_use) {
+            .index = jay_index(I->dst),
+            .dist = ctx->next_uses[jay_index(I->dst)],
+         };
+      }
+   }
+
+   /* Take the best candidates sorted by next-use distance */
+   unsigned n = MIN2(j, ctx->k - ctx->nW);
+   if (n < j) {
+      util_qsort_r(ctx->candidates, j, sizeof(struct next_use), cmp_dist, ctx);
+   }
+
+   for (unsigned i = 0; i < n; ++i) {
+      insert_W(ctx, ctx->candidates[i].index);
+   }
+}
+
+/*
+ * We initialize S with the union of S at the exit of (forward edge)
+ * predecessors and the complement of W, intersected with the live-in set. The
+ * former propagates S forward. The latter ensures we spill along the edge when
+ * a live value is not selected for the entry W.
+ */
+static ATTRIBUTE_NOINLINE void
+compute_s_entry(struct spill_ctx *ctx, jay_block *block)
+{
+   jay_foreach_predecessor(block, pred) {
+      U_SPARSE_BITSET_FOREACH_SET(&ctx->blocks[(*pred)->index].S_out, v) {
+         if (u_sparse_bitset_test(&block->live_in, v)) {
+            u_sparse_bitset_set(&ctx->S, v);
+         }
+      }
+   }
+
+   U_SPARSE_BITSET_FOREACH_SET(&block->live_in, v) {
+      if (BITSET_TEST(ctx->in_file, v) && !u_sparse_bitset_test(&ctx->W, v)) {
+         u_sparse_bitset_set(&ctx->S, v);
+      }
+   }
+
+   u_sparse_bitset_dup(&ctx->blocks[block->index].S_in, &ctx->S);
+}
+
+static ATTRIBUTE_NOINLINE void
+global_next_use_distances(struct spill_ctx *ctx, void *memctx)
+{
+   u_worklist worklist;
+   u_worklist_init(&worklist, ctx->func->num_blocks, NULL);
+
+   jay_foreach_block(ctx->func, block) {
+      struct spill_block *sb = &ctx->blocks[block->index];
+
+      util_dynarray_init(&sb->next_use_in, memctx);
+      util_dynarray_init(&sb->next_use_out, memctx);
+
+      jay_foreach_inst_in_block(block, I) {
+         sb->cycles += inst_cycles(I);
+      }
+
+      jay_worklist_push_head(&worklist, block);
+   }
+
+   /* Iterate the work list in reverse order since liveness is backwards */
+   while (!u_worklist_is_empty(&worklist)) {
+      jay_block *block = jay_worklist_pop_head(&worklist);
+      struct spill_block *sb = &ctx->blocks[block->index];
+
+      /* Clear locally accessed set (W) */
+      u_sparse_bitset_clear_all(&ctx->W);
+      util_dynarray_clear(&sb->next_use_in);
+
+      uint32_t cycle = 0;
+
+      /* Calculate dists */
+      jay_foreach_inst_in_block(block, I) {
+         /* Record first use before def */
+         jay_foreach_src_index(I, s, c, index) {
+            if (I->src[s].file == ctx->file &&
+                !u_sparse_bitset_test(&ctx->W, index)) {
+
+               add_next_use(&sb->next_use_in, index, cycle);
+               u_sparse_bitset_set(&ctx->W, index);
+            }
+         }
+
+         /* Record defs */
+         jay_foreach_index(I->dst, _, index) {
+            u_sparse_bitset_set(&ctx->W, index);
+         }
+
+         cycle += inst_cycles(I);
+      }
+
+      /* Apply transfer function to get our entry state. */
+      foreach_next_use(&sb->next_use_out, it) {
+         if (!u_sparse_bitset_test(&ctx->W, it->index)) {
+            add_next_use(&sb->next_use_in, it->index,
+                         add_dist(it->dist, sb->cycles));
+         }
+      }
+
+      /* Propagate successor live-in to pred live-out, joining with min */
+      jay_foreach_predecessor(block, pred) {
+         if (minimum_next_uses(&ctx->blocks[(*pred)->index].next_use_out,
+                               &sb->next_use_in, ctx->next_uses,
+                               &ctx->phi_set)) {
+            jay_worklist_push_tail(&worklist, *pred);
+         }
+      }
+   }
+
+   u_worklist_fini(&worklist);
+
+#ifndef NDEBUG
+   /* In debug builds, validate the following invariant:
+    *
+    * Next-use distance is finite iff live and in file.
+    */
+   jay_foreach_block(ctx->func, blk) {
+      struct spill_block *sb = &ctx->blocks[blk->index];
+
+      for (unsigned i = 0; i < 2; i++) {
+         struct util_dynarray *nu = i ? &sb->next_use_out : &sb->next_use_in;
+         struct u_sparse_bitset *live = i ? &blk->live_out : &blk->live_in;
+
+         u_sparse_bitset_clear_all(&ctx->W);
+
+         foreach_next_use(nu, it) {
+            assert(u_sparse_bitset_test(live, it->index) &&
+                   BITSET_TEST(ctx->in_file, it->index));
+
+            u_sparse_bitset_set(&ctx->W, it->index);
+         }
+
+         U_SPARSE_BITSET_FOREACH_SET(live, i) {
+            if (BITSET_TEST(ctx->in_file, i)) {
+               assert(u_sparse_bitset_test(&ctx->W, i));
+            }
+         }
+      }
+   }
+#endif
+}
+
+void
+jay_spill(jay_function *func, enum jay_file file, unsigned k)
+{
+   void *memctx = ralloc_context(NULL);
+   void *linctx = linear_context(memctx);
+   struct spill_ctx ctx = { .func = func, .file = file, .k = k };
+
+   ctx.n = func->ssa_alloc;
+   ctx.in_file = BITSET_LINEAR_ZALLOC(linctx, ctx.n);
+   ctx.defs = linear_zalloc_array(linctx, jay_inst *, ctx.n);
+   ctx.next_uses = linear_alloc_array(linctx, dist_t, ctx.n);
+   ctx.candidates = linear_alloc_array(linctx, struct next_use, ctx.n);
+   ctx.blocks =
+      linear_zalloc_array(linctx, struct spill_block, func->num_blocks);
+
+   jay_foreach_inst_in_func(func, block, I) {
+      if (can_remat(I) || I->op == JAY_OPCODE_PHI_DST) {
+         ctx.defs[jay_index(I->dst)] = I;
+      }
+
+      if (I->dst.file == file) {
+         BITSET_SET_COUNT(ctx.in_file, jay_base_index(I->dst),
+                          jay_num_values(I->dst));
+      }
+   }
+
+   u_sparse_bitset_init(&ctx.W, ctx.n, memctx);
+   u_sparse_bitset_init(&ctx.S, ctx.n, memctx);
+   u_sparse_bitset_init(&ctx.N, ctx.n, memctx);
+   u_sparse_bitset_init(&ctx.phi_set, ctx.n, memctx);
+   util_dynarray_init(&ctx.next_ip, memctx);
+
+   global_next_use_distances(&ctx, memctx);
+
+   /* Reserve a memory variable for every regular variable */
+   func->ssa_alloc *= 2;
+
+   jay_foreach_block(func, block) {
+      ctx.nW = 0;
+      ctx.ip = 0;
+
+      u_sparse_bitset_clear_all(&ctx.W);
+      u_sparse_bitset_clear_all(&ctx.S);
+      u_sparse_bitset_clear_all(&ctx.N);
+      util_dynarray_clear(&ctx.next_ip);
+
+      populate_local_next_use(&ctx, block);
+
+      struct spill_block *sb = &ctx.blocks[block->index];
+      dist_t *next_ips = util_dynarray_element(&ctx.next_ip, dist_t, 0);
+      unsigned nu_cursor = util_dynarray_num_elements(&ctx.next_ip, dist_t);
+
+      /* Populate next-use with phi destinations, which are not in the
+       * next_use_in set but are accounted for when computing W_entry.
+       */
+      jay_foreach_phi_dst_in_block(block, I) {
+         if (I->dst.file == file) {
+            assert(nu_cursor >= 1);
+            ctx.next_uses[jay_index(I->dst)] = next_ips[--nu_cursor];
+            u_sparse_bitset_set(&ctx.N, jay_index(I->dst));
+         }
+      }
+
+      if (block->loop_header) {
+         compute_w_entry_loop_header(&ctx, block);
+      } else if (jay_num_predecessors(block) /* skip start blocks */) {
+         compute_w_entry(&ctx, block);
+      }
+
+      assert(ctx.nW <= ctx.k && "invariant");
+      u_sparse_bitset_dup(&sb->W_in, &ctx.W);
+
+      compute_s_entry(&ctx, block);
+      min_algorithm(&ctx, block, sb, next_ips, nu_cursor);
+   }
+
+   /* Now that all blocks are processed separately, stitch it together */
+   jay_foreach_block(func, block) {
+      jay_foreach_predecessor(block, pred) {
+         u_sparse_bitset_clear_all(&ctx.phi_set);
+         insert_coupling_code(&ctx, *pred, block);
+      }
+   }
+
+   ralloc_free(memctx);
+
+   /* Spilling breaks SSA, so we need to repair before validating */
+   jay_repair_ssa(func);
+   jay_validate(func->shader, "Spilling");
+
+   /* Remat can introduce dead code */
+   jay_opt_dead_code(func->shader);
+}
diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c
new file mode 100644
index 00000000000..935ae4d2727
--- /dev/null
+++ b/src/intel/compiler/jay/jay_to_binary.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include "compiler/brw/brw_disasm_info.h"
+#include "compiler/brw/brw_eu.h"
+#include "compiler/brw/brw_eu_defines.h"
+#include "compiler/brw/brw_eu_inst.h"
+#include "compiler/brw/brw_reg.h"
+#include "compiler/brw/brw_reg_type.h"
+#include "dev/intel_debug.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+#include "jay.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+static inline enum brw_reg_type
+to_brw_reg_type(enum jay_type type)
+{
+   /* clang-format off */
+   switch (type) {
+   case JAY_TYPE_UNTYPED:
+   case JAY_TYPE_U8:   return BRW_TYPE_UB;
+   case JAY_TYPE_U16:  return BRW_TYPE_UW;
+   case JAY_TYPE_U32:  return BRW_TYPE_UD;
+   case JAY_TYPE_U64:  return BRW_TYPE_UQ;
+   case JAY_TYPE_S8:   return BRW_TYPE_B;
+   case JAY_TYPE_S16:  return BRW_TYPE_W;
+   case JAY_TYPE_S32:  return BRW_TYPE_D;
+   case JAY_TYPE_S64:  return BRW_TYPE_Q;
+   case JAY_TYPE_F16:  return BRW_TYPE_HF;
+   case JAY_TYPE_F32:  return BRW_TYPE_F;
+   case JAY_TYPE_F64:  return BRW_TYPE_DF;
+   case JAY_TYPE_BF16: return BRW_TYPE_BF;
+   default: UNREACHABLE("invalid type");
+   }
+   /* clang-format on */
+}
+
+static inline unsigned
+to_def_grf_16(struct jay_partition *p, jay_def d)
+{
+   unsigned count = jay_num_values(d);
+   if (count == 0 || !(d.file == GPR || d.file == UGPR)) {
+      return d.reg;
+   }
+
+   unsigned base = 0;
+   for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) {
+      unsigned offset = d.reg - base;
+
+      if (offset < p->blocks[d.file][i].len) {
+         assert(offset + count <= p->blocks[d.file][i].len &&
+                "vectors must not cross partition boundaries");
+
+         return (p->blocks[d.file][i].start + offset) * 2 + d.hi;
+      }
+
+      base += p->blocks[d.file][i].len;
+   }
+
+   UNREACHABLE("virtual register must be in a block");
+}
+
+static inline brw_reg
+to_brw_reg(jay_function *f,
+           const jay_inst *I,
+           signed idx,
+           unsigned simd_offs,
+           bool force_hi)
+{
+   bool is_dest = idx < 0;
+   enum jay_type type = is_dest ? I->type : jay_src_type(I, idx);
+   jay_def d = is_dest ? I->dst : I->src[idx];
+   d.hi |= force_hi;
+
+   struct brw_reg R;
+   unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0;
+
+   if (jay_is_imm(d)) {
+      /* Immediates have size restrictions but can zero extend */
+      if (jay_type_size_bits(type) == 64) {
+         type = jay_type_resize(type, 32);
+      } else if (I->op == JAY_OPCODE_BFN) {
+         assert(jay_as_uint(d) < UINT16_MAX);
+         type = JAY_TYPE_U16;
+      }
+
+      R = brw_imm_ud(jay_as_uint(d));
+   } else if (jay_is_null(d)) {
+      R = brw_null_reg();
+   } else if (d.file == UGPR) {
+      unsigned grf = (reg >> 1) / 8;
+      offset_B = ((reg >> 1) % 8) * 4;
+
+      if (d.file == UGPR) {
+         R = brw_ud1_grf(grf, 0);
+      } else {
+         R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0);
+      }
+
+      /* Handle 3-src restrictions and vectorized uniform code. */
+      if (is_dest || jay_num_values(d) >= 8) {
+         R = vec8(R);
+      }
+
+      /* Some operations have special restrictions on the destination stride,
+       * but if we write a single UGPR the stride is ignored..  Specify
+       * whatever stride is needed to satisfy the rules.
+       */
+      if (is_dest) {
+         /* BSpec 56640 "Special Restrictions" says:
+          *
+          *    "Conversion between HF and Integer must be DWord-aligned
+          *     and strided by a DWord on the destination."
+          */
+         enum jay_type src0_type = jay_src_type(I, 0);
+         if ((I->type == JAY_TYPE_F16 && !jay_type_is_any_float(src0_type)) ||
+             (src0_type == JAY_TYPE_F16 && !jay_type_is_any_float(I->type))) {
+            assert(jay_num_values(d) == 1 && "must not vectorize HF<->Int");
+            R = stride(R, 8, 2, 4);
+         }
+
+         /* Packed floats have restrictions on mixed sizes.  Use <2>. */
+         if (jay_type_size_bits(I->type) == 16 &&
+             jay_type_size_bits(jay_src_type(I, 0)) != 16) {
+            assert(jay_num_values(d) == 1 && "must not vectorize mixed float");
+            R = stride(R, 4, 2, 2);
+         }
+      }
+   } else if (d.file == GPR) {
+      enum jay_stride def_stride = jay_def_stride(f->shader, d);
+      uint32_t type_bits = jay_type_size_bits(type);
+      unsigned stride_bits = jay_stride_to_bits(def_stride);
+      unsigned simd_width = jay_simd_width_physical(f->shader, I);
+
+      unsigned grf;
+      if (def_stride == JAY_STRIDE_2) {
+         /* Bit 0 selects between lo/hi halves of the GPR */
+         grf = (reg / 2) * jay_grf_per_gpr(f->shader);
+         offset_B = (reg & 1) * 2 * f->shader->dispatch_width;
+      } else {
+         /* Low bits are an offset in 2-byte words into the GRF */
+         unsigned mask = BITFIELD_MASK(stride_bits / 32);
+         grf = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader);
+         offset_B = (reg & mask) * 2;
+      }
+
+      R = byte_offset(xe2_vec8_grf(grf, 0),
+                      simd_offs * simd_width * stride_bits / 8);
+
+      if (stride_bits == (type_bits * 4)) {
+         R = stride(R, 8, 2, 4);
+      } else if (stride_bits == (type_bits * 2)) {
+         R = stride(R, 4, 2, 2);
+      } else {
+         assert(stride_bits == type_bits);
+      }
+
+      /* Broadcast is equivalent to <8, 8, 1> for SIMD1 instructions. Use that
+       * instead due to regioning restrictions.
+       */
+      if (simd_width == 1) {
+         R = vec1(R);
+      }
+   } else if (jay_is_flag(d)) {
+      /* Explicit flags act like UGPRs. As sources they broadcast to all lanes,
+       * so we may ignore the SIMD offset. As destinations, they are written by
+       * SIMD1 instructions and are never SIMD split.
+       */
+      assert(simd_offs == 0 || idx >= 0);
+      unsigned offs_B = d.reg * (f->shader->dispatch_width / 8);
+      R = brw_flag_subreg(offs_B / 2);
+   } else if (d.file == J_ADDRESS) {
+      R = brw_address_reg(d.reg);
+   } else if (d.file == J_ARF) {
+      R = brw_ud1_reg(ARF, jay_base_index(d), 0);
+   } else {
+      UNREACHABLE("unexpected file");
+   }
+
+   R.negate = d.negate;
+   R.abs = d.abs;
+   return byte_offset(retype(R, to_brw_reg_type(type)), offset_B);
+}
+
+#define SRC(i) to_brw_reg(f, I, i, simd_offs, false)
+
+#define OP0(hw)                                                                \
+   case JAY_OPCODE_##hw:                                                       \
+      brw_##hw(p);                                                             \
+      break;
+
+#define OP1(jay, hw)                                                           \
+   case JAY_OPCODE_##jay:                                                      \
+      brw_alu1(p, BRW_OPCODE_##hw, dst, SRC(0));                               \
+      break;
+
+#define OP2(jay, hw)                                                           \
+   case JAY_OPCODE_##jay:                                                      \
+      brw_alu2(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1));                       \
+      break;
+
+#define OP3(jay, hw)                                                           \
+   case JAY_OPCODE_##jay:                                                      \
+      brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1), SRC(2));               \
+      break;
+
+#define OP3_SWAP(jay, hw)                                                      \
+   case JAY_OPCODE_##jay:                                                      \
+      brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(2), SRC(1), SRC(0));               \
+      break;
+
+static struct brw_reg
+quad_swizzle(struct brw_reg r, const jay_inst *I)
+{
+   /* clang-format off */
+   switch (jay_quad_swizzle_swizzle(I)) {
+   case JAY_QUAD_SWIZZLE_XXXX: return suboffset(stride(r, 4, 4, 0), 0);
+   case JAY_QUAD_SWIZZLE_YYYY: return suboffset(stride(r, 4, 4, 0), 1);
+   case JAY_QUAD_SWIZZLE_ZZZZ: return suboffset(stride(r, 4, 4, 0), 2);
+   case JAY_QUAD_SWIZZLE_WWWW: return suboffset(stride(r, 4, 4, 0), 3);
+   case JAY_QUAD_SWIZZLE_XXZZ: return suboffset(stride(r, 2, 2, 0), 0);
+   case JAY_QUAD_SWIZZLE_YYWW: return suboffset(stride(r, 2, 2, 0), 1);
+   case JAY_QUAD_SWIZZLE_XYXY: return suboffset(stride(r, 0, 2, 1), 0);
+   case JAY_QUAD_SWIZZLE_ZWZW: return suboffset(stride(r, 0, 2, 1), 2);
+   }
+   /* clang-format on */
+
+   UNREACHABLE("invalid quad swizzle");
+}
+
+/* Runs once per SIMD-split, so must not modify the instruction! */
+static void
+emit(struct brw_codegen *p,
+     jay_function *f,
+     const jay_inst *I,
+     unsigned simd_offs)
+{
+   ASSERTED unsigned nr_ins_before = p->nr_insn;
+   unsigned exec_size = jay_simd_width_physical(f->shader, I);
+   // jay_print_inst(stdout, (jay_inst *) I);
+
+   /* Fix up SWSB dependencies for SIMD split instructions. The latter
+    * instructions do not need to redundantly wait on an SBID but might
+    * replicate their regdists.
+    */
+   struct tgl_swsb dep =
+      simd_offs && !I->replicate_dep ? tgl_swsb_null() : I->dep;
+   dep.mode = simd_offs ? TGL_SBID_NULL : dep.mode;
+
+   if (I->decrement_dep) {
+      unsigned delta = simd_offs * jay_macro_length(I);
+      assert(dep.regdist > delta);
+      dep.regdist -= delta;
+   }
+
+   brw_set_default_exec_size(p, util_logbase2(exec_size));
+   brw_set_default_mask_control(p, jay_is_no_mask(I));
+   brw_set_default_swsb(p, dep);
+   brw_set_default_saturate(p, I->saturate);
+
+   /* Quad swizzle can get split down to SIMD4 even on Xe2 where we don't have
+    * NibCtrl. Fortunately, it's NoMask so it doesn't matter.
+    */
+   if (I->op != JAY_OPCODE_QUAD_SWIZZLE) {
+      brw_set_default_group(p, simd_offs * exec_size);
+   }
+
+   /* Grab the hardware predicate, corresponding either to a logical predicate
+    * or SEL's selector.
+    */
+   const jay_def *pred = I->predication ? jay_inst_get_predicate((void *) I) :
+                         I->op == JAY_OPCODE_SEL ? &I->src[2] :
+                                                   NULL;
+
+   brw_set_default_predicate_control(p, pred ? BRW_PREDICATE_NORMAL :
+                                               BRW_PREDICATE_NONE);
+   brw_set_default_predicate_inverse(p, pred && pred->negate);
+
+   /* Jay/brw enums line up by construction */
+   enum brw_conditional_mod cmod =
+      (enum brw_conditional_mod) I->conditional_mod;
+
+   if (!jay_is_null(I->cond_flag)) {
+      assert(!(pred && pred->reg != I->cond_flag.reg) && "must be tied");
+      pred = &I->cond_flag;
+   }
+
+   if (pred) {
+      unsigned reg = pred->reg * jay_phys_flag_per_virt(f->shader);
+      brw_set_default_flag_reg(p, reg / 2, reg % 2);
+   }
+
+   if (I->op == JAY_OPCODE_MIN) {
+      cmod = BRW_CONDITIONAL_L;
+   } else if (I->op == JAY_OPCODE_MAX) {
+      cmod = BRW_CONDITIONAL_GE;
+   }
+
+   struct brw_reg dst = to_brw_reg(f, I, -1, simd_offs, false);
+
+   switch (I->op) {
+      OP0(ELSE)
+      OP0(ENDIF)
+      OP0(WHILE)
+      OP0(BREAK)
+      OP1(MOV, MOV)
+      OP1(MODIFIER, MOV)
+      OP1(RNDD, RNDD)
+      OP1(RNDZ, RNDZ)
+      OP1(RNDE, RNDE)
+      OP1(FRC, FRC)
+      OP1(BFREV, BFREV)
+      OP1(CBIT, CBIT)
+      OP1(NOT, NOT)
+      OP1(FBL, FBL)
+      OP1(FBH, FBH)
+      OP1(LZD, LZD)
+      OP2(ROL, ROL)
+      OP2(AVG, AVG)
+      OP2(ADD, ADD)
+      OP2(MUL, MUL)
+      OP2(SEL, SEL)
+      OP2(MIN, SEL)
+      OP2(MAX, SEL)
+      OP2(MUL_32X16, MUL)
+      OP2(AND, AND)
+      OP2(AND_U32_U16, AND)
+      OP2(OR, OR)
+      OP2(XOR, XOR)
+      OP2(ASR, ASR)
+      OP2(SHR, SHR)
+      OP2(SHL, SHL)
+      OP2(BFI1, BFI1)
+      OP3(BFI2, BFI2)
+      OP3(ADD3, ADD3)
+      OP3(CSEL, CSEL)
+      OP3(DP4A_UU, DP4A)
+      OP3(DP4A_SS, DP4A)
+      OP3(DP4A_SU, DP4A)
+      OP3_SWAP(MAD, MAD)
+      OP3_SWAP(BFE, BFE)
+
+   case JAY_OPCODE_LOOP_ONCE:
+      /* TODO: Is there a better way to do this? */
+      brw_BREAK(p);
+      brw_WHILE(p);
+      break;
+
+   case JAY_OPCODE_IF:
+      brw_IF(p, util_logbase2(exec_size));
+      break;
+
+   case JAY_OPCODE_MATH:
+      gfx6_math(p, dst, jay_math_op(I), SRC(0),
+                retype(brw_null_reg(), to_brw_reg_type(I->type)));
+      break;
+
+   case JAY_OPCODE_BFN:
+      brw_BFN(p, dst, SRC(0), SRC(1), SRC(2), brw_imm_ud(jay_bfn_ctrl(I)));
+      break;
+
+   case JAY_OPCODE_DESWIZZLE_16:
+      brw_set_default_exec_size(p, BRW_EXECUTE_16);
+      brw_MOV(p, retype(xe2_vec8_grf(jay_deswizzle_16_dst(I), 0), BRW_TYPE_UD),
+              retype(xe2_vec8_grf(jay_deswizzle_16_src(I), 0), BRW_TYPE_UD));
+      break;
+
+   case JAY_OPCODE_CVT: {
+      unsigned index = jay_cvt_index(I);
+      bool force_hi = false;
+
+      /* We will apply a suboffset for the specific subword being converted. In
+       * the case where we have a subword (16-bit) stride, accesses to the upper
+       * half will be instead to a discontiguous GRF so we have to fix up. This
+       * affects u8->u32 conversions.
+       */
+      if (I->src[0].file == GPR) {
+         unsigned type_size_B = jay_type_size_bits(jay_cvt_src_type(I)) / 8;
+         unsigned index_B = index * type_size_B;
+         unsigned stride_B =
+            jay_stride_to_bits(jay_def_stride(f->shader, I->src[0])) / 8;
+
+         if (index_B >= stride_B) {
+            assert(stride_B == 2 && index_B <= 4 && !I->src[0].hi);
+            force_hi = true;
+            index = (index_B % stride_B) / type_size_B;
+         }
+      }
+
+      brw_MOV(p, dst,
+              suboffset(to_brw_reg(f, I, 0, simd_offs, force_hi), index));
+      break;
+   }
+
+   case JAY_OPCODE_SYNC:
+      brw_SYNC(p, jay_sync_op(I));
+      break;
+
+   case JAY_OPCODE_CMP:
+      brw_CMP(p, dst, I->conditional_mod, SRC(0), SRC(1));
+      break;
+
+   case JAY_OPCODE_MOV_IMM64:
+      brw_MOV(p, dst, brw_imm_u64(jay_mov_imm64_imm(I)));
+      break;
+
+   case JAY_OPCODE_RELOC:
+      brw_MOV_reloc_imm(p, dst, BRW_TYPE_UD, jay_reloc_param(I),
+                        jay_reloc_base(I));
+      break;
+
+   case JAY_OPCODE_QUAD_SWIZZLE:
+      brw_MOV(p, dst, quad_swizzle(SRC(0), I));
+      break;
+
+   case JAY_OPCODE_BROADCAST_IMM:
+      brw_MOV(p, dst, get_element(SRC(0), jay_broadcast_imm_lane(I)));
+      break;
+
+   case JAY_OPCODE_SEND:
+      brw_SEND(p, jay_send_sfid(I), dst, SRC(2), SRC(3), SRC(0), SRC(1),
+               jay_send_ex_desc_imm(I), jay_send_ex_mlen(I),
+               jay_send_bindless(I), jay_send_eot(I), false /* gather */);
+      if (jay_send_check_tdr(I)) {
+         brw_eu_inst_set_opcode(p->isa, brw_eu_last_inst(p), BRW_OPCODE_SENDC);
+      }
+      break;
+
+   /* Gfx20+ has separate Render Target Array indices for each pair of subspans
+    * in order to support multiple polygons, so we need to use a <1;8,0> region
+    * in order to select the word for each channel.
+    */
+   case JAY_OPCODE_EXTRACT_LAYER:
+      brw_AND(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UW), 1, 8, 0),
+              brw_imm_uw(0x7ff));
+      break;
+
+   case JAY_OPCODE_EXPAND_QUAD:
+      brw_MOV(p, dst, stride(SRC(simd_offs), 1, 4, 0));
+      break;
+
+   case JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS:
+      brw_set_default_exec_size(p, BRW_EXECUTE_32);
+      brw_set_default_group(p, 0);
+      brw_ADD(p, retype(dst, BRW_TYPE_UW), retype(SRC(0), BRW_TYPE_UW),
+              brw_imm_uv(0x11100100));
+      break;
+
+   case JAY_OPCODE_LANE_ID_8:
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_MOV(p, dst, brw_imm_uv(0x76543210));
+      break;
+
+   case JAY_OPCODE_LANE_ID_EXPAND:
+      brw_set_default_exec_size(p, util_logbase2(jay_lane_id_expand_width(I)));
+      brw_ADD(p, suboffset(dst, jay_lane_id_expand_width(I)), SRC(0),
+              brw_imm_uw(jay_lane_id_expand_width(I)));
+      break;
+
+   case JAY_OPCODE_EXTRACT_BYTE_PER_8LANES:
+      brw_MOV(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UB), 1, 8, 0));
+      break;
+
+   case JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4:
+      brw_SHR(p, dst, SRC(0), brw_imm_uv(0x44440000));
+      break;
+
+   case JAY_OPCODE_MUL_32: {
+      brw_MUL(p, retype(brw_acc_reg(1), to_brw_reg_type(I->type)), SRC(0),
+              subscript(SRC(1), BRW_TYPE_UW, 0));
+
+      brw_set_default_swsb(p, tgl_swsb_null());
+      brw_alu2(p, jay_mul_32_high(I) ? BRW_OPCODE_MACH : BRW_OPCODE_MACL, dst,
+               SRC(0), SRC(1));
+      break;
+   }
+
+   case JAY_OPCODE_SHUFFLE: {
+      struct brw_reg a0 = brw_address_reg(0);
+      unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]);
+      unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width;
+
+      brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B));
+      brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD));
+      break;
+   }
+
+   default:
+      jay_print_inst(stderr, (jay_inst *) I);
+      UNREACHABLE("Unhandled opcode");
+   }
+
+   if (cmod != BRW_CONDITIONAL_NONE) {
+      brw_eu_inst_set_cond_modifier(p->devinfo, brw_eu_last_inst(p), cmod);
+   }
+
+   assert(p->nr_insn == (nr_ins_before + jay_macro_length(I)) &&
+          "Jay instructions must map 1:n to GEN instructions");
+}
+
+struct jay_shader_bin *
+jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size)
+{
+   struct jay_shader_bin *bin = rzalloc(s, struct jay_shader_bin);
+
+   struct util_dynarray prog;
+   util_dynarray_init(&prog, bin);
+
+   struct brw_isa_info isa;
+   struct brw_codegen p;
+
+   brw_init_isa_info(&isa, s->devinfo);
+   brw_init_codegen(&isa, &p, bin);
+   int start_offset = p.next_insn_offset;
+
+   /* TODO: Multifunction properly */
+   jay_foreach_function(s, f) {
+      jay_foreach_block(f, block) {
+         if (block->loop_header) {
+            brw_DO(&p, 0);
+         }
+
+         jay_foreach_inst_in_block(block, I) {
+            for (unsigned i = 0; i < (1 << jay_simd_split(s, I)); ++i) {
+               emit(&p, f, I, i);
+            }
+         }
+      }
+   }
+
+   int final_halt_offset = -1 /* TODO */;
+   brw_set_uip_jip(&p, start_offset, final_halt_offset);
+
+   struct disasm_info *disasm = disasm_initialize(p.isa, NULL);
+
+   disasm_new_inst_group(disasm, 0);
+   disasm_new_inst_group(disasm, p.next_insn_offset);
+
+   UNUSED bool valid = true;
+#ifndef NDEBUG
+   valid =
+      brw_validate_instructions(p.isa, p.store, 0, p.next_insn_offset, disasm);
+#endif
+
+   brw_compact_instructions(&p, start_offset, disasm);
+
+   if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(s->stage)) || !valid) {
+      dump_assembly(p.store, 0, p.next_insn_offset, disasm, NULL, stdout);
+   }
+
+   if (!valid) {
+      UNREACHABLE("invalid assembly");
+   }
+
+   struct brw_stage_prog_data *prog_data = &s->prog_data->base;
+
+   assert(prog_data->const_data_size == 0);
+   if (const_data_size > 0) {
+      prog_data->const_data_size = const_data_size;
+      prog_data->const_data_offset =
+         brw_append_data(&p, const_data, const_data_size, 32);
+   }
+
+   bin->kernel = brw_get_program(&p, &bin->size);
+   s->prog_data->base.relocs =
+      brw_get_shader_relocs(&p, &s->prog_data->base.num_relocs);
+
+   return bin;
+}
diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c
new file mode 100644
index 00000000000..7a3a6953fb7
--- /dev/null
+++ b/src/intel/compiler/jay/jay_validate.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+#ifndef NDEBUG
+
+enum validate_block_state {
+   STATE_PHI_DST,
+   STATE_NORMAL,
+   STATE_LATE,
+};
+
+struct validate_state {
+   bool failed;
+   bool post_ra;
+   const char *when;
+   jay_inst *I;
+   jay_block *block;
+   jay_function *func;
+   BITSET_WORD *defs;
+   enum jay_file *files;
+   enum validate_block_state block_state;
+};
+
+static enum validate_block_state
+block_state_for_inst(jay_inst *I)
+{
+   if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) {
+      return STATE_PHI_DST;
+   } else if (I->op == JAY_OPCODE_PHI_SRC ||
+              (jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) {
+      return STATE_LATE;
+   } else {
+      return STATE_NORMAL;
+   }
+}
+
+static void
+chirp(struct validate_state *validate, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+
+   if (!validate->failed) {
+      fprintf(stderr, "jay shader validation failed (after %s):\n",
+              validate->when);
+      validate->failed = true;
+   }
+   if (validate->I) {
+      fprintf(stderr,
+              "   invalid instruction in block %d: ", validate->block->index);
+      jay_print_inst(stderr, validate->I);
+   }
+   fprintf(stderr, "   ");
+   vfprintf(stderr, fmt, args);
+   fprintf(stderr, "\n\n");
+
+   va_end(args);
+}
+
+#define CHECK(cond)                                                            \
+   if (!(cond)) {                                                              \
+      chirp(validate, "assertion failed at %s:%u\n   %s", __FILE__, __LINE__,  \
+            #cond);                                                            \
+   }
+
+static void
+validate_flagness(struct validate_state *validate,
+                  jay_def def,
+                  enum jay_type type,
+                  const char *name)
+{
+   CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def));
+}
+
+static unsigned
+get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
+{
+   if (I->op == JAY_OPCODE_EXPAND_QUAD) {
+      return 4;
+   }
+
+   bool vectorized = I->dst.file == UGPR &&
+                     jay_num_values(I->dst) > jay_type_vector_length(I->type) &&
+                     I->op != JAY_OPCODE_SEND &&
+                     jay_num_values(I->src[s]) > 1;
+
+   unsigned elsize = jay_type_vector_length(jay_src_type(I, s));
+   unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1);
+
+   if (vectorized && I->src[s].file == GPR) {
+      CHECK(words == validate->func->shader->dispatch_width);
+      return 1;
+   } else {
+      return words;
+   }
+}
+
+/*
+ * Validate the fundamental invariants of static single assignment form.
+ */
+static void
+validate_ssa(struct validate_state *validate, jay_inst *I)
+{
+   jay_foreach_src_index(I, src_index, _, ssa_index) {
+      CHECK(BITSET_TEST(validate->defs, ssa_index) && "defs dominate uses");
+      CHECK(validate->files[ssa_index] == I->src[src_index].file &&
+            "consistent files");
+   }
+
+   jay_foreach_dst_index(I, d, ssa_index) {
+      CHECK(!BITSET_TEST(validate->defs, ssa_index) && "single definition");
+      BITSET_SET(validate->defs, ssa_index);
+      validate->files[ssa_index] = d.file;
+   }
+}
+
+/*
+ * Validate the invariants of jay_def.
+ */
+static void
+validate_def(struct validate_state *validate, jay_def def, const char *kind)
+{
+   CHECK(!jay_is_null(def) || !def.reg);
+
+   if (def.collect) {
+      CHECK(jay_num_values(def) >= 2);
+      CHECK(def.file == GPR || def.file == UGPR);
+
+      bool contiguous = true;
+      jay_foreach_comp(def, c) {
+         uint32_t index = jay_channel(def, c);
+         contiguous &= index == (jay_channel(def, 0) + c);
+         CHECK(index != JAY_SENTINEL);
+      }
+
+      CHECK(!contiguous);
+   } else if (def.file == J_IMM) {
+      CHECK(!def.reg);
+      CHECK(!def.num_values_m1);
+      CHECK(!def.negate);
+      CHECK(!def.abs);
+   } else if (def.file == ACCUM || def.file == UACCUM || def.hi) {
+      CHECK(validate->post_ra);
+   } else {
+      CHECK(jay_base_index(def) != JAY_SENTINEL || validate->post_ra);
+   }
+
+   if (jay_is_ssa(def) && jay_channel(def, 0) != JAY_SENTINEL) {
+      jay_foreach_comp(def, c) {
+         CHECK(jay_channel(def, c) < validate->func->ssa_alloc);
+      }
+   }
+
+   CHECK(jay_num_values(def) == 1 || !jay_is_flag(def));
+}
+
+/**
+ * Validate an instruction.
+ */
+static void
+validate_inst(struct validate_state *validate, jay_inst *I)
+{
+   validate->I = I;
+
+   /* Block states are monotonic. */
+   enum validate_block_state state = block_state_for_inst(I);
+   CHECK(state >= validate->block_state);
+   validate->block_state = state;
+
+   const struct jay_opcode_info *opinfo = &jay_opcode_infos[I->op];
+
+   validate_def(validate, I->dst, "dst");
+   validate_def(validate, I->cond_flag, "cond_flag");
+
+   jay_foreach_src(I, s) {
+      validate_def(validate, I->src[s], "source");
+   }
+
+   if (!validate->post_ra) {
+      validate_ssa(validate, I);
+   }
+
+   CHECK(I->num_srcs <= JAY_MAX_SRCS);
+
+   validate_flagness(validate, I->dst, I->type, "destination");
+   validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag");
+
+   CHECK(!I->conditional_mod ||
+         !jay_is_null(I->cond_flag) ||
+         I->op == JAY_OPCODE_CSEL);
+
+   /* These assumptions are baked into the definition of broadcast_flag and
+    * required to ensure correctness with the lane masking.
+    */
+   CHECK(!I->broadcast_flag ||
+         (!jay_is_null(I->cond_flag) &&
+          jay_is_null(I->dst) &&
+          I->cond_flag.file == FLAG &&
+          (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_MOV)));
+
+   /* Standard modifiers only allowed on some instructions */
+   CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL);
+   CHECK(!I->saturate || opinfo->sat);
+
+   unsigned num_srcs = I->num_srcs;
+
+   if (I->predication) {
+      CHECK(num_srcs >= I->predication);
+
+      if (jay_inst_has_default(I)) {
+         CHECK(jay_inst_get_default(I)->file == I->dst.file);
+      }
+
+      CHECK(jay_is_flag(*jay_inst_get_predicate(I)));
+      CHECK(!jay_is_null(*jay_inst_get_predicate(I)));
+
+      num_srcs -= I->predication;
+   }
+
+   if (validate->post_ra) {
+      CHECK(jay_simd_width_logical(validate->func->shader, I) > 0);
+      CHECK(jay_simd_width_physical(validate->func->shader, I) > 0);
+   }
+
+   /* Number of sources should match for our opcode.  If opinfo->num_srcs
+    * is zero, then it may actually take a variable number of sources.
+    */
+   CHECK(num_srcs == opinfo->num_srcs || opinfo->num_srcs == 0);
+
+   for (unsigned s = 0; s < num_srcs; s++) {
+      if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) {
+         unsigned expected = get_src_words(validate, I, s);
+         unsigned words = jay_num_values(I->src[s]);
+         if (I->op != JAY_OPCODE_SEND || s < 2) {
+            CHECK(expected == words);
+         }
+
+         validate_flagness(validate, I->src[s], jay_src_type(I, s), "source");
+      }
+
+      CHECK(!I->src[s].negate || jay_has_src_mods(I, s));
+   }
+
+   switch (I->op) {
+   case JAY_OPCODE_SEL:
+      CHECK(jay_is_flag(I->src[2]) && "SEL src[2] (selector) must be a flag");
+      break;
+   case JAY_OPCODE_SWAP:
+      CHECK(I->src[0].file == I->src[1].file && "SWAP files must match");
+      break;
+   default:
+      break;
+   }
+}
+
+static void
+jay_validate_function(struct validate_state *validate)
+{
+   validate->defs = BITSET_CALLOC(validate->func->ssa_alloc);
+   validate->files =
+      calloc(validate->func->ssa_alloc, sizeof(validate->files[0]));
+
+   jay_foreach_block(validate->func, block) {
+      validate->block = block;
+      validate->I = NULL;
+
+      CHECK(block->successors[0] || !block->successors[1]);
+
+      /* Post-RA we can remove physical jumps though they exist logically */
+      if (block->successors[1] && !validate->post_ra) {
+         CHECK(jay_block_ending_jump(block) != NULL);
+      }
+
+      /* If a block has multiple successors, and one of them has multiple
+       * predecessors, then we've detected a critical edge.
+       */
+      if (jay_num_successors(block) > 1 && !validate->post_ra) {
+         jay_foreach_successor(block, succ) {
+            if (jay_num_predecessors(succ) > 1) {
+               chirp(validate, "Critical edge (B%u -> B%u) is not allowed",
+                     block->index, succ->index);
+            }
+         }
+      }
+
+      validate->block_state = 0;
+      jay_foreach_inst_in_block(block, inst) {
+         validate_inst(validate, inst);
+      }
+   }
+
+   /* Validate that there are no dead phis. RA relies on this. */
+   if (!validate->post_ra) {
+      jay_foreach_block(validate->func, block) {
+         jay_foreach_phi_src_in_block(block, phi) {
+            CHECK(BITSET_TEST(validate->defs, jay_phi_src_index(phi)));
+         }
+      }
+   }
+
+   free(validate->defs);
+   free(validate->files);
+}
+
+void
+jay_validate(jay_shader *s, const char *when)
+{
+   struct validate_state validate = { .when = when, .post_ra = s->post_ra };
+
+   jay_foreach_function(s, f) {
+      validate.func = f;
+      jay_validate_function(&validate);
+   }
+
+   if (validate.failed) {
+      fprintf(stderr, "jay shader that failed validation:\n");
+      jay_print(stderr, s);
+      abort();
+   }
+}
+
+#endif
diff --git a/src/intel/compiler/jay/jay_validate_ra.c b/src/intel/compiler/jay/jay_validate_ra.c
new file mode 100644
index 00000000000..02bd20b57bd
--- /dev/null
+++ b/src/intel/compiler/jay/jay_validate_ra.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/ralloc.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+/* Validatation doesn't make sense in release builds */
+#ifndef NDEBUG
+
+struct regfile {
+   /* For each register in each file, records the SSA index currently stored
+    * in that register (or zero if undefined contents).
+    */
+   uint32_t *r[JAY_NUM_SSA_FILES];
+
+   /* Size of each register file */
+   size_t n[JAY_NUM_SSA_FILES];
+};
+
+static uint32_t *
+reg(struct regfile *rf, enum jay_file file, uint32_t reg)
+{
+   /* FLAG and UFLAG share their registers. TODO: Rework? */
+   if (file == UFLAG) {
+      file = FLAG;
+   }
+
+   assert(file < JAY_NUM_SSA_FILES);
+   assert(reg < rf->n[file]);
+   return &rf->r[file][reg];
+}
+
+static uint32_t *
+def_reg(struct regfile *rf, jay_def x, uint32_t component)
+{
+   return reg(rf, x.file, x.reg + component);
+}
+
+static void
+print_regfile(struct regfile *rf, FILE *fp)
+{
+   fprintf(fp, "regfile: \n");
+   jay_foreach_ssa_file(file) {
+      for (unsigned i = 0; i < rf->n[file]; ++i) {
+         uint32_t v = *reg(rf, file, i);
+         const char *prefixes = "ruf"; /* XXX: share with jay_print */
+
+         if (v) {
+            fprintf(fp, "   %c%u = %u\n", prefixes[file], i, v);
+         }
+      }
+   }
+   fprintf(fp, "\n");
+}
+
+static bool
+validate_src(struct jay_partition *partition,
+             jay_inst *I,
+             unsigned s,
+             struct regfile *rf,
+             jay_def def)
+{
+   jay_foreach_comp(def, c) {
+      uint32_t actual = *def_reg(rf, def, c);
+
+      if (def.file == GPR) {
+         assert(jay_gpr_to_stride(partition, def.reg) ==
+                jay_gpr_to_stride(partition, def.reg + c));
+      }
+
+      if (actual == 0 || actual != jay_channel(def, c)) {
+         fprintf(stderr, "invalid RA for source %u, channel %u.\n", s, c);
+
+         fprintf(stderr, "expected index %u but", jay_channel(def, c));
+         if (actual)
+            fprintf(stderr, " got index %u\n", actual);
+         else
+            fprintf(stderr, " register is undefined\n");
+
+         jay_print_inst(stderr, I);
+         print_regfile(rf, stderr);
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+validate_block(jay_function *func, jay_block *block, struct regfile *blocks)
+{
+   struct regfile *rf = &blocks[block->index];
+   bool success = true;
+
+   /* Pathological shaders can end up with loop headers that have only a
+    * single predecessor and act like normal blocks. Validate them as such,
+    * since RA treats them as such implicitly. Affects:
+    *
+    * dEQP-VK.graphicsfuzz.spv-stable-mergesort-dead-code
+    */
+   bool loop_header = block->loop_header && jay_num_predecessors(block) > 1;
+
+   /* Initialize the register file based on predecessors. */
+   /* Initialize with the exit state of any one predecessor */
+   jay_block *first_pred = jay_first_predecessor(block);
+   if (first_pred) {
+      struct regfile *pred_rf = &blocks[first_pred->index];
+
+      jay_foreach_ssa_file(f) {
+         memcpy(rf->r[f], pred_rf->r[f], rf->n[f] * sizeof(uint32_t));
+      }
+   }
+
+   /* TODO: Handle loop header validation better */
+   if (!loop_header) {
+      /* Intersect with the other predecessor. If a register has different
+       * values coming in from each block, it is considered undefined at the
+       * start of the block.
+       */
+      jay_foreach_predecessor(block, pred) {
+         struct regfile *pred_rf = &blocks[(*pred)->index];
+
+         jay_foreach_ssa_file(file) {
+            for (unsigned r = 0; r < rf->n[file]; ++r) {
+               if (*reg(rf, file, r) != *reg(pred_rf, file, r)) {
+                  *reg(rf, file, r) = 0;
+               }
+            }
+         }
+      }
+   }
+
+   jay_foreach_inst_in_block(block, I) {
+      /* Validate sources */
+      jay_foreach_ssa_src(I, s) {
+         if (jay_channel(I->src[s], 0) != JAY_SENTINEL) {
+            success &=
+               validate_src(&func->shader->partition, I, s, rf, I->src[s]);
+         }
+      }
+
+      /* Record destinations */
+      jay_foreach_dst(I, dst) {
+         if (jay_channel(dst, 0) != JAY_SENTINEL) {
+            jay_foreach_comp(dst, c) {
+               *def_reg(rf, dst, c) = jay_channel(dst, c);
+
+               if (dst.file == GPR) {
+                  struct jay_partition *p = &func->shader->partition;
+                  assert(jay_gpr_to_stride(p, dst.reg) ==
+                         jay_gpr_to_stride(p, dst.reg + c));
+               }
+            }
+         }
+      }
+
+      if (I->op == JAY_OPCODE_MOV &&
+          jay_channel(I->dst, 0) == JAY_SENTINEL &&
+          jay_is_ssa(I->src[0]) &&
+          jay_channel(I->src[0], 0) == JAY_SENTINEL) {
+
+         /* Lowered live range splits don't have SSA associated, handle
+          * directly at the register level.
+          */
+         assert(jay_num_values(I->dst) == jay_num_values(I->src[0]));
+
+         jay_foreach_comp(I->dst, c) {
+            *def_reg(rf, I->dst, c) = *def_reg(rf, I->src[0], c);
+         }
+      } else if (I->op == JAY_OPCODE_SWAP) {
+         assert(jay_num_values(I->src[0]) == jay_num_values(I->src[1]));
+
+         jay_foreach_comp(I->src[0], c) {
+            SWAP(*def_reg(rf, I->src[0], c), *def_reg(rf, I->src[1], c));
+         }
+      }
+   }
+
+   return success;
+}
+
+void
+jay_validate_ra(jay_function *func)
+{
+   bool succ = true;
+   linear_ctx *lin_ctx = linear_context(func->shader);
+   struct regfile *blocks =
+      linear_zalloc_array(lin_ctx, struct regfile, func->num_blocks);
+
+   jay_foreach_block(func, block) {
+      struct regfile *b = &blocks[block->index];
+      assert(block->index < func->num_blocks);
+
+      jay_foreach_ssa_file(file) {
+         b->n[file] = jay_num_regs(func->shader, file);
+         b->r[file] = linear_zalloc_array(lin_ctx, uint32_t, b->n[file]);
+      }
+   }
+
+   jay_foreach_block(func, block) {
+      succ &= validate_block(func, block, blocks);
+   }
+
+   if (!succ) {
+      jay_print_func(stderr, func);
+      UNREACHABLE("invalid RA");
+   }
+
+   linear_free_context(lin_ctx);
+}
+
+#endif /* NDEBUG */
diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build
new file mode 100644
index 00000000000..e9c47ada78c
--- /dev/null
+++ b/src/intel/compiler/jay/meson.build
@@ -0,0 +1,109 @@
+# Copyright 2017 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+jay_opcodes = custom_target(
+  input : ['jay_opcodes_gen.py'],
+  output : ['jay_opcodes.c', 'jay_opcodes.h'],
+  command : [prog_python, '@INPUT@', '--code', '@OUTPUT0@', '--header', '@OUTPUT1@'],
+  depend_files : files('jay_opcodes.py'),
+)
+
+idep_jay_opcodes_h = declare_dependency(
+  sources : [jay_opcodes[1]],
+  include_directories : include_directories('.'),
+)
+
+jay_extra_info_h = custom_target(
+  input : ['jay_extra_info.h.py'],
+  output : 'jay_extra_info.h',
+  command : [prog_python, '@INPUT@', '@OUTPUT@'],
+  depend_files : files('jay_opcodes.py'),
+)
+
+idep_jay_extra_info_h = declare_dependency(
+  sources : [jay_extra_info_h],
+  include_directories : include_directories('.'),
+)
+
+jay_builder_opcodes_h = custom_target(
+  input : 'jay_builder_opcodes.h.py',
+  output : 'jay_builder_opcodes.h',
+  command : [prog_python, '@INPUT@', '@OUTPUT@'],
+  depend_files : files('jay_opcodes.py'),
+)
+
+idep_jay_builder_opcodes_h = declare_dependency(
+  sources : [jay_builder_opcodes_h],
+  include_directories : include_directories('.'),
+)
+
+jay_nir_algebraic = custom_target(
+  'jay_nir_algebraic.c',
+  input : ['jay_nir_algebraic.py'],
+  output : 'jay_nir_algebraic.c',
+  command : [prog_python, '@INPUT@', '@OUTPUT@', '-p', dir_compiler_nir] ,
+  depend_files : nir_algebraic_depends,
+)
+
+libintel_compiler_jay_files = files(
+  'jay.h',
+  'jay_assign_flags.c',
+  'jay_from_nir.c',
+  'jay_ir.h',
+  'jay_liveness.c',
+  'jay_lower_post_ra.c',
+  'jay_lower_pre_ra.c',
+  'jay_lower_scoreboard.c',
+  'jay_lower_spill.c',
+  'jay_opt_dead_code.c',
+  'jay_opt_control_flow.c',
+  'jay_opt_propagate.c',
+  'jay_print.c',
+  'jay_private.h',
+  'jay_repair_ssa.c',
+  'jay_register_allocate.c',
+  'jay_simd_width.c',
+  'jay_spill.c',
+  'jay_to_binary.c',
+  'jay_validate.c',
+  'jay_validate_ra.c',
+)
+
+libintel_compiler_jay = static_library(
+  'intel_compiler_jay',
+  [libintel_compiler_jay_files, jay_nir_algebraic, jay_opcodes[0]],
+  include_directories : [inc_include, inc_src, inc_intel],
+  c_args : [no_override_init_args, '-Wno-c23-extensions', '-Wno-array-bounds'],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [idep_nir_headers, idep_jay_opcodes_h, idep_jay_builder_opcodes_h, idep_jay_extra_info_h, idep_mesautil, idep_intel_dev],
+  build_by_default : false,
+)
+
+idep_intel_compiler_jay = declare_dependency(
+  link_with : [libintel_compiler_jay],
+  dependencies : [
+    idep_nir,
+    idep_vtn,
+  ],
+)
+
+if with_tests
+  test(
+    'jay_tests',
+    executable(
+      'jay_tests',
+      files(
+         'test/test-lower-post-ra.cpp',
+         'test/test-optimizer.cpp',
+         'test/test-repair-ssa.cpp',
+      ),
+      c_args : [c_msvc_compat_args, no_override_init_args],
+      gnu_symbol_visibility : 'hidden',
+      include_directories : [inc_include, inc_src, inc_intel],
+      dependencies: [idep_gtest, idep_nir, idep_jay_opcodes_h, idep_jay_builder_opcodes_h, idep_jay_extra_info_h, idep_mesautil, idep_intel_dev],
+      link_with : [libintel_compiler_jay],
+    ),
+    suite : ['intel'],
+    protocol : 'gtest',
+  )
+endif
diff --git a/src/intel/compiler/jay/register-file.md b/src/intel/compiler/jay/register-file.md
new file mode 100644
index 00000000000..b2053ccf348
--- /dev/null
+++ b/src/intel/compiler/jay/register-file.md
@@ -0,0 +1,57 @@
+# Glossary
+
+**lane**: A single work-item.
+
+**subgroup**: A collection of 8, 16, or 32 lanes executing in lockstep.
+Avoid using the term _thread_ as it is ambiguous.
+
+**uniform**: A value that has the same value in every active lane of a subgroup.
+Sometimes called _convergent_. Opposite of "non-uniform".
+
+**non-uniform**: A value that may have different values in different active
+lanes within a subgroup. Sometimes called _divergent_. Opposite of "uniform".
+
+**GPR**: General-purpose register, a single non-uniform value viewed from the
+perspective of a single lane. This is a 'virtual' or 'logical' register within
+the SIMT programming model. It does not represent a physical machine
+register. For that, see "GRF".
+
+**UGPR**: Uniform general purpose register, a single uniform value. This is
+again a virtual or logical register.
+
+**GRF**: A physical Intel GPU register. On Xe2+, a GRF is 512-bits. On older
+platforms, a GRF is 256-bits. Depending on the platform and the SIMD width,
+different numbers of GRFs required to store a single GPR, and different numbers
+of UGPRs fit into a single GRF. In SIMD32 mode on Xe2, 1 GPR requires 2 GRFs,
+and 16 UGPRs fit into 1 GRF.
+
+**scalar**: A single value from the perspective of a single lane; a single GPR
+or UGPR. Note that a scalar may be either uniform or non-uniform. Opposite of
+"vector".
+
+**vector**: A collection of multiple values from the perspective of a single
+lane. All scalars within the vector must be identically be GPRs or UGPRs.
+
+# Introduction
+
+Jay separates the logical register files (GPR and UGPR) from the
+unified physical register file. We assign registers independently for each
+logical file, and then post-RA we remap to physical GRFs. This simplifies RA.
+
+We decide a static GPR/UGPR split up front. Ideally, we'd just use the
+first N registers for GPRs and the rest for UGPRs, or something like
+that. Unfortunately, several hardware issues complicate this scheme...
+
+# End-of-thread SENDs
+
+End-of-thread SENDs require their source is in r112-r127. As their source will
+always be per-thread, we want to make sure these are GPRs.
+
+# Payloads
+
+At the start of each thread, the register file is preloaded with a payload.
+Parts of the payload act like UGPRs, parts act like GPRs, and parts act like...
+something weird and in between. To minimize copying, we want to assign UGPRs to
+the UGPR parts of the payload and GPRs to the GPR parts. As for the weird cases,
+we model them as UGPR vectors and use special opcodes (lowered late to
+regioning) to unpack to GPRs for normal handling.
diff --git a/src/intel/compiler/jay/test/jay_test.h b/src/intel/compiler/jay/test/jay_test.h
new file mode 100644
index 00000000000..43cc48b87ef
--- /dev/null
+++ b/src/intel/compiler/jay/test/jay_test.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <inttypes.h>
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_private.h"
+#include "shader_enums.h"
+
+static inline jay_block *
+jay_test_block(jay_function *f)
+{
+   jay_block *blk = jay_new_block(f);
+   list_addtail(&blk->link, &f->blocks);
+   return blk;
+}
+
+/* Helper to generate a jay_builder suitable for creating test instructions */
+static inline jay_builder *
+jay_test_builder(void *memctx)
+{
+   jay_shader *s = jay_new_shader(memctx, MESA_SHADER_COMPUTE);
+   jay_function *f = jay_new_function(s);
+   s->partition.base8 = 8;
+
+   struct intel_device_info *devinfo =
+      rzalloc(memctx, struct intel_device_info);
+   s->devinfo = devinfo;
+   s->dispatch_width = 32;
+
+   unsigned verx10 = 200;
+   devinfo->verx10 = verx10;
+   devinfo->ver = verx10 / 10;
+   assert(devinfo->ver > 0);
+
+   /* We'll use low indices for test values */
+   f->ssa_alloc = 10;
+
+   jay_builder *b = rzalloc(memctx, jay_builder);
+   *b = jay_init_builder(f, jay_after_block(jay_test_block(f)));
+   return b;
+}
+
+/* Helper to compare for logical equality of instructions. Need to compare the
+ * pointers, then compare raw data.
+ */
+static inline bool
+jay_inst_equal(jay_inst *A, jay_inst *B)
+{
+   /* Check the plain old data portion of jay_inst. */
+   unsigned header = sizeof(struct list_head);
+   if (memcmp((uint8_t *) A + header, (uint8_t *) B + header,
+              sizeof(jay_inst) - header))
+      return false;
+
+   /* All of the sizes are plain data. They match, so do a deep compare. */
+   size_t size = (A->num_srcs * sizeof(jay_def)) + jay_inst_info_size(A);
+   return !memcmp(A->src, B->src, size);
+}
+
+static inline bool
+jay_block_equal(jay_block *A, jay_block *B)
+{
+   if (list_length(&A->instructions) != list_length(&B->instructions))
+      return false;
+
+   list_pair_for_each_entry(jay_inst, I, J, &A->instructions, &B->instructions,
+                            link) {
+      if (!jay_inst_equal(I, J)) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static inline bool
+jay_function_equal(jay_function *A, jay_function *B)
+{
+   if (list_length(&A->blocks) != list_length(&B->blocks))
+      return false;
+
+   list_pair_for_each_entry(jay_block, blockA, blockB, &A->blocks, &B->blocks,
+                            link) {
+      if (!jay_block_equal(blockA, blockB))
+         return false;
+   }
+
+   return true;
+}
+
+static inline bool
+jay_shader_equal(jay_shader *A, jay_shader *B)
+{
+   if (list_length(&A->functions) != list_length(&B->functions))
+      return false;
+
+   list_pair_for_each_entry(jay_function, functionA, functionB, &A->functions,
+                            &B->functions, link) {
+      if (!jay_function_equal(functionA, functionB))
+         return false;
+   }
+
+   return true;
+}
+
+#define ASSERT_SHADER_EQUAL(A, B)                                              \
+   if (!jay_shader_equal(A, B)) {                                              \
+      ADD_FAILURE();                                                           \
+      fprintf(stderr, "Pass produced unexpected results");                     \
+      fprintf(stderr, "  Actual:\n");                                          \
+      jay_print(stderr, A);                                                    \
+      fprintf(stderr, " Expected:\n");                                         \
+      jay_print(stderr, B);                                                    \
+      fprintf(stderr, "\n");                                                   \
+   }
+
+#define INSTRUCTION_CASE_GEN(instr, expected, pass, validate)                  \
+   do {                                                                        \
+      jay_builder *A = jay_test_builder(mem_ctx);                              \
+      jay_builder *B = jay_test_builder(mem_ctx);                              \
+      {                                                                        \
+         jay_builder *b = A;                                                   \
+         instr;                                                                \
+      }                                                                        \
+      if (validate)                                                            \
+         jay_validate(A->shader, "test setup");                                \
+      {                                                                        \
+         jay_builder *b = B;                                                   \
+         expected;                                                             \
+      }                                                                        \
+      JAY_PASS(A->shader, pass);                                               \
+      ASSERT_SHADER_EQUAL(A->shader, B->shader);                               \
+   } while (0)
+
+#define INSTRUCTION_CASE(instr, expected, pass)                                \
+   INSTRUCTION_CASE_GEN(instr, expected, pass, true)
diff --git a/src/intel/compiler/jay/test/test-lower-post-ra.cpp b/src/intel/compiler/jay/test/test-lower-post-ra.cpp
new file mode 100644
index 00000000000..209d944f347
--- /dev/null
+++ b/src/intel/compiler/jay/test/test-lower-post-ra.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_test.h"
+
+#include <gtest/gtest.h>
+
+#define CASE(instr, expected)                                                  \
+   INSTRUCTION_CASE(                                                           \
+      {                                                                        \
+         A->shader->post_ra = true;                                            \
+         instr;                                                                \
+      },                                                                       \
+      {                                                                        \
+         B->shader->post_ra = true;                                            \
+         expected;                                                             \
+      },                                                                       \
+      jay_lower_post_ra)
+
+#define PRE   jay_add_predicate_else
+#define POST  jay_add_predicate
+#define CFLAG jay_set_cond_flag
+
+#define NEGCASE(x) CASE(x, x)
+
+class LowerPostRA : public testing::Test {
+ protected:
+   LowerPostRA()
+   {
+      mem_ctx = ralloc_context(NULL);
+
+      x = jay_bare_reg(GPR, 1);
+      y = jay_bare_reg(GPR, 2);
+      z = jay_bare_reg(GPR, 3);
+      u4 = jay_bare_reg(UGPR, 4);
+      f0 = jay_bare_reg(FLAG, 0);
+      f1 = jay_bare_reg(FLAG, 1);
+      f2 = jay_bare_reg(FLAG, 2);
+   }
+
+   ~LowerPostRA()
+   {
+      ralloc_free(mem_ctx);
+   }
+
+   jay_inst *I;
+   void *mem_ctx;
+   jay_def x, y, z, u4, f0, f1, f2, nul = jay_null();
+};
+
+TEST_F(LowerPostRA, Tied)
+{
+   CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0, z),
+        POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0));
+
+   CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), jay_negate(f0), z),
+        POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), jay_negate(f0)));
+}
+
+TEST_F(LowerPostRA, InsertMove)
+{
+   CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0, x), {
+      POST(b, jay_MOV(b, z, x), jay_negate(f0));
+      POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0);
+   });
+}
+
+TEST_F(LowerPostRA, RewriteToSel)
+{
+   CASE(PRE(b, jay_MOV(b, z, y), f0, x),
+        jay_SEL(b, JAY_TYPE_U32, z, x, y, jay_negate(f0)));
+}
+
+TEST_F(LowerPostRA, CopyUGPR)
+{
+   NEGCASE(jay_MOV(b, x, u4));
+   NEGCASE(jay_MOV(b, u4, x));
+}
diff --git a/src/intel/compiler/jay/test/test-optimizer.cpp b/src/intel/compiler/jay/test/test-optimizer.cpp
new file mode 100644
index 00000000000..739a2d15610
--- /dev/null
+++ b/src/intel/compiler/jay/test/test-optimizer.cpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/lut.h"
+#include "jay_builder.h"
+#include "jay_ir.h"
+#include "jay_private.h"
+#include "jay_test.h"
+
+#include <gtest/gtest.h>
+
+static void
+jay_optimize_and_dce(jay_shader *shader)
+{
+   JAY_PASS(shader, jay_opt_propagate_forwards);
+   JAY_PASS(shader, jay_opt_propagate_backwards);
+   JAY_PASS(shader, jay_opt_dead_code);
+}
+
+#define CASE(instr, expected)                                                  \
+   INSTRUCTION_CASE(                                                           \
+      {                                                                        \
+         instr;                                                                \
+         jay_UNIT_TEST_u32(b, out);                                            \
+      },                                                                       \
+      {                                                                        \
+         expected;                                                             \
+         jay_UNIT_TEST_u32(b, out);                                            \
+      },                                                                       \
+      jay_optimize_and_dce)
+
+#define NEGCASE(instr) CASE(instr, instr)
+#define UNIT           jay_UNIT_TEST_u32
+
+#define NEG(x) jay_negate(x)
+
+#define MOV(T, src0)                                                           \
+   ({                                                                          \
+      jay_def dst = jay_alloc_def(b, GPR, 1);                                  \
+      jay_MODIFIER(b, T, dst, src0);                                           \
+      dst;                                                                     \
+   })
+
+class Optimizer : public testing::Test {
+ protected:
+   Optimizer()
+   {
+      mem_ctx = ralloc_context(NULL);
+
+      out = jay_scalar(GPR, 8);
+      wx = jay_scalar(TEST_FILE, 1);
+      wy = jay_scalar(TEST_FILE, 1);
+      wz = jay_scalar(TEST_FILE, 1);
+   }
+
+   ~Optimizer()
+   {
+      ralloc_free(mem_ctx);
+   }
+
+   void *mem_ctx;
+
+   jay_def out, wx, wy, wz;
+};
+
+static enum jay_type float_types[] = {
+   JAY_TYPE_F16,
+   JAY_TYPE_F32,
+};
+
+TEST_F(Optimizer, Copyprop)
+{
+   CASE(jay_ADD(b, JAY_TYPE_U32, out, wx, jay_MOV_u32(b, wy)),
+        jay_ADD(b, JAY_TYPE_U32, out, wx, wy));
+
+   CASE(jay_ADD(b, JAY_TYPE_U32, out, wx, jay_MOV_u32(b, wy)),
+        jay_ADD(b, JAY_TYPE_U32, out, wx, wy));
+}
+
+TEST_F(Optimizer, FusedNeg)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) {
+      enum jay_type T = float_types[i];
+
+      CASE(jay_ADD(b, T, out, wx, MOV(T, NEG(wy))),
+           jay_ADD(b, T, out, wx, NEG(wy)));
+
+      CASE(jay_MUL(b, T, out, MOV(T, NEG(wy)), NEG(wx)),
+           jay_MUL(b, T, out, NEG(wy), NEG(wx)));
+
+      CASE(jay_MAD(b, T, out, MOV(T, NEG(wy)), wz, NEG(MOV(T, NEG(wx)))),
+           jay_MAD(b, T, out, NEG(wy), wz, wx));
+   }
+}
+
+TEST_F(Optimizer, SELToFloat)
+{
+   CASE(
+      {
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy));
+         jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 3, x);
+         jay_SEL(b, JAY_TYPE_U32, out, wx, MOV(JAY_TYPE_F32, NEG(wy)), flag);
+      },
+      {
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy));
+         jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 3, x);
+         jay_SEL(b, JAY_TYPE_F32, out, wx, NEG(wy), flag);
+      });
+}
+
+TEST_F(Optimizer, FusedNot)
+{
+   CASE(jay_BFN(b, out, wx, jay_NOT_u32(b, wy), 0, UTIL_LUT3(a & b)),
+        jay_BFN(b, out, wx, wy, 0, UTIL_LUT3(a & ~b)));
+
+   CASE(jay_AND(b, JAY_TYPE_U32, out, wx, jay_NOT_u32(b, wy)),
+        jay_AND(b, JAY_TYPE_U32, out, wx, jay_negate(wy)));
+
+   CASE(jay_XOR(b, JAY_TYPE_U32, out, jay_NOT_u32(b, wx), wy),
+        jay_XOR(b, JAY_TYPE_U32, out, jay_negate(wx), wy));
+
+   CASE(jay_OR(b, JAY_TYPE_U32, out, jay_NOT_u32(b, wx), jay_NOT_u32(b, wy)),
+        jay_OR(b, JAY_TYPE_U32, out, jay_negate(wx), jay_negate(wy)));
+}
+
+TEST_F(Optimizer, NegativeFusedFneg)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) {
+      enum jay_type T = float_types[i];
+      NEGCASE(jay_ADD(b, JAY_TYPE_U32, out, wx, MOV(T, NEG(wy))));
+      NEGCASE(jay_ADD(b, JAY_TYPE_S32, out, wx, MOV(T, NEG(wy))));
+   }
+}
+
+/* TODO: test fneg with f64 */
+
+TEST_F(Optimizer, FusedSat)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) {
+      enum jay_type T = float_types[i];
+
+      CASE(
+         {
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_ADD(b, T, x, wx, MOV(T, NEG(wy)));
+            jay_MODIFIER(b, T, out, x)->saturate = true;
+         },
+         { jay_ADD(b, T, out, wx, NEG(wy))->saturate = true; });
+
+      CASE(
+         {
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_MUL(b, T, x, wx, MOV(T, NEG(wy)));
+            jay_MODIFIER(b, T, out, x)->saturate = true;
+         },
+         { jay_MUL(b, T, out, wx, NEG(wy))->saturate = true; });
+
+      CASE(
+         {
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_MAX(b, T, x, wx, MOV(T, NEG(wy)))->saturate = true;
+            jay_MODIFIER(b, T, out, x)->saturate = true;
+         },
+         { jay_MAX(b, T, out, wx, NEG(wy))->saturate = true; });
+   }
+}
+
+TEST_F(Optimizer, InverseBallotPropagate)
+{
+   CASE(
+      {
+         jay_def x = jay_alloc_def(b, UGPR, 1);
+         jay_def f = jay_alloc_def(b, FLAG, 1);
+         jay_ADD(b, JAY_TYPE_U32, x, wx, wy);
+         jay_MOV(b, f, x);
+         jay_SEL(b, JAY_TYPE_U32, out, wx, wy, f);
+      },
+      {
+         UNUSED jay_def x = jay_alloc_def(b, UGPR, 1);
+         jay_def f = jay_alloc_def(b, FLAG, 1);
+         jay_ADD(b, JAY_TYPE_U32, f, wx, wy);
+         jay_SEL(b, JAY_TYPE_U32, out, wx, wy, f);
+      });
+}
+
+TEST_F(Optimizer, GtZero)
+{
+   CASE(
+      {
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy));
+         jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 0, x);
+         jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+      },
+      {
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_inst *add = jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy));
+         jay_set_conditional_mod(b, add, flag, JAY_CONDITIONAL_GT);
+         jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+      });
+}
+
+TEST_F(Optimizer, MultipleCmp)
+{
+   CASE(
+      {
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def flag2 = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy));
+         jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 0, x);
+         jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_GT, flag2, 0, x);
+         jay_SEL(b, JAY_TYPE_U32, out, x, jay_SEL_u32(b, x, 123, flag), flag2);
+      },
+      {
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def flag2 = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_inst *add = jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy));
+         jay_set_conditional_mod(b, add, flag, JAY_CONDITIONAL_GT);
+         jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_GT, flag2, 0, x);
+         jay_SEL(b, JAY_TYPE_U32, out, x, jay_SEL_u32(b, x, 123, flag), flag2);
+      });
+}
+
+TEST_F(Optimizer, TypeNeutralConditionalMods)
+{
+   enum jay_conditional_mod mods[] = {
+      JAY_CONDITIONAL_NE,
+      JAY_CONDITIONAL_EQ,
+   };
+
+   for (unsigned i = 0; i < 2; ++i) {
+      CASE(
+         {
+            jay_def flag = jay_alloc_def(b, FLAG, 1);
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c));
+            jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0);
+            jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+         },
+         {
+            jay_def flag = jay_alloc_def(b, FLAG, 1);
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_inst *bfn3 = jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c));
+            jay_set_conditional_mod(b, bfn3, flag, mods[i]);
+            jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+         });
+
+      CASE(
+         {
+            jay_def flag = jay_alloc_def(b, FLAG, 1);
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_AND(b, JAY_TYPE_U32, x, wx, wy);
+            jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0);
+            jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+         },
+         {
+            jay_def flag = jay_alloc_def(b, FLAG, 1);
+            jay_def x = jay_alloc_def(b, GPR, 1);
+            jay_inst *an = jay_AND(b, JAY_TYPE_U32, x, wx, wy);
+            jay_set_conditional_mod(b, an, flag, mods[i]);
+            jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+         });
+   }
+}
+
+TEST_F(Optimizer, SignednessMismatchConditionalMods)
+{
+   enum jay_conditional_mod mods[] = {
+      JAY_CONDITIONAL_LE,
+      JAY_CONDITIONAL_GT,
+   };
+
+   for (unsigned i = 0; i < 2; ++i) {
+      NEGCASE({
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c));
+         jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0);
+         jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+      });
+   }
+}
+
+TEST_F(Optimizer, FloatMismatchConditionalMods)
+{
+   enum jay_conditional_mod mods[] = {
+      JAY_CONDITIONAL_NAN,
+      JAY_CONDITIONAL_EQ,
+      JAY_CONDITIONAL_NE,
+      JAY_CONDITIONAL_LT,
+   };
+
+   for (unsigned i = 0; i < 2; ++i) {
+      NEGCASE({
+         jay_def flag = jay_alloc_def(b, FLAG, 1);
+         jay_def x = jay_alloc_def(b, GPR, 1);
+         jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c));
+         jay_CMP(b, JAY_TYPE_F32, mods[i], flag, x, 0);
+         jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag);
+      });
+   }
+}
diff --git a/src/intel/compiler/jay/test/test-repair-ssa.cpp b/src/intel/compiler/jay/test/test-repair-ssa.cpp
new file mode 100644
index 00000000000..8d117746eee
--- /dev/null
+++ b/src/intel/compiler/jay/test/test-repair-ssa.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022 Collabora, Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_test.h"
+
+#include <gtest/gtest.h>
+
+JAY_DEFINE_FUNCTION_PASS(pass, jay_repair_ssa)
+
+#define CASE(instr)                                                            \
+   INSTRUCTION_CASE_GEN(                                                       \
+      {                                                                        \
+         UNUSED bool repaired = false;                                         \
+         b->func->ssa_alloc = 1;                                               \
+         instr                                                                 \
+      },                                                                       \
+      {                                                                        \
+         UNUSED bool repaired = true;                                          \
+         b->func->ssa_alloc = 1;                                               \
+         instr                                                                 \
+      },                                                                       \
+      pass, false)
+
+class RepairSSA : public testing::Test {
+ protected:
+   RepairSSA()
+   {
+      mem_ctx = ralloc_context(NULL);
+   }
+
+   ~RepairSSA()
+   {
+      ralloc_free(mem_ctx);
+   }
+
+   void *mem_ctx;
+};
+
+static jay_def
+jay_phi_2(jay_builder *b, jay_block *p1, jay_def v1, jay_block *p2, jay_def v2)
+{
+   assert(v2.file == v1.file || jay_is_null(v2));
+   jay_def idx = jay_alloc_def(b, v1.file, 1);
+   jay_PHI_DST(b, idx);
+   jay_cursor saved = b->cursor;
+
+   b->cursor = jay_after_block(p1);
+   jay_PHI_SRC_u32(b, v1, jay_index(idx));
+
+   b->cursor = jay_after_block(p2);
+   jay_PHI_SRC_u32(b, jay_is_null(v2) ? idx : v2, jay_index(idx));
+
+   b->cursor = saved;
+   return idx;
+}
+
+TEST_F(RepairSSA, Local)
+{
+   CASE({
+      jay_def x = jay_MOV_u32(b, 0xcafe);
+      jay_def y = jay_MOV_u32(b, 0xefac);
+
+      if (repaired) {
+         jay_UNIT_TEST(b, jay_ADD_f32(b, y, x));
+      } else {
+         jay_ADD(b, JAY_TYPE_F32, x, y, x);
+         jay_UNIT_TEST(b, x);
+      }
+   });
+}
+
+/*      A
+ *     / \
+ *    B   C
+ *     \ /
+ *      D
+ */
+TEST_F(RepairSSA, IfElse)
+{
+   CASE({
+      jay_block *A = jay_first_block(b->func);
+      jay_block *B = jay_test_block(b->func);
+      jay_block *C = jay_test_block(b->func);
+      jay_block *D = jay_test_block(b->func);
+
+      jay_block_add_successor(A, B);
+      jay_block_add_successor(A, C);
+
+      jay_block_add_successor(B, D);
+      jay_block_add_successor(C, D);
+
+      b->cursor = jay_after_block(A);
+      jay_IF(b);
+
+      b->cursor = jay_after_block(B);
+      jay_def x = jay_MOV_u32(b, 0xcafe);
+      jay_def y = jay_MOV_u32(b, 0xbade);
+
+      b->cursor = jay_after_block(C);
+      jay_ELSE(b);
+      jay_def x2 = repaired ? jay_alloc_def(b, UGPR, 1) : x;
+      jay_MOV(b, x2, 0xefac);
+      jay_def y2 = jay_MOV_u32(b, 0xbaee);
+      jay_ENDIF(b);
+
+      b->cursor = jay_after_block(D);
+      jay_def y3 = jay_phi_2(b, B, y, C, y2);
+      if (repaired)
+         x = jay_phi_2(b, B, x, C, x2);
+
+      jay_UNIT_TEST(b, jay_ADD_f32(b, x, y3));
+   });
+}
+
+/*
+ *      H
+ *      |
+ *      A---|
+ *     / \  |
+ *    B   C |
+ *    |  /  |
+ *    | D----
+ *    |
+ *    |-E
+ */
+TEST_F(RepairSSA, Loop)
+{
+   CASE({
+      jay_block *H = jay_first_block(b->func);
+      jay_block *A = jay_test_block(b->func);
+      jay_block *B = jay_test_block(b->func);
+      jay_block *C = jay_test_block(b->func);
+      jay_block *D = jay_test_block(b->func);
+      jay_block *E = jay_test_block(b->func);
+
+      jay_block_add_successor(H, A);
+      jay_block_add_successor(A, B);
+      jay_block_add_successor(A, C);
+      jay_block_add_successor(B, E);
+      jay_block_add_successor(C, D);
+      jay_block_add_successor(D, A);
+
+      A->loop_header = true;
+
+      b->cursor = jay_after_block(H);
+      jay_def x = jay_MOV_u32(b, 0xcafe);
+
+      b->cursor = jay_after_block(A);
+      jay_def x_in = repaired ? jay_alloc_def(b, UGPR, 1) : x;
+      jay_def x_out = repaired ? jay_alloc_def(b, UGPR, 1) : x;
+      if (repaired) {
+         jay_PHI_DST(b, x_in);
+      }
+      jay_IF(b);
+
+      b->cursor = jay_after_block(H);
+      if (repaired) {
+         jay_PHI_SRC_u32(b, x, jay_index(x_in));
+      }
+
+      b->cursor = jay_after_block(B);
+      jay_BREAK(b);
+
+      b->cursor = jay_after_block(D);
+      jay_ADD(b, JAY_TYPE_U32, x_out, x_in, 1);
+      if (repaired) {
+         jay_PHI_SRC_u32(b, x_out, jay_index(x_in));
+      }
+      jay_WHILE(b);
+
+      b->cursor = jay_after_block(E);
+      jay_UNIT_TEST(b, x_in);
+   });
+}
+
+/* Same setup as IfElse */
+TEST_F(RepairSSA, TrivialPhisOptimized)
+{
+   CASE({
+      jay_block *A = jay_first_block(b->func);
+      jay_block *B = jay_test_block(b->func);
+      jay_block *C = jay_test_block(b->func);
+      jay_block *D = jay_test_block(b->func);
+
+      jay_block_add_successor(A, B);
+      jay_block_add_successor(A, C);
+
+      jay_block_add_successor(B, D);
+      jay_block_add_successor(C, D);
+
+      b->cursor = jay_after_block(A);
+      jay_def x = jay_MOV_u32(b, 0xcafe);
+      jay_IF(b);
+
+      b->cursor = jay_after_block(C);
+      jay_ELSE(b);
+      jay_ENDIF(b);
+
+      b->cursor = jay_after_block(D);
+      if (repaired) {
+         b->func->ssa_alloc++;
+      }
+
+      jay_UNIT_TEST(b, jay_ADD_f32(b, x, x));
+   });
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index cdfdd00d5f8..0a2c0c1f66a 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -35,6 +35,7 @@ brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
                                         command : [prog_python, '@INPUT0@', '--out', '@OUTPUT@'])
 
 subdir('brw')
+subdir('jay')
 
 if with_intel_elk
   subdir('elk')