From 4eb838eb48a8e489978c6d6fbf1a636da807c434 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Date: Tue, 14 Apr 2026 11:25:11 -0400
Subject: [PATCH] jay: split up jay_from_nir.c

Big monolithic file, split it up into the relevant pieces.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40960>
---
 src/intel/compiler/jay/jay_from_nir.c       | 1101 +------------------
 src/intel/compiler/jay/jay_insert_fp_mode.c |   85 ++
 src/intel/compiler/jay/jay_nir.c            |  462 ++++++++
 src/intel/compiler/jay/jay_private.h        |   11 +
 src/intel/compiler/jay/jay_prog_data.c      |  581 ++++++++++
 src/intel/compiler/jay/meson.build          |    3 +
 6 files changed, 1149 insertions(+), 1094 deletions(-)
 create mode 100644 src/intel/compiler/jay/jay_insert_fp_mode.c
 create mode 100644 src/intel/compiler/jay/jay_nir.c
 create mode 100644 src/intel/compiler/jay/jay_prog_data.c

diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c
index a194586a86d..aed974af0de 100644
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@@ -7,7 +7,6 @@
 #include "compiler/brw/brw_eu.h"
 #include "compiler/brw/brw_eu_defines.h"
 #include "compiler/brw/brw_nir.h"
-#include "compiler/brw/brw_private.h"
 #include "compiler/brw/brw_sampler.h"
 #include "compiler/intel_nir.h"
 #include "compiler/intel_shader_enums.h"
@@ -28,7 +27,6 @@
 #include "jay_private.h"
 #include "nir.h"
 #include "nir_builder.h"
-#include "nir_builder_opcodes.h"
 #include "nir_defines.h"
 #include "nir_intrinsics.h"
 #include "nir_intrinsics_indices.h"
@@ -2310,85 +2308,6 @@ jay_emit_eot(struct nir_to_jay_state *nj)
    }
 }
 
-static void
-set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired)
-{
-   /* Only touch cr0 if we are changing bits */
-   if ((*cr0) != desired) {
-      jay_builder b = jay_init_builder(f, cursor);
-      jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired);
-      *cr0 = desired;
-   }
-}
-
-static void
-jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes)
-{
-   /* First, work out the global float control mode for the shader */
-   uint32_t global = 0x0;
-
-   /* Initially fp16 denorms are flushed-to-zero, handle preserve. */
-   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) {
-      global |= BRW_CR0_FP16_DENORM_PRESERVE;
-   }
-
-   /* Initially fp32 denorms are flushed-to-zero, handle preserve.
-    *
-    * TODO: Optimize this, we have a dispatch bit.
-    */
-   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) {
-      global |= BRW_CR0_FP32_DENORM_PRESERVE;
-   }
-
-   /* Initially fp64 denorms are flushed to zero, handle preserve. */
-   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) {
-      global |= BRW_CR0_FP64_DENORM_PRESERVE;
-   }
-
-   /* By default, we are in round-to-even mode. Note we do not permit setting
-    * round mode separately by bitsize but this is ok for current APIs. The
-    * Vulkan driver sets roundingModeIndependence = NONE.
-    *
-    * TODO: Optimize this, there is a command buffer bit for it.
-    */
-   if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) ||
-       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) ||
-       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) {
-      global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT);
-   }
-
-   uint32_t cr0 = 0;
-   jay_function *entrypoint = jay_shader_get_entrypoint(shader);
-   set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global);
-
-   /* Now handle per-instruction deltas to the global mode */
-   jay_foreach_function(shader, func) {
-      jay_foreach_block(func, block) {
-         uint32_t current = cr0;
-
-         jay_foreach_inst_in_block(block, I) {
-            uint32_t required = cr0;
-            enum jay_rounding_mode round =
-               (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND;
-
-            if (round != JAY_ROUND) {
-               required &= ~BRW_CR0_RND_MODE_MASK;
-               required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT);
-            }
-
-            if (jay_type_is_any_float(I->type)) {
-               set_cr0(func, jay_before_inst(I), &current, required);
-            }
-         }
-
-         /* Restore to global state on block boundaries */
-         if (jay_num_successors(block) > 0) {
-            set_cr0(func, jay_after_block(block), &current, cr0);
-         }
-      }
-   }
-}
-
 struct payload_builder {
    jay_builder *b;
    unsigned offsets[JAY_NUM_SSA_FILES];
@@ -2467,542 +2386,6 @@ setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
       read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
 }
 
-static inline enum intel_barycentric_mode
-brw_barycentric_mode(const struct brw_fs_prog_key *key,
-                     nir_intrinsic_instr *intr)
-{
-   const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr);
-
-   /* Barycentric modes don't make sense for flat inputs. */
-   assert(mode != INTERP_MODE_FLAT);
-
-   unsigned bary;
-   switch (intr->intrinsic) {
-   case nir_intrinsic_load_barycentric_pixel:
-   case nir_intrinsic_load_barycentric_at_offset:
-      /* When per sample interpolation is dynamic, assume sample interpolation.
-       * We'll dynamically remap things so that the FS payload is not affected.
-       */
-      bary = key->persample_interp == INTEL_SOMETIMES ?
-                INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
-                INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
-      break;
-   case nir_intrinsic_load_barycentric_centroid:
-      bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
-      break;
-   case nir_intrinsic_load_barycentric_sample:
-   case nir_intrinsic_load_barycentric_at_sample:
-      bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
-      break;
-   default:
-      UNREACHABLE("invalid intrinsic");
-   }
-
-   if (mode == INTERP_MODE_NOPERSPECTIVE)
-      bary += 3;
-
-   return (enum intel_barycentric_mode) bary;
-}
-
-struct fs_info_ctx {
-   const struct brw_fs_prog_key *key;
-   struct brw_fs_prog_data *prog_data;
-   const struct intel_device_info *devinfo;
-};
-
-static bool
-gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data)
-{
-   struct fs_info_ctx *ctx = data;
-   struct brw_fs_prog_data *prog_data = ctx->prog_data;
-
-   switch (intr->intrinsic) {
-   case nir_intrinsic_load_barycentric_pixel:
-   case nir_intrinsic_load_barycentric_centroid:
-   case nir_intrinsic_load_barycentric_sample:
-      prog_data->barycentric_interp_modes |=
-         1 << brw_barycentric_mode(ctx->key, intr);
-      break;
-
-   case nir_intrinsic_load_barycentric_at_sample:
-   case nir_intrinsic_load_barycentric_at_offset: {
-      unsigned mode = brw_barycentric_mode(ctx->key, intr);
-      prog_data->barycentric_interp_modes |= 1 << mode;
-      prog_data->uses_sample_offsets |=
-         mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE ||
-         mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE;
-
-      if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS)
-         prog_data->uses_npc_bary_coefficients = true;
-      else
-         prog_data->uses_pc_bary_coefficients = true;
-      break;
-   }
-
-   case nir_intrinsic_load_frag_coord_z:
-      prog_data->uses_src_depth = true;
-      break;
-
-   case nir_intrinsic_load_frag_coord_w_rcp:
-      prog_data->uses_src_w = true;
-      break;
-
-   case nir_intrinsic_load_sample_mask_in:
-      /* TODO: Sample masks are broken and discards are broken and simd32
-       * layouts are broken too. XXX.
-       */
-      // prog_data->uses_sample_mask = true;
-      break;
-
-   case nir_intrinsic_load_pixel_coord_intel:
-      BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
-      break;
-
-   default:
-      break;
-   }
-
-   return false;
-}
-
-static void
-brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data,
-                        const nir_shader *shader)
-{
-   prog_data->flat_inputs = 0;
-
-   nir_foreach_shader_in_variable(var, shader) {
-      if (var->data.interpolation != INTERP_MODE_FLAT ||
-          var->data.per_primitive)
-         continue;
-
-      unsigned slots = glsl_count_attribute_slots(var->type, false);
-      for (unsigned s = 0; s < slots; s++) {
-         int input_index = prog_data->urb_setup[var->data.location + s];
-
-         if (input_index >= 0)
-            prog_data->flat_inputs |= 1 << input_index;
-      }
-   }
-}
-
-static uint8_t
-computed_depth_mode(const nir_shader *shader)
-{
-   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-      switch (shader->info.fs.depth_layout) {
-      case FRAG_DEPTH_LAYOUT_NONE:
-      case FRAG_DEPTH_LAYOUT_ANY:
-         return BRW_PSCDEPTH_ON;
-      case FRAG_DEPTH_LAYOUT_GREATER:
-         return BRW_PSCDEPTH_ON_GE;
-      case FRAG_DEPTH_LAYOUT_LESS:
-         return BRW_PSCDEPTH_ON_LE;
-      case FRAG_DEPTH_LAYOUT_UNCHANGED:
-         /* We initially set this to OFF, but having the shader write the
-          * depth means we allocate register space in the SEND message. The
-          * difference between the SEND register count and the OFF state
-          * programming makes the HW hang.
-          *
-          * Removing the depth writes also leads to test failures. So use
-          * LesserThanOrEqual, which fits writing the same value
-          * (unchanged/equal).
-          *
-          */
-         return BRW_PSCDEPTH_ON_LE;
-      }
-   }
-   return BRW_PSCDEPTH_OFF;
-}
-
-/*
- * Build up an array of indices into the urb_setup array that
- * references the active entries of the urb_setup array.
- * Used to accelerate walking the active entries of the urb_setup array
- * on each upload.
- */
-static void
-brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data)
-{
-   /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
-    * skip per-primitive attributes here.
-    */
-
-   /* Make sure uint8_t is sufficient */
-   static_assert(VARYING_SLOT_MAX <= 0xff);
-   uint8_t index = 0;
-   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if (fs_prog_data->urb_setup[attr] >= 0) {
-         fs_prog_data->urb_setup_attribs[index++] = attr;
-      }
-   }
-   fs_prog_data->urb_setup_attribs_count = index;
-}
-
-static void
-calculate_urb_setup(const struct intel_device_info *devinfo,
-                    const struct brw_fs_prog_key *key,
-                    struct brw_fs_prog_data *prog_data,
-                    nir_shader *nir,
-                    const struct brw_mue_map *mue_map,
-                    int *per_primitive_offsets)
-{
-   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
-   int urb_next = 0; /* in vec4s */
-
-   /* Figure out where the PrimitiveID lives, either in the per-vertex block
-    * or in the per-primitive block or both.
-    */
-   const uint64_t per_vert_primitive_id =
-      key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
-   const uint64_t per_prim_primitive_id =
-      key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
-   const uint64_t inputs_read =
-      nir->info.inputs_read &
-      (~nir->info.per_primitive_inputs | per_vert_primitive_id);
-   const uint64_t per_primitive_header_bits =
-      VARYING_BIT_PRIMITIVE_SHADING_RATE |
-      VARYING_BIT_LAYER |
-      VARYING_BIT_VIEWPORT |
-      VARYING_BIT_CULL_PRIMITIVE;
-   const uint64_t per_primitive_inputs =
-      nir->info.inputs_read &
-      (nir->info.per_primitive_inputs | per_prim_primitive_id) &
-      ~per_primitive_header_bits;
-   struct intel_vue_map vue_map;
-   uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
-
-   if (mue_map != NULL) {
-      memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
-      memcpy(per_primitive_offsets, mue_map->per_primitive_offsets,
-             sizeof(mue_map->per_primitive_offsets));
-
-      if (!mue_map->wa_18019110168_active) {
-         u_foreach_bit64(location, per_primitive_inputs) {
-            assert(per_primitive_offsets[location] != -1);
-
-            first_read_offset =
-               MIN2(first_read_offset,
-                    (uint32_t) per_primitive_offsets[location]);
-            per_primitive_stride =
-               MAX2((uint32_t) per_primitive_offsets[location] + 16,
-                    per_primitive_stride);
-         }
-      } else {
-         first_read_offset = per_primitive_stride = 0;
-      }
-   } else {
-      brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout,
-                          1 /* pos_slots, TODO */);
-      brw_compute_per_primitive_map(per_primitive_offsets,
-                                    &per_primitive_stride, &first_read_offset,
-                                    0, nir, nir_var_shader_in,
-                                    per_primitive_inputs,
-                                    true /* separate_shader */);
-   }
-
-   if (per_primitive_stride > first_read_offset) {
-      first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
-
-      /* Remove the first few unused registers */
-      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
-         if (per_primitive_offsets[i] == -1)
-            continue;
-         per_primitive_offsets[i] -= first_read_offset;
-      }
-
-      prog_data->num_per_primitive_inputs =
-         2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
-   } else {
-      prog_data->num_per_primitive_inputs = 0;
-   }
-
-   /* Now do the per-vertex stuff (what used to be legacy pipeline) */
-
-   /* If Mesh is involved, we cannot do any packing. Documentation doesn't say
-    * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
-    * using Mesh.
-    */
-   if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) {
-      /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
-       * arbitrary rearrangement of the first 16 varying inputs, so we can put
-       * them wherever we want. Just put them in order.
-       *
-       * This is useful because it means that (a) inputs not used by the
-       * fragment shader won't take up valuable register space, and (b) we
-       * won't have to recompile the fragment shader if it gets paired with a
-       * different vertex (or geometry) shader.
-       */
-      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-         if (inputs_read & BITFIELD64_BIT(i)) {
-            prog_data->urb_setup[i] = urb_next++;
-         }
-      }
-   } else {
-      /* We have enough input varyings that the SF/SBE pipeline stage can't
-       * arbitrarily rearrange them to suit our whim; we have to put them in
-       * an order that matches the output of the previous pipeline stage
-       * (geometry or vertex shader).
-       */
-      int first_slot = 0;
-      for (int i = 0; i < vue_map.num_slots; i++) {
-         int varying = vue_map.slot_to_varying[i];
-         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) {
-            first_slot = ROUND_DOWN_TO(i, 2);
-            break;
-         }
-      }
-
-      for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
-         int varying = vue_map.slot_to_varying[slot];
-         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) {
-            prog_data->urb_setup[varying] = slot - first_slot;
-         }
-      }
-      urb_next = vue_map.num_slots - first_slot;
-   }
-
-   prog_data->num_varying_inputs = urb_next;
-   prog_data->inputs = inputs_read;
-   prog_data->per_primitive_inputs = per_primitive_inputs;
-
-   brw_compute_urb_setup_index(prog_data);
-}
-
-static void
-populate_fs_prog_data(nir_shader *shader,
-                      const struct intel_device_info *devinfo,
-                      const struct brw_fs_prog_key *key,
-                      struct brw_fs_prog_data *prog_data,
-                      const struct brw_mue_map *mue_map,
-                      int *per_primitive_offsets)
-{
-   struct fs_info_ctx ctx = {
-      .key = key,
-      .prog_data = prog_data,
-      .devinfo = devinfo,
-   };
-   nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx);
-
-   prog_data->uses_kill = shader->info.fs.uses_discard;
-   prog_data->uses_omask =
-      !key->ignore_sample_mask_out &&
-      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
-   prog_data->max_polygons = 1;
-   prog_data->computed_depth_mode = computed_depth_mode(shader);
-   prog_data->computed_stencil =
-      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
-
-   prog_data->sample_shading = shader->info.fs.uses_sample_shading;
-   prog_data->api_sample_shading = key->api_sample_shading;
-   prog_data->min_sample_shading = key->min_sample_shading;
-
-   assert(key->multisample_fbo != INTEL_NEVER ||
-          key->persample_interp == INTEL_NEVER);
-
-   prog_data->persample_dispatch = key->persample_interp;
-   if (prog_data->sample_shading)
-      prog_data->persample_dispatch = INTEL_ALWAYS;
-
-   /* We can only persample dispatch if we have a multisample FBO */
-   prog_data->persample_dispatch =
-      MIN2(prog_data->persample_dispatch, key->multisample_fbo);
-
-   /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
-    * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
-    * to definitively tell whether alpha_to_coverage is on or off.
-    */
-   prog_data->alpha_to_coverage = key->alpha_to_coverage;
-
-   assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER);
-   prog_data->mesh_input = key->mesh_input;
-
-   assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER);
-   prog_data->provoking_vertex_last = key->provoking_vertex_last;
-
-   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
-    *
-    *    "MSDISPMODE_PERSAMPLE is required in order to select
-    *    POSOFFSET_SAMPLE"
-    *
-    * So we can only really get sample positions if we are doing real
-    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
-    * persample dispatch, we hard-code it to 0.5.
-    */
-   prog_data->uses_pos_offset =
-      prog_data->persample_dispatch != INTEL_NEVER &&
-      (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
-       BITSET_TEST(shader->info.system_values_read,
-                   SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
-
-   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
-   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
-   prog_data->inner_coverage = shader->info.fs.inner_coverage;
-
-   /* From the BDW PRM documentation for 3DSTATE_WM:
-    *
-    *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
-    *     Sample or Non- perspective Sample barycentric coordinates."
-    *
-    * So cleanup any potentially set sample barycentric mode when not in per
-    * sample dispatch.
-    */
-   if (prog_data->persample_dispatch == INTEL_NEVER) {
-      prog_data->barycentric_interp_modes &=
-         ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
-   }
-
-   if (devinfo->ver >= 20) {
-      prog_data->vertex_attributes_bypass =
-         brw_needs_vertex_attributes_bypass(shader);
-   }
-
-   prog_data->uses_nonperspective_interp_modes =
-      (prog_data->barycentric_interp_modes &
-       INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
-      prog_data->uses_npc_bary_coefficients;
-
-   /* The current VK_EXT_graphics_pipeline_library specification requires
-    * coarse to specified at compile time. But per sample interpolation can be
-    * dynamic. So we should never be in a situation where coarse &
-    * persample_interp are both respectively true & INTEL_ALWAYS.
-    *
-    * Coarse will dynamically turned off when persample_interp is active.
-    */
-   assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
-
-   prog_data->coarse_pixel_dispatch =
-      intel_sometimes_invert(prog_data->persample_dispatch);
-   if (!key->coarse_pixel ||
-       /* DG2 should support this, but Wa_22012766191 says there are issues
-        * with CPS 1x1 + MSAA + FS writing to oMask.
-        */
-       (devinfo->verx10 < 200 &&
-        (prog_data->uses_omask || prog_data->uses_sample_mask)) ||
-       prog_data->sample_shading ||
-       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
-       prog_data->computed_stencil ||
-       devinfo->ver < 11) {
-      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
-   }
-
-   /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
-    * Message Descriptor :
-    *
-    *    "Message Type. Specifies the type of message being sent when
-    *     pixel-rate evaluation is requested :
-    *
-    *     Format = U2
-    *       0: Per Message Offset (eval_snapped with immediate offset)
-    *       1: Sample Position Offset (eval_sindex)
-    *       2: Centroid Position Offset (eval_centroid)
-    *       3: Per Slot Offset (eval_snapped with register offset)
-    *
-    *     Message Type. Specifies the type of message being sent when
-    *     coarse-rate evaluation is requested :
-    *
-    *     Format = U2
-    *       0: Coarse to Pixel Mapping Message (internal message)
-    *       1: Reserved
-    *       2: Coarse Centroid Position (eval_centroid)
-    *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
-    *
-    * The Sample Position Offset is marked as reserved for coarse rate
-    * evaluation and leads to hangs if we try to use it. So disable coarse
-    * pixel shading if we have any intrinsic that will result in a pixel
-    * interpolater message at sample.
-    */
-   if (intel_nir_pulls_at_sample(shader))
-      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
-
-   /* We choose to always enable VMask prior to XeHP, as it would cause
-    * us to lose out on the eliminate_find_live_channel() optimization.
-    */
-   prog_data->uses_vmask =
-      devinfo->verx10 < 125 ||
-      shader->info.fs.needs_coarse_quad_helper_invocations ||
-      shader->info.uses_wide_subgroup_intrinsics ||
-      prog_data->coarse_pixel_dispatch != INTEL_NEVER;
-
-   prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients;
-
-   if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
-      prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth;
-      prog_data->uses_src_depth = false;
-   }
-
-   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map,
-                       per_primitive_offsets);
-   brw_compute_flat_inputs(prog_data, shader);
-}
-
-static void
-populate_vs_prog_data(nir_shader *nir,
-                      const struct intel_device_info *devinfo,
-                      const struct brw_vs_prog_key *key,
-                      struct brw_vs_prog_data *prog_data,
-                      unsigned nr_packed_regs,
-                      bool debug)
-{
-   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
-   BITSET_WORD *sysvals = nir->info.system_values_read;
-
-   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
-    * incoming vertex attribute.  So, add an extra slot.
-    */
-   if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) ||
-       BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) ||
-       BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
-       BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) {
-      nr_attribute_slots++;
-   }
-
-   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
-   if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) ||
-       BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
-      nr_attribute_slots++;
-   }
-
-   const struct {
-      bool *data;
-      gl_system_value val;
-   } bool_sysvals[] = {
-      { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW     },
-      { &prog_data->uses_firstvertex,     SYSTEM_VALUE_FIRST_VERTEX        },
-      { &prog_data->uses_baseinstance,    SYSTEM_VALUE_BASE_INSTANCE       },
-      { &prog_data->uses_vertexid,        SYSTEM_VALUE_VERTEX_ID_ZERO_BASE },
-      { &prog_data->uses_instanceid,      SYSTEM_VALUE_INSTANCE_ID         },
-      { &prog_data->uses_drawid,          SYSTEM_VALUE_DRAW_ID             },
-   };
-
-   for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) {
-      *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val);
-   }
-
-   unsigned nr_attribute_regs;
-   if (key->vf_component_packing) {
-      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8);
-      nr_attribute_regs = nr_packed_regs;
-   } else {
-      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
-      nr_attribute_regs = 4 * nr_attribute_slots;
-   }
-
-   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
-    * (overwriting the original contents), we need to make sure the size is
-    * the larger of the two.
-    */
-   const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4),
-                                     prog_data->base.vue_map.num_slots);
-   prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
-   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
-
-   if (unlikely(debug)) {
-      fprintf(stderr, "VS Output ");
-      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
-   }
-}
-
 static void
 setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
 {
@@ -3196,275 +2579,6 @@ jay_gather_stats(const jay_shader *s, struct genisa_stats *stats)
    stats->sends -= (s->spills + s->fills);
 }
 
-/*
- * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
- * its own index. Vectors/64-bit use contiguous indices. We therefore run a
- * modified version of nir_index_ssa_defs right before translating NIR->Jay.
- */
-static bool
-index_ssa_def_cb(nir_def *def, void *state)
-{
-   unsigned *index = (unsigned *) state;
-   def->index = *index;
-   *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
-   return true;
-}
-
-static void
-nj_index_ssa_defs(nir_shader *nir)
-{
-   nir_foreach_function_impl(impl, nir) {
-      /* The zero index means null in Jay, so start SSA indices at 1 */
-      unsigned index = 1;
-
-      nir_foreach_block_unstructured(block, impl) {
-         nir_foreach_instr(instr, block)
-            nir_foreach_def(instr, index_ssa_def_cb, &index);
-      }
-
-      impl->ssa_alloc = index;
-   }
-}
-
-static bool
-lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
-{
-   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
-      return false;
-
-   /* TODO: Is this right for multisampling? */
-   b->cursor = nir_before_instr(&intr->instr);
-   nir_def *active =
-      nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
-
-   nir_def_replace(&intr->def, active);
-   return true;
-}
-
-static bool
-lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
-{
-   if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
-       intr->intrinsic != nir_intrinsic_load_pixel_coord)
-      return false;
-
-   b->cursor = nir_before_instr(&intr->instr);
-   nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
-
-   if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
-      c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
-                   nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
-                   nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
-   }
-
-   nir_def_replace(&intr->def, c);
-   return true;
-}
-
-static bool
-jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
-{
-   b->cursor = nir_after_instr(&intr->instr);
-   unsigned *simd_width = simd_;
-
-   /* mask & -mask isolates the lowest set bit in the mask. */
-   if (intr->intrinsic == nir_intrinsic_elect) {
-      nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
-      mask = nir_iand(b, mask, nir_ineg(b, mask));
-      nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
-      return true;
-   }
-
-   /* Ballots must match the SIMD size */
-   if (intr->intrinsic == nir_intrinsic_ballot ||
-       intr->intrinsic == nir_intrinsic_ballot_relaxed) {
-      unsigned old_bitsize = intr->def.bit_size;
-      intr->def.bit_size = *simd_width;
-      nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
-      nir_def_rewrite_uses_after(&intr->def, u2uN);
-      return true;
-   }
-
-   /* Note: we don't treat read_invocation specially because there's little
-    * benefit but doing so would require expensive uniformizing in some cases.
-    */
-   if (intr->intrinsic != nir_intrinsic_shuffle &&
-       intr->intrinsic != nir_intrinsic_read_invocation)
-      return false;
-
-   nir_def *data = intr->src[0].ssa;
-   assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
-
-   nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
-   nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
-   return true;
-}
-
-struct frag_out_ctx {
-   nir_def *colour[8], *depth, *stencil, *sample_mask;
-};
-
-static bool
-collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
-{
-   struct frag_out_ctx *ctx = ctx_;
-   if (intr->intrinsic != nir_intrinsic_store_output)
-      return false;
-
-   unsigned wrmask = nir_intrinsic_write_mask(intr);
-   assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
-   assert(util_is_power_of_two_nonzero(wrmask + 1) &&
-          "complex writemasks should be lowered");
-
-   /* TODO: Optimize with write mask? */
-
-   gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
-   assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
-   nir_def **out;
-   if (loc == FRAG_RESULT_COLOR) {
-      out = &ctx->colour[0];
-   } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
-      out = &ctx->colour[loc - FRAG_RESULT_DATA0];
-   } else if (loc == FRAG_RESULT_DEPTH) {
-      out = &ctx->depth;
-   } else if (loc == FRAG_RESULT_STENCIL) {
-      UNREACHABLE("todo");
-      out = &ctx->stencil;
-   } else if (loc == FRAG_RESULT_SAMPLE_MASK) {
-      UNREACHABLE("todo");
-      out = &ctx->sample_mask;
-   } else {
-      UNREACHABLE("invalid location");
-   }
-
-   assert((*out) == NULL && "each location written exactly once");
-   *out = intr->src[0].ssa;
-
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-static void
-append_payload(nir_builder *b,
-               nir_def **payload,
-               unsigned *len,
-               unsigned max_len,
-               nir_def *value)
-{
-   if (value != NULL) {
-      for (unsigned i = 0; i < value->num_components; ++i) {
-         payload[*len] = nir_channel(b, value, i);
-         (*len)++;
-         assert((*len) <= max_len);
-      }
-   }
-}
-
-static void
-insert_rt_store(nir_builder *b,
-                const struct intel_device_info *devinfo,
-                signed target,
-                bool last,
-                nir_def *colour,
-                nir_def *src0_alpha,
-                nir_def *depth,
-                nir_def *stencil,
-                nir_def *sample_mask,
-                unsigned dispatch_width)
-{
-   bool null_rt = target < 0;
-   target = MAX2(target, 0);
-
-   if (!colour) {
-      colour = nir_undef(b, 4, 32);
-   }
-
-   colour = nir_pad_vec4(b, colour);
-
-   if (null_rt) {
-      /* Even if we don't write a RT, we still need to write alpha for
-       * alpha-to-coverage and alpha testing. Optimize the other channels out.
-       */
-      colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
-                                     nir_channel(b, colour, 3), 3);
-   }
-
-   /* TODO: Not sure I like this. We'll see what 2src looks like. */
-   unsigned op = dispatch_width == 32 ?
-                    XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
-                    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
-   uint64_t desc =
-      brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
-
-   uint64_t ex_desc = 0;
-   if (devinfo->ver >= 20) {
-      ex_desc = target << 21 |
-                null_rt << 20 |
-                (src0_alpha ? (1 << 15) : 0) |
-                (stencil ? (1 << 14) : 0) |
-                (depth ? (1 << 13) : 0) |
-                (sample_mask ? (1 << 12) : 0);
-   } else if (devinfo->ver >= 11) {
-      /* Set the "Render Target Index" and "Src0 Alpha Present" fields
-       * in the extended message descriptor, in lieu of using a header.
-       */
-      ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
-   }
-
-   /* Build the payload */
-   nir_def *payload[8] = { NULL };
-   unsigned len = 0;
-   append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
-   append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
-   /* TODO */
-
-   nir_def *disable = b->shader->info.fs.uses_discard ?
-                         nir_is_helper_invocation(b, 1) :
-                         nir_imm_false(b);
-
-   nir_store_render_target_intel(b, nir_vec(b, payload, len),
-                                 nir_imm_ivec2(b, desc, ex_desc), disable,
-                                 .eot = last);
-}
-
-static void
-lower_fragment_outputs(nir_function_impl *impl,
-                       const struct intel_device_info *devinfo,
-                       unsigned nr_color_regions,
-                       unsigned dispatch_width)
-{
-   struct frag_out_ctx ctx = { { NULL } };
-   nir_function_intrinsics_pass(impl, collect_fragment_output,
-                                nir_metadata_control_flow, &ctx);
-   nir_builder b_ = nir_builder_at(nir_after_impl(impl));
-   nir_builder *b = &b_;
-   assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
-
-   signed first = -1;
-   for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) {
-      if (ctx.colour[i]) {
-         first = i;
-         break;
-      }
-   }
-
-   /* Do the later render targets first */
-   for (unsigned i = first + 1; i < nr_color_regions; ++i) {
-      if (ctx.colour[i]) {
-         insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL,
-                         NULL, dispatch_width);
-      }
-   }
-
-   /* Finally do render target zero attaching all the sideband things and
-    * setting the LastRT bit. This needs to exist even if nothing is written
-    * since it also signals end-of-thread.
-    */
-   insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true,
-                   first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth,
-                   ctx.stencil, ctx.sample_mask, dispatch_width);
-}
-
 struct jay_shader_bin *
 jay_compile(const struct intel_device_info *devinfo,
             void *mem_ctx,
@@ -3473,177 +2587,8 @@ jay_compile(const struct intel_device_info *devinfo,
             union brw_any_prog_key *key)
 {
    jay_debug = debug_get_option_jay_debug();
-   enum mesa_shader_stage stage = nir->info.stage;
-   bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage));
-   struct brw_compiler compiler = { .devinfo = devinfo };
-   unsigned nr_packed_regs = 0;
-
-   brw_pass_tracker pt_ = {
-      .nir = nir,
-      .key = &key->base,
-      .dispatch_width = 0,
-      .compiler = &compiler,
-      .archiver = NULL, //params->base.archiver,
-   }, *pt = &pt_;
-
-   BRW_NIR_SNAPSHOT("first");
-
-   prog_data->base.ray_queries = nir->info.ray_queries;
-   prog_data->base.stage = stage;
-   // TODO: Make the driver do this?
-   // prog_data->base.source_hash = params->source_hash;
-   prog_data->base.total_shared = nir->info.shared_size;
-
-   /* TODO: Real heuristic */
-   bool do_simd32 = INTEL_SIMD(FS, 32);
-   do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
-   unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
-
-   if (stage == MESA_SHADER_VERTEX) {
-      /* We only expect slot compaction to be disabled when using device
-       * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
-       * programming. This should always be enabled together with VF component
-       * packing to minimize the size of the payload.
-       */
-      assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
-
-      /* When using Primitive Replication for multiview, each view gets its own
-       * position slot.
-       */
-      const uint32_t pos_slots =
-         (nir->info.per_view_outputs & VARYING_BIT_POS) ?
-            MAX2(1, util_bitcount(key->base.view_mask)) :
-            1;
-
-      /* Only position is allowed to be per-view */
-      assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
-
-      brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
-                          nir->info.outputs_written, key->base.vue_layout,
-                          pos_slots);
-
-      brw_nir_apply_key(pt, &key->base, simd_width);
-
-      prog_data->vs.inputs_read = nir->info.inputs_read;
-      prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
-      prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
-
-      brw_nir_lower_vs_inputs(nir);
-      brw_nir_lower_vue_outputs(nir);
-      BRW_NIR_SNAPSHOT("after_lower_io");
-
-      memset(prog_data->vs.vf_component_packing, 0,
-             sizeof(prog_data->vs.vf_component_packing));
-      if (key->vs.vf_component_packing) {
-         nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
-      }
-
-      /* Get constant offsets out of the way for proper clip/cull handling */
-      BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
-      BRW_NIR_PASS(nir_opt_constant_folding);
-      BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
-                   &prog_data->vue.vue_map, 0, 0);
-   } else if (stage == MESA_SHADER_FRAGMENT) {
-      assert(key->fs.mesh_input == INTEL_NEVER && "todo");
-      assert(!key->fs.force_dual_color_blend && "todo");
-      brw_nir_apply_key(pt, &key->base, 32);
-      brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
-      brw_nir_lower_fs_outputs(nir);
-      NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
-
-      if (!brw_can_coherent_fb_fetch(devinfo))
-         NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
-
-      NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
-      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
-               nir_metadata_control_flow, NULL);
-      NIR_PASS(_, nir, nir_opt_barycentric, true);
-
-      lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
-                             key->fs.nr_color_regions, simd_width);
-      NIR_PASS(_, nir, nir_lower_helper_writes, true);
-      NIR_PASS(_, nir, nir_lower_is_helper_invocation);
-      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
-               nir_metadata_control_flow, NULL);
-
-      if (key->fs.alpha_to_coverage != INTEL_NEVER) {
-         /* Run constant fold optimization in order to get the correct source
-          * offset to determine render target 0 store instruction in
-          * emit_alpha_to_coverage pass.
-          */
-         NIR_PASS(_, nir, nir_opt_constant_folding);
-         NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
-      }
-
-      // TODO
-      // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
-
-      if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
-         uint32_t f = 0;
-
-         if (key->fs.multisample_fbo == INTEL_ALWAYS)
-            f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
-
-         if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
-            f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
-
-         if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
-            f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
-
-         if (key->fs.persample_interp == INTEL_ALWAYS) {
-            f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
-                 INTEL_FS_CONFIG_PERSAMPLE_INTERP;
-         }
-
-         NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
-                  f);
-      }
-   } else {
-      brw_nir_apply_key(pt, &key->base, simd_width);
-   }
-
-   brw_postprocess_nir_opts(pt);
-
-   NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
-            nir_metadata_control_flow, &simd_width);
-   NIR_PASS(_, nir, nir_opt_algebraic_late);
-   NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
-
-   /* Late postprocess while remaining in SSA */
-   /* Run fsign lowering again after the last time brw_nir_optimize is called.
-    * As is the case with conversion lowering (below), brw_nir_optimize can
-    * create additional fsign instructions.
-    */
-   NIR_PASS(_, nir, jay_nir_lower_fsign);
-   NIR_PASS(_, nir, jay_nir_lower_bool);
-   NIR_PASS(_, nir, nir_opt_cse);
-   NIR_PASS(_, nir, nir_opt_dce);
-   NIR_PASS(_, nir, jay_nir_opt_sel_zero);
-
-   /* Run nir_split_conversions only after the last tiem
-    * brw_nir_optimize is called. Various optimizations invoked there can
-    * rematerialize the conversions that the lowering pass eliminates.
-    */
-   const nir_split_conversions_options split_conv_opts = {
-      .callback = intel_nir_split_conversions_cb,
-   };
-   NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
-
-   /* Do this only after the last opt_gcm. GCM will undo this lowering. */
-   if (stage == MESA_SHADER_FRAGMENT) {
-      NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
-   }
-
-   NIR_PASS(_, nir, nir_opt_constant_folding);
-   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
-   NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
-   NIR_PASS(_, nir, nir_opt_copy_prop);
-   NIR_PASS(_, nir, nir_opt_dce);
-
-   /* Run divergence analysis at the end */
-   nir_sweep(nir);
-   nj_index_ssa_defs(nir);
-   nir_divergence_analysis(nir);
+   bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage));
+   unsigned simd_width = jay_process_nir(devinfo, nir, prog_data, key);
 
    if (debug) {
       /* We can't use nir_print_shader since it reindexes SSA defs. */
@@ -3652,18 +2597,7 @@ jay_compile(const struct intel_device_info *devinfo,
       fflush(stdout);
    }
 
-   if (stage == MESA_SHADER_VERTEX) {
-      populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs,
-                            nr_packed_regs, debug);
-   } else if (stage == MESA_SHADER_FRAGMENT) {
-      int per_primitive_offsets[VARYING_SLOT_MAX];
-      memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
-
-      populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs,
-                            NULL /* TODO: mue_map */, per_primitive_offsets);
-   }
-
-   jay_shader *s = jay_new_shader(NULL, stage);
+   jay_shader *s = jay_new_shader(NULL, nir->info.stage);
    s->dispatch_width = simd_width;
    s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width;
    s->devinfo = devinfo;
@@ -3729,13 +2663,13 @@ jay_compile(const struct intel_device_info *devinfo,
    jay_gather_stats(s, &bin->stats);
    bin->stats.code_size = bin->size;
 
-   if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) {
+   if (debug) {
       if (nir->info.label) {
          printf("%s - ", nir->info.label);
       }
 
       const char *shader_name =
-         ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(stage),
+         ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(s->stage),
                          s->dispatch_width);
       genisa_stats_fprintf(stdout, shader_name, &bin->stats);
    }
@@ -3743,7 +2677,7 @@ jay_compile(const struct intel_device_info *devinfo,
    bin->stats.workgroup_memory_size = nir->info.shared_size;
    bin->stats.dispatch_width = simd_width;
 
-   if (stage == MESA_SHADER_FRAGMENT) {
+   if (s->stage == MESA_SHADER_FRAGMENT) {
       if (simd_width == 8) {
          prog_data->fs.dispatch_8 = true;
       } else if (simd_width == 16) {
@@ -3754,13 +2688,10 @@ jay_compile(const struct intel_device_info *devinfo,
          prog_data->fs.prog_offset_32 = 0;
       }
 
-      prog_data->fs.has_side_effects = nir->info.writes_memory;
-   } else if (mesa_shader_stage_is_compute(stage)) {
+   } else if (mesa_shader_stage_is_compute(s->stage)) {
       unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2;
       prog_data->cs.prog_offset[i] = 0;
       prog_data->cs.prog_mask = BITFIELD_BIT(i);
-      prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr;
-      prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr;
       prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */
    }
 
@@ -3789,24 +2720,6 @@ jay_compile(const struct intel_device_info *devinfo,
               util_next_power_of_two(s->scratch_size));
    }
 
-   if (stage == MESA_SHADER_VERTEX ||
-       stage == MESA_SHADER_TESS_EVAL ||
-       stage == MESA_SHADER_GEOMETRY ||
-       stage == MESA_SHADER_MESH) {
-
-      uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
-      uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size,
-                                          nir->info.cull_distance_array_size);
-
-      if (stage == MESA_SHADER_MESH) {
-         prog_data->mesh.clip_distance_mask = clip_mask;
-         prog_data->mesh.cull_distance_mask = cull_mask;
-      } else {
-         prog_data->vue.clip_distance_mask = clip_mask;
-         prog_data->vue.cull_distance_mask = cull_mask;
-      }
-   }
-
    /* Scratch is allocated in 1KiB increments. */
    prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024);
 
diff --git a/src/intel/compiler/jay/jay_insert_fp_mode.c b/src/intel/compiler/jay/jay_insert_fp_mode.c
new file mode 100644
index 00000000000..f7fbc82b31d
--- /dev/null
+++ b/src/intel/compiler/jay/jay_insert_fp_mode.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "jay_builder.h"
+#include "jay_ir.h"
+
+static void
+set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired)
+{
+   /* Only touch cr0 if we are changing bits */
+   if ((*cr0) != desired) {
+      jay_builder b = jay_init_builder(f, cursor);
+      jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired);
+      *cr0 = desired;
+   }
+}
+
+void
+jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes)
+{
+   /* First, work out the global float control mode for the shader */
+   uint32_t global = 0x0;
+
+   /* Initially fp16 denorms are flushed-to-zero, handle preserve. */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) {
+      global |= BRW_CR0_FP16_DENORM_PRESERVE;
+   }
+
+   /* Initially fp32 denorms are flushed-to-zero, handle preserve.
+    *
+    * TODO: Optimize this, we have a dispatch bit.
+    */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) {
+      global |= BRW_CR0_FP32_DENORM_PRESERVE;
+   }
+
+   /* Initially fp64 denorms are flushed to zero, handle preserve. */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) {
+      global |= BRW_CR0_FP64_DENORM_PRESERVE;
+   }
+
+   /* By default, we are in round-to-even mode. Note we do not permit setting
+    * round mode separately by bitsize but this is ok for current APIs. The
+    * Vulkan driver sets roundingModeIndependence = NONE.
+    *
+    * TODO: Optimize this, there is a command buffer bit for it.
+    */
+   if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) ||
+       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) ||
+       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) {
+      global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT);
+   }
+
+   uint32_t cr0 = 0;
+   jay_function *entrypoint = jay_shader_get_entrypoint(shader);
+   set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global);
+
+   /* Now handle per-instruction deltas to the global mode */
+   jay_foreach_function(shader, func) {
+      jay_foreach_block(func, block) {
+         uint32_t current = cr0;
+
+         jay_foreach_inst_in_block(block, I) {
+            uint32_t required = cr0;
+            enum jay_rounding_mode round =
+               (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND;
+
+            if (round != JAY_ROUND) {
+               required &= ~BRW_CR0_RND_MODE_MASK;
+               required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT);
+            }
+
+            if (jay_type_is_any_float(I->type)) {
+               set_cr0(func, jay_before_inst(I), &current, required);
+            }
+         }
+
+         /* Restore to global state on block boundaries */
+         if (jay_num_successors(block) > 0) {
+            set_cr0(func, jay_after_block(block), &current, cr0);
+         }
+      }
+   }
+}
diff --git a/src/intel/compiler/jay/jay_nir.c b/src/intel/compiler/jay/jay_nir.c
new file mode 100644
index 00000000000..b50b98248fd
--- /dev/null
+++ b/src/intel/compiler/jay/jay_nir.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/brw/brw_eu.h"
+#include "compiler/brw/brw_eu_defines.h"
+#include "compiler/brw/brw_nir.h"
+#include "compiler/brw/brw_private.h"
+#include "compiler/intel_nir.h"
+#include "jay_private.h"
+#include "nir.h"
+#include "nir_builder.h"
+
+/*
+ * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
+ * its own index. Vectors/64-bit use contiguous indices. We therefore run a
+ * modified version of nir_index_ssa_defs right before translating NIR->Jay.
+ */
+static bool
+index_ssa_def_cb(nir_def *def, void *state)
+{
+   unsigned *index = (unsigned *) state;
+   def->index = *index;
+   *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
+   return true;
+}
+
+static void
+nj_index_ssa_defs(nir_shader *nir)
+{
+   nir_foreach_function_impl(impl, nir) {
+      /* The zero index means null in Jay, so start SSA indices at 1 */
+      unsigned index = 1;
+
+      nir_foreach_block_unstructured(block, impl) {
+         nir_foreach_instr(instr, block)
+            nir_foreach_def(instr, index_ssa_def_cb, &index);
+      }
+
+      impl->ssa_alloc = index;
+   }
+}
+
+static bool
+lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
+{
+   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
+      return false;
+
+   /* TODO: Is this right for multisampling? */
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *active =
+      nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
+
+   nir_def_replace(&intr->def, active);
+   return true;
+}
+
+static bool
+lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
+{
+   if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
+       intr->intrinsic != nir_intrinsic_load_pixel_coord)
+      return false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
+
+   if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
+      c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
+                   nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
+                   nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
+   }
+
+   nir_def_replace(&intr->def, c);
+   return true;
+}
+
+static bool
+jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+   unsigned *simd_width = simd_;
+
+   /* mask & -mask isolates the lowest set bit in the mask. */
+   if (intr->intrinsic == nir_intrinsic_elect) {
+      nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
+      mask = nir_iand(b, mask, nir_ineg(b, mask));
+      nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
+      return true;
+   }
+
+   /* Ballots must match the SIMD size */
+   if (intr->intrinsic == nir_intrinsic_ballot ||
+       intr->intrinsic == nir_intrinsic_ballot_relaxed) {
+      unsigned old_bitsize = intr->def.bit_size;
+      intr->def.bit_size = *simd_width;
+      nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
+      nir_def_rewrite_uses_after(&intr->def, u2uN);
+      return true;
+   }
+
+   /* Note: we don't treat read_invocation specially because there's little
+    * benefit but doing so would require expensive uniformizing in some cases.
+    */
+   if (intr->intrinsic != nir_intrinsic_shuffle &&
+       intr->intrinsic != nir_intrinsic_read_invocation)
+      return false;
+
+   nir_def *data = intr->src[0].ssa;
+   assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
+
+   nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
+   nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
+   return true;
+}
+
+struct frag_out_ctx {
+   nir_def *colour[8], *depth, *stencil, *sample_mask;
+};
+
+static bool
+collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
+{
+   struct frag_out_ctx *ctx = ctx_;
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
+   assert(util_is_power_of_two_nonzero(wrmask + 1) &&
+          "complex writemasks should be lowered");
+
+   /* TODO: Optimize with write mask? */
+
+   gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
+   assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
+   nir_def **out;
+   if (loc == FRAG_RESULT_COLOR) {
+      out = &ctx->colour[0];
+   } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
+      out = &ctx->colour[loc - FRAG_RESULT_DATA0];
+   } else if (loc == FRAG_RESULT_DEPTH) {
+      out = &ctx->depth;
+   } else if (loc == FRAG_RESULT_STENCIL) {
+      UNREACHABLE("todo");
+      out = &ctx->stencil;
+   } else if (loc == FRAG_RESULT_SAMPLE_MASK) {
+      UNREACHABLE("todo");
+      out = &ctx->sample_mask;
+   } else {
+      UNREACHABLE("invalid location");
+   }
+
+   assert((*out) == NULL && "each location written exactly once");
+   *out = intr->src[0].ssa;
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static void
+append_payload(nir_builder *b,
+               nir_def **payload,
+               unsigned *len,
+               unsigned max_len,
+               nir_def *value)
+{
+   if (value != NULL) {
+      for (unsigned i = 0; i < value->num_components; ++i) {
+         payload[*len] = nir_channel(b, value, i);
+         (*len)++;
+         assert((*len) <= max_len);
+      }
+   }
+}
+
+static void
+insert_rt_store(nir_builder *b,
+                const struct intel_device_info *devinfo,
+                signed target,
+                bool last,
+                nir_def *colour,
+                nir_def *src0_alpha,
+                nir_def *depth,
+                nir_def *stencil,
+                nir_def *sample_mask,
+                unsigned dispatch_width)
+{
+   bool null_rt = target < 0;
+   target = MAX2(target, 0);
+
+   if (!colour) {
+      colour = nir_undef(b, 4, 32);
+   }
+
+   colour = nir_pad_vec4(b, colour);
+
+   if (null_rt) {
+      /* Even if we don't write a RT, we still need to write alpha for
+       * alpha-to-coverage and alpha testing. Optimize the other channels out.
+       */
+      colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
+                                     nir_channel(b, colour, 3), 3);
+   }
+
+   /* TODO: Not sure I like this. We'll see what 2src looks like. */
+   unsigned op = dispatch_width == 32 ?
+                    XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
+                    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   uint64_t desc =
+      brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
+
+   uint64_t ex_desc = 0;
+   if (devinfo->ver >= 20) {
+      ex_desc = target << 21 |
+                null_rt << 20 |
+                (src0_alpha ? (1 << 15) : 0) |
+                (stencil ? (1 << 14) : 0) |
+                (depth ? (1 << 13) : 0) |
+                (sample_mask ? (1 << 12) : 0);
+   } else if (devinfo->ver >= 11) {
+      /* Set the "Render Target Index" and "Src0 Alpha Present" fields
+       * in the extended message descriptor, in lieu of using a header.
+       */
+      ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
+   }
+
+   /* Build the payload */
+   nir_def *payload[8] = { NULL };
+   unsigned len = 0;
+   append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
+   append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
+   /* TODO */
+
+   nir_def *disable = b->shader->info.fs.uses_discard ?
+                         nir_is_helper_invocation(b, 1) :
+                         nir_imm_false(b);
+
+   nir_store_render_target_intel(b, nir_vec(b, payload, len),
+                                 nir_imm_ivec2(b, desc, ex_desc), disable,
+                                 .eot = last);
+}
+
+static void
+lower_fragment_outputs(nir_function_impl *impl,
+                       const struct intel_device_info *devinfo,
+                       unsigned nr_color_regions,
+                       unsigned dispatch_width)
+{
+   struct frag_out_ctx ctx = { { NULL } };
+   nir_function_intrinsics_pass(impl, collect_fragment_output,
+                                nir_metadata_control_flow, &ctx);
+   nir_builder b_ = nir_builder_at(nir_after_impl(impl));
+   nir_builder *b = &b_;
+   assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
+
+   signed first = -1;
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) {
+      if (ctx.colour[i]) {
+         first = i;
+         break;
+      }
+   }
+
+   /* Do the later render targets first */
+   for (unsigned i = first + 1; i < nr_color_regions; ++i) {
+      if (ctx.colour[i]) {
+         insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL,
+                         NULL, dispatch_width);
+      }
+   }
+
+   /* Finally do render target zero attaching all the sideband things and
+    * setting the LastRT bit. This needs to exist even if nothing is written
+    * since it also signals end-of-thread.
+    */
+   insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true,
+                   first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth,
+                   ctx.stencil, ctx.sample_mask, dispatch_width);
+}
+
+unsigned
+jay_process_nir(const struct intel_device_info *devinfo,
+                nir_shader *nir,
+                union brw_any_prog_data *prog_data,
+                union brw_any_prog_key *key)
+{
+   enum mesa_shader_stage stage = nir->info.stage;
+   struct brw_compiler compiler = { .devinfo = devinfo };
+   unsigned nr_packed_regs = 0;
+
+   brw_pass_tracker pt_ = {
+      .nir = nir,
+      .key = &key->base,
+      .dispatch_width = 0,
+      .compiler = &compiler,
+      .archiver = NULL, //params->base.archiver,
+   }, *pt = &pt_;
+
+   BRW_NIR_SNAPSHOT("first");
+
+   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.stage = stage;
+   // TODO: Make the driver do this?
+   // prog_data->base.source_hash = params->source_hash;
+   prog_data->base.total_shared = nir->info.shared_size;
+
+   /* TODO: Real heuristic */
+   bool do_simd32 = INTEL_SIMD(FS, 32);
+   do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
+   unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
+
+   if (stage == MESA_SHADER_VERTEX) {
+      /* We only expect slot compaction to be disabled when using device
+       * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
+       * programming. This should always be enabled together with VF component
+       * packing to minimize the size of the payload.
+       */
+      assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
+
+      /* When using Primitive Replication for multiview, each view gets its own
+       * position slot.
+       */
+      const uint32_t pos_slots =
+         (nir->info.per_view_outputs & VARYING_BIT_POS) ?
+            MAX2(1, util_bitcount(key->base.view_mask)) :
+            1;
+
+      /* Only position is allowed to be per-view */
+      assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
+
+      brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
+                          nir->info.outputs_written, key->base.vue_layout,
+                          pos_slots);
+
+      brw_nir_apply_key(pt, &key->base, simd_width);
+
+      prog_data->vs.inputs_read = nir->info.inputs_read;
+      prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
+      prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
+
+      brw_nir_lower_vs_inputs(nir);
+      brw_nir_lower_vue_outputs(nir);
+      BRW_NIR_SNAPSHOT("after_lower_io");
+
+      memset(prog_data->vs.vf_component_packing, 0,
+             sizeof(prog_data->vs.vf_component_packing));
+      if (key->vs.vf_component_packing) {
+         nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
+      }
+
+      /* Get constant offsets out of the way for proper clip/cull handling */
+      BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+      BRW_NIR_PASS(nir_opt_constant_folding);
+      BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
+                   &prog_data->vue.vue_map, 0, 0);
+   } else if (stage == MESA_SHADER_FRAGMENT) {
+      assert(key->fs.mesh_input == INTEL_NEVER && "todo");
+      assert(!key->fs.force_dual_color_blend && "todo");
+      brw_nir_apply_key(pt, &key->base, 32);
+      brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
+      brw_nir_lower_fs_outputs(nir);
+      NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
+
+      if (!brw_can_coherent_fb_fetch(devinfo))
+         NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
+
+      NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
+               nir_metadata_control_flow, NULL);
+      NIR_PASS(_, nir, nir_opt_barycentric, true);
+
+      lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
+                             key->fs.nr_color_regions, simd_width);
+      NIR_PASS(_, nir, nir_lower_helper_writes, true);
+      NIR_PASS(_, nir, nir_lower_is_helper_invocation);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
+               nir_metadata_control_flow, NULL);
+
+      if (key->fs.alpha_to_coverage != INTEL_NEVER) {
+         /* Run constant fold optimization in order to get the correct source
+          * offset to determine render target 0 store instruction in
+          * emit_alpha_to_coverage pass.
+          */
+         NIR_PASS(_, nir, nir_opt_constant_folding);
+         NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
+      }
+
+      // TODO
+      // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
+
+      if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
+         uint32_t f = 0;
+
+         if (key->fs.multisample_fbo == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
+
+         if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
+
+         if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
+
+         if (key->fs.persample_interp == INTEL_ALWAYS) {
+            f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
+                 INTEL_FS_CONFIG_PERSAMPLE_INTERP;
+         }
+
+         NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
+                  f);
+      }
+   } else {
+      brw_nir_apply_key(pt, &key->base, simd_width);
+   }
+
+   brw_postprocess_nir_opts(pt);
+
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
+            nir_metadata_control_flow, &simd_width);
+   NIR_PASS(_, nir, nir_opt_algebraic_late);
+   NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
+
+   /* Late postprocess while remaining in SSA */
+   /* Run fsign lowering again after the last time brw_nir_optimize is called.
+    * As is the case with conversion lowering (below), brw_nir_optimize can
+    * create additional fsign instructions.
+    */
+   NIR_PASS(_, nir, jay_nir_lower_fsign);
+   NIR_PASS(_, nir, jay_nir_lower_bool);
+   NIR_PASS(_, nir, nir_opt_cse);
+   NIR_PASS(_, nir, nir_opt_dce);
+   NIR_PASS(_, nir, jay_nir_opt_sel_zero);
+
+   /* Run nir_split_conversions only after the last tiem
+    * brw_nir_optimize is called. Various optimizations invoked there can
+    * rematerialize the conversions that the lowering pass eliminates.
+    */
+   const nir_split_conversions_options split_conv_opts = {
+      .callback = intel_nir_split_conversions_cb,
+   };
+   NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
+
+   /* Do this only after the last opt_gcm. GCM will undo this lowering. */
+   if (stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
+   }
+
+   NIR_PASS(_, nir, nir_opt_constant_folding);
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
+   NIR_PASS(_, nir, nir_opt_copy_prop);
+   NIR_PASS(_, nir, nir_opt_dce);
+
+   /* Run divergence analysis at the end */
+   nir_sweep(nir);
+   nj_index_ssa_defs(nir);
+   nir_divergence_analysis(nir);
+
+   jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
+   return simd_width;
+}
diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h
index 2799eaa7b7b..e0ceaebdaed 100644
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@@ -22,6 +22,16 @@ bool jay_nir_lower_bool(nir_shader *nir);
 bool jay_nir_opt_sel_zero(nir_shader *nir);
 bool jay_nir_lower_fsign(nir_shader *nir);
 
+void jay_populate_prog_data(const struct intel_device_info *devinfo,
+                            nir_shader *nir,
+                            union brw_any_prog_data *prog_data,
+                            union brw_any_prog_key *key,
+                            unsigned nr_packed_regs);
+unsigned jay_process_nir(const struct intel_device_info *devinfo,
+                         nir_shader *nir,
+                         union brw_any_prog_data *prog_data,
+                         union brw_any_prog_key *key);
+
 void jay_compute_liveness(jay_function *f);
 void jay_calculate_register_demands(jay_function *f);
 
@@ -63,6 +73,7 @@ void jay_lower_post_ra(jay_shader *s);
 void jay_lower_spill(jay_function *func);
 void jay_lower_simd_width(jay_shader *s);
 void jay_lower_scoreboard(jay_shader *s);
+void jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes);
 
 struct jay_shader_bin *
 jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size);
diff --git a/src/intel/compiler/jay/jay_prog_data.c b/src/intel/compiler/jay/jay_prog_data.c
new file mode 100644
index 00000000000..bc56c13dae6
--- /dev/null
+++ b/src/intel/compiler/jay/jay_prog_data.c
@@ -0,0 +1,581 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/brw/brw_compiler.h"
+#include "compiler/brw/brw_nir.h"
+#include "compiler/intel_nir.h"
+#include "jay_private.h"
+#include "nir.h"
+
+static inline enum intel_barycentric_mode
+brw_barycentric_mode(const struct brw_fs_prog_key *key,
+                     nir_intrinsic_instr *intr)
+{
+   const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr);
+
+   /* Barycentric modes don't make sense for flat inputs. */
+   assert(mode != INTERP_MODE_FLAT);
+
+   unsigned bary;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_at_offset:
+      /* When per sample interpolation is dynamic, assume sample interpolation.
+       * We'll dynamically remap things so that the FS payload is not affected.
+       */
+      bary = key->persample_interp == INTEL_SOMETIMES ?
+                INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
+                INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_at_sample:
+      bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+      break;
+   default:
+      UNREACHABLE("invalid intrinsic");
+   }
+
+   if (mode == INTERP_MODE_NOPERSPECTIVE)
+      bary += 3;
+
+   return (enum intel_barycentric_mode) bary;
+}
+
+struct fs_info_ctx {
+   const struct brw_fs_prog_key *key;
+   struct brw_fs_prog_data *prog_data;
+   const struct intel_device_info *devinfo;
+};
+
+static bool
+gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   struct fs_info_ctx *ctx = data;
+   struct brw_fs_prog_data *prog_data = ctx->prog_data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample:
+      prog_data->barycentric_interp_modes |=
+         1 << brw_barycentric_mode(ctx->key, intr);
+      break;
+
+   case nir_intrinsic_load_barycentric_at_sample:
+   case nir_intrinsic_load_barycentric_at_offset: {
+      unsigned mode = brw_barycentric_mode(ctx->key, intr);
+      prog_data->barycentric_interp_modes |= 1 << mode;
+      prog_data->uses_sample_offsets |=
+         mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE ||
+         mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE;
+
+      if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS)
+         prog_data->uses_npc_bary_coefficients = true;
+      else
+         prog_data->uses_pc_bary_coefficients = true;
+      break;
+   }
+
+   case nir_intrinsic_load_frag_coord_z:
+      prog_data->uses_src_depth = true;
+      break;
+
+   case nir_intrinsic_load_frag_coord_w_rcp:
+      prog_data->uses_src_w = true;
+      break;
+
+   case nir_intrinsic_load_sample_mask_in:
+      /* TODO: Sample masks are broken and discards are broken and simd32
+       * layouts are broken too. XXX.
+       */
+      // prog_data->uses_sample_mask = true;
+      break;
+
+   case nir_intrinsic_load_pixel_coord_intel:
+      BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
+      break;
+
+   default:
+      break;
+   }
+
+   return false;
+}
+
+static void
+brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data,
+                        const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_shader_in_variable(var, shader) {
+      if (var->data.interpolation != INTERP_MODE_FLAT ||
+          var->data.per_primitive)
+         continue;
+
+      unsigned slots = glsl_count_attribute_slots(var->type, false);
+      for (unsigned s = 0; s < slots; s++) {
+         int input_index = prog_data->urb_setup[var->data.location + s];
+
+         if (input_index >= 0)
+            prog_data->flat_inputs |= 1 << input_index;
+      }
+   }
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info.fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_NONE:
+      case FRAG_DEPTH_LAYOUT_ANY:
+         return BRW_PSCDEPTH_ON;
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         return BRW_PSCDEPTH_ON_GE;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         return BRW_PSCDEPTH_ON_LE;
+      case FRAG_DEPTH_LAYOUT_UNCHANGED:
+         /* We initially set this to OFF, but having the shader write the
+          * depth means we allocate register space in the SEND message. The
+          * difference between the SEND register count and the OFF state
+          * programming makes the HW hang.
+          *
+          * Removing the depth writes also leads to test failures. So use
+          * LesserThanOrEqual, which fits writing the same value
+          * (unchanged/equal).
+          *
+          */
+         return BRW_PSCDEPTH_ON_LE;
+      }
+   }
+   return BRW_PSCDEPTH_OFF;
+}
+
+/*
+ * Build up an array of indices into the urb_setup array that
+ * references the active entries of the urb_setup array.
+ * Used to accelerate walking the active entries of the urb_setup array
+ * on each upload.
+ */
+static void
+brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data)
+{
+   /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
+    * skip per-primitive attributes here.
+    */
+
+   /* Make sure uint8_t is sufficient */
+   static_assert(VARYING_SLOT_MAX <= 0xff);
+   uint8_t index = 0;
+   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      if (fs_prog_data->urb_setup[attr] >= 0) {
+         fs_prog_data->urb_setup_attribs[index++] = attr;
+      }
+   }
+   fs_prog_data->urb_setup_attribs_count = index;
+}
+
+static void
+calculate_urb_setup(const struct intel_device_info *devinfo,
+                    const struct brw_fs_prog_key *key,
+                    struct brw_fs_prog_data *prog_data,
+                    nir_shader *nir,
+                    const struct brw_mue_map *mue_map,
+                    int *per_primitive_offsets)
+{
+   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
+   int urb_next = 0; /* in vec4s */
+
+   /* Figure out where the PrimitiveID lives, either in the per-vertex block
+    * or in the per-primitive block or both.
+    */
+   const uint64_t per_vert_primitive_id =
+      key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t per_prim_primitive_id =
+      key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t inputs_read =
+      nir->info.inputs_read &
+      (~nir->info.per_primitive_inputs | per_vert_primitive_id);
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_inputs =
+      nir->info.inputs_read &
+      (nir->info.per_primitive_inputs | per_prim_primitive_id) &
+      ~per_primitive_header_bits;
+   struct intel_vue_map vue_map;
+   uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
+
+   if (mue_map != NULL) {
+      memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
+      memcpy(per_primitive_offsets, mue_map->per_primitive_offsets,
+             sizeof(mue_map->per_primitive_offsets));
+
+      if (!mue_map->wa_18019110168_active) {
+         u_foreach_bit64(location, per_primitive_inputs) {
+            assert(per_primitive_offsets[location] != -1);
+
+            first_read_offset =
+               MIN2(first_read_offset,
+                    (uint32_t) per_primitive_offsets[location]);
+            per_primitive_stride =
+               MAX2((uint32_t) per_primitive_offsets[location] + 16,
+                    per_primitive_stride);
+         }
+      } else {
+         first_read_offset = per_primitive_stride = 0;
+      }
+   } else {
+      brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout,
+                          1 /* pos_slots, TODO */);
+      brw_compute_per_primitive_map(per_primitive_offsets,
+                                    &per_primitive_stride, &first_read_offset,
+                                    0, nir, nir_var_shader_in,
+                                    per_primitive_inputs,
+                                    true /* separate_shader */);
+   }
+
+   if (per_primitive_stride > first_read_offset) {
+      first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
+
+      /* Remove the first few unused registers */
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1)
+            continue;
+         per_primitive_offsets[i] -= first_read_offset;
+      }
+
+      prog_data->num_per_primitive_inputs =
+         2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
+   } else {
+      prog_data->num_per_primitive_inputs = 0;
+   }
+
+   /* Now do the per-vertex stuff (what used to be legacy pipeline) */
+
+   /* If Mesh is involved, we cannot do any packing. Documentation doesn't say
+    * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
+    * using Mesh.
+    */
+   if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) {
+      /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
+       * arbitrary rearrangement of the first 16 varying inputs, so we can put
+       * them wherever we want. Just put them in order.
+       *
+       * This is useful because it means that (a) inputs not used by the
+       * fragment shader won't take up valuable register space, and (b) we
+       * won't have to recompile the fragment shader if it gets paired with a
+       * different vertex (or geometry) shader.
+       */
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (inputs_read & BITFIELD64_BIT(i)) {
+            prog_data->urb_setup[i] = urb_next++;
+         }
+      }
+   } else {
+      /* We have enough input varyings that the SF/SBE pipeline stage can't
+       * arbitrarily rearrange them to suit our whim; we have to put them in
+       * an order that matches the output of the previous pipeline stage
+       * (geometry or vertex shader).
+       */
+      int first_slot = 0;
+      for (int i = 0; i < vue_map.num_slots; i++) {
+         int varying = vue_map.slot_to_varying[i];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+            first_slot = ROUND_DOWN_TO(i, 2);
+            break;
+         }
+      }
+
+      for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
+         int varying = vue_map.slot_to_varying[slot];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) {
+            prog_data->urb_setup[varying] = slot - first_slot;
+         }
+      }
+      urb_next = vue_map.num_slots - first_slot;
+   }
+
+   prog_data->num_varying_inputs = urb_next;
+   prog_data->inputs = inputs_read;
+   prog_data->per_primitive_inputs = per_primitive_inputs;
+
+   brw_compute_urb_setup_index(prog_data);
+}
+
+static void
+populate_fs_prog_data(nir_shader *shader,
+                      const struct intel_device_info *devinfo,
+                      const struct brw_fs_prog_key *key,
+                      struct brw_fs_prog_data *prog_data,
+                      const struct brw_mue_map *mue_map,
+                      int *per_primitive_offsets)
+{
+   struct fs_info_ctx ctx = {
+      .key = key,
+      .prog_data = prog_data,
+      .devinfo = devinfo,
+   };
+   nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx);
+
+   prog_data->uses_kill = shader->info.fs.uses_discard;
+   prog_data->uses_omask =
+      !key->ignore_sample_mask_out &&
+      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+   prog_data->max_polygons = 1;
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   prog_data->sample_shading = shader->info.fs.uses_sample_shading;
+   prog_data->api_sample_shading = key->api_sample_shading;
+   prog_data->min_sample_shading = key->min_sample_shading;
+
+   assert(key->multisample_fbo != INTEL_NEVER ||
+          key->persample_interp == INTEL_NEVER);
+
+   prog_data->persample_dispatch = key->persample_interp;
+   if (prog_data->sample_shading)
+      prog_data->persample_dispatch = INTEL_ALWAYS;
+
+   /* We can only persample dispatch if we have a multisample FBO */
+   prog_data->persample_dispatch =
+      MIN2(prog_data->persample_dispatch, key->multisample_fbo);
+
+   /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
+    * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
+    * to definitively tell whether alpha_to_coverage is on or off.
+    */
+   prog_data->alpha_to_coverage = key->alpha_to_coverage;
+
+   assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER);
+   prog_data->mesh_input = key->mesh_input;
+
+   assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER);
+   prog_data->provoking_vertex_last = key->provoking_vertex_last;
+
+   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select
+    *    POSOFFSET_SAMPLE"
+    *
+    * So we can only really get sample positions if we are doing real
+    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+    * persample dispatch, we hard-code it to 0.5.
+    */
+   prog_data->uses_pos_offset =
+      prog_data->persample_dispatch != INTEL_NEVER &&
+      (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
+       BITSET_TEST(shader->info.system_values_read,
+                   SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
+
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info.fs.inner_coverage;
+
+   /* From the BDW PRM documentation for 3DSTATE_WM:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+    *     Sample or Non- perspective Sample barycentric coordinates."
+    *
+    * So cleanup any potentially set sample barycentric mode when not in per
+    * sample dispatch.
+    */
+   if (prog_data->persample_dispatch == INTEL_NEVER) {
+      prog_data->barycentric_interp_modes &=
+         ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+   }
+
+   if (devinfo->ver >= 20) {
+      prog_data->vertex_attributes_bypass =
+         brw_needs_vertex_attributes_bypass(shader);
+   }
+
+   prog_data->uses_nonperspective_interp_modes =
+      (prog_data->barycentric_interp_modes &
+       INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
+      prog_data->uses_npc_bary_coefficients;
+
+   /* The current VK_EXT_graphics_pipeline_library specification requires
+    * coarse to specified at compile time. But per sample interpolation can be
+    * dynamic. So we should never be in a situation where coarse &
+    * persample_interp are both respectively true & INTEL_ALWAYS.
+    *
+    * Coarse will dynamically turned off when persample_interp is active.
+    */
+   assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
+
+   prog_data->coarse_pixel_dispatch =
+      intel_sometimes_invert(prog_data->persample_dispatch);
+   if (!key->coarse_pixel ||
+       /* DG2 should support this, but Wa_22012766191 says there are issues
+        * with CPS 1x1 + MSAA + FS writing to oMask.
+        */
+       (devinfo->verx10 < 200 &&
+        (prog_data->uses_omask || prog_data->uses_sample_mask)) ||
+       prog_data->sample_shading ||
+       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+       prog_data->computed_stencil ||
+       devinfo->ver < 11) {
+      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
+   }
+
+   /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
+    * Message Descriptor :
+    *
+    *    "Message Type. Specifies the type of message being sent when
+    *     pixel-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Per Message Offset (eval_snapped with immediate offset)
+    *       1: Sample Position Offset (eval_sindex)
+    *       2: Centroid Position Offset (eval_centroid)
+    *       3: Per Slot Offset (eval_snapped with register offset)
+    *
+    *     Message Type. Specifies the type of message being sent when
+    *     coarse-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Coarse to Pixel Mapping Message (internal message)
+    *       1: Reserved
+    *       2: Coarse Centroid Position (eval_centroid)
+    *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
+    *
+    * The Sample Position Offset is marked as reserved for coarse rate
+    * evaluation and leads to hangs if we try to use it. So disable coarse
+    * pixel shading if we have any intrinsic that will result in a pixel
+    * interpolater message at sample.
+    */
+   if (intel_nir_pulls_at_sample(shader))
+      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
+
+   /* We choose to always enable VMask prior to XeHP, as it would cause
+    * us to lose out on the eliminate_find_live_channel() optimization.
+    */
+   prog_data->uses_vmask =
+      devinfo->verx10 < 125 ||
+      shader->info.fs.needs_coarse_quad_helper_invocations ||
+      shader->info.uses_wide_subgroup_intrinsics ||
+      prog_data->coarse_pixel_dispatch != INTEL_NEVER;
+
+   prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients;
+
+   if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
+      prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth;
+      prog_data->uses_src_depth = false;
+   }
+
+   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map,
+                       per_primitive_offsets);
+   brw_compute_flat_inputs(prog_data, shader);
+
+   prog_data->has_side_effects = shader->info.writes_memory;
+}
+
+static void
+populate_vs_prog_data(nir_shader *nir,
+                      const struct intel_device_info *devinfo,
+                      const struct brw_vs_prog_key *key,
+                      struct brw_vs_prog_data *prog_data,
+                      unsigned nr_packed_regs)
+{
+   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
+   BITSET_WORD *sysvals = nir->info.system_values_read;
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) {
+      nr_attribute_slots++;
+   }
+
+   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
+   if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
+      nr_attribute_slots++;
+   }
+
+   const struct {
+      bool *data;
+      gl_system_value val;
+   } bool_sysvals[] = {
+      { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW     },
+      { &prog_data->uses_firstvertex,     SYSTEM_VALUE_FIRST_VERTEX        },
+      { &prog_data->uses_baseinstance,    SYSTEM_VALUE_BASE_INSTANCE       },
+      { &prog_data->uses_vertexid,        SYSTEM_VALUE_VERTEX_ID_ZERO_BASE },
+      { &prog_data->uses_instanceid,      SYSTEM_VALUE_INSTANCE_ID         },
+      { &prog_data->uses_drawid,          SYSTEM_VALUE_DRAW_ID             },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) {
+      *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val);
+   }
+
+   unsigned nr_attribute_regs;
+   if (key->vf_component_packing) {
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8);
+      nr_attribute_regs = nr_packed_regs;
+   } else {
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
+      nr_attribute_regs = 4 * nr_attribute_slots;
+   }
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4),
+                                     prog_data->base.vue_map.num_slots);
+   prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+}
+
+void
+jay_populate_prog_data(const struct intel_device_info *devinfo,
+                       nir_shader *nir,
+                       union brw_any_prog_data *prog_data,
+                       union brw_any_prog_key *key,
+                       unsigned nr_packed_regs)
+{
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs,
+                            nr_packed_regs);
+   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      int per_primitive_offsets[VARYING_SLOT_MAX];
+      memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
+
+      populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs,
+                            NULL /* TODO: mue_map */, per_primitive_offsets);
+   } else if (mesa_shader_stage_is_compute(nir->info.stage)) {
+      prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr;
+      prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr;
+   }
+
+   if (nir->info.stage == MESA_SHADER_VERTEX ||
+       nir->info.stage == MESA_SHADER_TESS_EVAL ||
+       nir->info.stage == MESA_SHADER_GEOMETRY ||
+       nir->info.stage == MESA_SHADER_MESH) {
+
+      uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
+      uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size,
+                                          nir->info.cull_distance_array_size);
+
+      if (nir->info.stage == MESA_SHADER_MESH) {
+         prog_data->mesh.clip_distance_mask = clip_mask;
+         prog_data->mesh.cull_distance_mask = cull_mask;
+      } else {
+         prog_data->vue.clip_distance_mask = clip_mask;
+         prog_data->vue.cull_distance_mask = cull_mask;
+      }
+   }
+}
diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build
index e9c47ada78c..492d04c8bb2 100644
--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@@ -50,16 +50,19 @@ libintel_compiler_jay_files = files(
   'jay_assign_flags.c',
   'jay_from_nir.c',
   'jay_ir.h',
+  'jay_insert_fp_mode.c',
   'jay_liveness.c',
   'jay_lower_post_ra.c',
   'jay_lower_pre_ra.c',
   'jay_lower_scoreboard.c',
   'jay_lower_spill.c',
+  'jay_nir.c',
   'jay_opt_dead_code.c',
   'jay_opt_control_flow.c',
   'jay_opt_propagate.c',
   'jay_print.c',
   'jay_private.h',
+  'jay_prog_data.c',
   'jay_repair_ssa.c',
   'jay_register_allocate.c',
   'jay_simd_width.c',