jay: split up jay_from_nir.c

Big monolithic file, split it up into the relevant pieces. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40960>
2026-05-07 13:38:06 +02:00 · 2026-04-14 11:25:11 -04:00 · 2026-04-14 11:25:11 -04:00 · 4eb838eb48
commit 4eb838eb48
parent 6925d9ee23
6 changed files with 1149 additions and 1094 deletions
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
--- a/src/intel/compiler/jay/jay_insert_fp_mode.c
+++ b/src/intel/compiler/jay/jay_insert_fp_mode.c
@ -0,0 +1,85 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "jay_builder.h"
+#include "jay_ir.h"
+
+static void
+set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired)
+{
+   /* Only touch cr0 if we are changing bits */
+   if ((*cr0) != desired) {
+      jay_builder b = jay_init_builder(f, cursor);
+      jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired);
+      *cr0 = desired;
+   }
+}
+
+void
+jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes)
+{
+   /* First, work out the global float control mode for the shader */
+   uint32_t global = 0x0;
+
+   /* Initially fp16 denorms are flushed-to-zero, handle preserve. */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) {
+      global |= BRW_CR0_FP16_DENORM_PRESERVE;
+   }
+
+   /* Initially fp32 denorms are flushed-to-zero, handle preserve.
+    *
+    * TODO: Optimize this, we have a dispatch bit.
+    */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) {
+      global |= BRW_CR0_FP32_DENORM_PRESERVE;
+   }
+
+   /* Initially fp64 denorms are flushed to zero, handle preserve. */
+   if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) {
+      global |= BRW_CR0_FP64_DENORM_PRESERVE;
+   }
+
+   /* By default, we are in round-to-even mode. Note we do not permit setting
+    * round mode separately by bitsize but this is ok for current APIs. The
+    * Vulkan driver sets roundingModeIndependence = NONE.
+    *
+    * TODO: Optimize this, there is a command buffer bit for it.
+    */
+   if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) ||
+       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) ||
+       ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) {
+      global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT);
+   }
+
+   uint32_t cr0 = 0;
+   jay_function *entrypoint = jay_shader_get_entrypoint(shader);
+   set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global);
+
+   /* Now handle per-instruction deltas to the global mode */
+   jay_foreach_function(shader, func) {
+      jay_foreach_block(func, block) {
+         uint32_t current = cr0;
+
+         jay_foreach_inst_in_block(block, I) {
+            uint32_t required = cr0;
+            enum jay_rounding_mode round =
+               (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND;
+
+            if (round != JAY_ROUND) {
+               required &= ~BRW_CR0_RND_MODE_MASK;
+               required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT);
+            }
+
+            if (jay_type_is_any_float(I->type)) {
+               set_cr0(func, jay_before_inst(I), &current, required);
+            }
+         }
+
+         /* Restore to global state on block boundaries */
+         if (jay_num_successors(block) > 0) {
+            set_cr0(func, jay_after_block(block), &current, cr0);
+         }
+      }
+   }
+}
--- a/src/intel/compiler/jay/jay_nir.c
+++ b/src/intel/compiler/jay/jay_nir.c
@ -0,0 +1,462 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/brw/brw_eu.h"
+#include "compiler/brw/brw_eu_defines.h"
+#include "compiler/brw/brw_nir.h"
+#include "compiler/brw/brw_private.h"
+#include "compiler/intel_nir.h"
+#include "jay_private.h"
+#include "nir.h"
+#include "nir_builder.h"
+
+/*
+ * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
+ * its own index. Vectors/64-bit use contiguous indices. We therefore run a
+ * modified version of nir_index_ssa_defs right before translating NIR->Jay.
+ */
+static bool
+index_ssa_def_cb(nir_def *def, void *state)
+{
+   unsigned *index = (unsigned *) state;
+   def->index = *index;
+   *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
+   return true;
+}
+
+static void
+nj_index_ssa_defs(nir_shader *nir)
+{
+   nir_foreach_function_impl(impl, nir) {
+      /* The zero index means null in Jay, so start SSA indices at 1 */
+      unsigned index = 1;
+
+      nir_foreach_block_unstructured(block, impl) {
+         nir_foreach_instr(instr, block)
+            nir_foreach_def(instr, index_ssa_def_cb, &index);
+      }
+
+      impl->ssa_alloc = index;
+   }
+}
+
+static bool
+lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
+{
+   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
+      return false;
+
+   /* TODO: Is this right for multisampling? */
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *active =
+      nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
+
+   nir_def_replace(&intr->def, active);
+   return true;
+}
+
+static bool
+lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
+{
+   if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
+       intr->intrinsic != nir_intrinsic_load_pixel_coord)
+      return false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
+
+   if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
+      c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
+                   nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
+                   nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
+   }
+
+   nir_def_replace(&intr->def, c);
+   return true;
+}
+
+static bool
+jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+   unsigned *simd_width = simd_;
+
+   /* mask & -mask isolates the lowest set bit in the mask. */
+   if (intr->intrinsic == nir_intrinsic_elect) {
+      nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
+      mask = nir_iand(b, mask, nir_ineg(b, mask));
+      nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
+      return true;
+   }
+
+   /* Ballots must match the SIMD size */
+   if (intr->intrinsic == nir_intrinsic_ballot ||
+       intr->intrinsic == nir_intrinsic_ballot_relaxed) {
+      unsigned old_bitsize = intr->def.bit_size;
+      intr->def.bit_size = *simd_width;
+      nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
+      nir_def_rewrite_uses_after(&intr->def, u2uN);
+      return true;
+   }
+
+   /* Note: we don't treat read_invocation specially because there's little
+    * benefit but doing so would require expensive uniformizing in some cases.
+    */
+   if (intr->intrinsic != nir_intrinsic_shuffle &&
+       intr->intrinsic != nir_intrinsic_read_invocation)
+      return false;
+
+   nir_def *data = intr->src[0].ssa;
+   assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
+
+   nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
+   nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
+   return true;
+}
+
+struct frag_out_ctx {
+   nir_def *colour[8], *depth, *stencil, *sample_mask;
+};
+
+static bool
+collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
+{
+   struct frag_out_ctx *ctx = ctx_;
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
+   assert(util_is_power_of_two_nonzero(wrmask + 1) &&
+          "complex writemasks should be lowered");
+
+   /* TODO: Optimize with write mask? */
+
+   gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
+   assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
+   nir_def **out;
+   if (loc == FRAG_RESULT_COLOR) {
+      out = &ctx->colour[0];
+   } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
+      out = &ctx->colour[loc - FRAG_RESULT_DATA0];
+   } else if (loc == FRAG_RESULT_DEPTH) {
+      out = &ctx->depth;
+   } else if (loc == FRAG_RESULT_STENCIL) {
+      UNREACHABLE("todo");
+      out = &ctx->stencil;
+   } else if (loc == FRAG_RESULT_SAMPLE_MASK) {
+      UNREACHABLE("todo");
+      out = &ctx->sample_mask;
+   } else {
+      UNREACHABLE("invalid location");
+   }
+
+   assert((*out) == NULL && "each location written exactly once");
+   *out = intr->src[0].ssa;
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static void
+append_payload(nir_builder *b,
+               nir_def **payload,
+               unsigned *len,
+               unsigned max_len,
+               nir_def *value)
+{
+   if (value != NULL) {
+      for (unsigned i = 0; i < value->num_components; ++i) {
+         payload[*len] = nir_channel(b, value, i);
+         (*len)++;
+         assert((*len) <= max_len);
+      }
+   }
+}
+
+static void
+insert_rt_store(nir_builder *b,
+                const struct intel_device_info *devinfo,
+                signed target,
+                bool last,
+                nir_def *colour,
+                nir_def *src0_alpha,
+                nir_def *depth,
+                nir_def *stencil,
+                nir_def *sample_mask,
+                unsigned dispatch_width)
+{
+   bool null_rt = target < 0;
+   target = MAX2(target, 0);
+
+   if (!colour) {
+      colour = nir_undef(b, 4, 32);
+   }
+
+   colour = nir_pad_vec4(b, colour);
+
+   if (null_rt) {
+      /* Even if we don't write a RT, we still need to write alpha for
+       * alpha-to-coverage and alpha testing. Optimize the other channels out.
+       */
+      colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
+                                     nir_channel(b, colour, 3), 3);
+   }
+
+   /* TODO: Not sure I like this. We'll see what 2src looks like. */
+   unsigned op = dispatch_width == 32 ?
+                    XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
+                    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   uint64_t desc =
+      brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
+
+   uint64_t ex_desc = 0;
+   if (devinfo->ver >= 20) {
+      ex_desc = target << 21 |
+                null_rt << 20 |
+                (src0_alpha ? (1 << 15) : 0) |
+                (stencil ? (1 << 14) : 0) |
+                (depth ? (1 << 13) : 0) |
+                (sample_mask ? (1 << 12) : 0);
+   } else if (devinfo->ver >= 11) {
+      /* Set the "Render Target Index" and "Src0 Alpha Present" fields
+       * in the extended message descriptor, in lieu of using a header.
+       */
+      ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
+   }
+
+   /* Build the payload */
+   nir_def *payload[8] = { NULL };
+   unsigned len = 0;
+   append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
+   append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
+   /* TODO */
+
+   nir_def *disable = b->shader->info.fs.uses_discard ?
+                         nir_is_helper_invocation(b, 1) :
+                         nir_imm_false(b);
+
+   nir_store_render_target_intel(b, nir_vec(b, payload, len),
+                                 nir_imm_ivec2(b, desc, ex_desc), disable,
+                                 .eot = last);
+}
+
+static void
+lower_fragment_outputs(nir_function_impl *impl,
+                       const struct intel_device_info *devinfo,
+                       unsigned nr_color_regions,
+                       unsigned dispatch_width)
+{
+   struct frag_out_ctx ctx = { { NULL } };
+   nir_function_intrinsics_pass(impl, collect_fragment_output,
+                                nir_metadata_control_flow, &ctx);
+   nir_builder b_ = nir_builder_at(nir_after_impl(impl));
+   nir_builder *b = &b_;
+   assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
+
+   signed first = -1;
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) {
+      if (ctx.colour[i]) {
+         first = i;
+         break;
+      }
+   }
+
+   /* Do the later render targets first */
+   for (unsigned i = first + 1; i < nr_color_regions; ++i) {
+      if (ctx.colour[i]) {
+         insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL,
+                         NULL, dispatch_width);
+      }
+   }
+
+   /* Finally do render target zero attaching all the sideband things and
+    * setting the LastRT bit. This needs to exist even if nothing is written
+    * since it also signals end-of-thread.
+    */
+   insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true,
+                   first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth,
+                   ctx.stencil, ctx.sample_mask, dispatch_width);
+}
+
+unsigned
+jay_process_nir(const struct intel_device_info *devinfo,
+                nir_shader *nir,
+                union brw_any_prog_data *prog_data,
+                union brw_any_prog_key *key)
+{
+   enum mesa_shader_stage stage = nir->info.stage;
+   struct brw_compiler compiler = { .devinfo = devinfo };
+   unsigned nr_packed_regs = 0;
+
+   brw_pass_tracker pt_ = {
+      .nir = nir,
+      .key = &key->base,
+      .dispatch_width = 0,
+      .compiler = &compiler,
+      .archiver = NULL, //params->base.archiver,
+   }, *pt = &pt_;
+
+   BRW_NIR_SNAPSHOT("first");
+
+   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.stage = stage;
+   // TODO: Make the driver do this?
+   // prog_data->base.source_hash = params->source_hash;
+   prog_data->base.total_shared = nir->info.shared_size;
+
+   /* TODO: Real heuristic */
+   bool do_simd32 = INTEL_SIMD(FS, 32);
+   do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
+   unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
+
+   if (stage == MESA_SHADER_VERTEX) {
+      /* We only expect slot compaction to be disabled when using device
+       * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
+       * programming. This should always be enabled together with VF component
+       * packing to minimize the size of the payload.
+       */
+      assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
+
+      /* When using Primitive Replication for multiview, each view gets its own
+       * position slot.
+       */
+      const uint32_t pos_slots =
+         (nir->info.per_view_outputs & VARYING_BIT_POS) ?
+            MAX2(1, util_bitcount(key->base.view_mask)) :
+            1;
+
+      /* Only position is allowed to be per-view */
+      assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
+
+      brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
+                          nir->info.outputs_written, key->base.vue_layout,
+                          pos_slots);
+
+      brw_nir_apply_key(pt, &key->base, simd_width);
+
+      prog_data->vs.inputs_read = nir->info.inputs_read;
+      prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
+      prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
+
+      brw_nir_lower_vs_inputs(nir);
+      brw_nir_lower_vue_outputs(nir);
+      BRW_NIR_SNAPSHOT("after_lower_io");
+
+      memset(prog_data->vs.vf_component_packing, 0,
+             sizeof(prog_data->vs.vf_component_packing));
+      if (key->vs.vf_component_packing) {
+         nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
+      }
+
+      /* Get constant offsets out of the way for proper clip/cull handling */
+      BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+      BRW_NIR_PASS(nir_opt_constant_folding);
+      BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
+                   &prog_data->vue.vue_map, 0, 0);
+   } else if (stage == MESA_SHADER_FRAGMENT) {
+      assert(key->fs.mesh_input == INTEL_NEVER && "todo");
+      assert(!key->fs.force_dual_color_blend && "todo");
+      brw_nir_apply_key(pt, &key->base, 32);
+      brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
+      brw_nir_lower_fs_outputs(nir);
+      NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
+
+      if (!brw_can_coherent_fb_fetch(devinfo))
+         NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
+
+      NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
+               nir_metadata_control_flow, NULL);
+      NIR_PASS(_, nir, nir_opt_barycentric, true);
+
+      lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
+                             key->fs.nr_color_regions, simd_width);
+      NIR_PASS(_, nir, nir_lower_helper_writes, true);
+      NIR_PASS(_, nir, nir_lower_is_helper_invocation);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
+               nir_metadata_control_flow, NULL);
+
+      if (key->fs.alpha_to_coverage != INTEL_NEVER) {
+         /* Run constant fold optimization in order to get the correct source
+          * offset to determine render target 0 store instruction in
+          * emit_alpha_to_coverage pass.
+          */
+         NIR_PASS(_, nir, nir_opt_constant_folding);
+         NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
+      }
+
+      // TODO
+      // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
+
+      if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
+         uint32_t f = 0;
+
+         if (key->fs.multisample_fbo == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
+
+         if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
+
+         if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
+            f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
+
+         if (key->fs.persample_interp == INTEL_ALWAYS) {
+            f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
+                 INTEL_FS_CONFIG_PERSAMPLE_INTERP;
+         }
+
+         NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
+                  f);
+      }
+   } else {
+      brw_nir_apply_key(pt, &key->base, simd_width);
+   }
+
+   brw_postprocess_nir_opts(pt);
+
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
+            nir_metadata_control_flow, &simd_width);
+   NIR_PASS(_, nir, nir_opt_algebraic_late);
+   NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
+
+   /* Late postprocess while remaining in SSA */
+   /* Run fsign lowering again after the last time brw_nir_optimize is called.
+    * As is the case with conversion lowering (below), brw_nir_optimize can
+    * create additional fsign instructions.
+    */
+   NIR_PASS(_, nir, jay_nir_lower_fsign);
+   NIR_PASS(_, nir, jay_nir_lower_bool);
+   NIR_PASS(_, nir, nir_opt_cse);
+   NIR_PASS(_, nir, nir_opt_dce);
+   NIR_PASS(_, nir, jay_nir_opt_sel_zero);
+
+   /* Run nir_split_conversions only after the last tiem
+    * brw_nir_optimize is called. Various optimizations invoked there can
+    * rematerialize the conversions that the lowering pass eliminates.
+    */
+   const nir_split_conversions_options split_conv_opts = {
+      .callback = intel_nir_split_conversions_cb,
+   };
+   NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
+
+   /* Do this only after the last opt_gcm. GCM will undo this lowering. */
+   if (stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
+   }
+
+   NIR_PASS(_, nir, nir_opt_constant_folding);
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
+   NIR_PASS(_, nir, nir_opt_copy_prop);
+   NIR_PASS(_, nir, nir_opt_dce);
+
+   /* Run divergence analysis at the end */
+   nir_sweep(nir);
+   nj_index_ssa_defs(nir);
+   nir_divergence_analysis(nir);
+
+   jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
+   return simd_width;
+}
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@ -22,6 +22,16 @@ bool jay_nir_lower_bool(nir_shader *nir);
 bool jay_nir_opt_sel_zero(nir_shader *nir);
 bool jay_nir_lower_fsign(nir_shader *nir);

+void jay_populate_prog_data(const struct intel_device_info *devinfo,
+                            nir_shader *nir,
+                            union brw_any_prog_data *prog_data,
+                            union brw_any_prog_key *key,
+                            unsigned nr_packed_regs);
+unsigned jay_process_nir(const struct intel_device_info *devinfo,
+                         nir_shader *nir,
+                         union brw_any_prog_data *prog_data,
+                         union brw_any_prog_key *key);
+
 void jay_compute_liveness(jay_function *f);
 void jay_calculate_register_demands(jay_function *f);

@ -63,6 +73,7 @@ void jay_lower_post_ra(jay_shader *s);
 void jay_lower_spill(jay_function *func);
 void jay_lower_simd_width(jay_shader *s);
 void jay_lower_scoreboard(jay_shader *s);
+void jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes);

 struct jay_shader_bin *
 jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size);
--- a/src/intel/compiler/jay/jay_prog_data.c
+++ b/src/intel/compiler/jay/jay_prog_data.c
@ -0,0 +1,581 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/brw/brw_compiler.h"
+#include "compiler/brw/brw_nir.h"
+#include "compiler/intel_nir.h"
+#include "jay_private.h"
+#include "nir.h"
+
+static inline enum intel_barycentric_mode
+brw_barycentric_mode(const struct brw_fs_prog_key *key,
+                     nir_intrinsic_instr *intr)
+{
+   const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr);
+
+   /* Barycentric modes don't make sense for flat inputs. */
+   assert(mode != INTERP_MODE_FLAT);
+
+   unsigned bary;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_at_offset:
+      /* When per sample interpolation is dynamic, assume sample interpolation.
+       * We'll dynamically remap things so that the FS payload is not affected.
+       */
+      bary = key->persample_interp == INTEL_SOMETIMES ?
+                INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
+                INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_at_sample:
+      bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+      break;
+   default:
+      UNREACHABLE("invalid intrinsic");
+   }
+
+   if (mode == INTERP_MODE_NOPERSPECTIVE)
+      bary += 3;
+
+   return (enum intel_barycentric_mode) bary;
+}
+
+struct fs_info_ctx {
+   const struct brw_fs_prog_key *key;
+   struct brw_fs_prog_data *prog_data;
+   const struct intel_device_info *devinfo;
+};
+
+static bool
+gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   struct fs_info_ctx *ctx = data;
+   struct brw_fs_prog_data *prog_data = ctx->prog_data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample:
+      prog_data->barycentric_interp_modes |=
+         1 << brw_barycentric_mode(ctx->key, intr);
+      break;
+
+   case nir_intrinsic_load_barycentric_at_sample:
+   case nir_intrinsic_load_barycentric_at_offset: {
+      unsigned mode = brw_barycentric_mode(ctx->key, intr);
+      prog_data->barycentric_interp_modes |= 1 << mode;
+      prog_data->uses_sample_offsets |=
+         mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE ||
+         mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE;
+
+      if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS)
+         prog_data->uses_npc_bary_coefficients = true;
+      else
+         prog_data->uses_pc_bary_coefficients = true;
+      break;
+   }
+
+   case nir_intrinsic_load_frag_coord_z:
+      prog_data->uses_src_depth = true;
+      break;
+
+   case nir_intrinsic_load_frag_coord_w_rcp:
+      prog_data->uses_src_w = true;
+      break;
+
+   case nir_intrinsic_load_sample_mask_in:
+      /* TODO: Sample masks are broken and discards are broken and simd32
+       * layouts are broken too. XXX.
+       */
+      // prog_data->uses_sample_mask = true;
+      break;
+
+   case nir_intrinsic_load_pixel_coord_intel:
+      BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
+      break;
+
+   default:
+      break;
+   }
+
+   return false;
+}
+
+static void
+brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data,
+                        const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_shader_in_variable(var, shader) {
+      if (var->data.interpolation != INTERP_MODE_FLAT ||
+          var->data.per_primitive)
+         continue;
+
+      unsigned slots = glsl_count_attribute_slots(var->type, false);
+      for (unsigned s = 0; s < slots; s++) {
+         int input_index = prog_data->urb_setup[var->data.location + s];
+
+         if (input_index >= 0)
+            prog_data->flat_inputs |= 1 << input_index;
+      }
+   }
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info.fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_NONE:
+      case FRAG_DEPTH_LAYOUT_ANY:
+         return BRW_PSCDEPTH_ON;
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         return BRW_PSCDEPTH_ON_GE;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         return BRW_PSCDEPTH_ON_LE;
+      case FRAG_DEPTH_LAYOUT_UNCHANGED:
+         /* We initially set this to OFF, but having the shader write the
+          * depth means we allocate register space in the SEND message. The
+          * difference between the SEND register count and the OFF state
+          * programming makes the HW hang.
+          *
+          * Removing the depth writes also leads to test failures. So use
+          * LesserThanOrEqual, which fits writing the same value
+          * (unchanged/equal).
+          *
+          */
+         return BRW_PSCDEPTH_ON_LE;
+      }
+   }
+   return BRW_PSCDEPTH_OFF;
+}
+
+/*
+ * Build up an array of indices into the urb_setup array that
+ * references the active entries of the urb_setup array.
+ * Used to accelerate walking the active entries of the urb_setup array
+ * on each upload.
+ */
+static void
+brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data)
+{
+   /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
+    * skip per-primitive attributes here.
+    */
+
+   /* Make sure uint8_t is sufficient */
+   static_assert(VARYING_SLOT_MAX <= 0xff);
+   uint8_t index = 0;
+   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      if (fs_prog_data->urb_setup[attr] >= 0) {
+         fs_prog_data->urb_setup_attribs[index++] = attr;
+      }
+   }
+   fs_prog_data->urb_setup_attribs_count = index;
+}
+
+static void
+calculate_urb_setup(const struct intel_device_info *devinfo,
+                    const struct brw_fs_prog_key *key,
+                    struct brw_fs_prog_data *prog_data,
+                    nir_shader *nir,
+                    const struct brw_mue_map *mue_map,
+                    int *per_primitive_offsets)
+{
+   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
+   int urb_next = 0; /* in vec4s */
+
+   /* Figure out where the PrimitiveID lives, either in the per-vertex block
+    * or in the per-primitive block or both.
+    */
+   const uint64_t per_vert_primitive_id =
+      key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t per_prim_primitive_id =
+      key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t inputs_read =
+      nir->info.inputs_read &
+      (~nir->info.per_primitive_inputs | per_vert_primitive_id);
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_inputs =
+      nir->info.inputs_read &
+      (nir->info.per_primitive_inputs | per_prim_primitive_id) &
+      ~per_primitive_header_bits;
+   struct intel_vue_map vue_map;
+   uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
+
+   if (mue_map != NULL) {
+      memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
+      memcpy(per_primitive_offsets, mue_map->per_primitive_offsets,
+             sizeof(mue_map->per_primitive_offsets));
+
+      if (!mue_map->wa_18019110168_active) {
+         u_foreach_bit64(location, per_primitive_inputs) {
+            assert(per_primitive_offsets[location] != -1);
+
+            first_read_offset =
+               MIN2(first_read_offset,
+                    (uint32_t) per_primitive_offsets[location]);
+            per_primitive_stride =
+               MAX2((uint32_t) per_primitive_offsets[location] + 16,
+                    per_primitive_stride);
+         }
+      } else {
+         first_read_offset = per_primitive_stride = 0;
+      }
+   } else {
+      brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout,
+                          1 /* pos_slots, TODO */);
+      brw_compute_per_primitive_map(per_primitive_offsets,
+                                    &per_primitive_stride, &first_read_offset,
+                                    0, nir, nir_var_shader_in,
+                                    per_primitive_inputs,
+                                    true /* separate_shader */);
+   }
+
+   if (per_primitive_stride > first_read_offset) {
+      first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
+
+      /* Remove the first few unused registers */
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1)
+            continue;
+         per_primitive_offsets[i] -= first_read_offset;
+      }
+
+      prog_data->num_per_primitive_inputs =
+         2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
+   } else {
+      prog_data->num_per_primitive_inputs = 0;
+   }
+
+   /* Now do the per-vertex stuff (what used to be legacy pipeline) */
+
+   /* If Mesh is involved, we cannot do any packing. Documentation doesn't say
+    * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
+    * using Mesh.
+    */
+   if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) {
+      /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
+       * arbitrary rearrangement of the first 16 varying inputs, so we can put
+       * them wherever we want. Just put them in order.
+       *
+       * This is useful because it means that (a) inputs not used by the
+       * fragment shader won't take up valuable register space, and (b) we
+       * won't have to recompile the fragment shader if it gets paired with a
+       * different vertex (or geometry) shader.
+       */
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (inputs_read & BITFIELD64_BIT(i)) {
+            prog_data->urb_setup[i] = urb_next++;
+         }
+      }
+   } else {
+      /* We have enough input varyings that the SF/SBE pipeline stage can't
+       * arbitrarily rearrange them to suit our whim; we have to put them in
+       * an order that matches the output of the previous pipeline stage
+       * (geometry or vertex shader).
+       */
+      int first_slot = 0;
+      for (int i = 0; i < vue_map.num_slots; i++) {
+         int varying = vue_map.slot_to_varying[i];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+            first_slot = ROUND_DOWN_TO(i, 2);
+            break;
+         }
+      }
+
+      for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
+         int varying = vue_map.slot_to_varying[slot];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) {
+            prog_data->urb_setup[varying] = slot - first_slot;
+         }
+      }
+      urb_next = vue_map.num_slots - first_slot;
+   }
+
+   prog_data->num_varying_inputs = urb_next;
+   prog_data->inputs = inputs_read;
+   prog_data->per_primitive_inputs = per_primitive_inputs;
+
+   brw_compute_urb_setup_index(prog_data);
+}
+
+static void
+populate_fs_prog_data(nir_shader *shader,
+                      const struct intel_device_info *devinfo,
+                      const struct brw_fs_prog_key *key,
+                      struct brw_fs_prog_data *prog_data,
+                      const struct brw_mue_map *mue_map,
+                      int *per_primitive_offsets)
+{
+   struct fs_info_ctx ctx = {
+      .key = key,
+      .prog_data = prog_data,
+      .devinfo = devinfo,
+   };
+   nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx);
+
+   prog_data->uses_kill = shader->info.fs.uses_discard;
+   prog_data->uses_omask =
+      !key->ignore_sample_mask_out &&
+      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+   prog_data->max_polygons = 1;
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   prog_data->sample_shading = shader->info.fs.uses_sample_shading;
+   prog_data->api_sample_shading = key->api_sample_shading;
+   prog_data->min_sample_shading = key->min_sample_shading;
+
+   assert(key->multisample_fbo != INTEL_NEVER ||
+          key->persample_interp == INTEL_NEVER);
+
+   prog_data->persample_dispatch = key->persample_interp;
+   if (prog_data->sample_shading)
+      prog_data->persample_dispatch = INTEL_ALWAYS;
+
+   /* We can only persample dispatch if we have a multisample FBO */
+   prog_data->persample_dispatch =
+      MIN2(prog_data->persample_dispatch, key->multisample_fbo);
+
+   /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
+    * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
+    * to definitively tell whether alpha_to_coverage is on or off.
+    */
+   prog_data->alpha_to_coverage = key->alpha_to_coverage;
+
+   assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER);
+   prog_data->mesh_input = key->mesh_input;
+
+   assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER);
+   prog_data->provoking_vertex_last = key->provoking_vertex_last;
+
+   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select
+    *    POSOFFSET_SAMPLE"
+    *
+    * So we can only really get sample positions if we are doing real
+    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+    * persample dispatch, we hard-code it to 0.5.
+    */
+   prog_data->uses_pos_offset =
+      prog_data->persample_dispatch != INTEL_NEVER &&
+      (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
+       BITSET_TEST(shader->info.system_values_read,
+                   SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
+
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info.fs.inner_coverage;
+
+   /* From the BDW PRM documentation for 3DSTATE_WM:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+    *     Sample or Non- perspective Sample barycentric coordinates."
+    *
+    * So cleanup any potentially set sample barycentric mode when not in per
+    * sample dispatch.
+    */
+   if (prog_data->persample_dispatch == INTEL_NEVER) {
+      prog_data->barycentric_interp_modes &=
+         ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+   }
+
+   if (devinfo->ver >= 20) {
+      prog_data->vertex_attributes_bypass =
+         brw_needs_vertex_attributes_bypass(shader);
+   }
+
+   prog_data->uses_nonperspective_interp_modes =
+      (prog_data->barycentric_interp_modes &
+       INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
+      prog_data->uses_npc_bary_coefficients;
+
+   /* The current VK_EXT_graphics_pipeline_library specification requires
+    * coarse to specified at compile time. But per sample interpolation can be
+    * dynamic. So we should never be in a situation where coarse &
+    * persample_interp are both respectively true & INTEL_ALWAYS.
+    *
+    * Coarse will dynamically turned off when persample_interp is active.
+    */
+   assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
+
+   prog_data->coarse_pixel_dispatch =
+      intel_sometimes_invert(prog_data->persample_dispatch);
+   if (!key->coarse_pixel ||
+       /* DG2 should support this, but Wa_22012766191 says there are issues
+        * with CPS 1x1 + MSAA + FS writing to oMask.
+        */
+       (devinfo->verx10 < 200 &&
+        (prog_data->uses_omask || prog_data->uses_sample_mask)) ||
+       prog_data->sample_shading ||
+       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+       prog_data->computed_stencil ||
+       devinfo->ver < 11) {
+      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
+   }
+
+   /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
+    * Message Descriptor :
+    *
+    *    "Message Type. Specifies the type of message being sent when
+    *     pixel-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Per Message Offset (eval_snapped with immediate offset)
+    *       1: Sample Position Offset (eval_sindex)
+    *       2: Centroid Position Offset (eval_centroid)
+    *       3: Per Slot Offset (eval_snapped with register offset)
+    *
+    *     Message Type. Specifies the type of message being sent when
+    *     coarse-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Coarse to Pixel Mapping Message (internal message)
+    *       1: Reserved
+    *       2: Coarse Centroid Position (eval_centroid)
+    *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
+    *
+    * The Sample Position Offset is marked as reserved for coarse rate
+    * evaluation and leads to hangs if we try to use it. So disable coarse
+    * pixel shading if we have any intrinsic that will result in a pixel
+    * interpolater message at sample.
+    */
+   if (intel_nir_pulls_at_sample(shader))
+      prog_data->coarse_pixel_dispatch = INTEL_NEVER;
+
+   /* We choose to always enable VMask prior to XeHP, as it would cause
+    * us to lose out on the eliminate_find_live_channel() optimization.
+    */
+   prog_data->uses_vmask =
+      devinfo->verx10 < 125 ||
+      shader->info.fs.needs_coarse_quad_helper_invocations ||
+      shader->info.uses_wide_subgroup_intrinsics ||
+      prog_data->coarse_pixel_dispatch != INTEL_NEVER;
+
+   prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients;
+
+   if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
+      prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth;
+      prog_data->uses_src_depth = false;
+   }
+
+   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map,
+                       per_primitive_offsets);
+   brw_compute_flat_inputs(prog_data, shader);
+
+   prog_data->has_side_effects = shader->info.writes_memory;
+}
+
+static void
+populate_vs_prog_data(nir_shader *nir,
+                      const struct intel_device_info *devinfo,
+                      const struct brw_vs_prog_key *key,
+                      struct brw_vs_prog_data *prog_data,
+                      unsigned nr_packed_regs)
+{
+   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
+   BITSET_WORD *sysvals = nir->info.system_values_read;
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) {
+      nr_attribute_slots++;
+   }
+
+   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
+   if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) ||
+       BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
+      nr_attribute_slots++;
+   }
+
+   const struct {
+      bool *data;
+      gl_system_value val;
+   } bool_sysvals[] = {
+      { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW     },
+      { &prog_data->uses_firstvertex,     SYSTEM_VALUE_FIRST_VERTEX        },
+      { &prog_data->uses_baseinstance,    SYSTEM_VALUE_BASE_INSTANCE       },
+      { &prog_data->uses_vertexid,        SYSTEM_VALUE_VERTEX_ID_ZERO_BASE },
+      { &prog_data->uses_instanceid,      SYSTEM_VALUE_INSTANCE_ID         },
+      { &prog_data->uses_drawid,          SYSTEM_VALUE_DRAW_ID             },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) {
+      *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val);
+   }
+
+   unsigned nr_attribute_regs;
+   if (key->vf_component_packing) {
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8);
+      nr_attribute_regs = nr_packed_regs;
+   } else {
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
+      nr_attribute_regs = 4 * nr_attribute_slots;
+   }
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4),
+                                     prog_data->base.vue_map.num_slots);
+   prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+}
+
+void
+jay_populate_prog_data(const struct intel_device_info *devinfo,
+                       nir_shader *nir,
+                       union brw_any_prog_data *prog_data,
+                       union brw_any_prog_key *key,
+                       unsigned nr_packed_regs)
+{
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs,
+                            nr_packed_regs);
+   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      int per_primitive_offsets[VARYING_SLOT_MAX];
+      memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
+
+      populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs,
+                            NULL /* TODO: mue_map */, per_primitive_offsets);
+   } else if (mesa_shader_stage_is_compute(nir->info.stage)) {
+      prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr;
+      prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr;
+   }
+
+   if (nir->info.stage == MESA_SHADER_VERTEX ||
+       nir->info.stage == MESA_SHADER_TESS_EVAL ||
+       nir->info.stage == MESA_SHADER_GEOMETRY ||
+       nir->info.stage == MESA_SHADER_MESH) {
+
+      uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
+      uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size,
+                                          nir->info.cull_distance_array_size);
+
+      if (nir->info.stage == MESA_SHADER_MESH) {
+         prog_data->mesh.clip_distance_mask = clip_mask;
+         prog_data->mesh.cull_distance_mask = cull_mask;
+      } else {
+         prog_data->vue.clip_distance_mask = clip_mask;
+         prog_data->vue.cull_distance_mask = cull_mask;
+      }
+   }
+}
--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@ -50,16 +50,19 @@ libintel_compiler_jay_files = files(
  'jay_assign_flags.c',
  'jay_from_nir.c',
  'jay_ir.h',
+  'jay_insert_fp_mode.c',
  'jay_liveness.c',
  'jay_lower_post_ra.c',
  'jay_lower_pre_ra.c',
  'jay_lower_scoreboard.c',
  'jay_lower_spill.c',
+  'jay_nir.c',
  'jay_opt_dead_code.c',
  'jay_opt_control_flow.c',
  'jay_opt_propagate.c',
  'jay_print.c',
  'jay_private.h',
+  'jay_prog_data.c',
  'jay_repair_ssa.c',
  'jay_register_allocate.c',
  'jay_simd_width.c',