diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c
index e94adf7b89c..c98a061ab11 100644
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@@ -98,7 +98,6 @@ struct nir_to_jay_state {
       jay_def sampler_state_pointer, scratch_surface;
       jay_def inline_data;
       jay_def push_data[512];
-      jay_def lane_id;
       jay_def urb_handle;
 
       union {
@@ -1365,11 +1364,19 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
       break;
    }
 
-   case nir_intrinsic_load_subgroup_invocation:
+   case nir_intrinsic_load_subgroup_invocation: {
+      jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2);
+      jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4));
+
+      for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
+         jay_ADD(b, JAY_TYPE_U16, jay_extract_range(lid, i / 2, i / 2),
+                 jay_extract_range(lid, 0, i / 2), i);
+      }
+
       /* TODO: Lower this in NIR? */
-      jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16,
-              JAY_ROUND, 0);
+      jay_CVT(b, JAY_TYPE_U32, dst, lid, JAY_TYPE_U16, JAY_ROUND, 0);
       break;
+   }
 
    case nir_intrinsic_demote:
    case nir_intrinsic_demote_if:
@@ -1422,20 +1429,17 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
    }
 
    /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a
-    * GPR input, we could uniformize (as behaviour is undefined for
-    * non-uniform inputs) but a lowered bit extract is cheaper than uniformize.
+    * GPR input, behaviour is undefined for non-uniform inputs. TODO: a lowered
+    * bit extract is cheaper than uniformize, but maybe lower in NIR..?
     */
    case nir_intrinsic_inverse_ballot: {
       assert(dst.file == FLAG);
       jay_def x = nj_src(intr->src[0]);
       if (x.file == GPR) {
-         jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id);
-         jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1);
-         jay_set_conditional_mod(b, and, dst, GEN_CONDITION_NE);
-      } else {
-         jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
+         x = emit_uniformize(nj, x);
       }
 
+      jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
       break;
    }
 
@@ -2765,18 +2769,6 @@ jay_setup_payload(struct nir_to_jay_state *nj)
    }
 
    s->payload_gprs = p.offsets[GPR];
-
-   /* Lane ID calculations require &W and therefore are calculated in
-    * uniform control flow to sidestep RA problems. The easy solution is
-    * calculating the lane ID in the first block.
-    *
-    * XXX: This doesn't work for multi-function. Reconsider.
-    */
-   nj->payload.lane_id = jay_LANE_ID_8_u16(b);
-
-   for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
-      nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i);
-   }
 }
 
 /*
diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h
index 0a7fc576f5d..163767cd013 100644
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@@ -945,7 +945,7 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I)
    bool simd1 = jay_inst_is_uniform(I) && !I->broadcast_flag;
    unsigned base = simd1 ? 1 : s->dispatch_width;
 
-   /* Handle vectors-of-UGPR operations with special care for 64-bit */
+   /* Handle vectors-of-UGPR operations with special care for bitsizes */
    unsigned vec_per_channel = jay_type_vector_length(I->type);
    unsigned dst_size = jay_num_values(I->dst);
    assert(util_is_aligned(dst_size, vec_per_channel));
@@ -953,6 +953,12 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I)
    if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) {
       assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1);
       base = dst_size;
+
+      if (jay_type_size_bits(I->type) == 8) {
+         base *= 4;
+      } else if (jay_type_size_bits(I->type) == 16) {
+         base *= 2;
+      }
    }
 
    return base;
@@ -985,9 +991,7 @@ jay_is_no_mask(const jay_inst *I)
           I->op == JAY_OPCODE_QUAD_SWIZZLE ||
           I->op == JAY_OPCODE_DESWIZZLE_EVEN ||
           I->op == JAY_OPCODE_DESWIZZLE_ODD ||
-          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
-          I->op == JAY_OPCODE_LANE_ID_8 ||
-          I->op == JAY_OPCODE_LANE_ID_EXPAND;
+          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS;
 }
 
 /**
diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c
index 76f15b4cdc9..48071637b35 100644
--- a/src/intel/compiler/jay/jay_lower_spill.c
+++ b/src/intel/compiler/jay/jay_lower_spill.c
@@ -50,17 +50,9 @@ void
 jay_lower_spill(jay_function *func)
 {
    jay_builder b = jay_init_builder(func, jay_before_function(func));
+   signed ugpr_reservation = -1;
 
    /* We reserved a block of UGPRs for our use */
-   signed ugpr_reservation = -1, gpr2 = -1;
-   for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) {
-      struct jay_register_block B = func->shader->partition.blocks[GPR][i];
-
-      if (B.stride == JAY_STRIDE_2) {
-         gpr2 = B.start_gpr;
-      }
-   }
-
    for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) {
       struct jay_register_block B = func->shader->partition.blocks[UGPR][i];
 
@@ -70,7 +62,6 @@ jay_lower_spill(jay_function *func)
    }
 
    assert(ugpr_reservation >= 0 && "must have reserved something");
-   assert(gpr2 >= 0 && "must have a stride-2 gpr");
 
    jay_def sp = jay_bare_reg(UGPR, ugpr_reservation);
    sp.num_values_m1 = func->shader->dispatch_width - 1;
@@ -97,14 +88,17 @@ jay_lower_spill(jay_function *func)
    jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4);
 
    /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */
-   jay_def tmp2 = jay_bare_reg(GPR, gpr2);
-   jay_LANE_ID_8(&b, tmp2);
-   for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) {
-      jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i);
+   unsigned disp_width = b.shader->dispatch_width;
+   jay_LANE_ID_8(&b, jay_extract_range_post_ra(sp, 0, 4));
+
+   for (unsigned i = 8; i < disp_width; i *= 2) {
+      jay_ADD(&b, JAY_TYPE_U16, jay_extract_range_post_ra(sp, i / 2, i / 2),
+              jay_extract_range_post_ra(sp, 0, i / 2), i);
    }
 
-   jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4));
-   jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0);
+   jay_def lid = jay_extract_range_post_ra(sp, 0, disp_width / 2);
+   jay_SHL(&b, JAY_TYPE_U16, lid, lid, util_logbase2(4));
+   jay_CVT(&b, JAY_TYPE_U32, sp, lid, JAY_TYPE_U16, JAY_ROUND, 0);
    if (b.shader->scratch_size) {
       jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size);
    }
diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py
index 13c218279a0..f3a15e7096b 100644
--- a/src/intel/compiler/jay/jay_opcodes.py
+++ b/src/intel/compiler/jay/jay_opcodes.py
@@ -156,10 +156,8 @@ op('preload', 0, 'u32',     0, ['unsigned reg'])
 op('deswizzle_odd', 2, 'f32', 0, ['bool src2_hi'])
 op('deswizzle_even', 1, 'f32', 0, ['bool src_hi'])
 
-# Calculating the lane ID requires multiple power-of-two steps each involving
-# complex architectural features not modelled in the IR.
+# Return the UGPR[4] vector (0, 1, 2, 3, 4, 5, 6, 7) as packed 16-bit.
 op('lane_id_8', 0, 'u16')
-op('lane_id_expand', 1, 'u16', 0, ['unsigned width'])
 
 # Fill a scalar GPR from a contiguous UGPR[16] range containing words or bytes.
 # src_type can be either U8 or U16 (only).  For U8, stride can be 1 or 2, and
diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c
index 40d09cd9a53..d793cdd90b0 100644
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@@ -643,9 +643,6 @@ pick_regs_from_block(jay_ra_state *ra,
                      unsigned *best_reg,
                      unsigned first)
 {
-   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
-   must_tie &= !is_src;
-
    /* Cross-lane access cannot be SIMD split if the source/destination registers
     * overlap, but as long as we don't tie those destinations, we're ok.
     */
@@ -657,9 +654,7 @@ pick_regs_from_block(jay_ra_state *ra,
 
       unsigned cost = block_cost;
       bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
-
-      if (tied ? !may_tie :
-                 (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size)))
+      if (tied ? !may_tie : BITSET_TEST_COUNT(ra->pinned[file], r, size))
          continue;
 
       /* Try to tie predicated default values, otherwise post-RA lowering needs
diff --git a/src/intel/compiler/jay/jay_stride.c b/src/intel/compiler/jay/jay_stride.c
index 87d565143ec..ecdfa504c5f 100644
--- a/src/intel/compiler/jay/jay_stride.c
+++ b/src/intel/compiler/jay/jay_stride.c
@@ -110,13 +110,6 @@ jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max)
       max = JAY_STRIDE_4;
    }
 
-   /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet
-    * (preferring to burn the upper bits) but it is used internally.
-    */
-   if (I->op == JAY_OPCODE_LANE_ID_EXPAND) {
-      max = JAY_STRIDE_2;
-   }
-
    if (restrict_mixed_strides(I, s) &&
        jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) {
 
diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c
index 75f1f2a41b2..41a91fb3ca5 100644
--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@@ -195,7 +195,7 @@ to_gen_operand(jay_function *f,
       R = gen_retype(gen_restride(R, 0, 1, 0), GEN_TYPE_UD);
 
       /* Handle 3-src restrictions and vectorized uniform code. */
-      if (is_dest || jay_num_values(d) >= 8) {
+      if (is_dest || jay_num_values(d) >= 4) {
          R = gen_restride(R, 8, 8, 1);
       }
 
@@ -203,7 +203,7 @@ to_gen_operand(jay_function *f,
        * but if we write a single UGPR the stride is ignored..  Specify
        * whatever stride is needed to satisfy the rules.
        */
-      if (is_dest) {
+      if (is_dest && I->num_srcs > 0) {
          /* BSpec 56640 "Special Restrictions" says:
           *
           *    "Conversion between HF and Integer must be DWord-aligned
@@ -636,19 +636,9 @@ emit(struct jay_codegen *jc,
       break;
 
    case JAY_OPCODE_LANE_ID_8:
-      jc->state.exec_size = 8;
       jc_MOV(jc, dst, gen_imm_uv(0x76543210));
       break;
 
-   case JAY_OPCODE_LANE_ID_EXPAND: {
-      unsigned width = jay_lane_id_expand_width(I);
-      jc->state.exec_size = width;
-      jc_append2(GEN_OP_ADD,
-                 gen_element_offset(jc->devinfo, dst, width),
-                 SRC(0), gen_imm_uw(width));
-      break;
-   }
-
    case JAY_OPCODE_GPR_FROM_UGPRS:
       jc_MOV(jc, dst,
              gen_byte_offset(jc->devinfo,
diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c
index c21bcb232b4..d6ef88b2b18 100644
--- a/src/intel/compiler/jay/jay_validate.c
+++ b/src/intel/compiler/jay/jay_validate.c
@@ -78,9 +78,16 @@ validate_flagness(struct validate_state *validate,
    CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def));
 }
 
+static unsigned
+adjust_width_for_type(unsigned width, enum jay_type type)
+{
+   return (width * jay_type_size_bits(type)) / 32;
+}
+
 static unsigned
 get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
 {
+   /* TODO: I think this can be simplified */
    if (I->op == JAY_OPCODE_EXPAND_QUAD) {
       return 4;
    }
@@ -89,19 +96,19 @@ get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
       return jay_ugpr_per_grf(validate->func->shader);
    }
 
-   bool vectorized = I->dst.file == UGPR &&
-                     jay_num_values(I->dst) > jay_type_vector_length(I->type) &&
-                     I->op != JAY_OPCODE_SEND &&
-                     jay_num_values(I->src[s]) > 1;
-
+   unsigned simd_width = jay_simd_width_logical(validate->func->shader, I);
    unsigned elsize = jay_type_vector_length(jay_src_type(I, s));
-   unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1);
 
-   if (vectorized && I->src[s].file == GPR) {
-      CHECK(words == validate->func->shader->dispatch_width);
+   if (I->src[s].file == GPR && I->dst.file == UGPR) {
+      CHECK(jay_num_values(I->dst) ==
+               adjust_width_for_type(simd_width, I->type) ||
+            I->op == JAY_OPCODE_SEND);
+
       return 1;
+   } else if (I->src[s].file == UGPR && jay_num_values(I->src[s]) > elsize) {
+      return adjust_width_for_type(simd_width, jay_src_type(I, s));
    } else {
-      return words;
+      return elsize;
    }
 }