From 1ce2d57a31decac0d6ee9619fccae391ef2cb437 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Date: Fri, 29 May 2026 11:24:03 -0400
Subject: [PATCH] jay: rework lane ID calculations

Previously we had special ops doing data model breaking things on GPRs. But
there's no real reason for that, we can calculate lane IDs as UGPR vectors
within the Jay data model just fine. Adjust jay_ir/jay_validate to define packed
16-bit UGPR vectors, giving them the natural semantics, then use that to
calculate lane IDs, peeling back all the hacks we added along the way.

This also unfortunately pessimizes inverse_ballot() but only in a corner case
that could be revisited later. Stats are net positive.

In addition to the code clean up, this has 3 other benefits:

* Now that we can rematerialize the lane ID code anywhere we want, we could
  theoretically reduce register pressure in some scenarios. Stats show this
  doesn't help in the current implementation, though.

* Now that we can calculate lane IDs in control flow, the issues with divergent
  function calls all go away. (Well, the lane ID issue. There are other issues.)

* Now that we use UGPRs for this, we don't need a stride=16 GRF in shaders that
  don't actually use 16-bit math, meaning less shuffling from bad partitions.
  That's reflected in the positive stats here.

SIMD16:
   Totals from 1643 (62.07% of 2647) affected shaders:
   Instrs: 2227750 -> 2221032 (-0.30%); split: -0.44%, +0.14%
   CodeSize: 33138416 -> 33034224 (-0.31%); split: -0.52%, +0.20%

SIMD32:
   Totals from 1643 (62.07% of 2647) affected shaders:
   Instrs: 2864583 -> 2806217 (-2.04%); split: -2.22%, +0.19%
   CodeSize: 43088064 -> 42171504 (-2.13%); split: -2.29%, +0.17%

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41872>
---
 src/intel/compiler/jay/jay_from_nir.c         | 38 ++++++++-----------
 src/intel/compiler/jay/jay_ir.h               | 12 ++++--
 src/intel/compiler/jay/jay_lower_spill.c      | 26 +++++--------
 src/intel/compiler/jay/jay_opcodes.py         |  4 +-
 .../compiler/jay/jay_register_allocate.c      |  7 +---
 src/intel/compiler/jay/jay_stride.c           |  7 ----
 src/intel/compiler/jay/jay_to_binary.c        | 14 +------
 src/intel/compiler/jay/jay_validate.c         | 25 +++++++-----
 8 files changed, 53 insertions(+), 80 deletions(-)

diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c
index e94adf7b89c..c98a061ab11 100644
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@@ -98,7 +98,6 @@ struct nir_to_jay_state {
       jay_def sampler_state_pointer, scratch_surface;
       jay_def inline_data;
       jay_def push_data[512];
-      jay_def lane_id;
       jay_def urb_handle;
 
       union {
@@ -1365,11 +1364,19 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
       break;
    }
 
-   case nir_intrinsic_load_subgroup_invocation:
+   case nir_intrinsic_load_subgroup_invocation: {
+      jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2);
+      jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4));
+
+      for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
+         jay_ADD(b, JAY_TYPE_U16, jay_extract_range(lid, i / 2, i / 2),
+                 jay_extract_range(lid, 0, i / 2), i);
+      }
+
       /* TODO: Lower this in NIR? */
-      jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16,
-              JAY_ROUND, 0);
+      jay_CVT(b, JAY_TYPE_U32, dst, lid, JAY_TYPE_U16, JAY_ROUND, 0);
       break;
+   }
 
    case nir_intrinsic_demote:
    case nir_intrinsic_demote_if:
@@ -1422,20 +1429,17 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
    }
 
    /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a
-    * GPR input, we could uniformize (as behaviour is undefined for
-    * non-uniform inputs) but a lowered bit extract is cheaper than uniformize.
+    * GPR input, behaviour is undefined for non-uniform inputs. TODO: a lowered
+    * bit extract is cheaper than uniformize, but maybe lower in NIR..?
     */
    case nir_intrinsic_inverse_ballot: {
       assert(dst.file == FLAG);
       jay_def x = nj_src(intr->src[0]);
       if (x.file == GPR) {
-         jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id);
-         jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1);
-         jay_set_conditional_mod(b, and, dst, GEN_CONDITION_NE);
-      } else {
-         jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
+         x = emit_uniformize(nj, x);
       }
 
+      jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
       break;
    }
 
@@ -2765,18 +2769,6 @@ jay_setup_payload(struct nir_to_jay_state *nj)
    }
 
    s->payload_gprs = p.offsets[GPR];
-
-   /* Lane ID calculations require &W and therefore are calculated in
-    * uniform control flow to sidestep RA problems. The easy solution is
-    * calculating the lane ID in the first block.
-    *
-    * XXX: This doesn't work for multi-function. Reconsider.
-    */
-   nj->payload.lane_id = jay_LANE_ID_8_u16(b);
-
-   for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
-      nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i);
-   }
 }
 
 /*
diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h
index 0a7fc576f5d..163767cd013 100644
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@@ -945,7 +945,7 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I)
    bool simd1 = jay_inst_is_uniform(I) && !I->broadcast_flag;
    unsigned base = simd1 ? 1 : s->dispatch_width;
 
-   /* Handle vectors-of-UGPR operations with special care for 64-bit */
+   /* Handle vectors-of-UGPR operations with special care for bitsizes */
    unsigned vec_per_channel = jay_type_vector_length(I->type);
    unsigned dst_size = jay_num_values(I->dst);
    assert(util_is_aligned(dst_size, vec_per_channel));
@@ -953,6 +953,12 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I)
    if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) {
       assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1);
       base = dst_size;
+
+      if (jay_type_size_bits(I->type) == 8) {
+         base *= 4;
+      } else if (jay_type_size_bits(I->type) == 16) {
+         base *= 2;
+      }
    }
 
    return base;
@@ -985,9 +991,7 @@ jay_is_no_mask(const jay_inst *I)
           I->op == JAY_OPCODE_QUAD_SWIZZLE ||
           I->op == JAY_OPCODE_DESWIZZLE_EVEN ||
           I->op == JAY_OPCODE_DESWIZZLE_ODD ||
-          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
-          I->op == JAY_OPCODE_LANE_ID_8 ||
-          I->op == JAY_OPCODE_LANE_ID_EXPAND;
+          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS;
 }
 
 /**
diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c
index 76f15b4cdc9..48071637b35 100644
--- a/src/intel/compiler/jay/jay_lower_spill.c
+++ b/src/intel/compiler/jay/jay_lower_spill.c
@@ -50,17 +50,9 @@ void
 jay_lower_spill(jay_function *func)
 {
    jay_builder b = jay_init_builder(func, jay_before_function(func));
+   signed ugpr_reservation = -1;
 
    /* We reserved a block of UGPRs for our use */
-   signed ugpr_reservation = -1, gpr2 = -1;
-   for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) {
-      struct jay_register_block B = func->shader->partition.blocks[GPR][i];
-
-      if (B.stride == JAY_STRIDE_2) {
-         gpr2 = B.start_gpr;
-      }
-   }
-
    for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) {
       struct jay_register_block B = func->shader->partition.blocks[UGPR][i];
 
@@ -70,7 +62,6 @@ jay_lower_spill(jay_function *func)
    }
 
    assert(ugpr_reservation >= 0 && "must have reserved something");
-   assert(gpr2 >= 0 && "must have a stride-2 gpr");
 
    jay_def sp = jay_bare_reg(UGPR, ugpr_reservation);
    sp.num_values_m1 = func->shader->dispatch_width - 1;
@@ -97,14 +88,17 @@ jay_lower_spill(jay_function *func)
    jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4);
 
    /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */
-   jay_def tmp2 = jay_bare_reg(GPR, gpr2);
-   jay_LANE_ID_8(&b, tmp2);
-   for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) {
-      jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i);
+   unsigned disp_width = b.shader->dispatch_width;
+   jay_LANE_ID_8(&b, jay_extract_range_post_ra(sp, 0, 4));
+
+   for (unsigned i = 8; i < disp_width; i *= 2) {
+      jay_ADD(&b, JAY_TYPE_U16, jay_extract_range_post_ra(sp, i / 2, i / 2),
+              jay_extract_range_post_ra(sp, 0, i / 2), i);
    }
 
-   jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4));
-   jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0);
+   jay_def lid = jay_extract_range_post_ra(sp, 0, disp_width / 2);
+   jay_SHL(&b, JAY_TYPE_U16, lid, lid, util_logbase2(4));
+   jay_CVT(&b, JAY_TYPE_U32, sp, lid, JAY_TYPE_U16, JAY_ROUND, 0);
    if (b.shader->scratch_size) {
       jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size);
    }
diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py
index 13c218279a0..f3a15e7096b 100644
--- a/src/intel/compiler/jay/jay_opcodes.py
+++ b/src/intel/compiler/jay/jay_opcodes.py
@@ -156,10 +156,8 @@ op('preload', 0, 'u32',     0, ['unsigned reg'])
 op('deswizzle_odd', 2, 'f32', 0, ['bool src2_hi'])
 op('deswizzle_even', 1, 'f32', 0, ['bool src_hi'])
 
-# Calculating the lane ID requires multiple power-of-two steps each involving
-# complex architectural features not modelled in the IR.
+# Return the UGPR[4] vector (0, 1, 2, 3, 4, 5, 6, 7) as packed 16-bit.
 op('lane_id_8', 0, 'u16')
-op('lane_id_expand', 1, 'u16', 0, ['unsigned width'])
 
 # Fill a scalar GPR from a contiguous UGPR[16] range containing words or bytes.
 # src_type can be either U8 or U16 (only).  For U8, stride can be 1 or 2, and
diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c
index 40d09cd9a53..d793cdd90b0 100644
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@@ -643,9 +643,6 @@ pick_regs_from_block(jay_ra_state *ra,
                      unsigned *best_reg,
                      unsigned first)
 {
-   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
-   must_tie &= !is_src;
-
    /* Cross-lane access cannot be SIMD split if the source/destination registers
     * overlap, but as long as we don't tie those destinations, we're ok.
     */
@@ -657,9 +654,7 @@ pick_regs_from_block(jay_ra_state *ra,
 
       unsigned cost = block_cost;
       bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
-
-      if (tied ? !may_tie :
-                 (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size)))
+      if (tied ? !may_tie : BITSET_TEST_COUNT(ra->pinned[file], r, size))
          continue;
 
       /* Try to tie predicated default values, otherwise post-RA lowering needs
diff --git a/src/intel/compiler/jay/jay_stride.c b/src/intel/compiler/jay/jay_stride.c
index 87d565143ec..ecdfa504c5f 100644
--- a/src/intel/compiler/jay/jay_stride.c
+++ b/src/intel/compiler/jay/jay_stride.c
@@ -110,13 +110,6 @@ jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max)
       max = JAY_STRIDE_4;
    }
 
-   /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet
-    * (preferring to burn the upper bits) but it is used internally.
-    */
-   if (I->op == JAY_OPCODE_LANE_ID_EXPAND) {
-      max = JAY_STRIDE_2;
-   }
-
    if (restrict_mixed_strides(I, s) &&
        jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) {
 
diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c
index 75f1f2a41b2..41a91fb3ca5 100644
--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@@ -195,7 +195,7 @@ to_gen_operand(jay_function *f,
       R = gen_retype(gen_restride(R, 0, 1, 0), GEN_TYPE_UD);
 
       /* Handle 3-src restrictions and vectorized uniform code. */
-      if (is_dest || jay_num_values(d) >= 8) {
+      if (is_dest || jay_num_values(d) >= 4) {
          R = gen_restride(R, 8, 8, 1);
       }
 
@@ -203,7 +203,7 @@ to_gen_operand(jay_function *f,
        * but if we write a single UGPR the stride is ignored..  Specify
        * whatever stride is needed to satisfy the rules.
        */
-      if (is_dest) {
+      if (is_dest && I->num_srcs > 0) {
          /* BSpec 56640 "Special Restrictions" says:
           *
           *    "Conversion between HF and Integer must be DWord-aligned
@@ -636,19 +636,9 @@ emit(struct jay_codegen *jc,
       break;
 
    case JAY_OPCODE_LANE_ID_8:
-      jc->state.exec_size = 8;
       jc_MOV(jc, dst, gen_imm_uv(0x76543210));
       break;
 
-   case JAY_OPCODE_LANE_ID_EXPAND: {
-      unsigned width = jay_lane_id_expand_width(I);
-      jc->state.exec_size = width;
-      jc_append2(GEN_OP_ADD,
-                 gen_element_offset(jc->devinfo, dst, width),
-                 SRC(0), gen_imm_uw(width));
-      break;
-   }
-
    case JAY_OPCODE_GPR_FROM_UGPRS:
       jc_MOV(jc, dst,
              gen_byte_offset(jc->devinfo,
diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c
index c21bcb232b4..d6ef88b2b18 100644
--- a/src/intel/compiler/jay/jay_validate.c
+++ b/src/intel/compiler/jay/jay_validate.c
@@ -78,9 +78,16 @@ validate_flagness(struct validate_state *validate,
    CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def));
 }
 
+static unsigned
+adjust_width_for_type(unsigned width, enum jay_type type)
+{
+   return (width * jay_type_size_bits(type)) / 32;
+}
+
 static unsigned
 get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
 {
+   /* TODO: I think this can be simplified */
    if (I->op == JAY_OPCODE_EXPAND_QUAD) {
       return 4;
    }
@@ -89,19 +96,19 @@ get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
       return jay_ugpr_per_grf(validate->func->shader);
    }
 
-   bool vectorized = I->dst.file == UGPR &&
-                     jay_num_values(I->dst) > jay_type_vector_length(I->type) &&
-                     I->op != JAY_OPCODE_SEND &&
-                     jay_num_values(I->src[s]) > 1;
-
+   unsigned simd_width = jay_simd_width_logical(validate->func->shader, I);
    unsigned elsize = jay_type_vector_length(jay_src_type(I, s));
-   unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1);
 
-   if (vectorized && I->src[s].file == GPR) {
-      CHECK(words == validate->func->shader->dispatch_width);
+   if (I->src[s].file == GPR && I->dst.file == UGPR) {
+      CHECK(jay_num_values(I->dst) ==
+               adjust_width_for_type(simd_width, I->type) ||
+            I->op == JAY_OPCODE_SEND);
+
       return 1;
+   } else if (I->src[s].file == UGPR && jay_num_values(I->src[s]) > elsize) {
+      return adjust_width_for_type(simd_width, jay_src_type(I, s));
    } else {
-      return words;
+      return elsize;
    }
 }