jay: rework lane ID calculations

Previously we had special ops doing data model breaking things on GPRs. But there's no real reason for that, we can calculate lane IDs as UGPR vectors within the Jay data model just fine. Adjust jay_ir/jay_validate to define packed 16-bit UGPR vectors, giving them the natural semantics, then use that to calculate lane IDs, peeling back all the hacks we added along the way. This also unfortunately pessimizes inverse_ballot() but only in a corner case that could be revisited later. Stats are net positive. In addition to the code clean up, this has 3 other benefits: * Now that we can rematerialize the lane ID code anywhere we want, we could theoretically reduce register pressure in some scenarios. Stats show this doesn't help in the current implementation, though. * Now that we can calculate lane IDs in control flow, the issues with divergent function calls all go away. (Well, the lane ID issue. There are other issues.) * Now that we use UGPRs for this, we don't need a stride=16 GRF in shaders that don't actually use 16-bit math, meaning less shuffling from bad partitions. That's reflected in the positive stats here. SIMD16: Totals from 1643 (62.07% of 2647) affected shaders: Instrs: 2227750 -> 2221032 (-0.30%); split: -0.44%, +0.14% CodeSize: 33138416 -> 33034224 (-0.31%); split: -0.52%, +0.20% SIMD32: Totals from 1643 (62.07% of 2647) affected shaders: Instrs: 2864583 -> 2806217 (-2.04%); split: -2.22%, +0.19% CodeSize: 43088064 -> 42171504 (-2.13%); split: -2.29%, +0.17% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41872>
2026-06-10 01:18:18 +02:00 · 2026-05-29 11:24:03 -04:00 · 2026-05-29 11:24:03 -04:00 · 1ce2d57a31
commit 1ce2d57a31
parent 3ededec51c
8 changed files with 53 additions and 80 deletions
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@ -98,7 +98,6 @@ struct nir_to_jay_state {
      jay_def sampler_state_pointer, scratch_surface;
      jay_def inline_data;
      jay_def push_data[512];
-      jay_def lane_id;
      jay_def urb_handle;

      union {
@ -1365,11 +1364,19 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
      break;
   }

-   case nir_intrinsic_load_subgroup_invocation:
+   case nir_intrinsic_load_subgroup_invocation: {
+      jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2);
+      jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4));
+
+      for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
+         jay_ADD(b, JAY_TYPE_U16, jay_extract_range(lid, i / 2, i / 2),
+                 jay_extract_range(lid, 0, i / 2), i);
+      }
+
      /* TODO: Lower this in NIR? */
-      jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16,
-              JAY_ROUND, 0);
+      jay_CVT(b, JAY_TYPE_U32, dst, lid, JAY_TYPE_U16, JAY_ROUND, 0);
      break;
+   }

   case nir_intrinsic_demote:
   case nir_intrinsic_demote_if:
@ -1422,20 +1429,17 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
   }

   /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a
-    * GPR input, we could uniformize (as behaviour is undefined for
-    * non-uniform inputs) but a lowered bit extract is cheaper than uniformize.
+    * GPR input, behaviour is undefined for non-uniform inputs. TODO: a lowered
+    * bit extract is cheaper than uniformize, but maybe lower in NIR..?
    */
   case nir_intrinsic_inverse_ballot: {
      assert(dst.file == FLAG);
      jay_def x = nj_src(intr->src[0]);
      if (x.file == GPR) {
-         jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id);
-         jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1);
-         jay_set_conditional_mod(b, and, dst, GEN_CONDITION_NE);
-      } else {
-         jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
+         x = emit_uniformize(nj, x);
      }

+      jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
      break;
   }

@ -2765,18 +2769,6 @@ jay_setup_payload(struct nir_to_jay_state *nj)
   }

   s->payload_gprs = p.offsets[GPR];
-
-   /* Lane ID calculations require &W and therefore are calculated in
-    * uniform control flow to sidestep RA problems. The easy solution is
-    * calculating the lane ID in the first block.
-    *
-    * XXX: This doesn't work for multi-function. Reconsider.
-    */
-   nj->payload.lane_id = jay_LANE_ID_8_u16(b);
-
-   for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
-      nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i);
-   }
 }

 /*
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@ -945,7 +945,7 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I)
   bool simd1 = jay_inst_is_uniform(I) && !I->broadcast_flag;
   unsigned base = simd1 ? 1 : s->dispatch_width;

-   /* Handle vectors-of-UGPR operations with special care for 64-bit */
+   /* Handle vectors-of-UGPR operations with special care for bitsizes */
   unsigned vec_per_channel = jay_type_vector_length(I->type);
   unsigned dst_size = jay_num_values(I->dst);
   assert(util_is_aligned(dst_size, vec_per_channel));
@ -953,6 +953,12 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I)
   if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) {
      assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1);
      base = dst_size;
+
+      if (jay_type_size_bits(I->type) == 8) {
+         base *= 4;
+      } else if (jay_type_size_bits(I->type) == 16) {
+         base *= 2;
+      }
   }

   return base;
@ -985,9 +991,7 @@ jay_is_no_mask(const jay_inst *I)
          I->op == JAY_OPCODE_QUAD_SWIZZLE ||
          I->op == JAY_OPCODE_DESWIZZLE_EVEN ||
          I->op == JAY_OPCODE_DESWIZZLE_ODD ||
-          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
-          I->op == JAY_OPCODE_LANE_ID_8 ||
-          I->op == JAY_OPCODE_LANE_ID_EXPAND;
+          I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS;
 }

 /**
--- a/src/intel/compiler/jay/jay_lower_spill.c
+++ b/src/intel/compiler/jay/jay_lower_spill.c
@ -50,17 +50,9 @@ void
 jay_lower_spill(jay_function *func)
 {
   jay_builder b = jay_init_builder(func, jay_before_function(func));
+   signed ugpr_reservation = -1;

   /* We reserved a block of UGPRs for our use */
-   signed ugpr_reservation = -1, gpr2 = -1;
-   for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) {
-      struct jay_register_block B = func->shader->partition.blocks[GPR][i];
-
-      if (B.stride == JAY_STRIDE_2) {
-         gpr2 = B.start_gpr;
-      }
-   }
-
   for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) {
      struct jay_register_block B = func->shader->partition.blocks[UGPR][i];

@ -70,7 +62,6 @@ jay_lower_spill(jay_function *func)
   }

   assert(ugpr_reservation >= 0 && "must have reserved something");
-   assert(gpr2 >= 0 && "must have a stride-2 gpr");

   jay_def sp = jay_bare_reg(UGPR, ugpr_reservation);
   sp.num_values_m1 = func->shader->dispatch_width - 1;
@ -97,14 +88,17 @@ jay_lower_spill(jay_function *func)
   jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4);

   /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */
-   jay_def tmp2 = jay_bare_reg(GPR, gpr2);
-   jay_LANE_ID_8(&b, tmp2);
-   for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) {
-      jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i);
+   unsigned disp_width = b.shader->dispatch_width;
+   jay_LANE_ID_8(&b, jay_extract_range_post_ra(sp, 0, 4));
+
+   for (unsigned i = 8; i < disp_width; i *= 2) {
+      jay_ADD(&b, JAY_TYPE_U16, jay_extract_range_post_ra(sp, i / 2, i / 2),
+              jay_extract_range_post_ra(sp, 0, i / 2), i);
   }

-   jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4));
-   jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0);
+   jay_def lid = jay_extract_range_post_ra(sp, 0, disp_width / 2);
+   jay_SHL(&b, JAY_TYPE_U16, lid, lid, util_logbase2(4));
+   jay_CVT(&b, JAY_TYPE_U32, sp, lid, JAY_TYPE_U16, JAY_ROUND, 0);
   if (b.shader->scratch_size) {
      jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size);
   }
--- a/src/intel/compiler/jay/jay_opcodes.py
+++ b/src/intel/compiler/jay/jay_opcodes.py
@ -156,10 +156,8 @@ op('preload', 0, 'u32',     0, ['unsigned reg'])
 op('deswizzle_odd', 2, 'f32', 0, ['bool src2_hi'])
 op('deswizzle_even', 1, 'f32', 0, ['bool src_hi'])

-# Calculating the lane ID requires multiple power-of-two steps each involving
-# complex architectural features not modelled in the IR.
+# Return the UGPR[4] vector (0, 1, 2, 3, 4, 5, 6, 7) as packed 16-bit.
 op('lane_id_8', 0, 'u16')
-op('lane_id_expand', 1, 'u16', 0, ['unsigned width'])

 # Fill a scalar GPR from a contiguous UGPR[16] range containing words or bytes.
 # src_type can be either U8 or U16 (only).  For U8, stride can be 1 or 2, and
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@ -643,9 +643,6 @@ pick_regs_from_block(jay_ra_state *ra,
                     unsigned *best_reg,
                     unsigned first)
 {
-   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
-   must_tie &= !is_src;
-
   /* Cross-lane access cannot be SIMD split if the source/destination registers
    * overlap, but as long as we don't tie those destinations, we're ok.
    */
@ -657,9 +654,7 @@ pick_regs_from_block(jay_ra_state *ra,

      unsigned cost = block_cost;
      bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
-
-      if (tied ? !may_tie :
-                 (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size)))
+      if (tied ? !may_tie : BITSET_TEST_COUNT(ra->pinned[file], r, size))
         continue;

      /* Try to tie predicated default values, otherwise post-RA lowering needs
--- a/src/intel/compiler/jay/jay_stride.c
+++ b/src/intel/compiler/jay/jay_stride.c
@ -110,13 +110,6 @@ jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max)
      max = JAY_STRIDE_4;
   }

-   /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet
-    * (preferring to burn the upper bits) but it is used internally.
-    */
-   if (I->op == JAY_OPCODE_LANE_ID_EXPAND) {
-      max = JAY_STRIDE_2;
-   }
-
   if (restrict_mixed_strides(I, s) &&
       jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) {

--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@ -195,7 +195,7 @@ to_gen_operand(jay_function *f,
      R = gen_retype(gen_restride(R, 0, 1, 0), GEN_TYPE_UD);

      /* Handle 3-src restrictions and vectorized uniform code. */
-      if (is_dest || jay_num_values(d) >= 8) {
+      if (is_dest || jay_num_values(d) >= 4) {
         R = gen_restride(R, 8, 8, 1);
      }

@ -203,7 +203,7 @@ to_gen_operand(jay_function *f,
       * but if we write a single UGPR the stride is ignored..  Specify
       * whatever stride is needed to satisfy the rules.
       */
-      if (is_dest) {
+      if (is_dest && I->num_srcs > 0) {
         /* BSpec 56640 "Special Restrictions" says:
          *
          *    "Conversion between HF and Integer must be DWord-aligned
@ -636,19 +636,9 @@ emit(struct jay_codegen *jc,
      break;

   case JAY_OPCODE_LANE_ID_8:
-      jc->state.exec_size = 8;
      jc_MOV(jc, dst, gen_imm_uv(0x76543210));
      break;

-   case JAY_OPCODE_LANE_ID_EXPAND: {
-      unsigned width = jay_lane_id_expand_width(I);
-      jc->state.exec_size = width;
-      jc_append2(GEN_OP_ADD,
-                 gen_element_offset(jc->devinfo, dst, width),
-                 SRC(0), gen_imm_uw(width));
-      break;
-   }
-
   case JAY_OPCODE_GPR_FROM_UGPRS:
      jc_MOV(jc, dst,
             gen_byte_offset(jc->devinfo,
--- a/src/intel/compiler/jay/jay_validate.c
+++ b/src/intel/compiler/jay/jay_validate.c
@ -78,9 +78,16 @@ validate_flagness(struct validate_state *validate,
   CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def));
 }

+static unsigned
+adjust_width_for_type(unsigned width, enum jay_type type)
+{
+   return (width * jay_type_size_bits(type)) / 32;
+}
+
 static unsigned
 get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
 {
+   /* TODO: I think this can be simplified */
   if (I->op == JAY_OPCODE_EXPAND_QUAD) {
      return 4;
   }
@ -89,19 +96,19 @@ get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
      return jay_ugpr_per_grf(validate->func->shader);
   }

-   bool vectorized = I->dst.file == UGPR &&
-                     jay_num_values(I->dst) > jay_type_vector_length(I->type) &&
-                     I->op != JAY_OPCODE_SEND &&
-                     jay_num_values(I->src[s]) > 1;
-
+   unsigned simd_width = jay_simd_width_logical(validate->func->shader, I);
   unsigned elsize = jay_type_vector_length(jay_src_type(I, s));
-   unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1);

-   if (vectorized && I->src[s].file == GPR) {
-      CHECK(words == validate->func->shader->dispatch_width);
+   if (I->src[s].file == GPR && I->dst.file == UGPR) {
+      CHECK(jay_num_values(I->dst) ==
+               adjust_width_for_type(simd_width, I->type) ||
+            I->op == JAY_OPCODE_SEND);
+
      return 1;
+   } else if (I->src[s].file == UGPR && jay_num_values(I->src[s]) > elsize) {
+      return adjust_width_for_type(simd_width, jay_src_type(I, s));
   } else {
-      return words;
+      return elsize;
   }
 }