mesa/src/intel/compiler/jay/jay_from_nir.c

/*
 * Copyright 2026 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "compiler/brw/brw_compiler.h"
#include "compiler/brw/brw_eu.h"
#include "compiler/brw/brw_eu_defines.h"
#include "compiler/brw/brw_nir.h"
#include "compiler/brw/brw_sampler.h"
#include "compiler/intel_nir.h"
#include "compiler/intel_shader_enums.h"
#include "compiler/list.h"
#include "intel/dev/intel_debug.h"
#include "util/bitpack_helpers.h"
#include "util/bitscan.h"
#include "util/bitset.h"
#include "util/lut.h"
#include "util/macros.h"
#include "util/u_math.h"
#include "intel_device_info_gen.h"
#include "jay.h"
#include "jay_builder.h"
#include "jay_builder_opcodes.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_defines.h"
#include "nir_intrinsics.h"
#include "nir_intrinsics_indices.h"
#include "nir_opcodes.h"
#include "nir_search_helpers.h"
#include "shader_enums.h"
#include "shader_stats.h"

static const struct debug_named_value jay_debug_options[] = {
   { "noopt",       JAY_DBG_NOOPT,       "Disable backend optimizer"             },
   { "printdemand", JAY_DBG_PRINTDEMAND, "Print demand per instruction"          },
   { "spill",       JAY_DBG_SPILL,       "Shrink register file to test spilling" },
   { "sync",        JAY_DBG_SYNC,        "Sync after every instruction"          },
   { "noacc",       JAY_DBG_NOACC,       "Disable accumulator substitution"      },
   DEBUG_NAMED_VALUE_END
};

DEBUG_GET_ONCE_FLAGS_OPTION(jay_debug, "JAY_DEBUG", jay_debug_options, 0)
int jay_debug = 0;

typedef struct jay_vs_payload {
   /* "the maximum limit is 30 elements per vertex" (bspec 56124) */
   jay_def attributes[30 * 4];
} jay_vs_payload;

typedef struct jay_cs_payload {
   jay_def local_invocation_ids;
} jay_cs_payload;

typedef struct jay_fs_payload {
   jay_def bary[INTEL_BARYCENTRIC_MODE_COUNT];

   struct {
      jay_def xy, z, w;
   } coord;

   jay_def pixel_sample_mask;
   jay_def deltas[64];
} jay_fs_payload;

struct nir_to_jay_state {
   jay_shader *s;
   jay_function *f;
   const nir_shader *nir;
   const struct intel_device_info *devinfo;

   jay_builder bld;

   jay_block *current_block;
   jay_block *after_block;
   jay_block *break_block;

   unsigned indent;

   /* We cache ballot(true), ctz(ballot(true)), and 4*ctz(ballot(true)) within a
    * block. If we had competent backend CSE - or emitted uniformize in NIR and
    * taught NIR's CSE about ballots - we could remove this kludge.
    */
   jay_def active_lane_mask, active_lane, active_lane_x4;

   /* These defs contain the extracted payload. They are only valid while
    * translating NIR->Jay since they aren't maintained by Jay passes.
    */
   struct {
      jay_def u0, u1;
      jay_def sampler_state_pointer, scratch_surface;
      jay_def inline_data;
      jay_def push_data[512];
      jay_def lane_id;
      jay_def urb_handle;

      union {
         jay_vs_payload vs;
         jay_cs_payload cs;
         jay_fs_payload fs;
      };
   } payload;
};

static jay_def
payload_u1(struct nir_to_jay_state *nj, unsigned idx, unsigned len)
{
   if (jay_is_null(nj->payload.u1))
      return jay_null();
   else
      return jay_extract_range(nj->payload.u1, idx, len);
}

static jay_def
emit_active_lane_mask(struct nir_to_jay_state *nj)
{
   /* Note that we don't use mask0 since it needs fixups. Just ballot(true). */
   if (jay_is_null(nj->active_lane_mask)) {
      nj->active_lane_mask = jay_alloc_def(&nj->bld, FLAG, 1);
      jay_MOV(&nj->bld, nj->active_lane_mask, 1);
   }

   return nj->active_lane_mask;
}

static jay_def
emit_active_lane(struct nir_to_jay_state *nj)
{
   /* For this instruction to execute, some lane must be active. Therefore there
    * is a 1 in the lower [dispatch width] bits of the lane mask, so we may
    * equivalently use fbl.u32 instead of fbl.u[dispatch width].
    */
   if (jay_is_null(nj->active_lane)) {
      nj->active_lane = jay_alloc_def(&nj->bld, UGPR, 1);
      jay_FBL(&nj->bld, nj->active_lane, emit_active_lane_mask(nj));
   }

   return nj->active_lane;
}

static jay_def
emit_uniformize(struct nir_to_jay_state *nj, jay_def x)
{
   jay_builder *b = &nj->bld;
   if (x.file != GPR && x.file != FLAG) {
      return x;
   }

   if (jay_is_null(nj->active_lane_x4)) {
      nj->active_lane_x4 = jay_SHL_u32(b, emit_active_lane(nj), 2);
   }

   jay_def u = jay_alloc_def(b, x.file == FLAG ? UFLAG : UGPR, 1);
   jay_SHUFFLE(b, u, x, nj->active_lane_x4);
   return u;
}

static jay_block *jay_emit_cf_list(struct nir_to_jay_state *nj,
                                   struct exec_list *list);

/** Returns true if the entire compute workgroup fits in a single subgroup. */
static bool
jay_workgroup_is_one_subgroup(jay_builder *b, const nir_shader *nir)
{
   return mesa_shader_stage_uses_workgroup(nir->info.stage) &&
          !nir->info.workgroup_size_variable &&
          nir_static_workgroup_size(nir) <= b->shader->dispatch_width;
}

static enum jay_type
jay_base_type_for_nir(nir_alu_type nir_type)
{
   /* clang-format off */
   switch (nir_alu_type_get_base_type(nir_type)) {
   case nir_type_int:   return JAY_TYPE_S;
   case nir_type_uint:  return JAY_TYPE_U;
   case nir_type_bool:  return JAY_TYPE_S;
   case nir_type_float: return JAY_TYPE_F;
   default:             UNREACHABLE("invalid NIR type");
   }
   /* clang-format on */
}

static enum jay_file
jay_file_for_def(const nir_def *def)
{
   return def->bit_size == 1 ? (def->divergent ? FLAG : UFLAG) :
                               (def->divergent ? GPR : UGPR);
}

/**
 * Returns an jay_type for the ALU op's i-th source.
 * (Useful for conversions and comparisons.)
 */
static enum jay_type
jay_alu_source_type(nir_alu_instr *alu, unsigned i)
{
   return jay_type(jay_base_type_for_nir(nir_op_infos[alu->op].input_types[i]),
                   nir_src_bit_size(alu->src[i].src));
}

static inline jay_def
nj_def(nir_def *def)
{
   unsigned bits = def->num_components * MAX2(def->bit_size, 32);
   unsigned words = DIV_ROUND_UP(bits, 32);

   return jay_contiguous_def(jay_file_for_def(def), def->index, words);
}

static inline jay_def
nj_src(nir_src src)
{
   return nj_def(src.ssa);
}

static void
jay_emit_alu(struct nir_to_jay_state *nj, nir_alu_instr *alu)
{
   jay_builder *b = &nj->bld;
   jay_def dst = nj_def(&alu->def);

   nir_alu_type nir_type = nir_op_infos[alu->op].output_type;
   enum jay_type base_type = jay_base_type_for_nir(nir_type);
   enum jay_type type = jay_type(base_type, alu->def.bit_size);

   jay_def src[NIR_ALU_MAX_INPUTS];
   for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
      unsigned len = nir_src_bit_size(alu->src[i].src) == 64 ? 2 : 1;
      src[i] = jay_extract_range(nj_src(alu->src[i].src),
                                 len * alu->src[i].swizzle[0], len);
   }

   switch (alu->op) {
#define CMP(op, jay)                                                           \
   case nir_op_##op:                                                           \
      jay_CMP(b, jay_alu_source_type(alu, 0), JAY_CONDITIONAL_##jay, dst,      \
              src[0], src[1]);                                                 \
      break;

#define UNOP(nir, jay_op)                                                      \
   case nir_op_##nir:                                                          \
      jay_##jay_op(b, type, dst, src[0]);                                      \
      break;

#define MATH(nir, jay_op)                                                      \
   case nir_op_##nir:                                                          \
      jay_MATH(b, type, dst, src[0], JAY_MATH_##jay_op);                       \
      break;

#define UNOP_UNTYPED(nir, jay_op)                                              \
   case nir_op_##nir:                                                          \
      jay_##jay_op(b, dst, src[0]);                                            \
      break;

#define BINOP(nir, jay_op)                                                     \
   case nir_op_##nir:                                                          \
      jay_##jay_op(b, type, dst, src[0], src[1]);                              \
      break;

#define DP4A(nir, jay_op, sat_)                                                \
   case nir_op_##nir:                                                          \
      jay_DP4A_##jay_op(b, dst, src[2], src[0], src[1])->saturate = sat_;      \
      break;

      CMP(flt, LT)
      CMP(ilt, LT)
      CMP(ult, LT)
      CMP(fge, GE)
      CMP(ige, GE)
      CMP(uge, GE)
      CMP(feq, EQ)
      CMP(ieq, EQ)
      CMP(fneu, NE)
      CMP(ine, NE)

      MATH(frcp, INV)
      MATH(fexp2, EXP)
      MATH(flog2, LOG)
      MATH(fsin, SIN)
      MATH(fcos, COS)
      MATH(fsqrt, SQRT)
      MATH(frsq, RSQ)
      UNOP(ffract, FRC)
      UNOP(ftrunc, RNDZ)
      UNOP(ffloor, RNDD)
      UNOP(fround_even, RNDE)

      UNOP_UNTYPED(mov, copy)
      UNOP_UNTYPED(unpack_32_2x16_split_x, MOV)
      UNOP_UNTYPED(b2b1, CAST_CANONICAL_TO_FLAG)
      UNOP_UNTYPED(inot, NOT)
      UNOP_UNTYPED(bitfield_reverse, BFREV)
      UNOP_UNTYPED(bit_count, CBIT)
      UNOP_UNTYPED(uclz, LZD)
      UNOP_UNTYPED(find_lsb, FBL)

      BINOP(imin, MIN)
      BINOP(umin, MIN)
      BINOP(fmin, MIN)
      BINOP(imax, MAX)
      BINOP(umax, MAX)
      BINOP(fmax, MAX)
      BINOP(fadd, ADD)
      BINOP(iadd, ADD)
      BINOP(fmul, MUL)
      BINOP(imul_32x16, MUL_32X16)
      BINOP(umul_32x16, MUL_32X16)
      BINOP(ishl, SHL)
      BINOP(ishr, ASR)
      BINOP(ushr, SHR)
      BINOP(urol, ROL)
      BINOP(uror, ROR)
      BINOP(urhadd, AVG)
      BINOP(irhadd, AVG)
      BINOP(iand, AND)
      BINOP(ior, OR)
      BINOP(ixor, XOR)

      DP4A(sdot_4x8_iadd, SS, false)
      DP4A(sdot_4x8_iadd_sat, SS, true)
      DP4A(udot_4x8_uadd, UU, false)
      DP4A(udot_4x8_uadd_sat, UU, true)
      DP4A(sudot_4x8_iadd, SU, false)
      DP4A(sudot_4x8_iadd_sat, SU, true)

#undef CMP
#undef UNOP
#undef UNOP_UNTYPED
#undef BINOP
#undef DP4A

   case nir_op_imul:
      if (jay_type_size_bits(type) == 32) {
         jay_MUL_32(b, type, dst, src[0], src[1], false);
      } else {
         jay_MUL(b, type, dst, src[0], src[1]);
      }

      break;

   case nir_op_imul_high:
   case nir_op_umul_high:
      jay_MUL_32(b, type, dst, src[0], src[1], true);
      break;

   case nir_op_bfm:
      jay_BFI1(b, dst, src[0], src[1]);
      break;

   case nir_op_b2f64:
      jay_SEL(b, JAY_TYPE_U32, jay_extract(dst, 1), 0x3ff00000, 0, src[0]);
      jay_MOV(b, jay_extract(dst, 0), 0);
      break;

   case nir_op_ufind_msb_rev:
   case nir_op_ifind_msb_rev:
      jay_FBH(b, jay_alu_source_type(alu, 0), dst, src[0]);
      break;

   case nir_op_u2u8:
   case nir_op_u2u16:
   case nir_op_u2u32:
   case nir_op_i2i8:
   case nir_op_i2i16:
   case nir_op_i2i32:
      assert(nir_src_bit_size(alu->src[0].src) > 1 &&
             "predicate conversions are lowered");

      if (alu->def.bit_size <= nir_src_bit_size(alu->src[0].src)) {
         /* Downconversion. Upper bits garbage convention makes this a no-op.
          * The extract handles 64->32 narrowing conversions.
          */
         jay_MOV(b, dst, jay_extract(src[0], 0));
         break;
      }

      FALLTHROUGH;
   case nir_op_i2f64:
   case nir_op_i2i64:
   case nir_op_u2u64:
   case nir_op_u2f64:
   case nir_op_f2f64:
   case nir_op_f2i64:
   case nir_op_f2u64:
   case nir_op_f2i32:
   case nir_op_f2u32:
   case nir_op_f2i32_sat:
   case nir_op_f2u32_sat:
   case nir_op_i2f32:
   case nir_op_u2f32:
   case nir_op_f2f32:
   case nir_op_i2f16:
   case nir_op_u2f16:
   case nir_op_f2f16:
   case nir_op_f2i16:
   case nir_op_f2u16:
   case nir_op_f2i8:
   case nir_op_f2u8: {
      enum jay_type src_type = jay_alu_source_type(alu, 0);

      /* UGPR byte to float is not supported. Do it in 2 steps. */
      if (jay_type_size_bits(src_type) == 8 &&
          jay_base_type(type) == JAY_TYPE_F &&
          dst.file == UGPR) {

         enum jay_type integer = jay_type_rebase(type, jay_base_type(src_type));
         jay_def tmp = jay_alloc_def(b, UGPR, 1);
         jay_CVT(b, integer, tmp, src[0], src_type, JAY_ROUND, 0);
         jay_CVT(b, type, dst, tmp, integer, JAY_ROUND, 0);
      } else {
         jay_CVT(b, type, dst, src[0], src_type, JAY_ROUND, 0);
      }

      break;
   }

   case nir_op_f2f16_rtne:
   case nir_op_f2f16_rtz:
      jay_CVT(b, JAY_TYPE_F16, dst, src[0], jay_alu_source_type(alu, 0),
              alu->op == nir_op_f2f16_rtz ? JAY_RTZ : JAY_RNE, 0);
      break;

   case nir_op_fsat:
      jay_MODIFIER(b, type, dst, src[0])->saturate = true;
      break;

   case nir_op_fneg:
   case nir_op_ineg:
      jay_MODIFIER(b, type, dst, jay_negate(src[0]));
      break;

   case nir_op_fabs:
   case nir_op_iabs:
      jay_MODIFIER(b, type, dst, jay_abs(src[0]));
      break;

   case nir_op_iadd3:
      jay_ADD3(b, type, dst, src[0], src[1], src[2]);
      break;

   case nir_op_uadd_sat:
   case nir_op_iadd_sat:
      jay_ADD(b, type, dst, src[0], src[1])->saturate = true;
      break;

   case nir_op_usub_sat:
   case nir_op_isub_sat:
      jay_ADD(b, type, dst, src[0], jay_negate(src[1]))->saturate = true;
      break;

   case nir_op_ihadd:
   case nir_op_uhadd: {
      /* AVG(x, y) - ((x ^ y) & 1) */
      jay_def avg = jay_alloc_def(b, dst.file, 1);
      jay_def bfn = jay_alloc_def(b, dst.file, 1);
      jay_AVG(b, type, avg, src[0], src[1]);
      jay_BFN(b, bfn, 1, src[0], src[1], UTIL_LUT3(a & (b ^ c)));
      jay_ADD(b, type, dst, avg, jay_negate(bfn));
      break;
   }

   case nir_op_unpack_64_2x32_split_x:
      jay_MOV(b, dst, jay_extract(src[0], 0));
      break;
   case nir_op_unpack_64_2x32_split_y:
      jay_MOV(b, dst, jay_extract(src[0], 1));
      break;
   case nir_op_unpack_32_2x16_split_y:
      jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, 1);
      break;

   case nir_op_pack_32_4x8_split: {
      /* TODO: Optimize */
      jay_def r = jay_BFI2_u32(b, 0x0000ff00, src[1], src[0]);
      r = jay_BFI2_u32(b, 0x00ff0000, src[2], r);
      jay_BFI2(b, dst, 0xff000000, src[3], r);
      break;
   }

   case nir_op_pack_32_2x16_split:
      if (nir_src_is_const(alu->src[0].src) &&
          nir_alu_src_as_uint(alu->src[0]) == 0) {

         /* pack_32_2x16_split(0, x) is just a shift. This saves a constant. */
         jay_SHL(b, JAY_TYPE_U32, dst, src[1], 16);
      } else {
         /* TODO: Optimize */
         jay_BFI2(b, dst, 0xffff0000, src[1], src[0]);
      }
      break;

   case nir_op_pack_64_2x32_split:
      jay_MOV(b, jay_extract(dst, 0), src[0]);
      jay_MOV(b, jay_extract(dst, 1), src[1]);
      break;

   case nir_op_bitfield_select:
      assert(jay_type_size_bits(type) <= 32);
      jay_BFN(b, dst, src[0], src[1], src[2], UTIL_LUT3((a & b) | (~a & c)));
      break;

   case nir_op_ubfe:
   case nir_op_ibfe:
      jay_BFE(b, type, dst, src[0], src[1], src[2]);
      break;
   case nir_op_bfi:
      jay_BFI2(b, dst, src[0], src[1], src[2]);
      break;

   case nir_op_ffma:
      jay_MAD(b, type, dst, src[0], src[1], src[2]);
      break;

   case nir_op_fcsel:
      jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
         JAY_CONDITIONAL_NE;
      break;

   case nir_op_fcsel_gt:
   case nir_op_i32csel_gt:
      jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
         JAY_CONDITIONAL_GT;
      break;

   case nir_op_fcsel_ge:
   case nir_op_i32csel_ge:
      jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
         JAY_CONDITIONAL_GE;
      break;

   case nir_op_bcsel:
      assert(alu->def.bit_size < 64);
      assert(jay_is_flag(src[0]));

      /* sel.s32 can propagate more modifiers than sel.u32 with no drawback */
      type = jay_type_rebase(type, JAY_TYPE_S);

      /* b2i8 gets lowered into 8-bit csel. Just use the upper bits garbage
       * convention to implement with SEL.u16 instead.
       */
      if (type == JAY_TYPE_S8) {
         type = JAY_TYPE_S16;
      }

      /* SEL.f32 flushes denorms but SEL.u32 does not, so we can only use the
       * float types when we are used only as a float. We care about the uses
       * and not the sources here, to ensure we pick u32 instead of f32 for:
       *
       *    ieq(1, bcsel(a, fneg(b), c))
       *
       * Picking sel.f32 would incorrectly "flush" the integer c. However, when
       * we can use sel.f32, we prefer it since it usually gives more
       * flexibility for modifiers and saturation.
       */
      if (is_only_used_as_float(alu)) {
         type = jay_type_rebase(type, JAY_TYPE_F);
      }

      jay_SEL(b, type, dst, src[1], src[2], src[0]);
      break;

   case nir_op_extract_u8:
      jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U8, JAY_ROUND,
              nir_alu_src_as_uint(alu->src[1]));
      break;

   case nir_op_extract_i8:
      jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S8, JAY_ROUND,
              nir_alu_src_as_uint(alu->src[1]));
      break;

   case nir_op_extract_u16:
      jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND,
              nir_alu_src_as_uint(alu->src[1]));
      break;

   case nir_op_extract_i16:
      jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S16, JAY_ROUND,
              nir_alu_src_as_uint(alu->src[1]));
      break;

   default:
      if (nir_op_is_vec(alu->op)) {
         for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
            unsigned len = jay_type_vector_length(type);
            jay_copy(b, jay_extract_range(dst, len * i, len), src[i]);
         }

         break;
      }

      nir_print_instr(&alu->instr, stderr);
      fprintf(stderr, "\n");
      UNREACHABLE("unhandled instruction");
   }
}

static void
jay_emit_load_const(struct nir_to_jay_state *nj, nir_load_const_instr *lc)
{
   jay_builder *b = &nj->bld;
   jay_def dst = nj_def(&lc->def);
   assert(lc->def.num_components == 1 && "must be scalarized");

   if (lc->def.bit_size == 64 && lc->value[0].u64 >> 32) {
      jay_MOV_IMM64(b, dst, lc->value[0].u64);
   } else {
      jay_MOV(b, dst, lc->value[0].u32);
   }
}

static jay_def
jay_resource_handle(jay_builder *b,
                    nir_src *nsrc,
                    unsigned *bti_const,
                    bool *internal,
                    bool *bindless)
{
   if (!nsrc) {
      return jay_null();
   }

   nir_intrinsic_instr *rin = nir_src_as_intrinsic(*nsrc);

   if (nir_src_is_const(*nsrc)) {
      *bti_const = nir_src_as_uint(*nsrc);
      return jay_null();
   } else if (!rin || rin->intrinsic != nir_intrinsic_resource_intel) {
      return nj_src(*nsrc);
   }

   uint32_t flags = nir_intrinsic_resource_access_intel(rin);
   if (internal) {
      *internal = !!(flags & nir_resource_intel_internal);
   }
   if (bindless) {
      *bindless = !!(flags & nir_resource_intel_bindless);
   }

   if (nir_src_is_const(rin->src[1])) {
      *bti_const = nir_src_as_uint(rin->src[1]);
      return jay_null();
   } else {
      return nj_src(rin->src[1]);
   }
}

static inline enum lsc_flush_type
translate_flush_type(nir_intrinsic_instr *intr)
{
   switch (nir_intrinsic_memory_semantics(intr)) {
   case NIR_MEMORY_ACQUIRE:
      return LSC_FLUSH_TYPE_INVALIDATE;
   case NIR_MEMORY_RELEASE:
      return LSC_FLUSH_TYPE_CLEAN;
   case NIR_MEMORY_ACQ_REL:
      return LSC_FLUSH_TYPE_EVICT;
   case NIR_MEMORY_MAKE_AVAILABLE:
   case NIR_MEMORY_MAKE_VISIBLE:
   default:
      UNREACHABLE("unexpected memory semantic");
   }
}

static void
emit_lsc_fence(struct nir_to_jay_state *nj,
               nir_intrinsic_instr *intr,
               enum brw_sfid sfid)
{
   bool device = nir_intrinsic_memory_scope(intr) >= SCOPE_QUEUE_FAMILY;
   enum lsc_fence_scope scope = device ? LSC_FENCE_TILE : LSC_FENCE_THREADGROUP;
   enum lsc_flush_type type =
      sfid == BRW_SFID_SLM ? LSC_FLUSH_TYPE_NONE : translate_flush_type(intr);

   jay_def notif = jay_alloc_def(&nj->bld, UGPR, jay_ugpr_per_grf(nj->s));
   uint32_t desc = lsc_fence_msg_desc(nj->s->devinfo, scope, type, false);

   jay_SEND(&nj->bld, .sfid = sfid, .msg_desc = desc, .srcs = &nj->payload.u0,
            .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true, .dst = notif);
}

static void
jay_emit_memory_barrier(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
   nir_variable_mode modes = nir_intrinsic_memory_modes(intr);

   if (modes & nir_var_image) {
      emit_lsc_fence(nj, intr, BRW_SFID_TGM);
      assert(!nj->nir->info.use_lowered_image_to_global && "fix common code");
   }

   if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
      emit_lsc_fence(nj, intr, BRW_SFID_UGM);
   }

   if (modes & (nir_var_shader_out | nir_var_mem_task_payload)) {
      emit_lsc_fence(nj, intr, BRW_SFID_URB);
   }

   if ((modes & nir_var_mem_shared) &&
       !jay_workgroup_is_one_subgroup(&nj->bld, nj->nir)) {
      emit_lsc_fence(nj, intr, BRW_SFID_SLM);
   }
}

static void
jay_emit_signal_barrier(jay_builder *b, struct nir_to_jay_state *nj)
{
   /* Signal barrier / Active threads only (BSpec 72052).
    *
    * Source 0 is the number of subgroups in [31:24], which comes from the u0.2
    * payload in [31:24]. Mask out the other bits, then replicate to [23:15].
    *
    * TODO: This can be done faster with a SIMD2 8-bit move.
    */
   jay_def a = jay_AND_u32(b, jay_extract(nj->payload.u0, 2), 0xff000000);
   jay_def m2 = jay_OR_u32(b, a, jay_SHR_u32(b, a, 8));

   /* Use an active threads only barrier. TODO: I think we can optimize. */
   if (b->shader->devinfo->ver >= 20) {
      m2 = jay_OR_u32(b, m2, BITFIELD_BIT(8));
   }

   uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
   indices[2] = jay_index(m2);
   jay_def zipped = jay_collect(b, UGPR, indices, 3);

   jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY,
            .msg_desc = BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG, .srcs = &zipped,
            .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true);
}

static void
jay_emit_derivative(jay_builder *b,
                    jay_def dst,
                    nir_intrinsic_instr *intr,
                    enum jay_quad_swizzle swz0,
                    enum jay_quad_swizzle swz1)
{
   assert(intr->def.bit_size == 32 && "todo");
   jay_def val = nj_src(intr->src[0]);

   jay_ADD(b, JAY_TYPE_F32, dst, jay_QUAD_SWIZZLE_u32(b, val, swz1),
           jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0)));
}

static void
jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr)
{
   jay_def data = nj_src(intr->src[0]);
   jay_def srcs[8];

   /* Optimize unconditional discards. Should probably do this in NIR. */
   bool trivial =
      nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]);

   for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) {
      srcs[i] =
         trivial ? jay_UNDEF_u32(b) : jay_as_gpr(b, jay_extract(data, i));
   }

   jay_inst *send =
      jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true,
               .msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs(
                              nir_get_scalar(intr->src[1].ssa, 0))) |
                           (nir_scalar_as_uint(nir_scalar_chase_movs(
                               nir_get_scalar(intr->src[1].ssa, 1)))
                            << 32),
               .srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]),
               .type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr));

   /* Handle the disable predicate. It is logically inverted. */
   if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) {
      jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2])));
   }
}

static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)
{
   /* clang-format off */
   switch (bit_size / 8) {
   case 1:  return LSC_DATA_SIZE_D8U32;
   case 2:  return LSC_DATA_SIZE_D16U32;
   case 4:  return LSC_DATA_SIZE_D32;
   case 8:  return LSC_DATA_SIZE_D64;
   default: UNREACHABLE("Unsupported data size.");
   }
   /* clang-format on */
}

static enum lsc_opcode
lsc_op_for_atomic(nir_atomic_op op)
{
   /* clang-format off */
   switch (op) {
   case nir_atomic_op_iadd:     return LSC_OP_ATOMIC_ADD;
   case nir_atomic_op_imin:     return LSC_OP_ATOMIC_MIN;
   case nir_atomic_op_umin:     return LSC_OP_ATOMIC_UMIN;
   case nir_atomic_op_imax:     return LSC_OP_ATOMIC_MAX;
   case nir_atomic_op_umax:     return LSC_OP_ATOMIC_UMAX;
   case nir_atomic_op_iand:     return LSC_OP_ATOMIC_AND;
   case nir_atomic_op_ior:      return LSC_OP_ATOMIC_OR;
   case nir_atomic_op_ixor:     return LSC_OP_ATOMIC_XOR;
   case nir_atomic_op_xchg:     return LSC_OP_ATOMIC_STORE;
   case nir_atomic_op_cmpxchg:  return LSC_OP_ATOMIC_CMPXCHG;
   case nir_atomic_op_fmin:     return LSC_OP_ATOMIC_FMIN;
   case nir_atomic_op_fmax:     return LSC_OP_ATOMIC_FMAX;
   case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
   case nir_atomic_op_fadd:     return LSC_OP_ATOMIC_FADD;
   default:                     UNREACHABLE("Unsupported NIR atomic");
   }
   /* clang-format on */
}

static jay_def
jay_src_as_strided(jay_builder *b,
                   jay_def x,
                   unsigned element_sz,
                   enum jay_file dst_file)
{
   if (dst_file == UGPR) {
      assert(jay_is_uniform(x) && "Uniform dests require uniform sources");

      if (x.file != UGPR) {
         jay_def tmp = jay_alloc_def(b, UGPR, jay_num_values(x));
         jay_copy(b, tmp, x);
         x = tmp;
      }

      uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
      unsigned nr = jay_num_values(x) * jay_ugpr_per_grf(b->shader);
      assert(nr < ARRAY_SIZE(indices));

      for (unsigned i = 0; i < jay_num_values(x) / element_sz; ++i) {
         for (unsigned j = 0; j < element_sz; ++j) {
            indices[(i * jay_ugpr_per_grf(b->shader)) + j] =
               jay_channel(x, (i * element_sz) + j);
         }
      }

      return jay_collect(b, UGPR, indices, nr);
   } else {
      /* Could be a GPR or UGPR source */
      assert(dst_file == GPR);
      return jay_as_gpr(b, x);
   }
}

static jay_def
jay_scratch_surface(struct nir_to_jay_state *nj)
{
   if (jay_is_null(nj->payload.scratch_surface)) {
      jay_function *func = nj->f;
      assert(func->is_entrypoint && "todo: this needs ABI");

      jay_builder b = jay_init_builder(func, jay_before_function(func));
      jay_def u0_5 = jay_extract(nj->payload.u0, 5);
      nj->payload.scratch_surface = jay_AND_u32(&b, u0_5, ~BITFIELD_MASK(10));
   }

   return nj->payload.scratch_surface;
}

static void
jay_emit_mem_access(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
   jay_builder *b = &nj->bld;
   bool slm = nir_is_shared_access(intr);
   bool tgm = nir_intrinsic_has_image_dim(intr);
   bool urb = intr->intrinsic == nir_intrinsic_store_urb_lsc_intel ||
              intr->intrinsic == nir_intrinsic_store_urb_vec4_intel;
   enum brw_sfid sfid = slm ? BRW_SFID_SLM :
                        tgm ? BRW_SFID_TGM :
                        urb ? BRW_SFID_URB :
                              BRW_SFID_UGM;

   nir_src *data_src = nir_get_io_data_src(intr);
   bool scratch = intr->intrinsic == nir_intrinsic_load_scratch_intel ||
                  intr->intrinsic == nir_intrinsic_store_scratch_intel;

   enum lsc_opcode op;
   if (nir_intrinsic_has_atomic_op(intr))
      op = lsc_op_for_atomic(nir_intrinsic_atomic_op(intr));
   else if (sfid == BRW_SFID_TGM)
      op = data_src ? LSC_OP_STORE_CMASK : LSC_OP_LOAD_CMASK;
   else
      op = data_src ? LSC_OP_STORE : LSC_OP_LOAD;

   nir_src *bti = nir_get_io_index_src(intr), *ubo = NULL;
   nir_src *offset_src = tgm ? &intr->src[1] : nir_get_io_offset_src(intr);

   if (intr->intrinsic == nir_intrinsic_load_ubo ||
       intr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) {
      ubo = bti;
      bti = NULL;
      b->shader->prog_data->base.has_ubo_pull = true;
   }

   const struct intel_device_info *devinfo = b->shader->devinfo;
   bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest;
   jay_def data = data_src ? nj_src(*data_src) : jay_null();
   unsigned bti_const = 0;
   bool internal = false;
   bool bindless = false;
   jay_def bti_indirect =
      jay_resource_handle(b, bti ?: ubo, &bti_const, &internal, &bindless);
   jay_def offset = nj_src(*offset_src);
   nir_def *ndata = data_src ? data_src->ssa : &intr->def;
   jay_def dst = has_dest ? nj_def(&intr->def) : jay_null();
   int32_t base_offset =
      nir_intrinsic_has_base(intr) ? nir_intrinsic_base(intr) : 0;

   /* Optimize increment/decrement */
   if (op == LSC_OP_ATOMIC_ADD && nir_src_is_const(*data_src)) {
      int64_t add_val = nir_src_as_int(*data_src);
      if (add_val == 1 || add_val == -1) {
         op = add_val == 1 ? LSC_OP_ATOMIC_INC : LSC_OP_ATOMIC_DEC;
         data = jay_null();
      }
   }

   /* Pack the coordinates. TODO: MSAA */
   if (tgm) {
      unsigned nr = nir_image_intrinsic_coord_components(intr);
      offset = jay_extract_range(offset, 0, nr);
   }

   internal |= scratch;
   enum lsc_addr_surface_type surf_type = internal     ? LSC_ADDR_SURFTYPE_SS :
                                          bindless     ? LSC_ADDR_SURFTYPE_BSS :
                                          (bti || ubo) ? LSC_ADDR_SURFTYPE_BTI :
                                                         LSC_ADDR_SURFTYPE_FLAT;

   bool a64 = surf_type == LSC_ADDR_SURFTYPE_FLAT && sfid == BRW_SFID_UGM;
   enum lsc_addr_size addr_size = a64 ? LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32;
   enum jay_type offset_type = a64 ? JAY_TYPE_U64 : JAY_TYPE_U32;

   bool cmask = op == LSC_OP_LOAD_CMASK || op == LSC_OP_STORE_CMASK;
   bool uniform = !(has_dest && dst.file != UGPR);

   if (nir_intrinsic_has_align(intr)) {
      assert(nir_intrinsic_align(intr) >= (ndata->bit_size / 8));
   }

   if (!has_dest) {
      uniform &= jay_is_null(data) || data.file == UGPR;
      uniform &= jay_is_null(offset) || offset.file == UGPR;
      uniform &= !(cmask || urb);
   }

   /* Per bspec 57330, 8-bit/16-bit are not supported for transpose */
   bool transpose = uniform && !cmask && ndata->bit_size >= 32;
   bool scalar_uniform = uniform && !cmask && ndata->bit_size < 32;

   if (!uniform) {
      offset = jay_as_gpr(b, offset);
   } else if (!transpose) {
      offset = jay_src_as_strided(b, offset, a64 ? 2 : 1, UGPR);
   }

   if (!jay_is_null(data) && !transpose && !scalar_uniform)
      data = jay_as_gpr(b, data);

   unsigned access =
      nir_intrinsic_has_access(intr) ? nir_intrinsic_access(intr) : 0;

   bool volatile_access = access & ACCESS_VOLATILE;
   bool coherent_access = access & ACCESS_COHERENT;

   /* Bspec: Atomic instruction -> Cache section:
    *
    *    Atomic messages are always forced to "un-cacheable" in the L1
    *    cache.
    *
    * Bspec: Overview of memory Access:
    *
    *   If a read from a Null tile gets a cache-hit in a virtually-addressed
    *   GPU cache, then the read may not return zeroes.
    *
    * If a shader writes to a null tile and wants to be able to read it back
    * as zero, it will use the 'volatile' decoration for the access, otherwise
    * the compiler may choose to optimize things out, breaking the
    * residencyNonResidentStrict guarantees. Due to the above, we need to make
    * these operations uncached.
    */
   unsigned cache =
      urb ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) :
      lsc_opcode_is_atomic(op) ?
            LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
      volatile_access ?
            (devinfo->ver >= 20 ?
                /* Xe2 has a better L3 that can deal with null tiles.*/
                (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
                             LSC_CACHE(devinfo, LOAD, L1UC_L3C)) :
                /* On older platforms, all caches have to be bypassed. */
                (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) :
                             LSC_CACHE(devinfo, LOAD, L1UC_L3UC))) :
            /* Skip L1 for coherent accesses */
         coherent_access ? (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
                                        LSC_CACHE(devinfo, LOAD, L1UC_L3C)) :
      !has_dest          ? LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
                           LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);

   unsigned max_imm_bits = brw_max_immediate_offset_bits(surf_type);
   assert(base_offset >= u_intN_min(max_imm_bits));
   assert(base_offset <= u_intN_max(max_imm_bits));
   assert(base_offset == 0 || sfid != BRW_SFID_TGM);

   const unsigned base_offs_bits =
      util_bitpack_sint(base_offset, 0, max_imm_bits - 1);

   unsigned nr = ndata->num_components;
   uint64_t desc =
      lsc_msg_desc(devinfo, op, surf_type, addr_size,
                   lsc_bits_to_data_size(ndata->bit_size),
                   cmask ? BITFIELD_MASK(nr) : nr, transpose, cache);

   /* Unlike most SENDs, we may skip the destination of atomics. We do this here
    * instead of DCE so we don't need to fix up message descriptors later.
    */
   if (nir_intrinsic_has_atomic_op(intr) && nir_def_is_unused(&intr->def)) {
      dst = jay_null();
   }

   jay_def tmp = dst;

   if (dst.file == UGPR) {
      if (transpose) {
         /* Transpose writes whole GRFs, so round up */
         tmp = jay_alloc_def(b, UGPR,
                             ALIGN_POT(jay_num_values(dst),
                                       jay_ugpr_per_grf(b->shader)));
      } else {
         /* Without transpose we write at GRF granularity. Pad out. */
         tmp = jay_alloc_def(b, UGPR,
                             jay_ugpr_per_grf(b->shader) * jay_num_values(dst));
      }
   }

   jay_def srcs[] = { offset, data };

   /* Second data source immediately follows the first */
   if (op == LSC_OP_ATOMIC_CMPXCHG || op == LSC_OP_ATOMIC_FCMPXCHG) {
      jay_def data2 = nj_src(*(data_src + 1));

      if (!transpose) {
         data2 = jay_as_gpr(b, data2);
      }

      srcs[1] = jay_collect_two(b, data, data2);
   }

   jay_def ex_desc = jay_null();
   uint32_t ex_desc_imm = 0;
   if (scratch) {
      /* TODO: Once we have an address register RA, we should CSE these */
      ex_desc = jay_alloc_def(b, J_ADDRESS, 1);
      jay_SHR(b, JAY_TYPE_U32, ex_desc, jay_scratch_surface(nj), 4);

      if (has_dest) {
         b->shader->fills++;
      } else {
         b->shader->spills++;
      }
   } else if (surf_type == LSC_ADDR_SURFTYPE_FLAT) {
      desc |= ((uint64_t) lsc_flat_ex_desc(devinfo, base_offs_bits) << 32);
   } else if (jay_is_null(bti_indirect)) {
      desc |=
         ((uint64_t) lsc_bti_ex_desc(devinfo, bti_const, base_offs_bits) << 32);
   } else if (!jay_is_null(bti_indirect)) {
      ex_desc = bti_indirect;

      if (surf_type == LSC_ADDR_SURFTYPE_SS ||
          surf_type == LSC_ADDR_SURFTYPE_BSS) {
         ex_desc_imm = SET_BITS(GET_BITS(base_offs_bits, 16, 4), 31, 19) |
                       SET_BITS(GET_BITS(base_offs_bits, 3, 0), 15, 12);
      } else {
         /* TODO: Move the SHL to NIR for CSE? */
         assert(surf_type == LSC_ADDR_SURFTYPE_BTI);
         assert(base_offs_bits == 0);
         ex_desc = jay_SHL_u32(b, bti_indirect, 24);
      }
   }

   enum jay_type data_type = jay_type(JAY_TYPE_U, MAX2(ndata->bit_size, 32));
   jay_SEND(b, .sfid = sfid, .msg_desc = desc, .srcs = srcs,
            .nr_srcs = jay_is_null(data) ? 1 : 2, .dst = tmp, .type = data_type,
            .src_type = { offset_type, data_type }, .uniform = uniform,
            .bindless = surf_type == LSC_ADDR_SURFTYPE_BSS, .ex_desc = ex_desc,
            .ex_desc_imm = ex_desc_imm);

   if (has_dest && !jay_defs_equivalent(tmp, dst)) {
      jay_copy_strided(b, dst, tmp, !transpose);
   }
}

static void
jay_emit_barycentric(struct nir_to_jay_state *nj,
                     nir_intrinsic_instr *intr,
                     enum intel_barycentric_mode mode)
{
   assert(nj->s->stage == MESA_SHADER_FRAGMENT);
   enum glsl_interp_mode glsl_mode = nir_intrinsic_interp_mode(intr);

   if (glsl_mode == INTERP_MODE_NOPERSPECTIVE) {
      mode += INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL;
   } else {
      assert(glsl_mode == INTERP_MODE_SMOOTH);
   }

   jay_copy(&nj->bld, nj_def(&intr->def), nj->payload.fs.bary[mode]);
}

static void
jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
   jay_shader *s = nj->s;
   jay_function *f = nj->f;
   jay_builder *b = &nj->bld;
   jay_cs_payload *cs =
      mesa_shader_stage_is_compute(s->stage) ? &nj->payload.cs : NULL;

   const bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest;
   jay_def dst = has_dest ? nj_def(&intr->def) : jay_null();

   switch (intr->intrinsic) {
   case nir_intrinsic_resource_intel:
      /* No code to generate here */
      break;

   case nir_intrinsic_global_atomic:
   case nir_intrinsic_global_atomic_swap:
   case nir_intrinsic_image_atomic:
   case nir_intrinsic_image_atomic_swap:
   case nir_intrinsic_image_load:
   case nir_intrinsic_image_store:
   case nir_intrinsic_load_global:
   case nir_intrinsic_load_global_constant:
   case nir_intrinsic_load_global_constant_uniform_block_intel:
   case nir_intrinsic_load_scratch_intel:
   case nir_intrinsic_load_shared:
   case nir_intrinsic_load_shared_uniform_block_intel:
   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_load_ssbo_intel:
   case nir_intrinsic_load_ssbo_uniform_block_intel:
   case nir_intrinsic_load_ubo:
   case nir_intrinsic_load_ubo_uniform_block_intel:
   case nir_intrinsic_shared_atomic:
   case nir_intrinsic_shared_atomic_swap:
   case nir_intrinsic_ssbo_atomic:
   case nir_intrinsic_ssbo_atomic_swap:
   case nir_intrinsic_store_global:
   case nir_intrinsic_store_urb_lsc_intel:
   case nir_intrinsic_store_scratch_intel:
   case nir_intrinsic_store_shared:
   case nir_intrinsic_store_ssbo:
   case nir_intrinsic_store_ssbo_intel:
   case nir_intrinsic_bindless_image_load:
   case nir_intrinsic_bindless_image_store:
   case nir_intrinsic_bindless_image_atomic:
   case nir_intrinsic_bindless_image_atomic_swap:
      jay_emit_mem_access(nj, intr);
      break;

   case nir_intrinsic_load_push_data_intel: {
      unsigned sz = intr->def.bit_size / 8;
      unsigned base_offset = nir_intrinsic_base(intr);
      assert(util_is_aligned(base_offset, sz));

      if (nir_src_is_const(intr->src[0])) {
         unsigned load_offset = nir_src_as_uint(intr->src[0]);
         unsigned offs = base_offset + load_offset;
         assert(util_is_aligned(load_offset, sz));

         if (sz >= 4) {
            jay_foreach_comp(dst, c) {
               jay_MOV(b, jay_extract(dst, c),
                       nj->payload.push_data[(offs / 4) + c]);
            }
         } else {
            jay_foreach_comp(dst, c) {
               unsigned comp_offs = offs + c * sz;
               if (util_is_aligned(comp_offs, 4)) {
                  jay_MOV(b, jay_extract(dst, c),
                          nj->payload.push_data[comp_offs / 4]);
               } else {
                  jay_CVT(b, JAY_TYPE_U32, jay_extract(dst, c),
                          nj->payload.push_data[comp_offs / 4],
                          JAY_TYPE_U | intr->def.bit_size, JAY_ROUND,
                          (comp_offs % 4) / sz);
               }
            }
         }
      } else {
         UNREACHABLE("todo: indirect push data");
      }
      break;
   }

   case nir_intrinsic_barrier: {
      jay_SCHEDULE_BARRIER(b);

      if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
         jay_emit_memory_barrier(nj, intr);
      }

      if ((cs && nir_intrinsic_execution_scope(intr) == SCOPE_WORKGROUP) &&
          !jay_workgroup_is_one_subgroup(b, nj->nir)) {

         jay_emit_signal_barrier(b, nj);
         s->prog_data->cs.uses_barrier = true;
      }

      break;
   }

   case nir_intrinsic_begin_invocation_interlock:
   case nir_intrinsic_end_invocation_interlock:
      UNREACHABLE("TODO");

   case nir_intrinsic_load_reloc_const_intel:
      jay_RELOC(b, dst, nir_intrinsic_param_idx(intr),
                nir_intrinsic_base(intr));
      break;

   case nir_intrinsic_store_render_target_intel:
      assert(nj->nir->info.stage == MESA_SHADER_FRAGMENT);
      jay_emit_fb_write(b, intr);
      break;

   case nir_intrinsic_shader_clock:
      /* We must access the timestamp register atomically, but 64-bit
       * instructions cannot read ARF. Instead use a 2x32-bit vectorized move.
       */
      assert(dst.file == UGPR && "required for vectorization");
      jay_MOV(b, dst, jay_contiguous_def(J_ARF, JAY_ARF_TIMESTAMP, 2))->type =
         JAY_TYPE_U32;
      break;

   case nir_intrinsic_load_sample_mask_in: {
      jay_def mask = jay_extract(nj->payload.u0, 15);

      if (nj->s->dispatch_width == 32) {
         /* TODO: Optimize */
         jay_def hi = jay_extract(nj->payload.u1, 15);
         mask = jay_BFI2_u32(b, 0xffff0000, hi, mask);
      }

      jay_MOV(b, dst, mask);
      break;
   }

   case nir_intrinsic_load_subgroup_invocation:
      /* TODO: Lower this in NIR? */
      jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16,
              JAY_ROUND, 0);
      break;

   case nir_intrinsic_demote:
   case nir_intrinsic_demote_if:
      /* TODO: Already lowered, but need to implement for performance. */
      break;

   case nir_intrinsic_ddx:
   case nir_intrinsic_ddx_coarse:
      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX,
                          JAY_QUAD_SWIZZLE_YYYY);
      break;
   case nir_intrinsic_ddx_fine:
      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXZZ,
                          JAY_QUAD_SWIZZLE_YYWW);
      break;

   case nir_intrinsic_ddy:
   case nir_intrinsic_ddy_coarse:
      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX,
                          JAY_QUAD_SWIZZLE_ZZZZ);
      break;
   case nir_intrinsic_ddy_fine:
      jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XYXY,
                          JAY_QUAD_SWIZZLE_ZWZW);
      break;

   case nir_intrinsic_first_invocation:
      jay_MOV(b, dst, emit_active_lane(nj));
      break;

   case nir_intrinsic_read_first_invocation:
      jay_MOV(b, dst, emit_uniformize(nj, nj_src(intr->src[0])));
      break;

   case nir_intrinsic_ballot:
   case nir_intrinsic_ballot_relaxed: {
      jay_def val = nj_src(intr->src[0]);
      if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
         val = emit_active_lane_mask(nj);
      } else if (val.file == UFLAG) {
         /* Move to a FLAG temporary so we can ballot it. */
         val = jay_MOV(b, jay_alloc_def(b, FLAG, 1), val)->dst;
      } else {
         assert(val.file == FLAG);
      }

      assert(intr->def.bit_size == b->shader->dispatch_width);
      jay_MOV(b, dst, val);
      break;
   }

   /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a
    * GPR input, we could uniformize (as behaviour is undefined for
    * non-uniform inputs) but a lowered bit extract is cheaper than uniformize.
    */
   case nir_intrinsic_inverse_ballot: {
      assert(dst.file == FLAG);
      jay_def x = nj_src(intr->src[0]);
      if (x.file == GPR) {
         jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id);
         jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1);
         jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_NE);
      } else {
         jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
      }

      break;
   }

   case nir_intrinsic_load_local_invocation_id:
      assert(cs);
      UNREACHABLE("todo: implement me from payload");
      jay_copy(b, dst, cs->local_invocation_ids);
      break;

   case nir_intrinsic_load_barycentric_pixel:
      jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL);
      break;

   case nir_intrinsic_load_barycentric_sample:
      jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
      break;

   case nir_intrinsic_load_barycentric_centroid:
      jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID);
      break;

   case nir_intrinsic_load_pixel_coord_intel:
      jay_MOV(b, dst, nj->payload.fs.coord.xy);
      break;

   case nir_intrinsic_load_frag_coord_z:
      jay_MOV(b, dst, nj->payload.fs.coord.z);
      break;

   case nir_intrinsic_load_frag_coord_w_rcp:
      jay_MOV(b, dst, nj->payload.fs.coord.w);
      break;

   case nir_intrinsic_load_urb_output_handle_intel:
      jay_MOV(b, dst, nj->payload.urb_handle);
      break;

   case nir_intrinsic_load_layer_id:
      jay_EXTRACT_LAYER(b, dst, jay_extract(nj->payload.u0, 9),
                        payload_u1(nj, 9, 1));
      break;

   case nir_intrinsic_load_front_face: {
      /* Bit 11 is facingness for the first polygon. TODO: Multipolygon. */
      jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(),
                              jay_extract(nj->payload.u0, 9), BITFIELD_BIT(11));

      /* The bit is actually backfacingness so check for equality with 0 */
      jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_EQ);
      break;
   }

   /* Sample ID comes in as 4-bit numbers in g1.0:
    *
    *    15:12 Slot 3 SampleID
    *     11:8 Slot 2 SampleID
    *      7:4 Slot 1 SampleID
    *      3:0 Slot 0 SampleID
    *
    * Each slot corresponds to four channels, so we want to replicate each
    * half-byte value to 4 channels in a row:
    *
    *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
    *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
    *
    *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0
    *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
    *
    * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
    * channels to read the first byte (7:0), and the second group of 8
    * channels to read the second byte (15:8).  Then, we shift right by
    * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
    * values into place.  Finally, we AND with 0xf to keep the low nibble.
    *
    * According to the "PS Thread Payload for Normal Dispatch"
    * pages on the BSpec, the sample ids are stored in R0.8/R1.8
    * on gfx20+ and in R1.0/R2.0 on gfx8+.
    */
   case nir_intrinsic_load_sample_id: {
      jay_def x = jay_alloc_def(b, GPR, 1);
      jay_EXTRACT_BYTE_PER_8LANES(b, x, jay_extract(nj->payload.u0, 8),
                                  payload_u1(nj, 8, 1));
      jay_AND_U32_U16(b, dst, jay_SHR_ODD_SUBSPANS_BY_4_u16(b, x), 0xF);
      break;
   }

   case nir_intrinsic_load_input:
      if (s->stage == MESA_SHADER_VERTEX) {
         unsigned offs = nir_intrinsic_base(intr) * 4;
         offs += nir_intrinsic_component(intr);
         assert(intr->def.bit_size == 32 && "todo");

         jay_copy(b, dst,
                  jay_collect_vectors(b, nj->payload.vs.attributes + offs,
                                      intr->def.num_components));
         break;
      }

      FALLTHROUGH;
   case nir_intrinsic_load_fs_input_interp_deltas: {
      assert(s->stage == MESA_SHADER_FRAGMENT);
      unsigned location = nir_intrinsic_io_semantics(intr).location +
                          nir_src_as_uint(intr->src[0]);
      unsigned i = (s->prog_data->fs.urb_setup[location] * 4) +
                   nir_intrinsic_component(intr);

      if (intr->intrinsic == nir_intrinsic_load_input) {
         assert(intr->def.num_components == 1 && "should be scalarized");
      }

      /* Zeroth delta is the flat value */
      jay_copy(b, dst, nj->payload.fs.deltas[i]);
      break;
   }

   case nir_intrinsic_load_subgroup_id:
      assert(cs && f->is_entrypoint && "todo: this needs ABI");
      /* Subgroup ID in Thread Group is u0.2 bits 7:0 */
      jay_AND(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 0xFF);
      break;

   case nir_intrinsic_load_num_subgroups:
      assert(cs && f->is_entrypoint && "todo: this needs ABI");
      /* Number of subgroups in Thread Group is u0.2 bits 31:24 */
      jay_SHR(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 24);
      break;

   case nir_intrinsic_load_workgroup_id:
      assert(cs && f->is_entrypoint && "todo: this needs ABI");
      jay_MOV(b, jay_extract(dst, 0), jay_extract(nj->payload.u0, 1));
      jay_MOV(b, jay_extract(dst, 1), jay_extract(nj->payload.u0, 6));
      jay_MOV(b, jay_extract(dst, 2), jay_extract(nj->payload.u0, 7));
      break;

   case nir_intrinsic_shuffle_intel: {
      jay_def data = nj_src(intr->src[0]);

      if (nir_src_is_const(intr->src[1])) {
         /* Broadcast takes a lane index, with only 32-bit registers */
         jay_BROADCAST_IMM(b, dst, data, nir_src_as_uint(intr->src[1]) / 4);
      } else {
         /* Shuffle takes a byte index */
         jay_SHUFFLE(b, dst, data, nj_src(intr->src[1]));
      }

      break;
   }

   case nir_intrinsic_quad_broadcast:
      jay_QUAD_SWIZZLE(b, dst, nj_src(intr->src[0]),
                       JAY_QUAD_SWIZZLE_XXXX + nir_src_as_uint(intr->src[1]));
      break;

   case nir_intrinsic_load_inline_data_intel: {
      assert(cs && f->is_entrypoint && "todo: this needs ABI");
      assert(nir_src_as_uint(intr->src[0]) == 0 && "TODO: indirects");

      unsigned offset = nir_intrinsic_base(intr) / 4;
      unsigned nr = jay_num_values(dst);
      jay_copy(b, dst, jay_extract_range(nj->payload.inline_data, offset, nr));
      break;
   }

   default:
#ifndef NDEBUG
      assert(intr->intrinsic < nir_num_intrinsics);
      fprintf(stdout, "intrinsic: %s\n",
              nir_intrinsic_infos[intr->intrinsic].name);
#endif
      UNREACHABLE("unknown intrinsic");
   }
}

static bool
sampler_needs_header(enum brw_sampler_opcode op,
                     nir_texop nir_op,
                     const struct intel_device_info *devinfo)
{
   switch (op) {
   case BRW_SAMPLER_OPCODE_SAMPLEINFO:
      return true;
   case BRW_SAMPLER_OPCODE_LD:
   case BRW_SAMPLER_OPCODE_LD_LZ:
      /* Xe3 HW does not seem to work unless we force a header. */
      return devinfo->ver >= 30;
   default:
      return nir_op == nir_texop_tg4;
   }
}

static void
jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex)
{
   /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
    *
    *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
    *     Enable will be incorect for sample_c when applied to a surface with
    *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
    *     Enable may incorrectly report pixels as referencing a Null surface."
    *
    * We'll take care of this in NIR.
    */
   assert(!tex->is_sparse ||
          nir_tex_instr_src_index(tex, nir_tex_src_comparator) == -1);

   jay_builder *b = &nj->bld;
   jay_def dst = nj_def(&tex->def);
   jay_def tmp = dst;

   const enum brw_sampler_opcode op = (enum brw_sampler_opcode)(
      tex->backend_flags & ~BRW_TEX_INSTR_FUSED_EU_DISABLE);
   const struct brw_sampler_payload_desc *payload_desc =
      brw_get_sampler_payload_desc(op);

   /* First deal with surface & sampler */
   unsigned payload_type_bit_size = 0;
   bool surface_bindless = false;
   bool sampler_bindless = false;
   jay_def surface, sampler, packed_offsets = jay_null();
   jay_def payload[JAY_MAX_SAMPLER_MESSAGE_SIZE];
   int i;
   if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle)) >= 0) {
      unsigned x;
      surface =
         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless);
      if (jay_is_null(surface))
         surface = jay_imm(x);
      assert(tex->texture_index == 0);
   } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset)) >=
              0) {
      unsigned x;
      surface =
         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless);
      if (jay_is_null(surface))
         surface = jay_imm(x + tex->texture_index);
      else if (tex->texture_index)
         surface = jay_ADD_u32(b, surface, tex->texture_index);
   } else {
      surface = jay_imm(tex->texture_index);
   }

   if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle)) >= 0) {
      unsigned x;
      sampler =
         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless);
      if (jay_is_null(sampler))
         surface = jay_imm(x);
      assert(tex->sampler_index == 0);
   } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset)) >=
              0) {
      unsigned x;
      sampler =
         jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless);
      if (jay_is_null(sampler))
         sampler = jay_imm(x + tex->sampler_index);
      else
         sampler = jay_ADD_u32(b, sampler, tex->sampler_index);
   } else {
      sampler = jay_imm(tex->sampler_index);
   }

   surface = emit_uniformize(nj, surface);
   sampler = emit_uniformize(nj, sampler);

   /* Now the sampler payload */
   bool has_offset_in_payload = false;
   bool payload_uniform = true;
   uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0;
   for (uint32_t i = 0;
        payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID;
        i++) {
      nir_tex_src_type nir_source;
      unsigned nir_comp;

#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name
#define S(name, component)                                                     \
   do {                                                                        \
      nir_source = nir_tex_src_##name;                                         \
      nir_comp = component;                                                    \
   } while (0)

      struct brw_sampler_payload_src sampler_src = payload_desc->sources[i];

      switch (sampler_src.param) {
      case P(U):
         S(coord, 0);
         break;
      case P(V):
         S(coord, 1);
         break;
      case P(R):
         S(coord, 2);
         break;
      case P(AI):
         S(coord, 3);
         break;
      case P(BIAS):
         S(bias, 0);
         break;
      case P(LOD):
         S(lod, 0);
         break;
      case P(MLOD):
         S(min_lod, 0);
         break;
      case P(REF):
         S(comparator, 0);
         break;
      case P(DUDX):
         S(ddx, 0);
         break;
      case P(DUDY):
         S(ddy, 0);
         break;
      case P(DVDX):
         S(ddx, 1);
         break;
      case P(DVDY):
         S(ddy, 1);
         break;
      case P(DRDX):
         S(ddx, 2);
         break;
      case P(DRDY):
         S(ddy, 2);
         break;
      case P(SI):
         S(ms_index, 0);
         break;
      case P(MCSL):
         S(ms_mcs_intel, 0);
         break;
      case P(MCSH):
         S(ms_mcs_intel, 1);
         break;
      case P(MCS0):
         S(ms_mcs_intel, 0);
         break;
      case P(MCS1):
         S(ms_mcs_intel, 1);
         break;
      case P(MCS2):
         S(ms_mcs_intel, 2);
         break;
      case P(MCS3):
         S(ms_mcs_intel, 3);
         break;

      case P(OFFU):
         S(offset, 0);
         has_offset_in_payload = true;
         break;
      case P(OFFV):
         S(offset, 1);
         has_offset_in_payload = true;
         break;
      case P(OFFUV4):
      case P(OFFUVR4):
      case P(OFFUV6):
      case P(OFFUVR6):
      case P(BIAS_OFFUV6):
      case P(BIAS_OFFUVR4):
      case P(LOD_OFFUV6):
      case P(LOD_OFFUVR4):
      case P(OFFUV4_R):
      case P(OFFUV6_R):
      case P(OFFUVR4_R):
         /* There is no payload with 2 packed entries, so backend1 is always
          * the one payload parameter packed. */
         S(backend1, 0);
         has_offset_in_payload = true;
         break;

      case P(BIAS_AI):
      case P(LOD_AI):
      case P(MLOD_R):
         /* There is no payload with 2 packed entries, so backend1 is always
          * the one payload parameter packed. */
         S(backend1, 0);
         break;

      default:
         UNREACHABLE("unhandled sampler param");
      }

#undef P
#undef S

      jay_def param_val = jay_null();

      int j = nir_tex_instr_src_index(tex, nir_source);
      if (j >= 0 && nir_comp < tex->src[j].src.ssa->num_components) {
         param_val = jay_extract(nj_src(tex->src[j].src), nir_comp);

         unsigned bitsize = nir_src_bit_size(tex->src[j].src);
         assert(payload_type_bit_size == 0 || payload_type_bit_size == bitsize);
         payload_type_bit_size = bitsize;
      }

      /* The hardware requires a LOD for buffer textures */
      if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF &&
          sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) {
         sampler_src.optional = false;
      }

      /* Wa_14012688258:
       *
       * Don't trim zeros at the end of payload for sample operations
       * in cube and cube arrays.
       *
       * Compiler should send U,V,R parameters even if V,R are 0.
       */
      if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
          intel_needs_workaround(nj->devinfo, 14012688258) &&
          (sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U ||
           sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V ||
           sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) {
         sampler_src.optional = false;
      }

      /* The last source present in the payload dictates the number of
       * sources, unless it's required.
       *
       * We can skip the last source if it's zero.
       */
      if (!sampler_src.optional || !jay_is_null(param_val))
         n_sources = i + 1;

      if (jay_is_null(param_val)) {
         param_val = jay_alloc_def(b, dst.file, 1);
         jay_MOV(b, param_val, 0);
      }

      payload[i] = param_val;
      payload_uniform &= jay_is_uniform(payload[i]);
   }

   i = nir_tex_instr_src_index(tex, nir_tex_src_backend2);
   if (i >= 0) {
      packed_offsets = nj_src(tex->src[i].src);
   }

   /* Xe2+ should never used packed offsets since it has enough opcodes to
    * handle any programmable offset.
    */
   assert(jay_is_null(packed_offsets) || nj->devinfo->ver < 20);

   /* If the NIR instruction has an offset param but the sampler payload
    * doesn't, we can put the offset into the header of the message.
    *
    * The restriction though is that it should be a constant value.
    */
   int offs_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset);
   bool has_const_offsets = offs_idx != -1 && !has_offset_in_payload;

   bool is_high_sampler = !jay_is_imm(sampler) || jay_as_uint(sampler) >= 16;
   bool residency = tex->is_sparse;
   unsigned null_mask_component = 0;

   const bool needs_header = sampler_needs_header(op, tex->op, nj->devinfo) ||
                             has_const_offsets ||
                             !jay_is_null(packed_offsets) ||
                             sampler_bindless ||
                             is_high_sampler ||
                             residency;

   uint8_t component_mask;
   if (tex->op == nir_texop_tg4) {
      component_mask = WRITEMASK_XYZW;
   } else if (residency) {
      /* intel_nir_lower_sparse guarantees that texturing operations only
       * read the data, or the sparse residency code, but not both at once.
       *
       * We need to use UGPRs for the residency result because the sampler
       * returns the null pixel mask in lane 0, regardless of lanemasking.
       *
       * Unfortunately, the sampler doesn't allow us to writemask out all
       * four colour channels, so we have to needlessly return red.  This
       * isn't uniform data, but we store it in an array of UGPRs anyway
       * in order to have a consistent def file.  The colour data will be
       * immediately dead anyway.
       */
      assert(tex->op == nir_texop_sparse_residency_intel ||
             tex->op == nir_texop_sparse_residency_txf_intel);
      assert(nir_def_components_read(&tex->def) == WRITEMASK_Y);
      component_mask = WRITEMASK_X;
      unsigned red_grfs = payload_uniform ? 1 : jay_grf_per_gpr(b->shader);
      unsigned grfs = red_grfs + 1;
      tmp = jay_alloc_def(b, UGPR, grfs * jay_ugpr_per_grf(b->shader));
      null_mask_component = red_grfs * jay_ugpr_per_grf(b->shader);
   } else {
      component_mask = nir_def_components_read(&tex->def);

      /* We can reduce the return length of the message to drop unused
       * trailing components, but shrinking with a discontiguous mask
       * requires a message header.  We only do that if we need a header
       * for other reasons, as it's more expensive than writing extra data.
       */
      if (!needs_header) {
         component_mask =
            (uint8_t) BITFIELD_MASK(util_last_bit(component_mask));
      }

      /* TODO: Shrink 16-bit textures too. Shrinking is problematic for some
       * component masks due to 32-bit granularity of ISA registers.
       */
      if (tex->def.bit_size != 32 || (jay_debug & JAY_DBG_NOOPT))
         component_mask = nir_component_mask(tex->def.num_components);

      /* If we shrunk the destination, we need a temporary */
      if (component_mask != BITFIELD_MASK(tex->def.num_components)) {
         tmp = jay_alloc_def(b, GPR, util_bitcount(component_mask));
      }
   }

   /* SENDs always write entire GRFs so we need to pad out for uniform dests */
   if (dst.file == UGPR && !residency) {
      unsigned nr = jay_ugpr_per_grf(b->shader) * jay_num_values(tmp);
      tmp = jay_alloc_def(b, UGPR, nr);
   }

   if (tex->op == nir_texop_texture_samples) {
      assert(needs_header);
      payload_type_bit_size = 32;
      n_sources = 0;
   }

   jay_def header = jay_null();
   if (needs_header) {
      uint32_t header2;
      if (tex->op == nir_texop_tg4) {
         /* Gathers have a component but no write mask */
         header2 = (tex->component << 16);
      } else {
         /* If present, the header write mask are inverted compared to NIR */
         header2 = (~component_mask & 0xf) << 12;
      }

      if (residency)
         header2 |= 1 << 23; /* g0.2 bit 23: Pixel Null Mask Enable */

      if (has_const_offsets) {
         const unsigned num_components = nir_tex_instr_src_size(tex, offs_idx);
         for (unsigned i = 0; i < num_components; i++) {
            nir_scalar s = nir_get_scalar(tex->src[offs_idx].src.ssa, i);
            s = nir_scalar_chase_movs(s);
            assert(nir_scalar_is_const(s));
            int offset = nir_scalar_as_int(s);

            /* Offsets are 4-bits, reversed order */
            header2 |= (offset & 0xf) << ((2 - i) * 4);
         }
      }

      /* Vectorized zeroing of the header. TODO: This can be optimized more. */
      jay_def zeroes = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader));
      jay_MOV(b, zeroes, 0);

      jay_def ugprs[JAY_MAX_DEF_LENGTH];
      jay_foreach_comp(zeroes, i) {
         ugprs[i] = jay_extract(zeroes, i);
      }

      /* Set the main immediate part of the header */
      if (header2 != 0) {
         ugprs[2] = jay_MOV_u32(b, header2);
      }

      if (sampler_bindless) {
         /* Bindless sampler handles aren't relative to the sampler state
          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
          * Instead, it's an absolute pointer relative to dynamic state base
          * address.
          *
          * Sampler states are 16 bytes each and the pointer we give here has
          * to be 32-byte aligned.  In order to avoid more indirect messages
          * than required, we assume that all bindless sampler states are
          * 32-byte aligned.  This sacrifices a bit of general state base
          * address space but means we can do something more efficient in the
          * shader.
          */
         ugprs[3] = sampler;
      } else {
         /* Select the default dynamic state base address + offset */
         jay_def sampler_ptr = nj->payload.sampler_state_pointer;

         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
          * with the ones included in g0.3 bits 4:0.  Mask them out.
          */
         if (b->shader->devinfo->ver >= 11) {
            sampler_ptr = jay_AND_u32(b, sampler_ptr, INTEL_MASK(31, 5));
         }

         /* TODO: We should probably lower this in NIR. */
         if (is_high_sampler) {
            if (jay_is_imm(sampler)) {
               unsigned s = jay_as_uint(sampler);
               const int sampler_state_size_B = 16;
               unsigned offs_B = ROUND_DOWN_TO(s, 16) * sampler_state_size_B;
               assert(offs_B > 0 && "since s > 0");
               sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B);
            } else {
               jay_def offs_B =
                  jay_SHL_u32(b, jay_AND_u32(b, sampler, 0xf0), 4);
               sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B);
            }
         }

         ugprs[3] = sampler_ptr;
      }
      /* Zip it all up into a vector of UGPRs which will RA to a single GRF */
      header = jay_collect_vectors(b, ugprs, jay_num_values(zeroes));
   }

   assert(payload_type_bit_size == 16 || payload_type_bit_size == 32);
   unsigned simd_mode = 0;
   unsigned simd_width = payload_uniform ? 1 : nj->s->dispatch_width;
   if (nj->devinfo->ver < 20) {
      if (payload_type_bit_size == 16) {
         assert(nj->devinfo->ver >= 11);
         simd_mode = simd_width <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
                                       GFX10_SAMPLER_SIMD_MODE_SIMD16H;
      } else {
         simd_mode = simd_width <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
                                       BRW_SAMPLER_SIMD_MODE_SIMD16;
      }
   } else {
      if (payload_type_bit_size == 16) {
         simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
                                        XE2_SAMPLER_SIMD_MODE_SIMD32H;
      } else {
         simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
                                        XE2_SAMPLER_SIMD_MODE_SIMD32;
      }
   }

   uint64_t desc = 0;
   jay_def desc_src = jay_null(), desc_ex_src = jay_null();

   unsigned sampler_imm = 0;
   if (jay_is_imm(sampler) && !sampler_bindless) {
      sampler_imm = jay_as_uint(sampler) % 16;
   }

   const unsigned msg_type = brw_get_sampler_hw_opcode(op);
   bool is_16 = false; /* TODO */
   unsigned ret_type = is_16 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS :
                               GFX8_SAMPLER_RETURN_FORMAT_32BITS;

   if (!surface_bindless &&
       jay_is_imm(surface) &&
       (jay_is_imm(sampler) || sampler_bindless)) {
      desc = brw_sampler_desc(nj->devinfo, jay_as_uint(surface), sampler_imm,
                              msg_type, simd_mode, ret_type);
   } else if (surface_bindless) {
      /* Bindless surface */
      desc = brw_sampler_desc(nj->devinfo, GFX9_BTI_BINDLESS, sampler_imm,
                              msg_type, simd_mode, ret_type);

      /* For bindless samplers, the entire address is included in the message
       * header so we can leave the portion in the message descriptor 0.
       */
      if (!sampler_bindless && !jay_is_imm(sampler)) {
         desc_src = jay_SHL_u32(b, sampler, 8);
      }

      /* We assume that the driver provided the handle in the top 20 bits so
       * we can use the surface handle directly as the extended descriptor.
       */
      desc_ex_src = jay_alloc_def(b, J_ADDRESS, 1);
      jay_MOV(b, desc_ex_src, surface);
   } else {
      /* Immediate portion of the descriptor */
      desc = brw_sampler_desc(nj->devinfo, 0, 0, msg_type, simd_mode, ret_type);

      if (sampler_bindless) {
         desc_src = surface;
      } else if (!sampler_bindless && jay_is_imm(sampler)) {
         desc_src = jay_OR_u32(b, surface, jay_as_uint(sampler) << 8);
      } else {
         desc_src = jay_OR_u32(b, jay_SHL_u32(b, sampler, 8), surface);
      }

      desc_src = jay_AND_u32(b, desc_src, 0xfff);
   }

   if (n_sources > 2 || !jay_is_null(header)) {
      for (unsigned i = 0; i < n_sources; ++i) {
         payload[i] =
            jay_src_as_strided(b, payload[i], 1, payload_uniform ? UGPR : GPR);
      }
   }

   enum jay_type src_type = jay_type(JAY_TYPE_U, payload_type_bit_size);
   jay_SEND(b, .sfid = BRW_SFID_SAMPLER, .msg_desc = desc, .desc = desc_src,
            .ex_desc = desc_ex_src, .header = header, .srcs = payload,
            .nr_srcs = n_sources, .type = JAY_TYPE_U32,
            .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform,
            .bindless = surface_bindless);

   /* If we sampled into a temporary, copy out to the final */
   if (residency) {
      jay_MOV(b, jay_extract(dst, 1), jay_extract(tmp, null_mask_component));
   } else if (!jay_defs_equivalent(dst, tmp)) {
      unsigned i = 0;
      unsigned tmp_stride = dst.file == UGPR ? jay_ugpr_per_grf(b->shader) : 1;

      u_foreach_bit(c, component_mask) {
         jay_MOV(b, jay_extract(dst, c), jay_extract(tmp, (i++) * tmp_stride));
      }
   }

   if (mesa_shader_stage_is_compute(b->shader->stage)) {
      b->shader->prog_data->cs.uses_sampler |= !nir_tex_instr_is_query(tex);
   }
}

static void
jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr)
{
   switch (instr->type) {
   case nir_jump_break:
      jay_block_add_successor(nj->current_block, nj->break_block, GPR);
      jay_BREAK(&nj->bld);
      break;
   case nir_jump_halt:
      // TODO: Do we want a predicated EOT here, or a jump to the end?
      assert(!"TODO: implement HALT");
      break;
   case nir_jump_return:
      /* Should be lowered */
   default:
      UNREACHABLE("unknown jump");
   }
}

static void
jay_emit_instr(struct nir_to_jay_state *nj, jay_block *block, nir_instr *instr)
{
   switch (instr->type) {
   case nir_instr_type_alu:
      jay_emit_alu(nj, nir_instr_as_alu(instr));
      break;

   case nir_instr_type_intrinsic:
      jay_emit_intrinsic(nj, nir_instr_as_intrinsic(instr));
      break;

   case nir_instr_type_tex:
      jay_emit_texture(nj, nir_instr_as_tex(instr));
      break;

   case nir_instr_type_load_const:
      jay_emit_load_const(nj, nir_instr_as_load_const(instr));
      break;

   case nir_instr_type_phi:
   case nir_instr_type_undef: {
      jay_def def = nj_def(nir_instr_def(instr));

      jay_foreach_comp(def, c) {
         if (instr->type == nir_instr_type_phi) {
            jay_PHI_DST(&nj->bld, jay_extract(def, c));
         } else {
            jay_UNDEF(&nj->bld, jay_extract(def, c));
         }
      }

      break;
   }

   case nir_instr_type_jump:
      jay_emit_jump(nj, nir_instr_as_jump(instr));
      break;

   case nir_instr_type_deref:
      UNREACHABLE("All derefs should've been lowered");

   default:
      UNREACHABLE("unknown instruction type");
   }
}

static jay_block *
jay_create_block(struct nir_to_jay_state *nj)
{
   jay_block *block = jay_new_block(nj->f);
   block->indent = nj->indent;
   return block;
}

static jay_inst *
jay_block_ending_unconditional_jump(jay_block *block)
{
   jay_inst *jump = jay_block_ending_jump(block);
   return jump && !jump->predication ? jump : NULL;
}

static void
jay_emit_if(struct nir_to_jay_state *nj, nir_if *nif)
{
   jay_builder *b = &nj->bld;
   jay_def condition = nj_src(nif->condition);

   jay_block *before_block = nj->current_block;
   jay_block *after_block = jay_create_block(nj);

   /* Push */
   ++nj->indent;

   jay_block *else_first = jay_create_block(nj);

   jay_block *then_first = jay_emit_cf_list(nj, &nif->then_list);
   jay_block *then_last = nj->current_block;

   nj->after_block = else_first;

   jay_block *else_first_2 = jay_emit_cf_list(nj, &nif->else_list);
   jay_block *else_last = nj->current_block;
   assert(else_first == else_first_2);

   /* Pop */
   --nj->indent;

   bool uniform = jay_is_uniform(condition);

   /* Logical CFG edges */
   jay_block_add_successor(before_block, then_first, GPR);
   jay_block_add_successor(before_block, else_first, GPR);

   if (!jay_block_ending_unconditional_jump(then_last))
      jay_block_add_successor(then_last, after_block, GPR);

   if (!jay_block_ending_unconditional_jump(else_last))
      jay_block_add_successor(else_last, after_block, GPR);

   /* For a non-uniform IF, we fall through both sides in the physical CFG */
   if (!uniform) {
      jay_block_add_successor(then_last, else_first, UGPR);
   }

   nj->after_block = after_block;

   /* Emit the if-else-endif sequence */
   b->cursor = jay_after_block(before_block);
   jay_add_predicate(b, jay_IF(b), condition);

   b->cursor = jay_before_block(else_first);
   jay_ELSE(b);

   b->cursor = jay_after_block(else_last);
   jay_ENDIF(b);
}

static void
jay_emit_loop(struct nir_to_jay_state *nj, nir_loop *nloop)
{
   assert(!nir_loop_has_continue_construct(nloop));

   jay_builder *b = &nj->bld;
   jay_block *saved_break = nj->break_block;

   /* Make the block that will be after the loop exit */
   nj->break_block = jay_create_block(nj);
   ++nj->indent;

   /* Make a block for the loop body, which is also the loop header */
   jay_block *loop_header = jay_create_block(nj);
   loop_header->loop_header = true;

   /* The current block falls through to the start of the loop */
   jay_block_add_successor(nj->current_block, loop_header, GPR);

   /* Emit the loop body */
   nj->after_block = loop_header;
   jay_emit_cf_list(nj, &nloop->body);

   /* Emit the backedge */
   jay_inst *jump = jay_block_ending_jump(nj->current_block);
   if (jump && jump->op == JAY_OPCODE_BREAK) {
      jump->op = JAY_OPCODE_LOOP_ONCE;
   } else {
      jay_block_add_successor(nj->current_block, loop_header, GPR);
      jay_WHILE(b);
   }

   /* Pop */
   --nj->indent;
   nj->after_block = nj->break_block;
   nj->break_block = saved_break;

   b->cursor = jay_after_block(nj->after_block);
}

static jay_block *
jay_emit_block(struct nir_to_jay_state *nj, nir_block *nb)
{
   jay_builder *b = &nj->bld;

   if (nj->after_block) {
      nj->current_block = nj->after_block;
      nj->after_block = NULL;
   } else {
      nj->current_block = jay_create_block(nj);
   }

   jay_block *block = nj->current_block;
   block->uniform = !nb->divergent;
   list_addtail(&block->link, &nj->f->blocks);

   b->cursor = jay_after_block(block);

   /* Emit the contents of the block */
   nir_foreach_instr(instr, nb) {
      jay_emit_instr(nj, block, instr);
   }

   /* Look in the current NIR block's successors for any phis. Each of them
    * should have a source corresponding to a value coming from our current
    * block. Create PHI_SRC opcodes in the current block for those values.
    * The corresponding PHI_DST may not have been emitted yet, but that's ok.
    */
   for (unsigned bs = 0; bs < ARRAY_SIZE(nb->successors); ++bs) {
      nir_block *nb_successor = nb->successors[bs];
      if (!nb_successor)
         continue;

      nir_foreach_phi(nphi, nb_successor) {
         jay_def val = nj_src(nir_phi_get_src_from_block(nphi, nb)->src);

         /* The phi def might be nonuniform but have uniform source (like a
          * constant). Move to the correct file in the the source block and
          * reference that in PHI_SRC.
          */
         if (jay_file_for_def(&nphi->def) != val.file) {
            b->cursor = jay_after_block_logical(block);
            jay_def tmp = val;
            val = jay_alloc_def(b, jay_file_for_def(&nphi->def),
                                jay_num_values(val));
            jay_copy(b, val, tmp);
         }

         jay_foreach_comp(val, c) {
            b->cursor = jay_before_jump(block);
            jay_PHI_SRC(b, JAY_TYPE_U32, jay_extract(val, c),
                        nphi->def.index + c);
         }
      }
   }

   b->cursor = jay_after_block(block);
   nj->active_lane_mask = jay_null();
   nj->active_lane = jay_null();
   nj->active_lane_x4 = jay_null();

   return block;
}

static jay_block *
jay_emit_cf_list(struct nir_to_jay_state *nj, struct exec_list *list)
{
   jay_block *start_block = NULL;

   foreach_list_typed(nir_cf_node, node, node, list) {
      switch (node->type) {
      case nir_cf_node_block: {
         jay_block *block = jay_emit_block(nj, nir_cf_node_as_block(node));

         if (!start_block)
            start_block = block;
         break;
      }

      case nir_cf_node_if:
         jay_emit_if(nj, nir_cf_node_as_if(node));
         break;

      case nir_cf_node_loop:
         jay_emit_loop(nj, nir_cf_node_as_loop(node));
         break;

      default:
         UNREACHABLE("Unknown NIR control flow node");
      }
   }

   return start_block;
}

static void
jay_emit_eot(struct nir_to_jay_state *nj)
{
   jay_builder *b = &nj->bld;

   if (mesa_shader_stage_is_compute(nj->nir->info.stage)) {
      /* Vectorized copy into the EOT register. Not necessary for correctness
       * but keeps RA from inserting 16 scalar copies instead.
       */
      jay_def copy = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader));
      jay_MOV(b, copy, nj->payload.u0);

      jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, .eot = true, .msg_desc = 0,
               .srcs = &copy, .nr_srcs = 1, .type = JAY_TYPE_U32,
               .uniform = true);
   } else if (nj->nir->info.stage == MESA_SHADER_VERTEX) {
      jay_block *block = jay_last_block(nj->f);
      jay_inst *I = jay_last_inst(block);

      /* TODO: What if this isn't the case? Do we need a no-op store...? */
      assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == BRW_SFID_URB);
      jay_set_send_eot(I, true);
   }
}

struct payload_builder {
   jay_builder *b;
   unsigned offsets[JAY_NUM_SSA_FILES];
   jay_def vecs[JAY_NUM_SSA_FILES];
};

static jay_def
read_payload(struct payload_builder *b, enum jay_file file)
{
   unsigned granularity = file == UGPR ? 16 : 1;
   unsigned channel = b->offsets[file] % granularity;

   if (channel == 0) {
      b->vecs[file] = jay_alloc_def(b->b, file, granularity);
      jay_PRELOAD(b->b, b->vecs[file], b->offsets[file]);
   }

   b->offsets[file]++;
   return jay_extract(b->vecs[file], channel);
}

static jay_def
read_vector_payload(struct payload_builder *b, enum jay_file file, unsigned len)
{
   jay_def defs[JAY_MAX_DEF_LENGTH];
   assert(len < ARRAY_SIZE(defs));

   for (unsigned i = 0; i < len; ++i) {
      defs[i] = read_payload(b, file);
   }

   return jay_collect_vectors(b->b, defs, len);
}

static void
setup_payload_push(struct nir_to_jay_state *nj, struct payload_builder *p)
{
   unsigned push_size_B = 0;
   for (int i = 0; i < ARRAY_SIZE(nj->s->prog_data->base.push_sizes); i++) {
      push_size_B += nj->s->prog_data->base.push_sizes[i];
   }

   assert(util_is_aligned(push_size_B, 32));
   for (unsigned i = 0; i < (push_size_B / 4); ++i) {
      nj->payload.push_data[i] = read_payload(p, UGPR);
   }

   nj->s->push_grfs = push_size_B / (4 * jay_ugpr_per_grf(nj->s));
}

static void
setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
{
   nj->payload.urb_handle = read_payload(p, GPR);

   /* XXX: This is a hack to line up with the partition chosen in RA. This whole
    * thing needs an overhaul. Need to think harder about partitioning.
    */
   p->offsets[GPR] += 7;

   for (unsigned i = 0; i < (8 * nj->s->prog_data->vue.urb_read_length); ++i) {
      assert(i < ARRAY_SIZE(nj->payload.vs.attributes));
      nj->payload.vs.attributes[i] = read_payload(p, GPR);
   }

   setup_payload_push(nj, p);
}

static void
setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
{
   assert(!nj->s->prog_data->cs.generate_local_id);
   assert(!nj->s->prog_data->cs.uses_btd_stack_ids);

   nj->payload.inline_data =
      read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
}

static void
setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
{
   jay_fs_payload *fs = &nj->payload.fs;

   if (nj->s->dispatch_width == 32) {
      nj->payload.u1 = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
   }

   setup_payload_push(nj, p);

   u_foreach_bit(i, nj->s->prog_data->fs.barycentric_interp_modes) {
      fs->bary[i] = read_vector_payload(p, GPR, 2);
   }

   if (nj->s->prog_data->fs.uses_src_depth) {
      fs->coord.z = read_payload(p, GPR);
   }

   if (nj->s->prog_data->fs.uses_src_w) {
      fs->coord.w = read_payload(p, GPR);
   }

   unsigned nr_attribs = 16 * 4; /* TODO */
   for (unsigned i = 0; i < nr_attribs; ++i) {
      jay_def comps[] = { read_payload(p, UGPR), read_payload(p, UGPR),
                          read_payload(p, UGPR) };

      /* The .yz components are swizzled in the hardware compared to NIR. */
      SWAP(comps[1], comps[2]);
      fs->deltas[i] = jay_collect_vectors(&nj->bld, comps, ARRAY_SIZE(comps));

      /* Padding */
      if ((i % 5) == 4) {
         read_payload(p, UGPR);
      }
   }

   if (BITSET_TEST(nj->nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
      jay_def t = jay_alloc_def(&nj->bld, GPR, 1);
      jay_def lo = jay_extract_range(nj->payload.u0, 10, 4);
      jay_EXPAND_QUAD(&nj->bld, t, lo, payload_u1(nj, 10, 4));
      fs->coord.xy = jay_OFFSET_PACKED_PIXEL_COORDS_u32(&nj->bld, t);
   }

   /* Due to complexities of the physical payload, the logical payload is split
    * into even/odd halves. Fix up the offsets and insert copies.
    */
   if (nj->s->dispatch_width == 32) {
      jay_builder *b = &nj->bld;
      jay_foreach_inst_in_block(nj->after_block, I) {
         if (I->op == JAY_OPCODE_PRELOAD && I->dst.file == GPR) {
            unsigned base = (jay_preload_reg(I) % 2) ? p->offsets[GPR] : 0;
            jay_set_preload_reg(I, base + (jay_preload_reg(I) / 2));
         }
      }

      b->cursor = jay_before_block(nj->after_block);
      jay_DESWIZZLE(b, p->offsets[GPR]);
   }
}

static void
jay_setup_payload(struct nir_to_jay_state *nj)
{
   jay_shader *s = nj->s;
   jay_builder *b = &nj->bld;
   nj->after_block = jay_create_block(nj);
   b->cursor = jay_after_block(nj->after_block);

   struct payload_builder p = { .b = &nj->bld };
   nj->payload.u0 = read_vector_payload(&p, UGPR, jay_ugpr_per_grf(s));
   nj->payload.sampler_state_pointer = jay_extract(nj->payload.u0, 3);

   switch (s->stage) {
   case MESA_SHADER_VERTEX:
      setup_vertex_payload(nj, &p);
      break;
   case MESA_SHADER_FRAGMENT:
      setup_fragment_payload(nj, &p);
      break;
   case MESA_SHADER_COMPUTE:
   case MESA_SHADER_KERNEL:
      setup_compute_payload(nj, &p);
      break;
   default:
      UNREACHABLE("unimplemented shader stages");
   }

   /* Lane ID calculations require &W and therefore are calculated in
    * uniform control flow to sidestep RA problems. The easy solution is
    * calculating the lane ID in the first block.
    *
    * XXX: This doesn't work for multi-function. Reconsider.
    */
   nj->payload.lane_id = jay_LANE_ID_8_u16(b);

   for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
      nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i);
   }
}

/*
 * NIR sometimes contains logically unreachable blocks (e.g. due to infinite
 * loops). These blocks have no predecessors, but do have successors and can
 * contribute to phis. They are dead and violate the IR invariant:
 *
 *    Live-in sources are live-out in all predecessors.
 *
 * ...which RA (validation) depends on. The simplest solution is to simply
 * delete these dead blocks. Fortunately, because they are unreachable, this
 * does not have any ill effects. Notably, this cannot introduce critical edges.
 *
 * Deleting a block may cause a successor to become unreachable, so we use a
 * fixed-point algorithm to converge.
 */
static void
jay_remove_unreachable_blocks(jay_function *func)
{
   bool progress;
   do {
      progress = false;

      jay_foreach_block(func, pred) {
         if (pred != jay_first_block(func) &&
             jay_num_predecessors(pred, GPR) == 0 &&
             jay_num_successors(pred, GPR) > 0) {

            jay_foreach_successor(pred, succ, GPR) {
               util_dynarray_delete_unordered(&succ->logical_preds, jay_block *,
                                              pred);
            }

            jay_foreach_successor(pred, succ, UGPR) {
               util_dynarray_delete_unordered(&succ->physical_preds,
                                              jay_block *, pred);
            }

            pred->logical_succs[0] = NULL;
            pred->logical_succs[1] = NULL;
            pred->physical_succs[0] = NULL;
            pred->physical_succs[1] = NULL;
            progress = true;
         }
      }
   } while (progress);
}

static void
jay_from_nir_function(const struct intel_device_info *devinfo,
                      nir_shader *nir,
                      jay_shader *s,
                      nir_function_impl *impl)
{
   jay_function *f = jay_new_function(s);
   f->is_entrypoint = impl->function->is_entrypoint;

   struct nir_to_jay_state nj = {
      .s = s,
      .f = f,
      .nir = nir,
      .devinfo = devinfo,
      .bld = (jay_builder) { .shader = s, .func = f },
   };

   /* Jay indices match NIR indices. Therefore the first impl->ssa_alloc
    * indices are reserved. Our own temporaries go after.
    */
   f->ssa_alloc = impl->ssa_alloc;

   if (f->is_entrypoint) {
      jay_setup_payload(&nj);
   }

   jay_emit_cf_list(&nj, &impl->body);
   jay_emit_eot(&nj);
   jay_remove_unreachable_blocks(f);
}

static void
jay_gather_stats(const jay_shader *s, struct genisa_stats *stats)
{
   jay_foreach_inst_in_shader(s, f, I) {
      if (I->op != JAY_OPCODE_SYNC) {
         stats->instrs += jay_macro_length(I) << jay_simd_split(s, I);
      }

      stats->loops += I->op == JAY_OPCODE_WHILE;
      stats->sends += I->op == JAY_OPCODE_SEND;

      /* XXX: Write a real cycle model */
      stats->cycles++;
   }

   stats->spills = s->spills;
   stats->fills = s->fills;
   stats->sends -= (s->spills + s->fills);
}

struct jay_shader_bin *
jay_compile(const struct intel_device_info *devinfo,
            void *mem_ctx,
            nir_shader *nir,
            union brw_any_prog_data *prog_data,
            union brw_any_prog_key *key)
{
   jay_debug = debug_get_option_jay_debug();
   bool debug =
      INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)) &&
      !(nir->info.internal || NIR_DEBUG(PRINT_INTERNAL));

   unsigned simd_width = jay_process_nir(devinfo, nir, prog_data, key);

   if (debug) {
      /* We can't use nir_print_shader since it reindexes SSA defs. */
      fprintf(stdout, "NIR right before from_nir:\n\n");
      nir_print_shader_annotated(nir, stdout, NULL);
      fflush(stdout);
   }

   jay_shader *s = jay_new_shader(NULL, nir->info.stage);
   s->dispatch_width = simd_width;
   s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width;
   s->devinfo = devinfo;
   s->prog_data = prog_data;

   nir_foreach_function_impl(impl, nir) {
      jay_from_nir_function(devinfo, nir, s, impl);
   }

   /* Re-number block indices to be sequential and match the NIR. This ensures
    * block indices are ordered with respect to the control flow graph which is
    * a convenient IR invariant.
    */
   jay_foreach_function(s, f) {
      unsigned index = 0;

      jay_foreach_block(f, b) {
         b->index = index++;
      }
   }

   jay_validate(s, "NIR->Jay translation");

   /* After each propagation pass, eliminate dead code. This ensures use counts
    * are correct in jay_opt_propagate_backwards which allows more progress. We
    * don't do a progress loop - just run DCE an extra time. DCE is cheap.
    */
   if (!(jay_debug & JAY_DBG_NOOPT)) {
      JAY_PASS(s, jay_opt_propagate_forwards);
      JAY_PASS(s, jay_opt_dead_code);

      JAY_PASS(s, jay_opt_propagate_backwards);
      JAY_PASS(s, jay_opt_dead_code);
   }

   if (debug) {
      fprintf(stdout, "Jay shader:\n\n");
      jay_print(stdout, s);
   }

   JAY_PASS(s, jay_assign_flags);
   if (!(jay_debug & JAY_DBG_NOOPT)) {
      JAY_PASS(s, jay_opt_dead_code);
   }

   JAY_PASS(s, jay_lower_pre_ra);
   JAY_PASS(s, jay_partition_grf);
   JAY_PASS(s, jay_register_allocate);
   JAY_PASS(s, jay_lower_post_ra);
   JAY_PASS(s, jay_insert_fp_mode, nir->info.float_controls_execution_mode,
            nir->info.bit_sizes_float);

   if (!(jay_debug & JAY_DBG_NOOPT)) {
      /* jay_assign_accumulators uses a conservative liveness analysis for
       * predication, so assign accumulators before predicating for better
       * results.
       */
      if (!(jay_debug & JAY_DBG_NOACC)) {
         JAY_PASS(s, jay_assign_accumulators);
      }

      JAY_PASS(s, jay_opt_predicate);
   }

   if (jay_debug & JAY_DBG_SYNC) {
      JAY_PASS(s, jay_lower_scoreboard_trivial);
   } else {
      JAY_PASS(s, jay_lower_scoreboard);
   }

   if (debug) {
      fprintf(stdout, "Jay shader (post-RA):\n\n");
      jay_print(stdout, s);
   }

   struct jay_shader_bin *bin =
      jay_to_binary(s, nir->constant_data, nir->constant_data_size, debug);
   assert(bin->kernel);
   ralloc_steal(mem_ctx, bin);

   jay_gather_stats(s, &bin->stats);
   bin->stats.code_size = bin->size;

   if (debug) {
      if (nir->info.label) {
         printf("%s - ", nir->info.label);
      }

      const char *shader_name =
         ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(s->stage),
                         s->dispatch_width);
      genisa_stats_fprintf(stdout, shader_name, &bin->stats);
   }

   bin->stats.workgroup_memory_size = nir->info.shared_size;
   bin->stats.dispatch_width = simd_width;

   if (s->stage == MESA_SHADER_FRAGMENT) {
      if (simd_width == 8) {
         prog_data->fs.dispatch_8 = true;
      } else if (simd_width == 16) {
         prog_data->fs.dispatch_16 = true;
         prog_data->fs.prog_offset_16 = 0;
      } else if (simd_width == 32) {
         prog_data->fs.dispatch_32 = true;
         prog_data->fs.prog_offset_32 = 0;
      }

   } else if (mesa_shader_stage_is_compute(s->stage)) {
      unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2;
      prog_data->cs.prog_offset[i] = 0;
      prog_data->cs.prog_mask = BITFIELD_BIT(i);
      prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */
   }

   prog_data->base.program_size = bin->size;

   if (s->scratch_size > 0) {
      /* We currently only support up to 2MB of scratch space.  If we
       * need to support more eventually, the documentation suggests
       * that we could allocate a larger buffer, and partition it out
       * ourselves.  We'd just have to undo the hardware's address
       * calculation by subtracting (FFTID * Per Thread Scratch Space)
       * and then add FFTID * (Larger Per Thread Scratch Space).
       *
       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
       * Thread Group Tracking > Local Memory/Scratch Space.
       */
      assert(s->scratch_size <= devinfo->max_scratch_size_per_thread &&
             "maximum scratch size");

      /* Take the max of any previously compiled variant of the shader. In the
       * case of bindless shaders with return parts, this will also take the
       * max of all parts.
       */
      prog_data->base.total_scratch =
         MAX2(prog_data->base.total_scratch,
              util_next_power_of_two(s->scratch_size));
   }

   /* Scratch is allocated in 1KiB increments. */
   prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024);

   ralloc_free(s);
   return bin;
}