mesa/src/intel/compiler/jay/jay_from_nir.c
Alyssa Rosenzweig 52224bb597 jay/lower_scoreboard: refactor
no functional change, just reshuffling code for next commit.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41510>
2026-05-12 22:46:33 +00:00

2788 lines
89 KiB
C

/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/brw/brw_compiler.h"
#include "compiler/brw/brw_eu.h"
#include "compiler/brw/brw_eu_defines.h"
#include "compiler/brw/brw_nir.h"
#include "compiler/brw/brw_sampler.h"
#include "compiler/intel_nir.h"
#include "compiler/intel_shader_enums.h"
#include "compiler/list.h"
#include "intel/dev/intel_debug.h"
#include "util/bitpack_helpers.h"
#include "util/bitscan.h"
#include "util/bitset.h"
#include "util/lut.h"
#include "util/macros.h"
#include "util/u_math.h"
#include "intel_device_info_gen.h"
#include "jay.h"
#include "jay_builder.h"
#include "jay_builder_opcodes.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_defines.h"
#include "nir_intrinsics.h"
#include "nir_intrinsics_indices.h"
#include "nir_opcodes.h"
#include "nir_search_helpers.h"
#include "shader_enums.h"
#include "shader_stats.h"
static const struct debug_named_value jay_debug_options[] = {
{ "noopt", JAY_DBG_NOOPT, "Disable backend optimizer" },
{ "printdemand", JAY_DBG_PRINTDEMAND, "Print demand per instruction" },
{ "spill", JAY_DBG_SPILL, "Shrink register file to test spilling" },
{ "sync", JAY_DBG_SYNC, "Sync after every instruction" },
{ "noacc", JAY_DBG_NOACC, "Disable accumulator substitution" },
DEBUG_NAMED_VALUE_END
};
DEBUG_GET_ONCE_FLAGS_OPTION(jay_debug, "JAY_DEBUG", jay_debug_options, 0)
int jay_debug = 0;
typedef struct jay_vs_payload {
/* "the maximum limit is 30 elements per vertex" (bspec 56124) */
jay_def attributes[30 * 4];
} jay_vs_payload;
typedef struct jay_cs_payload {
jay_def local_invocation_ids;
} jay_cs_payload;
typedef struct jay_fs_payload {
jay_def bary[INTEL_BARYCENTRIC_MODE_COUNT];
struct {
jay_def xy, z, w;
} coord;
jay_def pixel_sample_mask;
jay_def deltas[64];
} jay_fs_payload;
struct nir_to_jay_state {
jay_shader *s;
jay_function *f;
const nir_shader *nir;
const struct intel_device_info *devinfo;
jay_builder bld;
jay_block *current_block;
jay_block *after_block;
jay_block *break_block;
unsigned indent;
/* We cache ballot(true), ctz(ballot(true)), and 4*ctz(ballot(true)) within a
* block. If we had competent backend CSE - or emitted uniformize in NIR and
* taught NIR's CSE about ballots - we could remove this kludge.
*/
jay_def active_lane_mask, active_lane, active_lane_x4;
/* These defs contain the extracted payload. They are only valid while
* translating NIR->Jay since they aren't maintained by Jay passes.
*/
struct {
jay_def u0, u1;
jay_def sampler_state_pointer, scratch_surface;
jay_def inline_data;
jay_def push_data[512];
jay_def lane_id;
jay_def urb_handle;
union {
jay_vs_payload vs;
jay_cs_payload cs;
jay_fs_payload fs;
};
} payload;
};
static jay_def
payload_u1(struct nir_to_jay_state *nj, unsigned idx, unsigned len)
{
if (jay_is_null(nj->payload.u1))
return jay_null();
else
return jay_extract_range(nj->payload.u1, idx, len);
}
static jay_def
emit_active_lane_mask(struct nir_to_jay_state *nj)
{
/* Note that we don't use mask0 since it needs fixups. Just ballot(true). */
if (jay_is_null(nj->active_lane_mask)) {
nj->active_lane_mask = jay_alloc_def(&nj->bld, FLAG, 1);
jay_MOV(&nj->bld, nj->active_lane_mask, 1);
}
return nj->active_lane_mask;
}
static jay_def
emit_active_lane(struct nir_to_jay_state *nj)
{
/* For this instruction to execute, some lane must be active. Therefore there
* is a 1 in the lower [dispatch width] bits of the lane mask, so we may
* equivalently use fbl.u32 instead of fbl.u[dispatch width].
*/
if (jay_is_null(nj->active_lane)) {
nj->active_lane = jay_alloc_def(&nj->bld, UGPR, 1);
jay_FBL(&nj->bld, nj->active_lane, emit_active_lane_mask(nj));
}
return nj->active_lane;
}
static jay_def
emit_uniformize(struct nir_to_jay_state *nj, jay_def x)
{
jay_builder *b = &nj->bld;
if (x.file != GPR && x.file != FLAG) {
return x;
}
if (jay_is_null(nj->active_lane_x4)) {
nj->active_lane_x4 = jay_SHL_u32(b, emit_active_lane(nj), 2);
}
jay_def u = jay_alloc_def(b, x.file == FLAG ? UFLAG : UGPR, 1);
jay_SHUFFLE(b, u, x, nj->active_lane_x4);
return u;
}
static jay_block *jay_emit_cf_list(struct nir_to_jay_state *nj,
struct exec_list *list);
/** Returns true if the entire compute workgroup fits in a single subgroup. */
static bool
jay_workgroup_is_one_subgroup(jay_builder *b, const nir_shader *nir)
{
return mesa_shader_stage_uses_workgroup(nir->info.stage) &&
!nir->info.workgroup_size_variable &&
nir_static_workgroup_size(nir) <= b->shader->dispatch_width;
}
static enum jay_type
jay_base_type_for_nir(nir_alu_type nir_type)
{
/* clang-format off */
switch (nir_alu_type_get_base_type(nir_type)) {
case nir_type_int: return JAY_TYPE_S;
case nir_type_uint: return JAY_TYPE_U;
case nir_type_bool: return JAY_TYPE_S;
case nir_type_float: return JAY_TYPE_F;
default: UNREACHABLE("invalid NIR type");
}
/* clang-format on */
}
static enum jay_file
jay_file_for_def(const nir_def *def)
{
return def->bit_size == 1 ? (def->divergent ? FLAG : UFLAG) :
(def->divergent ? GPR : UGPR);
}
/**
* Returns an jay_type for the ALU op's i-th source.
* (Useful for conversions and comparisons.)
*/
static enum jay_type
jay_alu_source_type(nir_alu_instr *alu, unsigned i)
{
return jay_type(jay_base_type_for_nir(nir_op_infos[alu->op].input_types[i]),
nir_src_bit_size(alu->src[i].src));
}
static inline jay_def
nj_def(nir_def *def)
{
unsigned bits = def->num_components * MAX2(def->bit_size, 32);
unsigned words = DIV_ROUND_UP(bits, 32);
return jay_contiguous_def(jay_file_for_def(def), def->index, words);
}
static inline jay_def
nj_src(nir_src src)
{
return nj_def(src.ssa);
}
static void
jay_emit_alu(struct nir_to_jay_state *nj, nir_alu_instr *alu)
{
jay_builder *b = &nj->bld;
jay_def dst = nj_def(&alu->def);
nir_alu_type nir_type = nir_op_infos[alu->op].output_type;
enum jay_type base_type = jay_base_type_for_nir(nir_type);
enum jay_type type = jay_type(base_type, alu->def.bit_size);
jay_def src[NIR_ALU_MAX_INPUTS];
for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
unsigned len = nir_src_bit_size(alu->src[i].src) == 64 ? 2 : 1;
src[i] = jay_extract_range(nj_src(alu->src[i].src),
len * alu->src[i].swizzle[0], len);
}
switch (alu->op) {
#define CMP(op, jay) \
case nir_op_##op: \
jay_CMP(b, jay_alu_source_type(alu, 0), JAY_CONDITIONAL_##jay, dst, \
src[0], src[1]); \
break;
#define UNOP(nir, jay_op) \
case nir_op_##nir: \
jay_##jay_op(b, type, dst, src[0]); \
break;
#define MATH(nir, jay_op) \
case nir_op_##nir: \
jay_MATH(b, type, dst, src[0], JAY_MATH_##jay_op); \
break;
#define UNOP_UNTYPED(nir, jay_op) \
case nir_op_##nir: \
jay_##jay_op(b, dst, src[0]); \
break;
#define BINOP(nir, jay_op) \
case nir_op_##nir: \
jay_##jay_op(b, type, dst, src[0], src[1]); \
break;
#define DP4A(nir, jay_op, sat_) \
case nir_op_##nir: \
jay_DP4A_##jay_op(b, dst, src[2], src[0], src[1])->saturate = sat_; \
break;
CMP(flt, LT)
CMP(ilt, LT)
CMP(ult, LT)
CMP(fge, GE)
CMP(ige, GE)
CMP(uge, GE)
CMP(feq, EQ)
CMP(ieq, EQ)
CMP(fneu, NE)
CMP(ine, NE)
MATH(frcp, INV)
MATH(fexp2, EXP)
MATH(flog2, LOG)
MATH(fsin, SIN)
MATH(fcos, COS)
MATH(fsqrt, SQRT)
MATH(frsq, RSQ)
UNOP(ffract, FRC)
UNOP(ftrunc, RNDZ)
UNOP(ffloor, RNDD)
UNOP(fround_even, RNDE)
UNOP_UNTYPED(mov, copy)
UNOP_UNTYPED(unpack_32_2x16_split_x, MOV)
UNOP_UNTYPED(b2b1, CAST_CANONICAL_TO_FLAG)
UNOP_UNTYPED(inot, NOT)
UNOP_UNTYPED(bitfield_reverse, BFREV)
UNOP_UNTYPED(bit_count, CBIT)
UNOP_UNTYPED(uclz, LZD)
UNOP_UNTYPED(find_lsb, FBL)
BINOP(imin, MIN)
BINOP(umin, MIN)
BINOP(fmin, MIN)
BINOP(imax, MAX)
BINOP(umax, MAX)
BINOP(fmax, MAX)
BINOP(fadd, ADD)
BINOP(iadd, ADD)
BINOP(fmul, MUL)
BINOP(imul_32x16, MUL_32X16)
BINOP(umul_32x16, MUL_32X16)
BINOP(ishl, SHL)
BINOP(ishr, ASR)
BINOP(ushr, SHR)
BINOP(urol, ROL)
BINOP(uror, ROR)
BINOP(urhadd, AVG)
BINOP(irhadd, AVG)
BINOP(iand, AND)
BINOP(ior, OR)
BINOP(ixor, XOR)
DP4A(sdot_4x8_iadd, SS, false)
DP4A(sdot_4x8_iadd_sat, SS, true)
DP4A(udot_4x8_uadd, UU, false)
DP4A(udot_4x8_uadd_sat, UU, true)
DP4A(sudot_4x8_iadd, SU, false)
DP4A(sudot_4x8_iadd_sat, SU, true)
#undef CMP
#undef UNOP
#undef UNOP_UNTYPED
#undef BINOP
#undef DP4A
case nir_op_imul:
if (jay_type_size_bits(type) == 32) {
jay_MUL_32(b, type, dst, src[0], src[1], false);
} else {
jay_MUL(b, type, dst, src[0], src[1]);
}
break;
case nir_op_imul_high:
case nir_op_umul_high:
jay_MUL_32(b, type, dst, src[0], src[1], true);
break;
case nir_op_bfm:
jay_BFI1(b, dst, src[0], src[1]);
break;
case nir_op_b2f64:
jay_SEL(b, JAY_TYPE_U32, jay_extract(dst, 1), 0x3ff00000, 0, src[0]);
jay_MOV(b, jay_extract(dst, 0), 0);
break;
case nir_op_ufind_msb_rev:
case nir_op_ifind_msb_rev:
jay_FBH(b, jay_alu_source_type(alu, 0), dst, src[0]);
break;
case nir_op_u2u8:
case nir_op_u2u16:
case nir_op_u2u32:
case nir_op_i2i8:
case nir_op_i2i16:
case nir_op_i2i32:
assert(nir_src_bit_size(alu->src[0].src) > 1 &&
"predicate conversions are lowered");
if (alu->def.bit_size <= nir_src_bit_size(alu->src[0].src)) {
/* Downconversion. Upper bits garbage convention makes this a no-op.
* The extract handles 64->32 narrowing conversions.
*/
jay_MOV(b, dst, jay_extract(src[0], 0));
break;
}
FALLTHROUGH;
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2u64:
case nir_op_u2f64:
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
case nir_op_f2i32:
case nir_op_f2u32:
case nir_op_f2i32_sat:
case nir_op_f2u32_sat:
case nir_op_i2f32:
case nir_op_u2f32:
case nir_op_f2f32:
case nir_op_i2f16:
case nir_op_u2f16:
case nir_op_f2f16:
case nir_op_f2i16:
case nir_op_f2u16:
case nir_op_f2i8:
case nir_op_f2u8: {
enum jay_type src_type = jay_alu_source_type(alu, 0);
/* UGPR byte to float is not supported. Do it in 2 steps. */
if (jay_type_size_bits(src_type) == 8 &&
jay_base_type(type) == JAY_TYPE_F &&
dst.file == UGPR) {
enum jay_type integer = jay_type_rebase(type, jay_base_type(src_type));
jay_def tmp = jay_alloc_def(b, UGPR, 1);
jay_CVT(b, integer, tmp, src[0], src_type, JAY_ROUND, 0);
jay_CVT(b, type, dst, tmp, integer, JAY_ROUND, 0);
} else {
jay_CVT(b, type, dst, src[0], src_type, JAY_ROUND, 0);
}
break;
}
case nir_op_f2f16_rtne:
case nir_op_f2f16_rtz:
jay_CVT(b, JAY_TYPE_F16, dst, src[0], jay_alu_source_type(alu, 0),
alu->op == nir_op_f2f16_rtz ? JAY_RTZ : JAY_RNE, 0);
break;
case nir_op_fsat:
jay_MODIFIER(b, type, dst, src[0])->saturate = true;
break;
case nir_op_fneg:
case nir_op_ineg:
jay_MODIFIER(b, type, dst, jay_negate(src[0]));
break;
case nir_op_fabs:
case nir_op_iabs:
jay_MODIFIER(b, type, dst, jay_abs(src[0]));
break;
case nir_op_iadd3:
jay_ADD3(b, type, dst, src[0], src[1], src[2]);
break;
case nir_op_uadd_sat:
case nir_op_iadd_sat:
jay_ADD(b, type, dst, src[0], src[1])->saturate = true;
break;
case nir_op_usub_sat:
case nir_op_isub_sat:
jay_ADD(b, type, dst, src[0], jay_negate(src[1]))->saturate = true;
break;
case nir_op_ihadd:
case nir_op_uhadd: {
/* AVG(x, y) - ((x ^ y) & 1) */
jay_def avg = jay_alloc_def(b, dst.file, 1);
jay_def bfn = jay_alloc_def(b, dst.file, 1);
jay_AVG(b, type, avg, src[0], src[1]);
jay_BFN(b, bfn, 1, src[0], src[1], UTIL_LUT3(a & (b ^ c)));
jay_ADD(b, type, dst, avg, jay_negate(bfn));
break;
}
case nir_op_unpack_64_2x32_split_x:
jay_MOV(b, dst, jay_extract(src[0], 0));
break;
case nir_op_unpack_64_2x32_split_y:
jay_MOV(b, dst, jay_extract(src[0], 1));
break;
case nir_op_unpack_32_2x16_split_y:
jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, 1);
break;
case nir_op_pack_32_4x8_split: {
/* TODO: Optimize */
jay_def r = jay_BFI2_u32(b, 0x0000ff00, src[1], src[0]);
r = jay_BFI2_u32(b, 0x00ff0000, src[2], r);
jay_BFI2(b, dst, 0xff000000, src[3], r);
break;
}
case nir_op_pack_32_2x16_split:
if (nir_src_is_const(alu->src[0].src) &&
nir_alu_src_as_uint(alu->src[0]) == 0) {
/* pack_32_2x16_split(0, x) is just a shift. This saves a constant. */
jay_SHL(b, JAY_TYPE_U32, dst, src[1], 16);
} else {
/* TODO: Optimize */
jay_BFI2(b, dst, 0xffff0000, src[1], src[0]);
}
break;
case nir_op_pack_64_2x32_split:
jay_MOV(b, jay_extract(dst, 0), src[0]);
jay_MOV(b, jay_extract(dst, 1), src[1]);
break;
case nir_op_bitfield_select:
assert(jay_type_size_bits(type) <= 32);
jay_BFN(b, dst, src[0], src[1], src[2], UTIL_LUT3((a & b) | (~a & c)));
break;
case nir_op_ubfe:
case nir_op_ibfe:
jay_BFE(b, type, dst, src[0], src[1], src[2]);
break;
case nir_op_bfi:
jay_BFI2(b, dst, src[0], src[1], src[2]);
break;
case nir_op_ffma:
jay_MAD(b, type, dst, src[0], src[1], src[2]);
break;
case nir_op_fcsel:
jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
JAY_CONDITIONAL_NE;
break;
case nir_op_fcsel_gt:
case nir_op_i32csel_gt:
jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
JAY_CONDITIONAL_GT;
break;
case nir_op_fcsel_ge:
case nir_op_i32csel_ge:
jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod =
JAY_CONDITIONAL_GE;
break;
case nir_op_bcsel:
assert(alu->def.bit_size < 64);
assert(jay_is_flag(src[0]));
/* sel.s32 can propagate more modifiers than sel.u32 with no drawback */
type = jay_type_rebase(type, JAY_TYPE_S);
/* b2i8 gets lowered into 8-bit csel. Just use the upper bits garbage
* convention to implement with SEL.u16 instead.
*/
if (type == JAY_TYPE_S8) {
type = JAY_TYPE_S16;
}
/* SEL.f32 flushes denorms but SEL.u32 does not, so we can only use the
* float types when we are used only as a float. We care about the uses
* and not the sources here, to ensure we pick u32 instead of f32 for:
*
* ieq(1, bcsel(a, fneg(b), c))
*
* Picking sel.f32 would incorrectly "flush" the integer c. However, when
* we can use sel.f32, we prefer it since it usually gives more
* flexibility for modifiers and saturation.
*/
if (is_only_used_as_float(alu)) {
type = jay_type_rebase(type, JAY_TYPE_F);
}
jay_SEL(b, type, dst, src[1], src[2], src[0]);
break;
case nir_op_extract_u8:
jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U8, JAY_ROUND,
nir_alu_src_as_uint(alu->src[1]));
break;
case nir_op_extract_i8:
jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S8, JAY_ROUND,
nir_alu_src_as_uint(alu->src[1]));
break;
case nir_op_extract_u16:
jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND,
nir_alu_src_as_uint(alu->src[1]));
break;
case nir_op_extract_i16:
jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S16, JAY_ROUND,
nir_alu_src_as_uint(alu->src[1]));
break;
default:
if (nir_op_is_vec(alu->op)) {
for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
unsigned len = jay_type_vector_length(type);
jay_copy(b, jay_extract_range(dst, len * i, len), src[i]);
}
break;
}
nir_print_instr(&alu->instr, stderr);
fprintf(stderr, "\n");
UNREACHABLE("unhandled instruction");
}
}
static void
jay_emit_load_const(struct nir_to_jay_state *nj, nir_load_const_instr *lc)
{
jay_builder *b = &nj->bld;
jay_def dst = nj_def(&lc->def);
assert(lc->def.num_components == 1 && "must be scalarized");
if (lc->def.bit_size == 64 && lc->value[0].u64 >> 32) {
jay_MOV_IMM64(b, dst, lc->value[0].u64);
} else {
jay_MOV(b, dst, lc->value[0].u32);
}
}
static jay_def
jay_resource_handle(jay_builder *b,
nir_src *nsrc,
unsigned *bti_const,
bool *internal,
bool *bindless)
{
if (!nsrc) {
return jay_null();
}
nir_intrinsic_instr *rin = nir_src_as_intrinsic(*nsrc);
if (nir_src_is_const(*nsrc)) {
*bti_const = nir_src_as_uint(*nsrc);
return jay_null();
} else if (!rin || rin->intrinsic != nir_intrinsic_resource_intel) {
return nj_src(*nsrc);
}
uint32_t flags = nir_intrinsic_resource_access_intel(rin);
if (internal) {
*internal = !!(flags & nir_resource_intel_internal);
}
if (bindless) {
*bindless = !!(flags & nir_resource_intel_bindless);
}
if (nir_src_is_const(rin->src[1])) {
*bti_const = nir_src_as_uint(rin->src[1]);
return jay_null();
} else {
return nj_src(rin->src[1]);
}
}
static inline enum lsc_flush_type
translate_flush_type(nir_intrinsic_instr *intr)
{
switch (nir_intrinsic_memory_semantics(intr)) {
case NIR_MEMORY_ACQUIRE:
return LSC_FLUSH_TYPE_INVALIDATE;
case NIR_MEMORY_RELEASE:
return LSC_FLUSH_TYPE_CLEAN;
case NIR_MEMORY_ACQ_REL:
return LSC_FLUSH_TYPE_EVICT;
case NIR_MEMORY_MAKE_AVAILABLE:
case NIR_MEMORY_MAKE_VISIBLE:
default:
UNREACHABLE("unexpected memory semantic");
}
}
static void
emit_lsc_fence(struct nir_to_jay_state *nj,
nir_intrinsic_instr *intr,
enum brw_sfid sfid)
{
bool device = nir_intrinsic_memory_scope(intr) >= SCOPE_QUEUE_FAMILY;
enum lsc_fence_scope scope = device ? LSC_FENCE_TILE : LSC_FENCE_THREADGROUP;
enum lsc_flush_type type =
sfid == BRW_SFID_SLM ? LSC_FLUSH_TYPE_NONE : translate_flush_type(intr);
jay_def notif = jay_alloc_def(&nj->bld, UGPR, jay_ugpr_per_grf(nj->s));
uint32_t desc = lsc_fence_msg_desc(nj->s->devinfo, scope, type, false);
jay_SEND(&nj->bld, .sfid = sfid, .msg_desc = desc, .srcs = &nj->payload.u0,
.nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true, .dst = notif);
}
static void
jay_emit_memory_barrier(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
if (modes & nir_var_image) {
emit_lsc_fence(nj, intr, BRW_SFID_TGM);
assert(!nj->nir->info.use_lowered_image_to_global && "fix common code");
}
if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
emit_lsc_fence(nj, intr, BRW_SFID_UGM);
}
if (modes & (nir_var_shader_out | nir_var_mem_task_payload)) {
emit_lsc_fence(nj, intr, BRW_SFID_URB);
}
if ((modes & nir_var_mem_shared) &&
!jay_workgroup_is_one_subgroup(&nj->bld, nj->nir)) {
emit_lsc_fence(nj, intr, BRW_SFID_SLM);
}
}
static void
jay_emit_signal_barrier(jay_builder *b, struct nir_to_jay_state *nj)
{
/* Signal barrier / Active threads only (BSpec 72052).
*
* Source 0 is the number of subgroups in [31:24], which comes from the u0.2
* payload in [31:24]. Mask out the other bits, then replicate to [23:15].
*
* TODO: This can be done faster with a SIMD2 8-bit move.
*/
jay_def a = jay_AND_u32(b, jay_extract(nj->payload.u0, 2), 0xff000000);
jay_def m2 = jay_OR_u32(b, a, jay_SHR_u32(b, a, 8));
/* Use an active threads only barrier. TODO: I think we can optimize. */
if (b->shader->devinfo->ver >= 20) {
m2 = jay_OR_u32(b, m2, BITFIELD_BIT(8));
}
uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
indices[2] = jay_index(m2);
jay_def zipped = jay_collect(b, UGPR, indices, 3);
jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY,
.msg_desc = BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG, .srcs = &zipped,
.nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true);
}
static void
jay_emit_derivative(jay_builder *b,
jay_def dst,
nir_intrinsic_instr *intr,
enum jay_quad_swizzle swz0,
enum jay_quad_swizzle swz1)
{
assert(intr->def.bit_size == 32 && "todo");
jay_def val = nj_src(intr->src[0]);
jay_ADD(b, JAY_TYPE_F32, dst, jay_QUAD_SWIZZLE_u32(b, val, swz1),
jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0)));
}
static void
jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr)
{
jay_def data = nj_src(intr->src[0]);
jay_def srcs[8];
/* Optimize unconditional discards. Should probably do this in NIR. */
bool trivial =
nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]);
for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) {
srcs[i] =
trivial ? jay_UNDEF_u32(b) : jay_as_gpr(b, jay_extract(data, i));
}
jay_inst *send =
jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true,
.msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs(
nir_get_scalar(intr->src[1].ssa, 0))) |
(nir_scalar_as_uint(nir_scalar_chase_movs(
nir_get_scalar(intr->src[1].ssa, 1)))
<< 32),
.srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]),
.type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr));
/* Handle the disable predicate. It is logically inverted. */
if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) {
jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2])));
}
}
static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)
{
/* clang-format off */
switch (bit_size / 8) {
case 1: return LSC_DATA_SIZE_D8U32;
case 2: return LSC_DATA_SIZE_D16U32;
case 4: return LSC_DATA_SIZE_D32;
case 8: return LSC_DATA_SIZE_D64;
default: UNREACHABLE("Unsupported data size.");
}
/* clang-format on */
}
static enum lsc_opcode
lsc_op_for_atomic(nir_atomic_op op)
{
/* clang-format off */
switch (op) {
case nir_atomic_op_iadd: return LSC_OP_ATOMIC_ADD;
case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN;
case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN;
case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX;
case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX;
case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND;
case nir_atomic_op_ior: return LSC_OP_ATOMIC_OR;
case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR;
case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE;
case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG;
case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN;
case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX;
case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD;
default: UNREACHABLE("Unsupported NIR atomic");
}
/* clang-format on */
}
static jay_def
jay_src_as_strided(jay_builder *b,
jay_def x,
unsigned element_sz,
enum jay_file dst_file)
{
if (dst_file == UGPR) {
assert(jay_is_uniform(x) && "Uniform dests require uniform sources");
if (x.file != UGPR) {
jay_def tmp = jay_alloc_def(b, UGPR, jay_num_values(x));
jay_copy(b, tmp, x);
x = tmp;
}
uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
unsigned nr = jay_num_values(x) * jay_ugpr_per_grf(b->shader);
assert(nr < ARRAY_SIZE(indices));
for (unsigned i = 0; i < jay_num_values(x) / element_sz; ++i) {
for (unsigned j = 0; j < element_sz; ++j) {
indices[(i * jay_ugpr_per_grf(b->shader)) + j] =
jay_channel(x, (i * element_sz) + j);
}
}
return jay_collect(b, UGPR, indices, nr);
} else {
/* Could be a GPR or UGPR source */
assert(dst_file == GPR);
return jay_as_gpr(b, x);
}
}
static jay_def
jay_scratch_surface(struct nir_to_jay_state *nj)
{
if (jay_is_null(nj->payload.scratch_surface)) {
jay_function *func = nj->f;
assert(func->is_entrypoint && "todo: this needs ABI");
jay_builder b = jay_init_builder(func, jay_before_function(func));
jay_def u0_5 = jay_extract(nj->payload.u0, 5);
nj->payload.scratch_surface = jay_AND_u32(&b, u0_5, ~BITFIELD_MASK(10));
}
return nj->payload.scratch_surface;
}
static void
jay_emit_mem_access(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
jay_builder *b = &nj->bld;
bool slm = nir_is_shared_access(intr);
bool tgm = nir_intrinsic_has_image_dim(intr);
bool urb = intr->intrinsic == nir_intrinsic_store_urb_lsc_intel ||
intr->intrinsic == nir_intrinsic_store_urb_vec4_intel;
enum brw_sfid sfid = slm ? BRW_SFID_SLM :
tgm ? BRW_SFID_TGM :
urb ? BRW_SFID_URB :
BRW_SFID_UGM;
nir_src *data_src = nir_get_io_data_src(intr);
bool scratch = intr->intrinsic == nir_intrinsic_load_scratch_intel ||
intr->intrinsic == nir_intrinsic_store_scratch_intel;
enum lsc_opcode op;
if (nir_intrinsic_has_atomic_op(intr))
op = lsc_op_for_atomic(nir_intrinsic_atomic_op(intr));
else if (sfid == BRW_SFID_TGM)
op = data_src ? LSC_OP_STORE_CMASK : LSC_OP_LOAD_CMASK;
else
op = data_src ? LSC_OP_STORE : LSC_OP_LOAD;
nir_src *bti = nir_get_io_index_src(intr), *ubo = NULL;
nir_src *offset_src = tgm ? &intr->src[1] : nir_get_io_offset_src(intr);
if (intr->intrinsic == nir_intrinsic_load_ubo ||
intr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) {
ubo = bti;
bti = NULL;
b->shader->prog_data->base.has_ubo_pull = true;
}
const struct intel_device_info *devinfo = b->shader->devinfo;
bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest;
jay_def data = data_src ? nj_src(*data_src) : jay_null();
unsigned bti_const = 0;
bool internal = false;
bool bindless = false;
jay_def bti_indirect =
jay_resource_handle(b, bti ?: ubo, &bti_const, &internal, &bindless);
jay_def offset = nj_src(*offset_src);
nir_def *ndata = data_src ? data_src->ssa : &intr->def;
jay_def dst = has_dest ? nj_def(&intr->def) : jay_null();
int32_t base_offset =
nir_intrinsic_has_base(intr) ? nir_intrinsic_base(intr) : 0;
/* Optimize increment/decrement */
if (op == LSC_OP_ATOMIC_ADD && nir_src_is_const(*data_src)) {
int64_t add_val = nir_src_as_int(*data_src);
if (add_val == 1 || add_val == -1) {
op = add_val == 1 ? LSC_OP_ATOMIC_INC : LSC_OP_ATOMIC_DEC;
data = jay_null();
}
}
/* Pack the coordinates. TODO: MSAA */
if (tgm) {
unsigned nr = nir_image_intrinsic_coord_components(intr);
offset = jay_extract_range(offset, 0, nr);
}
internal |= scratch;
enum lsc_addr_surface_type surf_type = internal ? LSC_ADDR_SURFTYPE_SS :
bindless ? LSC_ADDR_SURFTYPE_BSS :
(bti || ubo) ? LSC_ADDR_SURFTYPE_BTI :
LSC_ADDR_SURFTYPE_FLAT;
bool a64 = surf_type == LSC_ADDR_SURFTYPE_FLAT && sfid == BRW_SFID_UGM;
enum lsc_addr_size addr_size = a64 ? LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32;
enum jay_type offset_type = a64 ? JAY_TYPE_U64 : JAY_TYPE_U32;
bool cmask = op == LSC_OP_LOAD_CMASK || op == LSC_OP_STORE_CMASK;
bool uniform = !(has_dest && dst.file != UGPR);
if (nir_intrinsic_has_align(intr)) {
assert(nir_intrinsic_align(intr) >= (ndata->bit_size / 8));
}
if (!has_dest) {
uniform &= jay_is_null(data) || data.file == UGPR;
uniform &= jay_is_null(offset) || offset.file == UGPR;
uniform &= !(cmask || urb);
}
/* Per bspec 57330, 8-bit/16-bit are not supported for transpose */
bool transpose = uniform && !cmask && ndata->bit_size >= 32;
bool scalar_uniform = uniform && !cmask && ndata->bit_size < 32;
if (!uniform) {
offset = jay_as_gpr(b, offset);
} else if (!transpose) {
offset = jay_src_as_strided(b, offset, a64 ? 2 : 1, UGPR);
}
if (!jay_is_null(data) && !transpose && !scalar_uniform)
data = jay_as_gpr(b, data);
unsigned access =
nir_intrinsic_has_access(intr) ? nir_intrinsic_access(intr) : 0;
bool volatile_access = access & ACCESS_VOLATILE;
bool coherent_access = access & ACCESS_COHERENT;
/* Bspec: Atomic instruction -> Cache section:
*
* Atomic messages are always forced to "un-cacheable" in the L1
* cache.
*
* Bspec: Overview of memory Access:
*
* If a read from a Null tile gets a cache-hit in a virtually-addressed
* GPU cache, then the read may not return zeroes.
*
* If a shader writes to a null tile and wants to be able to read it back
* as zero, it will use the 'volatile' decoration for the access, otherwise
* the compiler may choose to optimize things out, breaking the
* residencyNonResidentStrict guarantees. Due to the above, we need to make
* these operations uncached.
*/
unsigned cache =
urb ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) :
lsc_opcode_is_atomic(op) ?
LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
volatile_access ?
(devinfo->ver >= 20 ?
/* Xe2 has a better L3 that can deal with null tiles.*/
(!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
LSC_CACHE(devinfo, LOAD, L1UC_L3C)) :
/* On older platforms, all caches have to be bypassed. */
(!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) :
LSC_CACHE(devinfo, LOAD, L1UC_L3UC))) :
/* Skip L1 for coherent accesses */
coherent_access ? (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
LSC_CACHE(devinfo, LOAD, L1UC_L3C)) :
!has_dest ? LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);
unsigned max_imm_bits = brw_max_immediate_offset_bits(surf_type);
assert(base_offset >= u_intN_min(max_imm_bits));
assert(base_offset <= u_intN_max(max_imm_bits));
assert(base_offset == 0 || sfid != BRW_SFID_TGM);
const unsigned base_offs_bits =
util_bitpack_sint(base_offset, 0, max_imm_bits - 1);
unsigned nr = ndata->num_components;
uint64_t desc =
lsc_msg_desc(devinfo, op, surf_type, addr_size,
lsc_bits_to_data_size(ndata->bit_size),
cmask ? BITFIELD_MASK(nr) : nr, transpose, cache);
/* Unlike most SENDs, we may skip the destination of atomics. We do this here
* instead of DCE so we don't need to fix up message descriptors later.
*/
if (nir_intrinsic_has_atomic_op(intr) && nir_def_is_unused(&intr->def)) {
dst = jay_null();
}
jay_def tmp = dst;
if (dst.file == UGPR) {
if (transpose) {
/* Transpose writes whole GRFs, so round up */
tmp = jay_alloc_def(b, UGPR,
ALIGN_POT(jay_num_values(dst),
jay_ugpr_per_grf(b->shader)));
} else {
/* Without transpose we write at GRF granularity. Pad out. */
tmp = jay_alloc_def(b, UGPR,
jay_ugpr_per_grf(b->shader) * jay_num_values(dst));
}
}
jay_def srcs[] = { offset, data };
/* Second data source immediately follows the first */
if (op == LSC_OP_ATOMIC_CMPXCHG || op == LSC_OP_ATOMIC_FCMPXCHG) {
jay_def data2 = nj_src(*(data_src + 1));
if (!transpose) {
data2 = jay_as_gpr(b, data2);
}
srcs[1] = jay_collect_two(b, data, data2);
}
jay_def ex_desc = jay_null();
uint32_t ex_desc_imm = 0;
if (scratch) {
/* TODO: Once we have an address register RA, we should CSE these */
ex_desc = jay_alloc_def(b, J_ADDRESS, 1);
jay_SHR(b, JAY_TYPE_U32, ex_desc, jay_scratch_surface(nj), 4);
if (has_dest) {
b->shader->fills++;
} else {
b->shader->spills++;
}
} else if (surf_type == LSC_ADDR_SURFTYPE_FLAT) {
desc |= ((uint64_t) lsc_flat_ex_desc(devinfo, base_offs_bits) << 32);
} else if (jay_is_null(bti_indirect)) {
desc |=
((uint64_t) lsc_bti_ex_desc(devinfo, bti_const, base_offs_bits) << 32);
} else if (!jay_is_null(bti_indirect)) {
ex_desc = bti_indirect;
if (surf_type == LSC_ADDR_SURFTYPE_SS ||
surf_type == LSC_ADDR_SURFTYPE_BSS) {
ex_desc_imm = SET_BITS(GET_BITS(base_offs_bits, 16, 4), 31, 19) |
SET_BITS(GET_BITS(base_offs_bits, 3, 0), 15, 12);
} else {
/* TODO: Move the SHL to NIR for CSE? */
assert(surf_type == LSC_ADDR_SURFTYPE_BTI);
assert(base_offs_bits == 0);
ex_desc = jay_SHL_u32(b, bti_indirect, 24);
}
}
enum jay_type data_type = jay_type(JAY_TYPE_U, MAX2(ndata->bit_size, 32));
jay_SEND(b, .sfid = sfid, .msg_desc = desc, .srcs = srcs,
.nr_srcs = jay_is_null(data) ? 1 : 2, .dst = tmp, .type = data_type,
.src_type = { offset_type, data_type }, .uniform = uniform,
.bindless = surf_type == LSC_ADDR_SURFTYPE_BSS, .ex_desc = ex_desc,
.ex_desc_imm = ex_desc_imm);
if (has_dest && !jay_defs_equivalent(tmp, dst)) {
jay_copy_strided(b, dst, tmp, !transpose);
}
}
static void
jay_emit_barycentric(struct nir_to_jay_state *nj,
nir_intrinsic_instr *intr,
enum intel_barycentric_mode mode)
{
assert(nj->s->stage == MESA_SHADER_FRAGMENT);
enum glsl_interp_mode glsl_mode = nir_intrinsic_interp_mode(intr);
if (glsl_mode == INTERP_MODE_NOPERSPECTIVE) {
mode += INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL;
} else {
assert(glsl_mode == INTERP_MODE_SMOOTH);
}
jay_copy(&nj->bld, nj_def(&intr->def), nj->payload.fs.bary[mode]);
}
static void
jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
jay_shader *s = nj->s;
jay_function *f = nj->f;
jay_builder *b = &nj->bld;
jay_cs_payload *cs =
mesa_shader_stage_is_compute(s->stage) ? &nj->payload.cs : NULL;
const bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest;
jay_def dst = has_dest ? nj_def(&intr->def) : jay_null();
switch (intr->intrinsic) {
case nir_intrinsic_resource_intel:
/* No code to generate here */
break;
case nir_intrinsic_global_atomic:
case nir_intrinsic_global_atomic_swap:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_constant_uniform_block_intel:
case nir_intrinsic_load_scratch_intel:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared_uniform_block_intel:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_ssbo_intel:
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ubo_uniform_block_intel:
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap:
case nir_intrinsic_store_global:
case nir_intrinsic_store_urb_lsc_intel:
case nir_intrinsic_store_scratch_intel:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_ssbo_intel:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_bindless_image_atomic:
case nir_intrinsic_bindless_image_atomic_swap:
jay_emit_mem_access(nj, intr);
break;
case nir_intrinsic_load_push_data_intel: {
unsigned sz = intr->def.bit_size / 8;
unsigned base_offset = nir_intrinsic_base(intr);
assert(util_is_aligned(base_offset, sz));
if (nir_src_is_const(intr->src[0])) {
unsigned load_offset = nir_src_as_uint(intr->src[0]);
unsigned offs = base_offset + load_offset;
assert(util_is_aligned(load_offset, sz));
if (sz >= 4) {
jay_foreach_comp(dst, c) {
jay_MOV(b, jay_extract(dst, c),
nj->payload.push_data[(offs / 4) + c]);
}
} else {
jay_foreach_comp(dst, c) {
unsigned comp_offs = offs + c * sz;
if (util_is_aligned(comp_offs, 4)) {
jay_MOV(b, jay_extract(dst, c),
nj->payload.push_data[comp_offs / 4]);
} else {
jay_CVT(b, JAY_TYPE_U32, jay_extract(dst, c),
nj->payload.push_data[comp_offs / 4],
JAY_TYPE_U | intr->def.bit_size, JAY_ROUND,
(comp_offs % 4) / sz);
}
}
}
} else {
UNREACHABLE("todo: indirect push data");
}
break;
}
case nir_intrinsic_barrier: {
jay_SCHEDULE_BARRIER(b);
if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
jay_emit_memory_barrier(nj, intr);
}
if ((cs && nir_intrinsic_execution_scope(intr) == SCOPE_WORKGROUP) &&
!jay_workgroup_is_one_subgroup(b, nj->nir)) {
jay_emit_signal_barrier(b, nj);
s->prog_data->cs.uses_barrier = true;
}
break;
}
case nir_intrinsic_begin_invocation_interlock:
case nir_intrinsic_end_invocation_interlock:
UNREACHABLE("TODO");
case nir_intrinsic_load_reloc_const_intel:
jay_RELOC(b, dst, nir_intrinsic_param_idx(intr),
nir_intrinsic_base(intr));
break;
case nir_intrinsic_store_render_target_intel:
assert(nj->nir->info.stage == MESA_SHADER_FRAGMENT);
jay_emit_fb_write(b, intr);
break;
case nir_intrinsic_shader_clock:
/* We must access the timestamp register atomically, but 64-bit
* instructions cannot read ARF. Instead use a 2x32-bit vectorized move.
*/
assert(dst.file == UGPR && "required for vectorization");
jay_MOV(b, dst, jay_contiguous_def(J_ARF, JAY_ARF_TIMESTAMP, 2))->type =
JAY_TYPE_U32;
break;
case nir_intrinsic_load_sample_mask_in: {
jay_def mask = jay_extract(nj->payload.u0, 15);
if (nj->s->dispatch_width == 32) {
/* TODO: Optimize */
jay_def hi = jay_extract(nj->payload.u1, 15);
mask = jay_BFI2_u32(b, 0xffff0000, hi, mask);
}
jay_MOV(b, dst, mask);
break;
}
case nir_intrinsic_load_subgroup_invocation:
/* TODO: Lower this in NIR? */
jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16,
JAY_ROUND, 0);
break;
case nir_intrinsic_demote:
case nir_intrinsic_demote_if:
/* TODO: Already lowered, but need to implement for performance. */
break;
case nir_intrinsic_ddx:
case nir_intrinsic_ddx_coarse:
jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX,
JAY_QUAD_SWIZZLE_YYYY);
break;
case nir_intrinsic_ddx_fine:
jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXZZ,
JAY_QUAD_SWIZZLE_YYWW);
break;
case nir_intrinsic_ddy:
case nir_intrinsic_ddy_coarse:
jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX,
JAY_QUAD_SWIZZLE_ZZZZ);
break;
case nir_intrinsic_ddy_fine:
jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XYXY,
JAY_QUAD_SWIZZLE_ZWZW);
break;
case nir_intrinsic_first_invocation:
jay_MOV(b, dst, emit_active_lane(nj));
break;
case nir_intrinsic_read_first_invocation:
jay_MOV(b, dst, emit_uniformize(nj, nj_src(intr->src[0])));
break;
case nir_intrinsic_ballot:
case nir_intrinsic_ballot_relaxed: {
jay_def val = nj_src(intr->src[0]);
if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
val = emit_active_lane_mask(nj);
} else if (val.file == UFLAG) {
/* Move to a FLAG temporary so we can ballot it. */
val = jay_MOV(b, jay_alloc_def(b, FLAG, 1), val)->dst;
} else {
assert(val.file == FLAG);
}
assert(intr->def.bit_size == b->shader->dispatch_width);
jay_MOV(b, dst, val);
break;
}
/* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a
* GPR input, we could uniformize (as behaviour is undefined for
* non-uniform inputs) but a lowered bit extract is cheaper than uniformize.
*/
case nir_intrinsic_inverse_ballot: {
assert(dst.file == FLAG);
jay_def x = nj_src(intr->src[0]);
if (x.file == GPR) {
jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id);
jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1);
jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_NE);
} else {
jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width;
}
break;
}
case nir_intrinsic_load_local_invocation_id:
assert(cs);
UNREACHABLE("todo: implement me from payload");
jay_copy(b, dst, cs->local_invocation_ids);
break;
case nir_intrinsic_load_barycentric_pixel:
jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL);
break;
case nir_intrinsic_load_barycentric_sample:
jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
break;
case nir_intrinsic_load_barycentric_centroid:
jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID);
break;
case nir_intrinsic_load_pixel_coord_intel:
jay_MOV(b, dst, nj->payload.fs.coord.xy);
break;
case nir_intrinsic_load_frag_coord_z:
jay_MOV(b, dst, nj->payload.fs.coord.z);
break;
case nir_intrinsic_load_frag_coord_w_rcp:
jay_MOV(b, dst, nj->payload.fs.coord.w);
break;
case nir_intrinsic_load_urb_output_handle_intel:
jay_MOV(b, dst, nj->payload.urb_handle);
break;
case nir_intrinsic_load_layer_id:
jay_EXTRACT_LAYER(b, dst, jay_extract(nj->payload.u0, 9),
payload_u1(nj, 9, 1));
break;
case nir_intrinsic_load_front_face: {
/* Bit 11 is facingness for the first polygon. TODO: Multipolygon. */
jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(),
jay_extract(nj->payload.u0, 9), BITFIELD_BIT(11));
/* The bit is actually backfacingness so check for equality with 0 */
jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_EQ);
break;
}
/* Sample ID comes in as 4-bit numbers in g1.0:
*
* 15:12 Slot 3 SampleID
* 11:8 Slot 2 SampleID
* 7:4 Slot 1 SampleID
* 3:0 Slot 0 SampleID
*
* Each slot corresponds to four channels, so we want to replicate each
* half-byte value to 4 channels in a row:
*
* dst+0: .7 .6 .5 .4 .3 .2 .1 .0
* 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
*
* dst+1: .7 .6 .5 .4 .3 .2 .1 .0
* 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
*
* First, we read g1.0 with a <1,8,0>UB region, causing the first 8
* channels to read the first byte (7:0), and the second group of 8
* channels to read the second byte (15:8). Then, we shift right by
* a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
* values into place. Finally, we AND with 0xf to keep the low nibble.
*
* According to the "PS Thread Payload for Normal Dispatch"
* pages on the BSpec, the sample ids are stored in R0.8/R1.8
* on gfx20+ and in R1.0/R2.0 on gfx8+.
*/
case nir_intrinsic_load_sample_id: {
jay_def x = jay_alloc_def(b, GPR, 1);
jay_EXTRACT_BYTE_PER_8LANES(b, x, jay_extract(nj->payload.u0, 8),
payload_u1(nj, 8, 1));
jay_AND_U32_U16(b, dst, jay_SHR_ODD_SUBSPANS_BY_4_u16(b, x), 0xF);
break;
}
case nir_intrinsic_load_input:
if (s->stage == MESA_SHADER_VERTEX) {
unsigned offs = nir_intrinsic_base(intr) * 4;
offs += nir_intrinsic_component(intr);
assert(intr->def.bit_size == 32 && "todo");
jay_copy(b, dst,
jay_collect_vectors(b, nj->payload.vs.attributes + offs,
intr->def.num_components));
break;
}
FALLTHROUGH;
case nir_intrinsic_load_fs_input_interp_deltas: {
assert(s->stage == MESA_SHADER_FRAGMENT);
unsigned location = nir_intrinsic_io_semantics(intr).location +
nir_src_as_uint(intr->src[0]);
unsigned i = (s->prog_data->fs.urb_setup[location] * 4) +
nir_intrinsic_component(intr);
if (intr->intrinsic == nir_intrinsic_load_input) {
assert(intr->def.num_components == 1 && "should be scalarized");
}
/* Zeroth delta is the flat value */
jay_copy(b, dst, nj->payload.fs.deltas[i]);
break;
}
case nir_intrinsic_load_subgroup_id:
assert(cs && f->is_entrypoint && "todo: this needs ABI");
/* Subgroup ID in Thread Group is u0.2 bits 7:0 */
jay_AND(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 0xFF);
break;
case nir_intrinsic_load_num_subgroups:
assert(cs && f->is_entrypoint && "todo: this needs ABI");
/* Number of subgroups in Thread Group is u0.2 bits 31:24 */
jay_SHR(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 24);
break;
case nir_intrinsic_load_workgroup_id:
assert(cs && f->is_entrypoint && "todo: this needs ABI");
jay_MOV(b, jay_extract(dst, 0), jay_extract(nj->payload.u0, 1));
jay_MOV(b, jay_extract(dst, 1), jay_extract(nj->payload.u0, 6));
jay_MOV(b, jay_extract(dst, 2), jay_extract(nj->payload.u0, 7));
break;
case nir_intrinsic_shuffle_intel: {
jay_def data = nj_src(intr->src[0]);
if (nir_src_is_const(intr->src[1])) {
/* Broadcast takes a lane index, with only 32-bit registers */
jay_BROADCAST_IMM(b, dst, data, nir_src_as_uint(intr->src[1]) / 4);
} else {
/* Shuffle takes a byte index */
jay_SHUFFLE(b, dst, data, nj_src(intr->src[1]));
}
break;
}
case nir_intrinsic_quad_broadcast:
jay_QUAD_SWIZZLE(b, dst, nj_src(intr->src[0]),
JAY_QUAD_SWIZZLE_XXXX + nir_src_as_uint(intr->src[1]));
break;
case nir_intrinsic_load_inline_data_intel: {
assert(cs && f->is_entrypoint && "todo: this needs ABI");
assert(nir_src_as_uint(intr->src[0]) == 0 && "TODO: indirects");
unsigned offset = nir_intrinsic_base(intr) / 4;
unsigned nr = jay_num_values(dst);
jay_copy(b, dst, jay_extract_range(nj->payload.inline_data, offset, nr));
break;
}
default:
#ifndef NDEBUG
assert(intr->intrinsic < nir_num_intrinsics);
fprintf(stdout, "intrinsic: %s\n",
nir_intrinsic_infos[intr->intrinsic].name);
#endif
UNREACHABLE("unknown intrinsic");
}
}
static bool
sampler_needs_header(enum brw_sampler_opcode op,
nir_texop nir_op,
const struct intel_device_info *devinfo)
{
switch (op) {
case BRW_SAMPLER_OPCODE_SAMPLEINFO:
return true;
case BRW_SAMPLER_OPCODE_LD:
case BRW_SAMPLER_OPCODE_LD_LZ:
/* Xe3 HW does not seem to work unless we force a header. */
return devinfo->ver >= 30;
default:
return nir_op == nir_texop_tg4;
}
}
static void
jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex)
{
/* SKL PRMs: Volume 7: 3D-Media-GPGPU:
*
* "The Pixel Null Mask field, when enabled via the Pixel Null Mask
* Enable will be incorect for sample_c when applied to a surface with
* 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
* Enable may incorrectly report pixels as referencing a Null surface."
*
* We'll take care of this in NIR.
*/
assert(!tex->is_sparse ||
nir_tex_instr_src_index(tex, nir_tex_src_comparator) == -1);
jay_builder *b = &nj->bld;
jay_def dst = nj_def(&tex->def);
jay_def tmp = dst;
const enum brw_sampler_opcode op = (enum brw_sampler_opcode)(
tex->backend_flags & ~BRW_TEX_INSTR_FUSED_EU_DISABLE);
const struct brw_sampler_payload_desc *payload_desc =
brw_get_sampler_payload_desc(op);
/* First deal with surface & sampler */
unsigned payload_type_bit_size = 0;
bool surface_bindless = false;
bool sampler_bindless = false;
jay_def surface, sampler, packed_offsets = jay_null();
jay_def payload[JAY_MAX_SAMPLER_MESSAGE_SIZE];
int i;
if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle)) >= 0) {
unsigned x;
surface =
jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless);
if (jay_is_null(surface))
surface = jay_imm(x);
assert(tex->texture_index == 0);
} else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset)) >=
0) {
unsigned x;
surface =
jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless);
if (jay_is_null(surface))
surface = jay_imm(x + tex->texture_index);
else if (tex->texture_index)
surface = jay_ADD_u32(b, surface, tex->texture_index);
} else {
surface = jay_imm(tex->texture_index);
}
if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle)) >= 0) {
unsigned x;
sampler =
jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless);
if (jay_is_null(sampler))
surface = jay_imm(x);
assert(tex->sampler_index == 0);
} else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset)) >=
0) {
unsigned x;
sampler =
jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless);
if (jay_is_null(sampler))
sampler = jay_imm(x + tex->sampler_index);
else
sampler = jay_ADD_u32(b, sampler, tex->sampler_index);
} else {
sampler = jay_imm(tex->sampler_index);
}
surface = emit_uniformize(nj, surface);
sampler = emit_uniformize(nj, sampler);
/* Now the sampler payload */
bool has_offset_in_payload = false;
bool payload_uniform = true;
uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0;
for (uint32_t i = 0;
payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID;
i++) {
nir_tex_src_type nir_source;
unsigned nir_comp;
#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name
#define S(name, component) \
do { \
nir_source = nir_tex_src_##name; \
nir_comp = component; \
} while (0)
struct brw_sampler_payload_src sampler_src = payload_desc->sources[i];
switch (sampler_src.param) {
case P(U):
S(coord, 0);
break;
case P(V):
S(coord, 1);
break;
case P(R):
S(coord, 2);
break;
case P(AI):
S(coord, 3);
break;
case P(BIAS):
S(bias, 0);
break;
case P(LOD):
S(lod, 0);
break;
case P(MLOD):
S(min_lod, 0);
break;
case P(REF):
S(comparator, 0);
break;
case P(DUDX):
S(ddx, 0);
break;
case P(DUDY):
S(ddy, 0);
break;
case P(DVDX):
S(ddx, 1);
break;
case P(DVDY):
S(ddy, 1);
break;
case P(DRDX):
S(ddx, 2);
break;
case P(DRDY):
S(ddy, 2);
break;
case P(SI):
S(ms_index, 0);
break;
case P(MCSL):
S(ms_mcs_intel, 0);
break;
case P(MCSH):
S(ms_mcs_intel, 1);
break;
case P(MCS0):
S(ms_mcs_intel, 0);
break;
case P(MCS1):
S(ms_mcs_intel, 1);
break;
case P(MCS2):
S(ms_mcs_intel, 2);
break;
case P(MCS3):
S(ms_mcs_intel, 3);
break;
case P(OFFU):
S(offset, 0);
has_offset_in_payload = true;
break;
case P(OFFV):
S(offset, 1);
has_offset_in_payload = true;
break;
case P(OFFUV4):
case P(OFFUVR4):
case P(OFFUV6):
case P(OFFUVR6):
case P(BIAS_OFFUV6):
case P(BIAS_OFFUVR4):
case P(LOD_OFFUV6):
case P(LOD_OFFUVR4):
case P(OFFUV4_R):
case P(OFFUV6_R):
case P(OFFUVR4_R):
/* There is no payload with 2 packed entries, so backend1 is always
* the one payload parameter packed. */
S(backend1, 0);
has_offset_in_payload = true;
break;
case P(BIAS_AI):
case P(LOD_AI):
case P(MLOD_R):
/* There is no payload with 2 packed entries, so backend1 is always
* the one payload parameter packed. */
S(backend1, 0);
break;
default:
UNREACHABLE("unhandled sampler param");
}
#undef P
#undef S
jay_def param_val = jay_null();
int j = nir_tex_instr_src_index(tex, nir_source);
if (j >= 0 && nir_comp < tex->src[j].src.ssa->num_components) {
param_val = jay_extract(nj_src(tex->src[j].src), nir_comp);
unsigned bitsize = nir_src_bit_size(tex->src[j].src);
assert(payload_type_bit_size == 0 || payload_type_bit_size == bitsize);
payload_type_bit_size = bitsize;
}
/* The hardware requires a LOD for buffer textures */
if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF &&
sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) {
sampler_src.optional = false;
}
/* Wa_14012688258:
*
* Don't trim zeros at the end of payload for sample operations
* in cube and cube arrays.
*
* Compiler should send U,V,R parameters even if V,R are 0.
*/
if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
intel_needs_workaround(nj->devinfo, 14012688258) &&
(sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U ||
sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V ||
sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) {
sampler_src.optional = false;
}
/* The last source present in the payload dictates the number of
* sources, unless it's required.
*
* We can skip the last source if it's zero.
*/
if (!sampler_src.optional || !jay_is_null(param_val))
n_sources = i + 1;
if (jay_is_null(param_val)) {
param_val = jay_alloc_def(b, dst.file, 1);
jay_MOV(b, param_val, 0);
}
payload[i] = param_val;
payload_uniform &= jay_is_uniform(payload[i]);
}
i = nir_tex_instr_src_index(tex, nir_tex_src_backend2);
if (i >= 0) {
packed_offsets = nj_src(tex->src[i].src);
}
/* Xe2+ should never used packed offsets since it has enough opcodes to
* handle any programmable offset.
*/
assert(jay_is_null(packed_offsets) || nj->devinfo->ver < 20);
/* If the NIR instruction has an offset param but the sampler payload
* doesn't, we can put the offset into the header of the message.
*
* The restriction though is that it should be a constant value.
*/
int offs_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset);
bool has_const_offsets = offs_idx != -1 && !has_offset_in_payload;
bool is_high_sampler = !jay_is_imm(sampler) || jay_as_uint(sampler) >= 16;
bool residency = tex->is_sparse;
unsigned null_mask_component = 0;
const bool needs_header = sampler_needs_header(op, tex->op, nj->devinfo) ||
has_const_offsets ||
!jay_is_null(packed_offsets) ||
sampler_bindless ||
is_high_sampler ||
residency;
uint8_t component_mask;
if (tex->op == nir_texop_tg4) {
component_mask = WRITEMASK_XYZW;
} else if (residency) {
/* intel_nir_lower_sparse guarantees that texturing operations only
* read the data, or the sparse residency code, but not both at once.
*
* We need to use UGPRs for the residency result because the sampler
* returns the null pixel mask in lane 0, regardless of lanemasking.
*
* Unfortunately, the sampler doesn't allow us to writemask out all
* four colour channels, so we have to needlessly return red. This
* isn't uniform data, but we store it in an array of UGPRs anyway
* in order to have a consistent def file. The colour data will be
* immediately dead anyway.
*/
assert(tex->op == nir_texop_sparse_residency_intel ||
tex->op == nir_texop_sparse_residency_txf_intel);
assert(nir_def_components_read(&tex->def) == WRITEMASK_Y);
component_mask = WRITEMASK_X;
unsigned red_grfs = payload_uniform ? 1 : jay_grf_per_gpr(b->shader);
unsigned grfs = red_grfs + 1;
tmp = jay_alloc_def(b, UGPR, grfs * jay_ugpr_per_grf(b->shader));
null_mask_component = red_grfs * jay_ugpr_per_grf(b->shader);
} else {
component_mask = nir_def_components_read(&tex->def);
/* We can reduce the return length of the message to drop unused
* trailing components, but shrinking with a discontiguous mask
* requires a message header. We only do that if we need a header
* for other reasons, as it's more expensive than writing extra data.
*/
if (!needs_header) {
component_mask =
(uint8_t) BITFIELD_MASK(util_last_bit(component_mask));
}
/* TODO: Shrink 16-bit textures too. Shrinking is problematic for some
* component masks due to 32-bit granularity of ISA registers.
*/
if (tex->def.bit_size != 32 || (jay_debug & JAY_DBG_NOOPT))
component_mask = nir_component_mask(tex->def.num_components);
/* If we shrunk the destination, we need a temporary */
if (component_mask != BITFIELD_MASK(tex->def.num_components)) {
tmp = jay_alloc_def(b, GPR, util_bitcount(component_mask));
}
}
/* SENDs always write entire GRFs so we need to pad out for uniform dests */
if (dst.file == UGPR && !residency) {
unsigned nr = jay_ugpr_per_grf(b->shader) * jay_num_values(tmp);
tmp = jay_alloc_def(b, UGPR, nr);
}
if (tex->op == nir_texop_texture_samples) {
assert(needs_header);
payload_type_bit_size = 32;
n_sources = 0;
}
jay_def header = jay_null();
if (needs_header) {
uint32_t header2;
if (tex->op == nir_texop_tg4) {
/* Gathers have a component but no write mask */
header2 = (tex->component << 16);
} else {
/* If present, the header write mask are inverted compared to NIR */
header2 = (~component_mask & 0xf) << 12;
}
if (residency)
header2 |= 1 << 23; /* g0.2 bit 23: Pixel Null Mask Enable */
if (has_const_offsets) {
const unsigned num_components = nir_tex_instr_src_size(tex, offs_idx);
for (unsigned i = 0; i < num_components; i++) {
nir_scalar s = nir_get_scalar(tex->src[offs_idx].src.ssa, i);
s = nir_scalar_chase_movs(s);
assert(nir_scalar_is_const(s));
int offset = nir_scalar_as_int(s);
/* Offsets are 4-bits, reversed order */
header2 |= (offset & 0xf) << ((2 - i) * 4);
}
}
/* Vectorized zeroing of the header. TODO: This can be optimized more. */
jay_def zeroes = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader));
jay_MOV(b, zeroes, 0);
jay_def ugprs[JAY_MAX_DEF_LENGTH];
jay_foreach_comp(zeroes, i) {
ugprs[i] = jay_extract(zeroes, i);
}
/* Set the main immediate part of the header */
if (header2 != 0) {
ugprs[2] = jay_MOV_u32(b, header2);
}
if (sampler_bindless) {
/* Bindless sampler handles aren't relative to the sampler state
* pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
* Instead, it's an absolute pointer relative to dynamic state base
* address.
*
* Sampler states are 16 bytes each and the pointer we give here has
* to be 32-byte aligned. In order to avoid more indirect messages
* than required, we assume that all bindless sampler states are
* 32-byte aligned. This sacrifices a bit of general state base
* address space but means we can do something more efficient in the
* shader.
*/
ugprs[3] = sampler;
} else {
/* Select the default dynamic state base address + offset */
jay_def sampler_ptr = nj->payload.sampler_state_pointer;
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
* with the ones included in g0.3 bits 4:0. Mask them out.
*/
if (b->shader->devinfo->ver >= 11) {
sampler_ptr = jay_AND_u32(b, sampler_ptr, INTEL_MASK(31, 5));
}
/* TODO: We should probably lower this in NIR. */
if (is_high_sampler) {
if (jay_is_imm(sampler)) {
unsigned s = jay_as_uint(sampler);
const int sampler_state_size_B = 16;
unsigned offs_B = ROUND_DOWN_TO(s, 16) * sampler_state_size_B;
assert(offs_B > 0 && "since s > 0");
sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B);
} else {
jay_def offs_B =
jay_SHL_u32(b, jay_AND_u32(b, sampler, 0xf0), 4);
sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B);
}
}
ugprs[3] = sampler_ptr;
}
/* Zip it all up into a vector of UGPRs which will RA to a single GRF */
header = jay_collect_vectors(b, ugprs, jay_num_values(zeroes));
}
assert(payload_type_bit_size == 16 || payload_type_bit_size == 32);
unsigned simd_mode = 0;
unsigned simd_width = payload_uniform ? 1 : nj->s->dispatch_width;
if (nj->devinfo->ver < 20) {
if (payload_type_bit_size == 16) {
assert(nj->devinfo->ver >= 11);
simd_mode = simd_width <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
GFX10_SAMPLER_SIMD_MODE_SIMD16H;
} else {
simd_mode = simd_width <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
BRW_SAMPLER_SIMD_MODE_SIMD16;
}
} else {
if (payload_type_bit_size == 16) {
simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
XE2_SAMPLER_SIMD_MODE_SIMD32H;
} else {
simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
XE2_SAMPLER_SIMD_MODE_SIMD32;
}
}
uint64_t desc = 0;
jay_def desc_src = jay_null(), desc_ex_src = jay_null();
unsigned sampler_imm = 0;
if (jay_is_imm(sampler) && !sampler_bindless) {
sampler_imm = jay_as_uint(sampler) % 16;
}
const unsigned msg_type = brw_get_sampler_hw_opcode(op);
bool is_16 = false; /* TODO */
unsigned ret_type = is_16 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS :
GFX8_SAMPLER_RETURN_FORMAT_32BITS;
if (!surface_bindless &&
jay_is_imm(surface) &&
(jay_is_imm(sampler) || sampler_bindless)) {
desc = brw_sampler_desc(nj->devinfo, jay_as_uint(surface), sampler_imm,
msg_type, simd_mode, ret_type);
} else if (surface_bindless) {
/* Bindless surface */
desc = brw_sampler_desc(nj->devinfo, GFX9_BTI_BINDLESS, sampler_imm,
msg_type, simd_mode, ret_type);
/* For bindless samplers, the entire address is included in the message
* header so we can leave the portion in the message descriptor 0.
*/
if (!sampler_bindless && !jay_is_imm(sampler)) {
desc_src = jay_SHL_u32(b, sampler, 8);
}
/* We assume that the driver provided the handle in the top 20 bits so
* we can use the surface handle directly as the extended descriptor.
*/
desc_ex_src = jay_alloc_def(b, J_ADDRESS, 1);
jay_MOV(b, desc_ex_src, surface);
} else {
/* Immediate portion of the descriptor */
desc = brw_sampler_desc(nj->devinfo, 0, 0, msg_type, simd_mode, ret_type);
if (sampler_bindless) {
desc_src = surface;
} else if (!sampler_bindless && jay_is_imm(sampler)) {
desc_src = jay_OR_u32(b, surface, jay_as_uint(sampler) << 8);
} else {
desc_src = jay_OR_u32(b, jay_SHL_u32(b, sampler, 8), surface);
}
desc_src = jay_AND_u32(b, desc_src, 0xfff);
}
if (n_sources > 2 || !jay_is_null(header)) {
for (unsigned i = 0; i < n_sources; ++i) {
payload[i] =
jay_src_as_strided(b, payload[i], 1, payload_uniform ? UGPR : GPR);
}
}
enum jay_type src_type = jay_type(JAY_TYPE_U, payload_type_bit_size);
jay_SEND(b, .sfid = BRW_SFID_SAMPLER, .msg_desc = desc, .desc = desc_src,
.ex_desc = desc_ex_src, .header = header, .srcs = payload,
.nr_srcs = n_sources, .type = JAY_TYPE_U32,
.src_type = { src_type }, .dst = tmp, .uniform = payload_uniform,
.bindless = surface_bindless);
/* If we sampled into a temporary, copy out to the final */
if (residency) {
jay_MOV(b, jay_extract(dst, 1), jay_extract(tmp, null_mask_component));
} else if (!jay_defs_equivalent(dst, tmp)) {
unsigned i = 0;
unsigned tmp_stride = dst.file == UGPR ? jay_ugpr_per_grf(b->shader) : 1;
u_foreach_bit(c, component_mask) {
jay_MOV(b, jay_extract(dst, c), jay_extract(tmp, (i++) * tmp_stride));
}
}
if (mesa_shader_stage_is_compute(b->shader->stage)) {
b->shader->prog_data->cs.uses_sampler |= !nir_tex_instr_is_query(tex);
}
}
static void
jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr)
{
switch (instr->type) {
case nir_jump_break:
jay_block_add_successor(nj->current_block, nj->break_block, GPR);
jay_BREAK(&nj->bld);
break;
case nir_jump_halt:
// TODO: Do we want a predicated EOT here, or a jump to the end?
assert(!"TODO: implement HALT");
break;
case nir_jump_return:
/* Should be lowered */
default:
UNREACHABLE("unknown jump");
}
}
static void
jay_emit_instr(struct nir_to_jay_state *nj, jay_block *block, nir_instr *instr)
{
switch (instr->type) {
case nir_instr_type_alu:
jay_emit_alu(nj, nir_instr_as_alu(instr));
break;
case nir_instr_type_intrinsic:
jay_emit_intrinsic(nj, nir_instr_as_intrinsic(instr));
break;
case nir_instr_type_tex:
jay_emit_texture(nj, nir_instr_as_tex(instr));
break;
case nir_instr_type_load_const:
jay_emit_load_const(nj, nir_instr_as_load_const(instr));
break;
case nir_instr_type_phi:
case nir_instr_type_undef: {
jay_def def = nj_def(nir_instr_def(instr));
jay_foreach_comp(def, c) {
if (instr->type == nir_instr_type_phi) {
jay_PHI_DST(&nj->bld, jay_extract(def, c));
} else {
jay_UNDEF(&nj->bld, jay_extract(def, c));
}
}
break;
}
case nir_instr_type_jump:
jay_emit_jump(nj, nir_instr_as_jump(instr));
break;
case nir_instr_type_deref:
UNREACHABLE("All derefs should've been lowered");
default:
UNREACHABLE("unknown instruction type");
}
}
static jay_block *
jay_create_block(struct nir_to_jay_state *nj)
{
jay_block *block = jay_new_block(nj->f);
block->indent = nj->indent;
return block;
}
static jay_inst *
jay_block_ending_unconditional_jump(jay_block *block)
{
jay_inst *jump = jay_block_ending_jump(block);
return jump && !jump->predication ? jump : NULL;
}
static void
jay_emit_if(struct nir_to_jay_state *nj, nir_if *nif)
{
jay_builder *b = &nj->bld;
jay_def condition = nj_src(nif->condition);
jay_block *before_block = nj->current_block;
jay_block *after_block = jay_create_block(nj);
/* Push */
++nj->indent;
jay_block *else_first = jay_create_block(nj);
jay_block *then_first = jay_emit_cf_list(nj, &nif->then_list);
jay_block *then_last = nj->current_block;
nj->after_block = else_first;
jay_block *else_first_2 = jay_emit_cf_list(nj, &nif->else_list);
jay_block *else_last = nj->current_block;
assert(else_first == else_first_2);
/* Pop */
--nj->indent;
bool uniform = jay_is_uniform(condition);
/* Logical CFG edges */
jay_block_add_successor(before_block, then_first, GPR);
jay_block_add_successor(before_block, else_first, GPR);
if (!jay_block_ending_unconditional_jump(then_last))
jay_block_add_successor(then_last, after_block, GPR);
if (!jay_block_ending_unconditional_jump(else_last))
jay_block_add_successor(else_last, after_block, GPR);
/* For a non-uniform IF, we fall through both sides in the physical CFG */
if (!uniform) {
jay_block_add_successor(then_last, else_first, UGPR);
}
nj->after_block = after_block;
/* Emit the if-else-endif sequence */
b->cursor = jay_after_block(before_block);
jay_add_predicate(b, jay_IF(b), condition);
b->cursor = jay_before_block(else_first);
jay_ELSE(b);
b->cursor = jay_after_block(else_last);
jay_ENDIF(b);
}
static void
jay_emit_loop(struct nir_to_jay_state *nj, nir_loop *nloop)
{
assert(!nir_loop_has_continue_construct(nloop));
jay_builder *b = &nj->bld;
jay_block *saved_break = nj->break_block;
/* Make the block that will be after the loop exit */
nj->break_block = jay_create_block(nj);
++nj->indent;
/* Make a block for the loop body, which is also the loop header */
jay_block *loop_header = jay_create_block(nj);
loop_header->loop_header = true;
/* The current block falls through to the start of the loop */
jay_block_add_successor(nj->current_block, loop_header, GPR);
/* Emit the loop body */
nj->after_block = loop_header;
jay_emit_cf_list(nj, &nloop->body);
/* Emit the backedge */
jay_inst *jump = jay_block_ending_jump(nj->current_block);
if (jump && jump->op == JAY_OPCODE_BREAK) {
jump->op = JAY_OPCODE_LOOP_ONCE;
} else {
jay_block_add_successor(nj->current_block, loop_header, GPR);
jay_WHILE(b);
}
/* Pop */
--nj->indent;
nj->after_block = nj->break_block;
nj->break_block = saved_break;
b->cursor = jay_after_block(nj->after_block);
}
static jay_block *
jay_emit_block(struct nir_to_jay_state *nj, nir_block *nb)
{
jay_builder *b = &nj->bld;
if (nj->after_block) {
nj->current_block = nj->after_block;
nj->after_block = NULL;
} else {
nj->current_block = jay_create_block(nj);
}
jay_block *block = nj->current_block;
block->uniform = !nb->divergent;
list_addtail(&block->link, &nj->f->blocks);
b->cursor = jay_after_block(block);
/* Emit the contents of the block */
nir_foreach_instr(instr, nb) {
jay_emit_instr(nj, block, instr);
}
/* Look in the current NIR block's successors for any phis. Each of them
* should have a source corresponding to a value coming from our current
* block. Create PHI_SRC opcodes in the current block for those values.
* The corresponding PHI_DST may not have been emitted yet, but that's ok.
*/
for (unsigned bs = 0; bs < ARRAY_SIZE(nb->successors); ++bs) {
nir_block *nb_successor = nb->successors[bs];
if (!nb_successor)
continue;
nir_foreach_phi(nphi, nb_successor) {
jay_def val = nj_src(nir_phi_get_src_from_block(nphi, nb)->src);
/* The phi def might be nonuniform but have uniform source (like a
* constant). Move to the correct file in the the source block and
* reference that in PHI_SRC.
*/
if (jay_file_for_def(&nphi->def) != val.file) {
b->cursor = jay_after_block_logical(block);
jay_def tmp = val;
val = jay_alloc_def(b, jay_file_for_def(&nphi->def),
jay_num_values(val));
jay_copy(b, val, tmp);
}
jay_foreach_comp(val, c) {
b->cursor = jay_before_jump(block);
jay_PHI_SRC(b, JAY_TYPE_U32, jay_extract(val, c),
nphi->def.index + c);
}
}
}
b->cursor = jay_after_block(block);
nj->active_lane_mask = jay_null();
nj->active_lane = jay_null();
nj->active_lane_x4 = jay_null();
return block;
}
static jay_block *
jay_emit_cf_list(struct nir_to_jay_state *nj, struct exec_list *list)
{
jay_block *start_block = NULL;
foreach_list_typed(nir_cf_node, node, node, list) {
switch (node->type) {
case nir_cf_node_block: {
jay_block *block = jay_emit_block(nj, nir_cf_node_as_block(node));
if (!start_block)
start_block = block;
break;
}
case nir_cf_node_if:
jay_emit_if(nj, nir_cf_node_as_if(node));
break;
case nir_cf_node_loop:
jay_emit_loop(nj, nir_cf_node_as_loop(node));
break;
default:
UNREACHABLE("Unknown NIR control flow node");
}
}
return start_block;
}
static void
jay_emit_eot(struct nir_to_jay_state *nj)
{
jay_builder *b = &nj->bld;
if (mesa_shader_stage_is_compute(nj->nir->info.stage)) {
/* Vectorized copy into the EOT register. Not necessary for correctness
* but keeps RA from inserting 16 scalar copies instead.
*/
jay_def copy = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader));
jay_MOV(b, copy, nj->payload.u0);
jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, .eot = true, .msg_desc = 0,
.srcs = &copy, .nr_srcs = 1, .type = JAY_TYPE_U32,
.uniform = true);
} else if (nj->nir->info.stage == MESA_SHADER_VERTEX) {
jay_block *block = jay_last_block(nj->f);
jay_inst *I = jay_last_inst(block);
/* TODO: What if this isn't the case? Do we need a no-op store...? */
assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == BRW_SFID_URB);
jay_set_send_eot(I, true);
}
}
struct payload_builder {
jay_builder *b;
unsigned offsets[JAY_NUM_SSA_FILES];
jay_def vecs[JAY_NUM_SSA_FILES];
};
static jay_def
read_payload(struct payload_builder *b, enum jay_file file)
{
unsigned granularity = file == UGPR ? 16 : 1;
unsigned channel = b->offsets[file] % granularity;
if (channel == 0) {
b->vecs[file] = jay_alloc_def(b->b, file, granularity);
jay_PRELOAD(b->b, b->vecs[file], b->offsets[file]);
}
b->offsets[file]++;
return jay_extract(b->vecs[file], channel);
}
static jay_def
read_vector_payload(struct payload_builder *b, enum jay_file file, unsigned len)
{
jay_def defs[JAY_MAX_DEF_LENGTH];
assert(len < ARRAY_SIZE(defs));
for (unsigned i = 0; i < len; ++i) {
defs[i] = read_payload(b, file);
}
return jay_collect_vectors(b->b, defs, len);
}
static void
setup_payload_push(struct nir_to_jay_state *nj, struct payload_builder *p)
{
unsigned push_size_B = 0;
for (int i = 0; i < ARRAY_SIZE(nj->s->prog_data->base.push_sizes); i++) {
push_size_B += nj->s->prog_data->base.push_sizes[i];
}
assert(util_is_aligned(push_size_B, 32));
for (unsigned i = 0; i < (push_size_B / 4); ++i) {
nj->payload.push_data[i] = read_payload(p, UGPR);
}
nj->s->push_grfs = push_size_B / (4 * jay_ugpr_per_grf(nj->s));
}
static void
setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
{
nj->payload.urb_handle = read_payload(p, GPR);
/* XXX: This is a hack to line up with the partition chosen in RA. This whole
* thing needs an overhaul. Need to think harder about partitioning.
*/
p->offsets[GPR] += 7;
for (unsigned i = 0; i < (8 * nj->s->prog_data->vue.urb_read_length); ++i) {
assert(i < ARRAY_SIZE(nj->payload.vs.attributes));
nj->payload.vs.attributes[i] = read_payload(p, GPR);
}
setup_payload_push(nj, p);
}
static void
setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
{
assert(!nj->s->prog_data->cs.generate_local_id);
assert(!nj->s->prog_data->cs.uses_btd_stack_ids);
nj->payload.inline_data =
read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
}
static void
setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
{
jay_fs_payload *fs = &nj->payload.fs;
if (nj->s->dispatch_width == 32) {
nj->payload.u1 = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
}
setup_payload_push(nj, p);
u_foreach_bit(i, nj->s->prog_data->fs.barycentric_interp_modes) {
fs->bary[i] = read_vector_payload(p, GPR, 2);
}
if (nj->s->prog_data->fs.uses_src_depth) {
fs->coord.z = read_payload(p, GPR);
}
if (nj->s->prog_data->fs.uses_src_w) {
fs->coord.w = read_payload(p, GPR);
}
unsigned nr_attribs = 16 * 4; /* TODO */
for (unsigned i = 0; i < nr_attribs; ++i) {
jay_def comps[] = { read_payload(p, UGPR), read_payload(p, UGPR),
read_payload(p, UGPR) };
/* The .yz components are swizzled in the hardware compared to NIR. */
SWAP(comps[1], comps[2]);
fs->deltas[i] = jay_collect_vectors(&nj->bld, comps, ARRAY_SIZE(comps));
/* Padding */
if ((i % 5) == 4) {
read_payload(p, UGPR);
}
}
if (BITSET_TEST(nj->nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
jay_def t = jay_alloc_def(&nj->bld, GPR, 1);
jay_def lo = jay_extract_range(nj->payload.u0, 10, 4);
jay_EXPAND_QUAD(&nj->bld, t, lo, payload_u1(nj, 10, 4));
fs->coord.xy = jay_OFFSET_PACKED_PIXEL_COORDS_u32(&nj->bld, t);
}
/* Due to complexities of the physical payload, the logical payload is split
* into even/odd halves. Fix up the offsets and insert copies.
*/
if (nj->s->dispatch_width == 32) {
jay_builder *b = &nj->bld;
jay_foreach_inst_in_block(nj->after_block, I) {
if (I->op == JAY_OPCODE_PRELOAD && I->dst.file == GPR) {
unsigned base = (jay_preload_reg(I) % 2) ? p->offsets[GPR] : 0;
jay_set_preload_reg(I, base + (jay_preload_reg(I) / 2));
}
}
b->cursor = jay_before_block(nj->after_block);
jay_DESWIZZLE(b, p->offsets[GPR]);
}
}
static void
jay_setup_payload(struct nir_to_jay_state *nj)
{
jay_shader *s = nj->s;
jay_builder *b = &nj->bld;
nj->after_block = jay_create_block(nj);
b->cursor = jay_after_block(nj->after_block);
struct payload_builder p = { .b = &nj->bld };
nj->payload.u0 = read_vector_payload(&p, UGPR, jay_ugpr_per_grf(s));
nj->payload.sampler_state_pointer = jay_extract(nj->payload.u0, 3);
switch (s->stage) {
case MESA_SHADER_VERTEX:
setup_vertex_payload(nj, &p);
break;
case MESA_SHADER_FRAGMENT:
setup_fragment_payload(nj, &p);
break;
case MESA_SHADER_COMPUTE:
case MESA_SHADER_KERNEL:
setup_compute_payload(nj, &p);
break;
default:
UNREACHABLE("unimplemented shader stages");
}
/* Lane ID calculations require &W and therefore are calculated in
* uniform control flow to sidestep RA problems. The easy solution is
* calculating the lane ID in the first block.
*
* XXX: This doesn't work for multi-function. Reconsider.
*/
nj->payload.lane_id = jay_LANE_ID_8_u16(b);
for (unsigned i = 8; i < s->dispatch_width; i *= 2) {
nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i);
}
}
/*
* NIR sometimes contains logically unreachable blocks (e.g. due to infinite
* loops). These blocks have no predecessors, but do have successors and can
* contribute to phis. They are dead and violate the IR invariant:
*
* Live-in sources are live-out in all predecessors.
*
* ...which RA (validation) depends on. The simplest solution is to simply
* delete these dead blocks. Fortunately, because they are unreachable, this
* does not have any ill effects. Notably, this cannot introduce critical edges.
*
* Deleting a block may cause a successor to become unreachable, so we use a
* fixed-point algorithm to converge.
*/
static void
jay_remove_unreachable_blocks(jay_function *func)
{
bool progress;
do {
progress = false;
jay_foreach_block(func, pred) {
if (pred != jay_first_block(func) &&
jay_num_predecessors(pred, GPR) == 0 &&
jay_num_successors(pred, GPR) > 0) {
jay_foreach_successor(pred, succ, GPR) {
util_dynarray_delete_unordered(&succ->logical_preds, jay_block *,
pred);
}
jay_foreach_successor(pred, succ, UGPR) {
util_dynarray_delete_unordered(&succ->physical_preds,
jay_block *, pred);
}
pred->logical_succs[0] = NULL;
pred->logical_succs[1] = NULL;
pred->physical_succs[0] = NULL;
pred->physical_succs[1] = NULL;
progress = true;
}
}
} while (progress);
}
static void
jay_from_nir_function(const struct intel_device_info *devinfo,
nir_shader *nir,
jay_shader *s,
nir_function_impl *impl)
{
jay_function *f = jay_new_function(s);
f->is_entrypoint = impl->function->is_entrypoint;
struct nir_to_jay_state nj = {
.s = s,
.f = f,
.nir = nir,
.devinfo = devinfo,
.bld = (jay_builder) { .shader = s, .func = f },
};
/* Jay indices match NIR indices. Therefore the first impl->ssa_alloc
* indices are reserved. Our own temporaries go after.
*/
f->ssa_alloc = impl->ssa_alloc;
if (f->is_entrypoint) {
jay_setup_payload(&nj);
}
jay_emit_cf_list(&nj, &impl->body);
jay_emit_eot(&nj);
jay_remove_unreachable_blocks(f);
}
static void
jay_gather_stats(const jay_shader *s, struct genisa_stats *stats)
{
jay_foreach_inst_in_shader(s, f, I) {
if (I->op != JAY_OPCODE_SYNC) {
stats->instrs += jay_macro_length(I) << jay_simd_split(s, I);
}
stats->loops += I->op == JAY_OPCODE_WHILE;
stats->sends += I->op == JAY_OPCODE_SEND;
/* XXX: Write a real cycle model */
stats->cycles++;
}
stats->spills = s->spills;
stats->fills = s->fills;
stats->sends -= (s->spills + s->fills);
}
struct jay_shader_bin *
jay_compile(const struct intel_device_info *devinfo,
void *mem_ctx,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key)
{
jay_debug = debug_get_option_jay_debug();
bool debug =
INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)) &&
!(nir->info.internal || NIR_DEBUG(PRINT_INTERNAL));
unsigned simd_width = jay_process_nir(devinfo, nir, prog_data, key);
if (debug) {
/* We can't use nir_print_shader since it reindexes SSA defs. */
fprintf(stdout, "NIR right before from_nir:\n\n");
nir_print_shader_annotated(nir, stdout, NULL);
fflush(stdout);
}
jay_shader *s = jay_new_shader(NULL, nir->info.stage);
s->dispatch_width = simd_width;
s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width;
s->devinfo = devinfo;
s->prog_data = prog_data;
nir_foreach_function_impl(impl, nir) {
jay_from_nir_function(devinfo, nir, s, impl);
}
/* Re-number block indices to be sequential and match the NIR. This ensures
* block indices are ordered with respect to the control flow graph which is
* a convenient IR invariant.
*/
jay_foreach_function(s, f) {
unsigned index = 0;
jay_foreach_block(f, b) {
b->index = index++;
}
}
jay_validate(s, "NIR->Jay translation");
/* After each propagation pass, eliminate dead code. This ensures use counts
* are correct in jay_opt_propagate_backwards which allows more progress. We
* don't do a progress loop - just run DCE an extra time. DCE is cheap.
*/
if (!(jay_debug & JAY_DBG_NOOPT)) {
JAY_PASS(s, jay_opt_propagate_forwards);
JAY_PASS(s, jay_opt_dead_code);
JAY_PASS(s, jay_opt_propagate_backwards);
JAY_PASS(s, jay_opt_dead_code);
}
if (debug) {
fprintf(stdout, "Jay shader:\n\n");
jay_print(stdout, s);
}
JAY_PASS(s, jay_assign_flags);
if (!(jay_debug & JAY_DBG_NOOPT)) {
JAY_PASS(s, jay_opt_dead_code);
}
JAY_PASS(s, jay_lower_pre_ra);
JAY_PASS(s, jay_partition_grf);
JAY_PASS(s, jay_register_allocate);
JAY_PASS(s, jay_lower_post_ra);
JAY_PASS(s, jay_insert_fp_mode, nir->info.float_controls_execution_mode,
nir->info.bit_sizes_float);
if (!(jay_debug & JAY_DBG_NOOPT)) {
/* jay_assign_accumulators uses a conservative liveness analysis for
* predication, so assign accumulators before predicating for better
* results.
*/
if (!(jay_debug & JAY_DBG_NOACC)) {
JAY_PASS(s, jay_assign_accumulators);
}
JAY_PASS(s, jay_opt_predicate);
}
if (jay_debug & JAY_DBG_SYNC) {
JAY_PASS(s, jay_lower_scoreboard_trivial);
} else {
JAY_PASS(s, jay_lower_scoreboard);
}
if (debug) {
fprintf(stdout, "Jay shader (post-RA):\n\n");
jay_print(stdout, s);
}
struct jay_shader_bin *bin =
jay_to_binary(s, nir->constant_data, nir->constant_data_size, debug);
assert(bin->kernel);
ralloc_steal(mem_ctx, bin);
jay_gather_stats(s, &bin->stats);
bin->stats.code_size = bin->size;
if (debug) {
if (nir->info.label) {
printf("%s - ", nir->info.label);
}
const char *shader_name =
ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(s->stage),
s->dispatch_width);
genisa_stats_fprintf(stdout, shader_name, &bin->stats);
}
bin->stats.workgroup_memory_size = nir->info.shared_size;
bin->stats.dispatch_width = simd_width;
if (s->stage == MESA_SHADER_FRAGMENT) {
if (simd_width == 8) {
prog_data->fs.dispatch_8 = true;
} else if (simd_width == 16) {
prog_data->fs.dispatch_16 = true;
prog_data->fs.prog_offset_16 = 0;
} else if (simd_width == 32) {
prog_data->fs.dispatch_32 = true;
prog_data->fs.prog_offset_32 = 0;
}
} else if (mesa_shader_stage_is_compute(s->stage)) {
unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2;
prog_data->cs.prog_offset[i] = 0;
prog_data->cs.prog_mask = BITFIELD_BIT(i);
prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */
}
prog_data->base.program_size = bin->size;
if (s->scratch_size > 0) {
/* We currently only support up to 2MB of scratch space. If we
* need to support more eventually, the documentation suggests
* that we could allocate a larger buffer, and partition it out
* ourselves. We'd just have to undo the hardware's address
* calculation by subtracting (FFTID * Per Thread Scratch Space)
* and then add FFTID * (Larger Per Thread Scratch Space).
*
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
* Thread Group Tracking > Local Memory/Scratch Space.
*/
assert(s->scratch_size <= devinfo->max_scratch_size_per_thread &&
"maximum scratch size");
/* Take the max of any previously compiled variant of the shader. In the
* case of bindless shaders with return parts, this will also take the
* max of all parts.
*/
prog_data->base.total_scratch =
MAX2(prog_data->base.total_scratch,
util_next_power_of_two(s->scratch_size));
}
/* Scratch is allocated in 1KiB increments. */
prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024);
ralloc_free(s);
return bin;
}