mesa/src/intel/compiler/jay/jay_to_binary.c

604 lines
19 KiB
C

/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include <stdint.h>
#include "compiler/brw/brw_disasm_info.h"
#include "compiler/brw/brw_eu.h"
#include "compiler/brw/brw_eu_defines.h"
#include "compiler/brw/brw_eu_inst.h"
#include "compiler/brw/brw_reg.h"
#include "compiler/brw/brw_reg_type.h"
#include "dev/intel_debug.h"
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "util/u_math.h"
#include "jay.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
static inline enum brw_reg_type
to_brw_reg_type(enum jay_type type)
{
/* clang-format off */
switch (type) {
case JAY_TYPE_UNTYPED:
case JAY_TYPE_U8: return BRW_TYPE_UB;
case JAY_TYPE_U16: return BRW_TYPE_UW;
case JAY_TYPE_U32: return BRW_TYPE_UD;
case JAY_TYPE_U64: return BRW_TYPE_UQ;
case JAY_TYPE_S8: return BRW_TYPE_B;
case JAY_TYPE_S16: return BRW_TYPE_W;
case JAY_TYPE_S32: return BRW_TYPE_D;
case JAY_TYPE_S64: return BRW_TYPE_Q;
case JAY_TYPE_F16: return BRW_TYPE_HF;
case JAY_TYPE_F32: return BRW_TYPE_F;
case JAY_TYPE_F64: return BRW_TYPE_DF;
case JAY_TYPE_BF16: return BRW_TYPE_BF;
default: UNREACHABLE("invalid type");
}
/* clang-format on */
}
static inline unsigned
to_def_grf_16(struct jay_partition *p, jay_def d)
{
unsigned count = jay_num_values(d);
if (count == 0 || !(d.file == GPR || d.file == UGPR)) {
return d.reg;
}
unsigned base = 0;
for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) {
unsigned offset = d.reg - base;
if (offset < p->blocks[d.file][i].len) {
assert(offset + count <= p->blocks[d.file][i].len &&
"vectors must not cross partition boundaries");
return (p->blocks[d.file][i].start + offset) * 2 + d.hi;
}
base += p->blocks[d.file][i].len;
}
UNREACHABLE("virtual register must be in a block");
}
static inline brw_reg
to_brw_reg(jay_function *f,
const jay_inst *I,
signed idx,
unsigned simd_offs,
bool force_hi)
{
bool is_dest = idx < 0;
enum jay_type type = is_dest ? I->type : jay_src_type(I, idx);
jay_def d = is_dest ? I->dst : I->src[idx];
d.hi |= force_hi;
struct brw_reg R;
unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0;
if (jay_is_imm(d)) {
/* Immediates have size restrictions but can zero extend */
if (jay_type_size_bits(type) == 64) {
type = jay_type_resize(type, 32);
} else if (I->op == JAY_OPCODE_BFN) {
assert(jay_as_uint(d) < UINT16_MAX);
type = JAY_TYPE_U16;
}
R = brw_imm_ud(jay_as_uint(d));
} else if (jay_is_null(d)) {
R = brw_null_reg();
} else if (d.file == UGPR || d.file == UACCUM) {
unsigned grf = (reg >> 1) / 8;
offset_B = ((reg >> 1) % 8) * 4;
if (d.file == UGPR) {
R = brw_ud1_grf(grf, 0);
} else {
R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0);
}
/* Handle 3-src restrictions and vectorized uniform code. */
if (is_dest || jay_num_values(d) >= 8) {
R = vec8(R);
}
/* Some operations have special restrictions on the destination stride,
* but if we write a single UGPR the stride is ignored.. Specify
* whatever stride is needed to satisfy the rules.
*/
if (is_dest) {
/* BSpec 56640 "Special Restrictions" says:
*
* "Conversion between HF and Integer must be DWord-aligned
* and strided by a DWord on the destination."
*/
enum jay_type src0_type = jay_src_type(I, 0);
if ((I->type == JAY_TYPE_F16 && !jay_type_is_any_float(src0_type)) ||
(src0_type == JAY_TYPE_F16 && !jay_type_is_any_float(I->type))) {
assert(jay_num_values(d) == 1 && "must not vectorize HF<->Int");
R = stride(R, 8, 2, 4);
}
/* Packed floats have restrictions on mixed sizes. Use <2>. */
if (jay_type_size_bits(I->type) == 16 &&
jay_type_size_bits(jay_src_type(I, 0)) != 16) {
assert(jay_num_values(d) == 1 && "must not vectorize mixed float");
R = stride(R, 4, 2, 2);
}
}
} else if (d.file == GPR || d.file == ACCUM) {
enum jay_stride def_stride =
d.file == GPR ? jay_def_stride(f->shader, d) : JAY_STRIDE_4;
uint32_t type_bits = jay_type_size_bits(type);
unsigned stride_bits = jay_stride_to_bits(def_stride);
unsigned simd_width = jay_simd_width_physical(f->shader, I);
unsigned grf;
if (def_stride == JAY_STRIDE_2) {
/* Bit 0 selects between lo/hi halves of the GPR */
grf = (reg / 2) * jay_grf_per_gpr(f->shader);
offset_B = (reg & 1) * 2 * f->shader->dispatch_width;
} else {
/* Low bits are an offset in 2-byte words into the GRF */
unsigned mask = BITFIELD_MASK(stride_bits / 32);
grf = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader);
offset_B = (reg & mask) * 2;
}
if (d.file == GPR) {
R = byte_offset(xe2_vec8_grf(grf, 0),
simd_offs * simd_width * stride_bits / 8);
} else {
R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0);
}
if (stride_bits == (type_bits * 4)) {
R = stride(R, 8, 2, 4);
} else if (stride_bits == (type_bits * 2)) {
R = stride(R, 4, 2, 2);
} else {
assert(stride_bits == type_bits);
}
/* Broadcast is equivalent to <8, 8, 1> for SIMD1 instructions. Use that
* instead due to regioning restrictions.
*/
if (simd_width == 1) {
R = vec1(R);
}
} else if (jay_is_flag(d)) {
/* Explicit flags act like UGPRs. As sources they broadcast to all lanes,
* so we may ignore the SIMD offset. As destinations, they are written by
* SIMD1 instructions and are never SIMD split.
*/
assert(simd_offs == 0 || idx >= 0);
unsigned offs_B = d.reg * (f->shader->dispatch_width / 8);
R = brw_flag_subreg(offs_B / 2);
} else if (d.file == J_ADDRESS) {
R = brw_address_reg(d.reg);
} else if (d.file == J_ARF) {
R = brw_ud1_reg(ARF, jay_base_index(d), 0);
} else {
UNREACHABLE("unexpected file");
}
R.negate = d.negate;
R.abs = d.abs;
return byte_offset(retype(R, to_brw_reg_type(type)), offset_B);
}
#define SRC(i) to_brw_reg(f, I, i, simd_offs, false)
#define OP0(hw) \
case JAY_OPCODE_##hw: \
brw_##hw(p); \
break;
#define OP1(jay, hw) \
case JAY_OPCODE_##jay: \
brw_alu1(p, BRW_OPCODE_##hw, dst, SRC(0)); \
break;
#define OP2(jay, hw) \
case JAY_OPCODE_##jay: \
brw_alu2(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1)); \
break;
#define OP3(jay, hw) \
case JAY_OPCODE_##jay: \
brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1), SRC(2)); \
break;
#define OP3_SWAP(jay, hw) \
case JAY_OPCODE_##jay: \
brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(2), SRC(1), SRC(0)); \
break;
static struct brw_reg
quad_swizzle(struct brw_reg r, const jay_inst *I)
{
/* clang-format off */
switch (jay_quad_swizzle_swizzle(I)) {
case JAY_QUAD_SWIZZLE_XXXX: return suboffset(stride(r, 4, 4, 0), 0);
case JAY_QUAD_SWIZZLE_YYYY: return suboffset(stride(r, 4, 4, 0), 1);
case JAY_QUAD_SWIZZLE_ZZZZ: return suboffset(stride(r, 4, 4, 0), 2);
case JAY_QUAD_SWIZZLE_WWWW: return suboffset(stride(r, 4, 4, 0), 3);
case JAY_QUAD_SWIZZLE_XXZZ: return suboffset(stride(r, 2, 2, 0), 0);
case JAY_QUAD_SWIZZLE_YYWW: return suboffset(stride(r, 2, 2, 0), 1);
case JAY_QUAD_SWIZZLE_XYXY: return suboffset(stride(r, 0, 2, 1), 0);
case JAY_QUAD_SWIZZLE_ZWZW: return suboffset(stride(r, 0, 2, 1), 2);
}
/* clang-format on */
UNREACHABLE("invalid quad swizzle");
}
/* Runs once per SIMD-split, so must not modify the instruction! */
static void
emit(struct brw_codegen *p,
jay_function *f,
const jay_inst *I,
unsigned simd_offs)
{
ASSERTED unsigned nr_ins_before = p->nr_insn;
unsigned exec_size = jay_simd_width_physical(f->shader, I);
// jay_print_inst(stdout, (jay_inst *) I);
/* Replicate the SWSB regdist for SIMD split instructions if needed */
struct tgl_swsb dep =
simd_offs && !I->replicate_dep ? tgl_swsb_null() : I->dep;
/* We do not allow SBID dependencies on SIMD split instructions since
* individual groups could get shot down. This would require more tracking
* and is unclear whether it's beneficial.
*/
assert(simd_offs == 0 || I->dep.mode == TGL_SBID_NULL);
if (I->decrement_dep) {
unsigned delta = simd_offs * jay_macro_length(I);
assert(dep.regdist > delta);
dep.regdist -= delta;
}
brw_set_default_exec_size(p, util_logbase2(exec_size));
brw_set_default_mask_control(p, jay_is_no_mask(I));
brw_set_default_group(p, simd_offs * exec_size);
brw_set_default_swsb(p, dep);
brw_set_default_saturate(p, I->saturate);
/* Grab the hardware predicate, corresponding either to a logical predicate
* or SEL's selector.
*/
const jay_def *pred = I->predication ? jay_inst_get_predicate((void *) I) :
I->op == JAY_OPCODE_SEL ? &I->src[2] :
NULL;
brw_set_default_predicate_control(p, pred ? BRW_PREDICATE_NORMAL :
BRW_PREDICATE_NONE);
brw_set_default_predicate_inverse(p, pred && pred->negate);
/* Jay/brw enums line up by construction */
enum brw_conditional_mod cmod =
(enum brw_conditional_mod) I->conditional_mod;
if (!jay_is_null(I->cond_flag)) {
assert(!(pred && pred->reg != I->cond_flag.reg) && "must be tied");
pred = &I->cond_flag;
}
if (pred) {
unsigned reg = pred->reg * jay_phys_flag_per_virt(f->shader);
brw_set_default_flag_reg(p, reg / 2, reg % 2);
}
if (I->op == JAY_OPCODE_MIN) {
cmod = BRW_CONDITIONAL_L;
} else if (I->op == JAY_OPCODE_MAX) {
cmod = BRW_CONDITIONAL_GE;
}
struct brw_reg dst = to_brw_reg(f, I, -1, simd_offs, false);
switch (I->op) {
OP0(ELSE)
OP0(ENDIF)
OP0(WHILE)
OP0(BREAK)
OP1(MOV, MOV)
OP1(MODIFIER, MOV)
OP1(RNDD, RNDD)
OP1(RNDZ, RNDZ)
OP1(RNDE, RNDE)
OP1(FRC, FRC)
OP1(BFREV, BFREV)
OP1(CBIT, CBIT)
OP1(NOT, NOT)
OP1(FBL, FBL)
OP1(FBH, FBH)
OP1(LZD, LZD)
OP2(ROL, ROL)
OP2(ROR, ROR)
OP2(AVG, AVG)
OP2(ADD, ADD)
OP2(MUL, MUL)
OP2(SEL, SEL)
OP2(MIN, SEL)
OP2(MAX, SEL)
OP2(MUL_32X16, MUL)
OP2(AND, AND)
OP2(AND_U32_U16, AND)
OP2(OR, OR)
OP2(XOR, XOR)
OP2(ASR, ASR)
OP2(SHR, SHR)
OP2(SHL, SHL)
OP2(BFI1, BFI1)
OP3(BFI2, BFI2)
OP3(ADD3, ADD3)
OP3(CSEL, CSEL)
OP3(DP4A_UU, DP4A)
OP3(DP4A_SS, DP4A)
OP3(DP4A_SU, DP4A)
OP3_SWAP(MAD, MAD)
OP3_SWAP(BFE, BFE)
case JAY_OPCODE_LOOP_ONCE:
/* TODO: Is there a better way to do this? */
brw_BREAK(p);
brw_WHILE(p);
break;
case JAY_OPCODE_IF:
brw_IF(p, util_logbase2(exec_size));
break;
case JAY_OPCODE_MATH:
gfx6_math(p, dst, jay_math_op(I), SRC(0),
retype(brw_null_reg(), to_brw_reg_type(I->type)));
break;
case JAY_OPCODE_BFN:
brw_BFN(p, dst, SRC(0), SRC(1), SRC(2), brw_imm_ud(jay_bfn_ctrl(I)));
break;
case JAY_OPCODE_DESWIZZLE_ODD: {
bool hi = simd_offs == 0 ? true : jay_deswizzle_odd_src2_hi(I);
brw_set_default_group(p, 0);
brw_MOV(p, dst,
byte_offset(to_brw_reg(f, I, simd_offs, 0, false), hi ? 64 : 0));
break;
}
case JAY_OPCODE_DESWIZZLE_EVEN:
brw_set_default_exec_size(p, BRW_EXECUTE_16);
brw_MOV(p, byte_offset(dst, 64),
byte_offset(SRC(0), jay_deswizzle_even_src_hi(I) * 64));
break;
case JAY_OPCODE_CVT: {
unsigned index = jay_cvt_index(I);
bool force_hi = false;
/* We will apply a suboffset for the specific subword being converted. In
* the case where we have a subword (16-bit) stride, accesses to the upper
* half will be instead to a discontiguous GRF so we have to fix up. This
* affects u8->u32 conversions.
*/
if (I->src[0].file == GPR) {
unsigned type_size_B = jay_type_size_bits(jay_cvt_src_type(I)) / 8;
unsigned index_B = index * type_size_B;
unsigned stride_B =
jay_stride_to_bits(jay_def_stride(f->shader, I->src[0])) / 8;
if (index_B >= stride_B) {
assert(stride_B == 2 && index_B <= 4 && !I->src[0].hi);
force_hi = true;
index = (index_B % stride_B) / type_size_B;
}
}
brw_MOV(p, dst,
suboffset(to_brw_reg(f, I, 0, simd_offs, force_hi), index));
break;
}
case JAY_OPCODE_SYNC:
brw_SYNC(p, jay_sync_op(I));
break;
case JAY_OPCODE_CMP:
brw_CMP(p, dst, I->conditional_mod, SRC(0), SRC(1));
break;
case JAY_OPCODE_MOV_IMM64:
brw_MOV(p, dst, brw_imm_u64(jay_mov_imm64_imm(I)));
break;
case JAY_OPCODE_RELOC:
brw_MOV_reloc_imm(p, dst, BRW_TYPE_UD, jay_reloc_param(I),
jay_reloc_base(I));
break;
case JAY_OPCODE_QUAD_SWIZZLE:
/* Quad swizzle can get split down to SIMD4 even on Xe2 where we don't
* have NibCtrl. Fortunately, it's NoMask so it doesn't matter.
*/
brw_set_default_group(p, 0);
brw_MOV(p, dst, quad_swizzle(SRC(0), I));
break;
case JAY_OPCODE_BROADCAST_IMM:
brw_MOV(p, dst, get_element(SRC(0), jay_broadcast_imm_lane(I)));
break;
case JAY_OPCODE_SEND:
brw_SEND(p, jay_send_sfid(I), dst, SRC(2), SRC(3), SRC(0), SRC(1),
jay_send_ex_desc_imm(I), jay_send_ex_mlen(I),
jay_send_bindless(I), jay_send_eot(I), false /* gather */);
if (jay_send_check_tdr(I)) {
brw_eu_inst_set_opcode(p->isa, brw_eu_last_inst(p), BRW_OPCODE_SENDC);
}
break;
/* Gfx20+ has separate Render Target Array indices for each pair of subspans
* in order to support multiple polygons, so we need to use a <1;8,0> region
* in order to select the word for each channel.
*/
case JAY_OPCODE_EXTRACT_LAYER:
brw_AND(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UW), 1, 8, 0),
brw_imm_uw(0x7ff));
break;
case JAY_OPCODE_EXPAND_QUAD:
brw_MOV(p, dst, stride(SRC(simd_offs), 1, 4, 0));
break;
case JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS:
brw_set_default_exec_size(p, BRW_EXECUTE_32);
brw_set_default_group(p, 0);
brw_ADD(p, retype(dst, BRW_TYPE_UW), retype(SRC(0), BRW_TYPE_UW),
brw_imm_uv(0x11100100));
break;
case JAY_OPCODE_LANE_ID_8:
brw_set_default_exec_size(p, BRW_EXECUTE_8);
brw_MOV(p, dst, brw_imm_uv(0x76543210));
break;
case JAY_OPCODE_LANE_ID_EXPAND:
brw_set_default_exec_size(p, util_logbase2(jay_lane_id_expand_width(I)));
brw_ADD(p, suboffset(dst, jay_lane_id_expand_width(I)), SRC(0),
brw_imm_uw(jay_lane_id_expand_width(I)));
break;
case JAY_OPCODE_EXTRACT_BYTE_PER_8LANES:
brw_MOV(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UB), 1, 8, 0));
break;
case JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4:
brw_SHR(p, dst, SRC(0), brw_imm_uv(0x44440000));
break;
case JAY_OPCODE_MUL_32: {
brw_MUL(p, retype(brw_acc_reg(1), to_brw_reg_type(I->type)), SRC(0),
subscript(SRC(1), BRW_TYPE_UW, 0));
brw_set_default_swsb(p, tgl_swsb_null());
brw_alu2(p, jay_mul_32_high(I) ? BRW_OPCODE_MACH : BRW_OPCODE_MACL, dst,
SRC(0), SRC(1));
break;
}
case JAY_OPCODE_SHUFFLE: {
struct brw_reg a0 = brw_address_reg(0);
unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]);
unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width;
brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B));
brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD));
break;
}
default:
jay_print_inst(stderr, (jay_inst *) I);
UNREACHABLE("Unhandled opcode");
}
if (cmod != BRW_CONDITIONAL_NONE) {
if (I->op != JAY_OPCODE_BFN) {
brw_eu_inst_set_cond_modifier(p->devinfo, brw_eu_last_inst(p), cmod);
} else {
unsigned cc = cmod == BRW_CONDITIONAL_L ? 3 :
cmod == BRW_CONDITIONAL_G ? 2 :
cmod == BRW_CONDITIONAL_Z ? 1 :
cmod == BRW_CONDITIONAL_NONE ? 0 :
-1;
assert(cc < 4 && "invalid cmod for bfn");
brw_eu_inst_set_boolean_func_cond_modifier(p->devinfo,
brw_eu_last_inst(p), cc);
}
}
assert(p->nr_insn == (nr_ins_before + jay_macro_length(I)) &&
"Jay instructions must map 1:n to GEN instructions");
}
struct jay_shader_bin *
jay_to_binary(jay_shader *s,
void *const_data,
size_t const_data_size,
bool debug)
{
struct jay_shader_bin *bin = rzalloc(s, struct jay_shader_bin);
struct util_dynarray prog;
util_dynarray_init(&prog, bin);
struct brw_isa_info isa;
struct brw_codegen p;
brw_init_isa_info(&isa, s->devinfo);
brw_init_codegen(&isa, &p, bin);
int start_offset = p.next_insn_offset;
/* TODO: Multifunction properly */
jay_foreach_function(s, f) {
jay_foreach_block(f, block) {
if (block->loop_header) {
brw_DO(&p, 0);
}
jay_foreach_inst_in_block(block, I) {
for (unsigned i = 0; i < (1 << jay_simd_split(s, I)); ++i) {
emit(&p, f, I, i);
}
}
}
}
int final_halt_offset = -1 /* TODO */;
brw_set_uip_jip(&p, start_offset, final_halt_offset);
struct disasm_info *disasm = disasm_initialize(p.isa, NULL);
disasm_new_inst_group(disasm, 0);
disasm_new_inst_group(disasm, p.next_insn_offset);
UNUSED bool valid = true;
#ifndef NDEBUG
valid =
brw_validate_instructions(p.isa, p.store, 0, p.next_insn_offset, disasm);
#endif
brw_compact_instructions(&p, start_offset, disasm);
if (debug || !valid) {
dump_assembly(p.store, 0, p.next_insn_offset, disasm, NULL, stdout);
}
if (!valid) {
UNREACHABLE("invalid assembly");
}
struct brw_stage_prog_data *prog_data = &s->prog_data->base;
assert(prog_data->const_data_size == 0);
if (const_data_size > 0) {
prog_data->const_data_size = const_data_size;
prog_data->const_data_offset =
brw_append_data(&p, const_data, const_data_size, 32);
}
bin->kernel = brw_get_program(&p, &bin->size);
s->prog_data->base.relocs =
brw_get_shader_relocs(&p, &s->prog_data->base.num_relocs);
return bin;
}