/* * Copyright 2026 Intel Corporation * SPDX-License-Identifier: MIT */ #include #include "compiler/brw/brw_disasm_info.h" #include "compiler/brw/brw_eu.h" #include "compiler/brw/brw_eu_defines.h" #include "compiler/brw/brw_eu_inst.h" #include "compiler/brw/brw_reg.h" #include "compiler/brw/brw_reg_type.h" #include "dev/intel_debug.h" #include "util/macros.h" #include "util/u_dynarray.h" #include "util/u_math.h" #include "jay.h" #include "jay_ir.h" #include "jay_opcodes.h" #include "jay_private.h" static inline enum brw_reg_type to_brw_reg_type(enum jay_type type) { /* clang-format off */ switch (type) { case JAY_TYPE_UNTYPED: case JAY_TYPE_U8: return BRW_TYPE_UB; case JAY_TYPE_U16: return BRW_TYPE_UW; case JAY_TYPE_U32: return BRW_TYPE_UD; case JAY_TYPE_U64: return BRW_TYPE_UQ; case JAY_TYPE_S8: return BRW_TYPE_B; case JAY_TYPE_S16: return BRW_TYPE_W; case JAY_TYPE_S32: return BRW_TYPE_D; case JAY_TYPE_S64: return BRW_TYPE_Q; case JAY_TYPE_F16: return BRW_TYPE_HF; case JAY_TYPE_F32: return BRW_TYPE_F; case JAY_TYPE_F64: return BRW_TYPE_DF; case JAY_TYPE_BF16: return BRW_TYPE_BF; default: UNREACHABLE("invalid type"); } /* clang-format on */ } static inline unsigned to_def_grf_16(struct jay_partition *p, jay_def d) { unsigned count = jay_num_values(d); if (count == 0 || !(d.file == GPR || d.file == UGPR)) { return d.reg; } unsigned base = 0; for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) { unsigned offset = d.reg - base; if (offset < p->blocks[d.file][i].len) { assert(offset + count <= p->blocks[d.file][i].len && "vectors must not cross partition boundaries"); return (p->blocks[d.file][i].start + offset) * 2 + d.hi; } base += p->blocks[d.file][i].len; } UNREACHABLE("virtual register must be in a block"); } static inline brw_reg to_brw_reg(jay_function *f, const jay_inst *I, signed idx, unsigned simd_offs, bool force_hi) { bool is_dest = idx < 0; enum jay_type type = is_dest ? I->type : jay_src_type(I, idx); jay_def d = is_dest ? I->dst : I->src[idx]; d.hi |= force_hi; struct brw_reg R; unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0; if (jay_is_imm(d)) { /* Immediates have size restrictions but can zero extend */ if (jay_type_size_bits(type) == 64) { type = jay_type_resize(type, 32); } else if (I->op == JAY_OPCODE_BFN) { assert(jay_as_uint(d) <= UINT16_MAX); type = JAY_TYPE_U16; } R = brw_imm_ud(jay_as_uint(d)); } else if (jay_is_null(d)) { R = brw_null_reg(); } else if (d.file == UGPR || d.file == UACCUM) { unsigned phys_reg = (reg >> 1) / 8; offset_B = ((reg >> 1) % 8) * 4; if (d.file == UGPR) { R = brw_ud1_grf(phys_reg, 0); } else { R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0); } /* Handle 3-src restrictions and vectorized uniform code. */ if (is_dest || jay_num_values(d) >= 8) { R = vec8(R); } /* Some operations have special restrictions on the destination stride, * but if we write a single UGPR the stride is ignored.. Specify * whatever stride is needed to satisfy the rules. */ if (is_dest) { /* BSpec 56640 "Special Restrictions" says: * * "Conversion between HF and Integer must be DWord-aligned * and strided by a DWord on the destination." */ enum jay_type src0_type = jay_src_type(I, 0); if ((I->type == JAY_TYPE_F16 && !jay_type_is_any_float(src0_type)) || (src0_type == JAY_TYPE_F16 && !jay_type_is_any_float(I->type))) { assert(jay_num_values(d) == 1 && "must not vectorize HF<->Int"); R = stride(R, 8, 2, 4); } /* Packed floats have restrictions on mixed sizes. Use <2>. */ if (jay_type_size_bits(I->type) == 16 && jay_type_size_bits(jay_src_type(I, 0)) != 16) { assert(jay_num_values(d) == 1 && "must not vectorize mixed float"); R = stride(R, 4, 2, 2); } } } else if (d.file == GPR || d.file == ACCUM) { enum jay_stride def_stride = d.file == GPR ? jay_def_stride(f->shader, d) : JAY_STRIDE_4; uint32_t type_bits = jay_type_size_bits(type); unsigned stride_bits = jay_stride_to_bits(def_stride); unsigned simd_width = jay_simd_width_physical(f->shader, I); unsigned phys_reg; if (def_stride == JAY_STRIDE_2) { /* Bit 0 selects between lo/hi halves of the GPR */ phys_reg = (reg / 2) * jay_grf_per_gpr(f->shader); offset_B = (reg & 1) * 2 * f->shader->dispatch_width; } else { /* Low bits are an offset in 2-byte words into the GRF */ unsigned mask = BITFIELD_MASK(stride_bits / 32); phys_reg = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader); offset_B = (reg & mask) * 2; } if (d.file == GPR) { R = xe2_vec8_grf(phys_reg, 0); } else { R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0); } R = byte_offset(R, simd_offs * simd_width * stride_bits / 8); if (stride_bits == (type_bits * 4)) { R = stride(R, 8, 2, 4); } else if (stride_bits == (type_bits * 2)) { R = stride(R, 4, 2, 2); } else { assert(stride_bits == type_bits); } /* Broadcast is equivalent to <8, 8, 1> for SIMD1 instructions. Use that * instead due to regioning restrictions. */ if (simd_width == 1) { R = vec1(R); } } else if (jay_is_flag(d)) { /* Explicit flags act like UGPRs. As sources they broadcast to all lanes, * so we may ignore the SIMD offset. As destinations, they are written by * SIMD1 instructions and are never SIMD split. */ assert(simd_offs == 0 || idx >= 0); unsigned offs_B = d.reg * (f->shader->dispatch_width / 8); R = brw_flag_subreg(offs_B / 2); } else if (d.file == J_ADDRESS) { R = brw_address_reg(d.reg); } else if (d.file == J_ARF) { R = brw_ud1_reg(ARF, jay_base_index(d), 0); } else { UNREACHABLE("unexpected file"); } R.negate = d.negate; R.abs = d.abs; return byte_offset(retype(R, to_brw_reg_type(type)), offset_B); } #define SRC(i) to_brw_reg(f, I, i, simd_offs, false) #define OP0(hw) \ case JAY_OPCODE_##hw: \ brw_##hw(p); \ break; #define OP1(jay, hw) \ case JAY_OPCODE_##jay: \ brw_alu1(p, BRW_OPCODE_##hw, dst, SRC(0)); \ break; #define OP2(jay, hw) \ case JAY_OPCODE_##jay: \ brw_alu2(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1)); \ break; #define OP3(jay, hw) \ case JAY_OPCODE_##jay: \ brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1), SRC(2)); \ break; #define OP3_SWAP(jay, hw) \ case JAY_OPCODE_##jay: \ brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(2), SRC(1), SRC(0)); \ break; static struct brw_reg quad_swizzle(struct brw_reg r, const jay_inst *I) { /* clang-format off */ switch (jay_quad_swizzle_swizzle(I)) { case JAY_QUAD_SWIZZLE_XXXX: return suboffset(stride(r, 4, 4, 0), 0); case JAY_QUAD_SWIZZLE_YYYY: return suboffset(stride(r, 4, 4, 0), 1); case JAY_QUAD_SWIZZLE_ZZZZ: return suboffset(stride(r, 4, 4, 0), 2); case JAY_QUAD_SWIZZLE_WWWW: return suboffset(stride(r, 4, 4, 0), 3); case JAY_QUAD_SWIZZLE_XXZZ: return suboffset(stride(r, 2, 2, 0), 0); case JAY_QUAD_SWIZZLE_YYWW: return suboffset(stride(r, 2, 2, 0), 1); case JAY_QUAD_SWIZZLE_XYXY: return suboffset(stride(r, 0, 2, 1), 0); case JAY_QUAD_SWIZZLE_ZWZW: return suboffset(stride(r, 0, 2, 1), 2); } /* clang-format on */ UNREACHABLE("invalid quad swizzle"); } /* Runs once per SIMD-split, so must not modify the instruction! */ static void emit(struct brw_codegen *p, jay_function *f, const jay_inst *I, unsigned simd_offs) { ASSERTED unsigned nr_ins_before = p->nr_insn; unsigned exec_size = jay_simd_width_physical(f->shader, I); // jay_print_inst(stdout, (jay_inst *) I); /* Replicate the SWSB regdist for SIMD split instructions if needed */ struct tgl_swsb dep = simd_offs && !I->replicate_dep ? tgl_swsb_null() : I->dep; /* We do not allow SBID dependencies on SIMD split instructions since * individual groups could get shot down. This would require more tracking * and is unclear whether it's beneficial. */ assert(simd_offs == 0 || I->dep.mode == TGL_SBID_NULL); if (I->decrement_dep) { unsigned delta = simd_offs * jay_macro_length(I); assert(dep.regdist > delta); dep.regdist -= delta; } brw_set_default_exec_size(p, util_logbase2(exec_size)); brw_set_default_mask_control(p, jay_is_no_mask(I)); brw_set_default_group(p, simd_offs * exec_size); brw_set_default_swsb(p, dep); brw_set_default_saturate(p, I->saturate); /* Grab the hardware predicate, corresponding either to a logical predicate * or SEL's selector. */ const jay_def *pred = I->predication ? jay_inst_get_predicate((void *) I) : I->op == JAY_OPCODE_SEL ? &I->src[2] : NULL; brw_set_default_predicate_control(p, pred ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE); brw_set_default_predicate_inverse(p, pred && pred->negate); /* Jay/brw enums line up by construction */ enum brw_conditional_mod cmod = (enum brw_conditional_mod) I->conditional_mod; if (!jay_is_null(I->cond_flag)) { assert(!(pred && pred->reg != I->cond_flag.reg) && "must be tied"); pred = &I->cond_flag; } if (pred) { unsigned reg = pred->reg * jay_phys_flag_per_virt(f->shader); brw_set_default_flag_reg(p, reg / 2, reg % 2); } if (I->op == JAY_OPCODE_MIN) { cmod = BRW_CONDITIONAL_L; } else if (I->op == JAY_OPCODE_MAX) { cmod = BRW_CONDITIONAL_GE; } struct brw_reg dst = to_brw_reg(f, I, -1, simd_offs, false); switch (I->op) { OP0(ELSE) OP0(ENDIF) OP0(WHILE) OP0(BREAK) OP1(MOV, MOV) OP1(MODIFIER, MOV) OP1(RNDD, RNDD) OP1(RNDZ, RNDZ) OP1(RNDE, RNDE) OP1(FRC, FRC) OP1(BFREV, BFREV) OP1(CBIT, CBIT) OP1(NOT, NOT) OP1(FBL, FBL) OP1(FBH, FBH) OP1(LZD, LZD) OP2(ROL, ROL) OP2(ROR, ROR) OP2(AVG, AVG) OP2(ADD, ADD) OP2(MUL, MUL) OP2(SEL, SEL) OP2(MIN, SEL) OP2(MAX, SEL) OP2(MUL_32X16, MUL) OP2(AND, AND) OP2(AND_U32_U16, AND) OP2(OR, OR) OP2(XOR, XOR) OP2(ASR, ASR) OP2(SHR, SHR) OP2(SHL, SHL) OP2(BFI1, BFI1) OP2(MAC, MAC) OP3(BFI2, BFI2) OP3(ADD3, ADD3) OP3(CSEL, CSEL) OP3(DP4A_UU, DP4A) OP3(DP4A_SS, DP4A) OP3(DP4A_SU, DP4A) OP3_SWAP(MAD, MAD) OP3_SWAP(BFE, BFE) case JAY_OPCODE_LOOP_ONCE: /* TODO: Is there a better way to do this? */ brw_BREAK(p); brw_WHILE(p); break; case JAY_OPCODE_IF: brw_IF(p, util_logbase2(exec_size)); break; case JAY_OPCODE_MATH: gfx6_math(p, dst, jay_math_op(I), SRC(0), retype(brw_null_reg(), to_brw_reg_type(I->type))); break; case JAY_OPCODE_BFN: brw_BFN(p, dst, SRC(0), SRC(1), SRC(2), brw_imm_ud(jay_bfn_ctrl(I))); break; case JAY_OPCODE_DESWIZZLE_ODD: { bool hi = simd_offs == 0 ? true : jay_deswizzle_odd_src2_hi(I); brw_set_default_group(p, 0); brw_MOV(p, dst, byte_offset(to_brw_reg(f, I, simd_offs, 0, false), hi ? 64 : 0)); break; } case JAY_OPCODE_DESWIZZLE_EVEN: brw_set_default_exec_size(p, BRW_EXECUTE_16); brw_MOV(p, byte_offset(dst, 64), byte_offset(SRC(0), jay_deswizzle_even_src_hi(I) * 64)); break; case JAY_OPCODE_CVT: { unsigned index = jay_cvt_index(I); bool force_hi = false; /* We will apply a suboffset for the specific subword being converted. In * the case where we have a subword (16-bit) stride, accesses to the upper * half will be instead to a discontiguous GRF so we have to fix up. This * affects u8->u32 conversions. */ if (I->src[0].file == GPR) { unsigned type_size_B = jay_type_size_bits(jay_cvt_src_type(I)) / 8; unsigned index_B = index * type_size_B; unsigned stride_B = jay_stride_to_bits(jay_def_stride(f->shader, I->src[0])) / 8; if (index_B >= stride_B) { assert(stride_B == 2 && index_B <= 4 && !I->src[0].hi); force_hi = true; index = (index_B % stride_B) / type_size_B; } } brw_MOV(p, dst, suboffset(to_brw_reg(f, I, 0, simd_offs, force_hi), index)); break; } case JAY_OPCODE_SYNC: brw_SYNC(p, jay_sync_op(I)); if (!jay_is_null(I->src[0])) { brw_set_src0(p, brw_eu_last_inst(p), stride(SRC(0), 0, 1, 0)); } break; case JAY_OPCODE_CMP: brw_CMP(p, dst, I->conditional_mod, SRC(0), SRC(1)); break; case JAY_OPCODE_MOV_IMM64: brw_MOV(p, dst, brw_imm_u64(jay_mov_imm64_imm(I))); break; case JAY_OPCODE_RELOC: brw_MOV_reloc_imm(p, dst, BRW_TYPE_UD, jay_reloc_param(I), jay_reloc_base(I)); break; case JAY_OPCODE_QUAD_SWIZZLE: /* Quad swizzle can get split down to SIMD4 even on Xe2 where we don't * have NibCtrl. Fortunately, it's NoMask so it doesn't matter. */ brw_set_default_group(p, 0); brw_MOV(p, dst, quad_swizzle(SRC(0), I)); break; case JAY_OPCODE_BROADCAST_IMM: brw_MOV(p, dst, get_element(SRC(0), jay_broadcast_imm_lane(I))); break; case JAY_OPCODE_SEND: brw_SEND(p, jay_send_sfid(I), dst, SRC(2), SRC(3), SRC(0), SRC(1), jay_send_ex_desc_imm(I), jay_send_ex_mlen(I), jay_send_bindless(I), jay_send_eot(I), false /* gather */); if (jay_send_check_tdr(I)) { brw_eu_inst_set_opcode(p->isa, brw_eu_last_inst(p), BRW_OPCODE_SENDC); } break; /* Gfx20+ has separate Render Target Array indices for each pair of subspans * in order to support multiple polygons, so we need to use a <1;8,0> region * in order to select the word for each channel. */ case JAY_OPCODE_EXTRACT_LAYER: brw_AND(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UW), 1, 8, 0), brw_imm_uw(0x7ff)); break; case JAY_OPCODE_EXPAND_QUAD: brw_MOV(p, dst, stride(SRC(simd_offs), 1, 4, 0)); break; case JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS: brw_set_default_exec_size(p, BRW_EXECUTE_32); brw_set_default_group(p, 0); brw_ADD(p, retype(dst, BRW_TYPE_UW), retype(SRC(0), BRW_TYPE_UW), brw_imm_uv(0x11100100)); break; case JAY_OPCODE_LANE_ID_8: brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_MOV(p, dst, brw_imm_uv(0x76543210)); break; case JAY_OPCODE_LANE_ID_EXPAND: brw_set_default_exec_size(p, util_logbase2(jay_lane_id_expand_width(I))); brw_ADD(p, suboffset(dst, jay_lane_id_expand_width(I)), SRC(0), brw_imm_uw(jay_lane_id_expand_width(I))); break; case JAY_OPCODE_EXTRACT_BYTE_PER_8LANES: brw_MOV(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UB), 1, 8, 0)); break; case JAY_OPCODE_BYTE_PACK: brw_MOV(p, stride(retype(dst, BRW_TYPE_UB), 1, 1, 0), stride(retype(SRC(0), BRW_TYPE_UB), 4, 1, 0)); break; case JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4: brw_SHR(p, dst, SRC(0), brw_imm_uv(0x44440000)); break; case JAY_OPCODE_MUL_32: { brw_MUL(p, retype(brw_acc_reg(1), to_brw_reg_type(I->type)), SRC(0), subscript(SRC(1), BRW_TYPE_UW, 0)); brw_set_default_swsb(p, tgl_swsb_null()); brw_alu2(p, jay_mul_32_high(I) ? BRW_OPCODE_MACH : BRW_OPCODE_MACL, dst, SRC(0), SRC(1)); break; } case JAY_OPCODE_SHUFFLE: { struct brw_reg a0 = brw_address_reg(0); unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]); unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width; brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B)); brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD)); break; } default: jay_print_inst(stderr, (jay_inst *) I); UNREACHABLE("Unhandled opcode"); } if (cmod != BRW_CONDITIONAL_NONE) { if (I->op != JAY_OPCODE_BFN) { brw_eu_inst_set_cond_modifier(p->devinfo, brw_eu_last_inst(p), cmod); } else { unsigned cc = cmod == BRW_CONDITIONAL_L ? 3 : cmod == BRW_CONDITIONAL_G ? 2 : cmod == BRW_CONDITIONAL_Z ? 1 : cmod == BRW_CONDITIONAL_NONE ? 0 : -1; assert(cc < 4 && "invalid cmod for bfn"); brw_eu_inst_set_boolean_func_cond_modifier(p->devinfo, brw_eu_last_inst(p), cc); } } assert(p->nr_insn == (nr_ins_before + jay_macro_length(I)) && "Jay instructions must map 1:n to GEN instructions"); } struct jay_shader_bin * jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size, bool debug) { struct jay_shader_bin *bin = rzalloc(s, struct jay_shader_bin); struct util_dynarray prog; util_dynarray_init(&prog, bin); struct brw_isa_info isa; struct brw_codegen p; brw_init_isa_info(&isa, s->devinfo); brw_init_codegen(&isa, &p, bin); int start_offset = p.next_insn_offset; /* TODO: Multifunction properly */ jay_foreach_function(s, f) { jay_foreach_block(f, block) { if (block->loop_header) { brw_DO(&p, 0); } jay_foreach_inst_in_block(block, I) { for (unsigned i = 0; i < (1 << jay_simd_split(s, I)); ++i) { emit(&p, f, I, i); } } } } int final_halt_offset = -1 /* TODO */; brw_set_uip_jip(&p, start_offset, final_halt_offset); struct disasm_info *disasm = disasm_initialize(p.isa, NULL); disasm_new_inst_group(disasm, 0); disasm_new_inst_group(disasm, p.next_insn_offset); UNUSED bool valid = true; #ifndef NDEBUG valid = brw_validate_instructions(p.isa, p.store, 0, p.next_insn_offset, disasm); #endif brw_compact_instructions(&p, start_offset, disasm); if (debug || !valid) { dump_assembly(p.store, 0, p.next_insn_offset, disasm, NULL, stdout); } if (!valid) { UNREACHABLE("invalid assembly"); } struct brw_stage_prog_data *prog_data = &s->prog_data->base; assert(prog_data->const_data_size == 0); if (const_data_size > 0) { prog_data->const_data_size = const_data_size; prog_data->const_data_offset = brw_append_data(&p, const_data, const_data_size, 32); } bin->kernel = brw_get_program(&p, &bin->size); s->prog_data->base.relocs = brw_get_shader_relocs(&p, &s->prog_data->base.num_relocs); return bin; }