From 0aff5e006c3d8fc773c05effa547b3ddccfec077 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 29 May 2026 21:41:39 -0700 Subject: [PATCH] jay: Handle dpas_intel intrinsic Reviewed-by: Alyssa Rosenzweig Part-of: --- .../compiler/jay/jay_assign_accumulators.c | 3 +- .../compiler/jay/jay_builder_opcodes.h.py | 9 ++- src/intel/compiler/jay/jay_from_nir.c | 58 +++++++++++++++++++ src/intel/compiler/jay/jay_ir.h | 3 +- src/intel/compiler/jay/jay_lower_scoreboard.c | 48 +++++++++++++-- src/intel/compiler/jay/jay_opcodes.py | 11 ++++ src/intel/compiler/jay/jay_opt_predicate.c | 4 +- src/intel/compiler/jay/jay_to_binary.c | 16 +++++ src/intel/compiler/jay/jay_validate.c | 25 +++++++- src/intel/vulkan/anv_physical_device.c | 4 +- 10 files changed, 166 insertions(+), 15 deletions(-) diff --git a/src/intel/compiler/jay/jay_assign_accumulators.c b/src/intel/compiler/jay/jay_assign_accumulators.c index dff099edc91..709a95a903a 100644 --- a/src/intel/compiler/jay/jay_assign_accumulators.c +++ b/src/intel/compiler/jay/jay_assign_accumulators.c @@ -136,7 +136,8 @@ can_access_accum(jay_shader *shader, jay_inst *I, signed src) /* "No Accumulator usage for Control Flow, Math, Send, DPAS instructions." */ if (jay_op_is_control_flow(I->op) || I->op == JAY_OPCODE_MATH || - I->op == JAY_OPCODE_SEND) { + I->op == JAY_OPCODE_SEND || + I->op == JAY_OPCODE_DPAS) { return false; } diff --git a/src/intel/compiler/jay/jay_builder_opcodes.h.py b/src/intel/compiler/jay/jay_builder_opcodes.h.py index 992970ba4fb..46d2a900c44 100644 --- a/src/intel/compiler/jay/jay_builder_opcodes.h.py +++ b/src/intel/compiler/jay/jay_builder_opcodes.h.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: def infer_type(op: 'Opcode') -> bool: return op.has_dest and (set(op.types) <= set(["u1", "u32", "u64"]) or - op.name == 'mov') + op.name == 'mov') and op.name != 'dpas' def signature(op: 'Opcode', with_dest: bool = True, with_types: bool = False, @@ -107,6 +107,8 @@ _jay_${OPCODE}(${signature(op, with_types = True)}) #define jay_${OPCODE}(${signature(op, with_types = True, mode = 'call')}) _jay_${OPCODE}(${signature(op, with_types = True, src = 'JAY_BUILD_SRC({})', mode='call')}) +% if op.name not in no_typed_wrappers: + % for type in op.types: static inline ${'jay_def' if op.has_dest else 'void'} _jay_${OPCODE}_${type}(${signature(op, with_dest = False)}) @@ -123,6 +125,8 @@ _jay_${OPCODE}_${type}(${signature(op, with_dest = False)}) 'call')}) _jay_${OPCODE}_${type}(${signature(op, src='JAY_BUILD_SRC({})', mode = 'call', with_dest = False)}) % endfor +% endif + % endfor #undef type_assert @@ -141,7 +145,8 @@ def main() -> int: f.write(Template(TEMPLATE).render( opcodes=ops, signature=signature, - infer_type=infer_type)) + infer_type=infer_type, + no_typed_wrappers={'dpas'})) except Exception: print(exceptions.text_error_template().render()) return 1 diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index de5d144a914..93d14b0d09d 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -212,6 +212,27 @@ jay_alu_source_type(nir_alu_instr *alu, unsigned i) nir_src_bit_size(alu->src[i].src)); } +static enum jay_type +jay_type_for_glsl_base_type(enum glsl_base_type t) +{ + switch (t) { + case GLSL_TYPE_UINT: return JAY_TYPE_U32; + case GLSL_TYPE_INT: return JAY_TYPE_S32; + case GLSL_TYPE_FLOAT: return JAY_TYPE_F32; + case GLSL_TYPE_FLOAT16: return JAY_TYPE_F16; + case GLSL_TYPE_BFLOAT16: return JAY_TYPE_BF16; + case GLSL_TYPE_DOUBLE: return JAY_TYPE_F64; + case GLSL_TYPE_UINT16: return JAY_TYPE_U16; + case GLSL_TYPE_INT16: return JAY_TYPE_S16; + case GLSL_TYPE_UINT8: return JAY_TYPE_U8; + case GLSL_TYPE_INT8: return JAY_TYPE_S8; + case GLSL_TYPE_UINT64: return JAY_TYPE_U64; + case GLSL_TYPE_INT64: return JAY_TYPE_S64; + default: + UNREACHABLE("invalid base type"); + } +} + static inline jay_def nj_def(nir_def *def) { @@ -1335,6 +1356,39 @@ jay_emit_rt_trace_ray(struct nir_to_jay_state *nj, nir_intrinsic_instr *instr) jay_SCHEDULE_BARRIER(&nj->bld); } +static void +jay_emit_dpas(struct nir_to_jay_state *nj, + nir_intrinsic_instr *intr) +{ + assert(mesa_shader_stage_uses_workgroup(nj->nir->info.stage)); + + /* For Accumulator source we can use null register. */ + bool src0_use_null = true; + for (unsigned c = 0; c < nir_src_num_components(intr->src[0]); c++) { + nir_scalar val = nir_scalar_resolved(intr->src[0].ssa, c); + src0_use_null &= nir_scalar_is_zero(val); + } + + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&intr->def); + jay_def src[3] = { + src0_use_null ? jay_null() + : jay_as_gpr(b, nj_src(intr->src[0])), + jay_as_gpr(b, nj_src(intr->src[1])), + jay_as_gpr(b, nj_src(intr->src[2])), + }; + + /* Jay follows HW source order. */ + jay_DPAS(b, dst, src[0], src[2], src[1], + nir_intrinsic_systolic_depth(intr), + nir_intrinsic_repeat_count(intr), + jay_type_for_glsl_base_type(nir_intrinsic_dest_base_type(intr)), + jay_type_for_glsl_base_type(nir_intrinsic_src_base_type(intr)), + /* sbid */ 0)->saturate = nir_intrinsic_saturate(intr); + + nj->s->prog_data->cs.uses_systolic = true; +} + static void jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) { @@ -1836,6 +1890,10 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) jay_emit_rt_trace_ray(nj, intr); break; + case nir_intrinsic_dpas_intel: + jay_emit_dpas(nj, intr); + break; + default: #ifndef NDEBUG assert(intr->intrinsic < nir_num_intrinsics); diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h index a87e7bd3f2e..18d1a2a11a8 100644 --- a/src/intel/compiler/jay/jay_ir.h +++ b/src/intel/compiler/jay/jay_ir.h @@ -981,7 +981,8 @@ jay_is_no_mask(const jay_inst *I) I->op == JAY_OPCODE_QUAD_SWIZZLE || I->op == JAY_OPCODE_DESWIZZLE_EVEN || I->op == JAY_OPCODE_DESWIZZLE_ODD || - I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS; + I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || + I->op == JAY_OPCODE_DPAS; } /** diff --git a/src/intel/compiler/jay/jay_lower_scoreboard.c b/src/intel/compiler/jay/jay_lower_scoreboard.c index 7ca6d979623..cdcb96a0c3f 100644 --- a/src/intel/compiler/jay/jay_lower_scoreboard.c +++ b/src/intel/compiler/jay/jay_lower_scoreboard.c @@ -54,25 +54,31 @@ sync_sbids(jay_builder *b, uint32_t mask, gen_sbid_mode mode) static inline bool jay_inst_is_unordered(const jay_inst *I) { - return I->op == JAY_OPCODE_SEND; + return I->op == JAY_OPCODE_SEND || + I->op == JAY_OPCODE_DPAS; } static inline bool jay_inst_has_sbid(const jay_inst *I) { - return I->op == JAY_OPCODE_SEND && !jay_send_eot(I); + return jay_inst_is_unordered(I) && + !(I->op == JAY_OPCODE_SEND && jay_send_eot(I)); } static inline unsigned jay_inst_sbid(const jay_inst *I) { - return jay_send_sbid(I); + return I->op == JAY_OPCODE_SEND ? jay_send_sbid(I) + : jay_dpas_sbid(I); } static inline void jay_inst_set_sbid(jay_inst *I, unsigned sbid) { - jay_set_send_sbid(I, sbid); + if (I->op == JAY_OPCODE_SEND) + jay_set_send_sbid(I, sbid); + else + jay_set_dpas_sbid(I, sbid); } static void @@ -218,6 +224,9 @@ inferred_sync_pipe(const struct intel_device_info *devinfo, const jay_inst *I) if (I->op == JAY_OPCODE_SEND) { return GEN_PIPE_NONE; + } else if (I->op == JAY_OPCODE_DPAS) { + return jay_type_is_any_float(jay_dpas_acc_type(I)) ? GEN_PIPE_FLOAT + : GEN_PIPE_INT; } else if (devinfo->verx10 >= 125 && type == JAY_TYPE_F64) { /* Avoid emitting (RegDist, SWSB) annotations for long instructions on * platforms where they are unordered as they may not be allowed. @@ -286,6 +295,10 @@ lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx) ctx->last_sync = I; uint32_t sbid_mask = 0; if (jay_sync_op(I) == TGL_SYNC_NOP) { + /* The SYNC.nops added by this function that are RegDist-only, are + * added *before* the instruction so are not seen here. + */ + assert(I->dep.mode != GEN_SBID_NULL); sbid_mask = BITFIELD_BIT(I->dep.sbid); } else if (jay_sync_op(I) == TGL_SYNC_ALLRD || jay_sync_op(I) == TGL_SYNC_ALLWR) { @@ -415,6 +428,24 @@ lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx) GEN_PIPE_NONE, }; + /* DPAS can only represent in-order dependency for its inferred pipe, + * so if it depends on something else, add an extra SYNC.nop for that. + */ + if (I->op == JAY_OPCODE_DPAS && + wait_pipes && + (!single_wait || + last_pipe != inferred_sync_pipe(func->shader->devinfo, I))) { + assert(I->dep.regdist > 0); + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + + jay_inst *sync = jay_SYNC(&b, jay_null(), TGL_SYNC_NOP); + sync->dep.regdist = I->dep.regdist; + sync->dep.pipe = I->dep.pipe; + + I->dep.regdist = 0; + I->dep.pipe = GEN_PIPE_NONE; + } + /* Fold the immediate preceding SYNC.nop into this instruction, allowing * us to wait on both ALU and a SBID in the same annotation. We cannot do * this safely in the presence of predication or SIMD splitting that could @@ -475,7 +506,14 @@ jay_lower_scoreboard_trivial(jay_shader *shader) { jay_foreach_inst_in_shader_safe(shader, func, I) { if (jay_inst_has_sbid(I)) { - I->dep = gen_swsb_dst_dep(gen_swsb_sbid(GEN_SBID_SET, 0), 1); + if (I->op == JAY_OPCODE_DPAS) { + /* DPAS can't have an A@1, so insert an extra SYNC.nop. */ + jay_builder before = jay_init_builder(func, jay_before_inst(I)); + jay_SYNC(&before, jay_null(), TGL_SYNC_NOP)->dep = gen_swsb_regdist(1); + I->dep = gen_swsb_sbid(GEN_SBID_SET, 0); + } else { + I->dep = gen_swsb_dst_dep(gen_swsb_sbid(GEN_SBID_SET, 0), 1); + } jay_builder b = jay_init_builder(func, jay_after_inst(I)); sync_sbids(&b, BITFIELD_BIT(0), GEN_SBID_DST); diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py index 995dd8fcfc5..78b73446620 100644 --- a/src/intel/compiler/jay/jay_opcodes.py +++ b/src/intel/compiler/jay/jay_opcodes.py @@ -222,6 +222,17 @@ op('shuffle', 2, 'u1 u32') # Shuffle with a constant lane index. op('broadcast_imm', 1, 'u1 u32', 0, ['unsigned lane']) +# Follows hardware source order: C B A. Data is already packed u32 slots +# by NIR, types are used when making the gen_inst. +op('dpas', 3, 'u32', 0, [ + 'uint8_t sdepth', + 'uint8_t rcount', + 'enum jay_type acc_type', + 'enum jay_type src_type', + 'uint8_t sbid', + 'uint8_t pad[3]', +]) + OPCODES = _opcodes ENUMS: 'Mapping[str, tuple[str, list[str]]]' = { diff --git a/src/intel/compiler/jay/jay_opt_predicate.c b/src/intel/compiler/jay/jay_opt_predicate.c index 329c9f3fe87..cc88a489b42 100644 --- a/src/intel/compiler/jay/jay_opt_predicate.c +++ b/src/intel/compiler/jay/jay_opt_predicate.c @@ -25,8 +25,10 @@ predicate_block(jay_builder *b, */ jay_foreach_inst_in_block(block, I) { if (jay_uses_flag(I) || - (I->op == JAY_OPCODE_MIN || I->op == JAY_OPCODE_MAX) || + I->op == JAY_OPCODE_MIN || + I->op == JAY_OPCODE_MAX || I->op == JAY_OPCODE_CSEL || + I->op == JAY_OPCODE_DPAS || (condition.file != UFLAG && jay_is_no_mask(I)) || (--limit) < 0) return false; diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c index 018b94e0e65..0b5fd14a232 100644 --- a/src/intel/compiler/jay/jay_to_binary.c +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -259,6 +259,7 @@ static const struct { OP(DP4A_SS, DP4A, 3), OP(DP4A_SU, DP4A, 3), OP(DP4A_UU, DP4A, 3), + OP(DPAS, DPAS, 3), OP(ELSE, ELSE, 0), OP(ENDIF, ENDIF, 0), OP(EXPAND_QUAD, MOV, 2), @@ -592,6 +593,21 @@ emit(struct jay_codegen *jc, gen->opcode = GEN_OP_HALT; break; + case JAY_OPCODE_DPAS: { + gen_reg_type acc_type = to_gen_reg_type(jay_dpas_acc_type(I)); + gen_reg_type src_type = to_gen_reg_type(jay_dpas_src_type(I)); + + gen->dst = gen_retype(gen->dst, acc_type); + gen->src[0] = gen_retype(gen->src[0], acc_type); + gen->src[1] = gen_retype(gen->src[1], src_type); + gen->src[2] = gen_retype(gen->src[2], src_type); + + gen->dpas.sdepth = jay_dpas_sdepth(I); + gen->dpas.rcount = jay_dpas_rcount(I); + gen->exec_size = jc->devinfo->ver >= 20 ? 16 : 8; + break; + } + default: break; } diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c index 8e8e1837ff3..13bd0889ce8 100644 --- a/src/intel/compiler/jay/jay_validate.c +++ b/src/intel/compiler/jay/jay_validate.c @@ -87,13 +87,32 @@ adjust_width_for_type(unsigned width, enum jay_type type) static unsigned get_src_words(struct validate_state *validate, jay_inst *I, unsigned s) { + jay_shader *shader = validate->func->shader; + /* TODO: I think this can be simplified */ if (I->op == JAY_OPCODE_EXPAND_QUAD) { return 4; } if (I->op == JAY_OPCODE_ZIP_UGPR16) { - return jay_ugpr_per_grf(validate->func->shader); + return jay_ugpr_per_grf(shader); + } + + if (I->op == JAY_OPCODE_DPAS) { + const unsigned dpas_exec_size = 8 * reg_unit(shader->devinfo); + const unsigned grf_size = shader->devinfo->grf_size; + const unsigned acc_size_B = jay_type_size_bits(jay_dpas_acc_type(I)) / 8; + + unsigned bytes; + switch (s) { + case 0: bytes = jay_dpas_rcount(I) * dpas_exec_size * acc_size_B; break; + case 1: bytes = jay_dpas_sdepth(I) * grf_size; break; + case 2: bytes = jay_dpas_rcount(I) * jay_dpas_sdepth(I) * 4; break; + default: + UNREACHABLE("invalid DPAS source"); + } + + return bytes / (shader->dispatch_width * 4); } unsigned simd_width = jay_simd_width_logical(validate->func->shader, I); @@ -263,7 +282,9 @@ validate_inst(struct validate_state *validate, jay_inst *I) CHECK(!I->src[s].negate || jay_has_src_mods(I, s)); } - if (I->op == JAY_OPCODE_SEL) { + if (I->op == JAY_OPCODE_DPAS) { + CHECK(jay_num_values(I->dst) == get_src_words(validate, I, 0)); + } else if (I->op == JAY_OPCODE_SEL) { CHECK(jay_is_flag(I->src[2]) && "SEL src[2] (selector) must be a flag"); } else if (I->op == JAY_OPCODE_SYNC) { CHECK(validate->post_ra && "SYNC does not exist while scheduling"); diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 9788a1bf037..eae97522782 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -2900,9 +2900,7 @@ anv_physical_device_try_create(struct vk_instance *vk_instance, goto fail_base; device->has_cooperative_matrix = - (device->info.has_systolic || - debug_get_bool_option("INTEL_LOWER_DPAS", false)) && - !intel_use_jay_any_stage(&device->info); + device->info.has_systolic || debug_get_bool_option("INTEL_LOWER_DPAS", false); /* Because of Xe2 PAT selected compression and the Vulkan spec requirement * to always return the same memory types for Images with same properties