jay: Handle dpas_intel intrinsic
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41913>
This commit is contained in:
Caio Oliveira 2026-05-29 21:41:39 -07:00 committed by Marge Bot
parent 0f309dbfe5
commit 0aff5e006c
10 changed files with 166 additions and 15 deletions

View file

@ -136,7 +136,8 @@ can_access_accum(jay_shader *shader, jay_inst *I, signed src)
/* "No Accumulator usage for Control Flow, Math, Send, DPAS instructions." */
if (jay_op_is_control_flow(I->op) ||
I->op == JAY_OPCODE_MATH ||
I->op == JAY_OPCODE_SEND) {
I->op == JAY_OPCODE_SEND ||
I->op == JAY_OPCODE_DPAS) {
return false;
}

View file

@ -16,7 +16,7 @@ if TYPE_CHECKING:
def infer_type(op: 'Opcode') -> bool:
return op.has_dest and (set(op.types) <= set(["u1", "u32", "u64"]) or
op.name == 'mov')
op.name == 'mov') and op.name != 'dpas'
def signature(op: 'Opcode', with_dest: bool = True, with_types: bool = False,
@ -107,6 +107,8 @@ _jay_${OPCODE}(${signature(op, with_types = True)})
#define jay_${OPCODE}(${signature(op, with_types = True, mode = 'call')}) _jay_${OPCODE}(${signature(op, with_types = True, src = 'JAY_BUILD_SRC({})', mode='call')})
% if op.name not in no_typed_wrappers:
% for type in op.types:
static inline ${'jay_def' if op.has_dest else 'void'}
_jay_${OPCODE}_${type}(${signature(op, with_dest = False)})
@ -123,6 +125,8 @@ _jay_${OPCODE}_${type}(${signature(op, with_dest = False)})
'call')}) _jay_${OPCODE}_${type}(${signature(op, src='JAY_BUILD_SRC({})', mode = 'call', with_dest = False)})
% endfor
% endif
% endfor
#undef type_assert
@ -141,7 +145,8 @@ def main() -> int:
f.write(Template(TEMPLATE).render(
opcodes=ops,
signature=signature,
infer_type=infer_type))
infer_type=infer_type,
no_typed_wrappers={'dpas'}))
except Exception:
print(exceptions.text_error_template().render())
return 1

View file

@ -212,6 +212,27 @@ jay_alu_source_type(nir_alu_instr *alu, unsigned i)
nir_src_bit_size(alu->src[i].src));
}
static enum jay_type
jay_type_for_glsl_base_type(enum glsl_base_type t)
{
switch (t) {
case GLSL_TYPE_UINT: return JAY_TYPE_U32;
case GLSL_TYPE_INT: return JAY_TYPE_S32;
case GLSL_TYPE_FLOAT: return JAY_TYPE_F32;
case GLSL_TYPE_FLOAT16: return JAY_TYPE_F16;
case GLSL_TYPE_BFLOAT16: return JAY_TYPE_BF16;
case GLSL_TYPE_DOUBLE: return JAY_TYPE_F64;
case GLSL_TYPE_UINT16: return JAY_TYPE_U16;
case GLSL_TYPE_INT16: return JAY_TYPE_S16;
case GLSL_TYPE_UINT8: return JAY_TYPE_U8;
case GLSL_TYPE_INT8: return JAY_TYPE_S8;
case GLSL_TYPE_UINT64: return JAY_TYPE_U64;
case GLSL_TYPE_INT64: return JAY_TYPE_S64;
default:
UNREACHABLE("invalid base type");
}
}
static inline jay_def
nj_def(nir_def *def)
{
@ -1335,6 +1356,39 @@ jay_emit_rt_trace_ray(struct nir_to_jay_state *nj, nir_intrinsic_instr *instr)
jay_SCHEDULE_BARRIER(&nj->bld);
}
static void
jay_emit_dpas(struct nir_to_jay_state *nj,
nir_intrinsic_instr *intr)
{
assert(mesa_shader_stage_uses_workgroup(nj->nir->info.stage));
/* For Accumulator source we can use null register. */
bool src0_use_null = true;
for (unsigned c = 0; c < nir_src_num_components(intr->src[0]); c++) {
nir_scalar val = nir_scalar_resolved(intr->src[0].ssa, c);
src0_use_null &= nir_scalar_is_zero(val);
}
jay_builder *b = &nj->bld;
jay_def dst = nj_def(&intr->def);
jay_def src[3] = {
src0_use_null ? jay_null()
: jay_as_gpr(b, nj_src(intr->src[0])),
jay_as_gpr(b, nj_src(intr->src[1])),
jay_as_gpr(b, nj_src(intr->src[2])),
};
/* Jay follows HW source order. */
jay_DPAS(b, dst, src[0], src[2], src[1],
nir_intrinsic_systolic_depth(intr),
nir_intrinsic_repeat_count(intr),
jay_type_for_glsl_base_type(nir_intrinsic_dest_base_type(intr)),
jay_type_for_glsl_base_type(nir_intrinsic_src_base_type(intr)),
/* sbid */ 0)->saturate = nir_intrinsic_saturate(intr);
nj->s->prog_data->cs.uses_systolic = true;
}
static void
jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
@ -1836,6 +1890,10 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
jay_emit_rt_trace_ray(nj, intr);
break;
case nir_intrinsic_dpas_intel:
jay_emit_dpas(nj, intr);
break;
default:
#ifndef NDEBUG
assert(intr->intrinsic < nir_num_intrinsics);

View file

@ -981,7 +981,8 @@ jay_is_no_mask(const jay_inst *I)
I->op == JAY_OPCODE_QUAD_SWIZZLE ||
I->op == JAY_OPCODE_DESWIZZLE_EVEN ||
I->op == JAY_OPCODE_DESWIZZLE_ODD ||
I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS;
I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
I->op == JAY_OPCODE_DPAS;
}
/**

View file

@ -54,25 +54,31 @@ sync_sbids(jay_builder *b, uint32_t mask, gen_sbid_mode mode)
static inline bool
jay_inst_is_unordered(const jay_inst *I)
{
return I->op == JAY_OPCODE_SEND;
return I->op == JAY_OPCODE_SEND ||
I->op == JAY_OPCODE_DPAS;
}
static inline bool
jay_inst_has_sbid(const jay_inst *I)
{
return I->op == JAY_OPCODE_SEND && !jay_send_eot(I);
return jay_inst_is_unordered(I) &&
!(I->op == JAY_OPCODE_SEND && jay_send_eot(I));
}
static inline unsigned
jay_inst_sbid(const jay_inst *I)
{
return jay_send_sbid(I);
return I->op == JAY_OPCODE_SEND ? jay_send_sbid(I)
: jay_dpas_sbid(I);
}
static inline void
jay_inst_set_sbid(jay_inst *I, unsigned sbid)
{
jay_set_send_sbid(I, sbid);
if (I->op == JAY_OPCODE_SEND)
jay_set_send_sbid(I, sbid);
else
jay_set_dpas_sbid(I, sbid);
}
static void
@ -218,6 +224,9 @@ inferred_sync_pipe(const struct intel_device_info *devinfo, const jay_inst *I)
if (I->op == JAY_OPCODE_SEND) {
return GEN_PIPE_NONE;
} else if (I->op == JAY_OPCODE_DPAS) {
return jay_type_is_any_float(jay_dpas_acc_type(I)) ? GEN_PIPE_FLOAT
: GEN_PIPE_INT;
} else if (devinfo->verx10 >= 125 && type == JAY_TYPE_F64) {
/* Avoid emitting (RegDist, SWSB) annotations for long instructions on
* platforms where they are unordered as they may not be allowed.
@ -286,6 +295,10 @@ lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx)
ctx->last_sync = I;
uint32_t sbid_mask = 0;
if (jay_sync_op(I) == TGL_SYNC_NOP) {
/* The SYNC.nops added by this function that are RegDist-only, are
* added *before* the instruction so are not seen here.
*/
assert(I->dep.mode != GEN_SBID_NULL);
sbid_mask = BITFIELD_BIT(I->dep.sbid);
} else if (jay_sync_op(I) == TGL_SYNC_ALLRD ||
jay_sync_op(I) == TGL_SYNC_ALLWR) {
@ -415,6 +428,24 @@ lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx)
GEN_PIPE_NONE,
};
/* DPAS can only represent in-order dependency for its inferred pipe,
* so if it depends on something else, add an extra SYNC.nop for that.
*/
if (I->op == JAY_OPCODE_DPAS &&
wait_pipes &&
(!single_wait ||
last_pipe != inferred_sync_pipe(func->shader->devinfo, I))) {
assert(I->dep.regdist > 0);
jay_builder b = jay_init_builder(func, jay_before_inst(I));
jay_inst *sync = jay_SYNC(&b, jay_null(), TGL_SYNC_NOP);
sync->dep.regdist = I->dep.regdist;
sync->dep.pipe = I->dep.pipe;
I->dep.regdist = 0;
I->dep.pipe = GEN_PIPE_NONE;
}
/* Fold the immediate preceding SYNC.nop into this instruction, allowing
* us to wait on both ALU and a SBID in the same annotation. We cannot do
* this safely in the presence of predication or SIMD splitting that could
@ -475,7 +506,14 @@ jay_lower_scoreboard_trivial(jay_shader *shader)
{
jay_foreach_inst_in_shader_safe(shader, func, I) {
if (jay_inst_has_sbid(I)) {
I->dep = gen_swsb_dst_dep(gen_swsb_sbid(GEN_SBID_SET, 0), 1);
if (I->op == JAY_OPCODE_DPAS) {
/* DPAS can't have an A@1, so insert an extra SYNC.nop. */
jay_builder before = jay_init_builder(func, jay_before_inst(I));
jay_SYNC(&before, jay_null(), TGL_SYNC_NOP)->dep = gen_swsb_regdist(1);
I->dep = gen_swsb_sbid(GEN_SBID_SET, 0);
} else {
I->dep = gen_swsb_dst_dep(gen_swsb_sbid(GEN_SBID_SET, 0), 1);
}
jay_builder b = jay_init_builder(func, jay_after_inst(I));
sync_sbids(&b, BITFIELD_BIT(0), GEN_SBID_DST);

View file

@ -222,6 +222,17 @@ op('shuffle', 2, 'u1 u32')
# Shuffle with a constant lane index.
op('broadcast_imm', 1, 'u1 u32', 0, ['unsigned lane'])
# Follows hardware source order: C B A. Data is already packed u32 slots
# by NIR, types are used when making the gen_inst.
op('dpas', 3, 'u32', 0, [
'uint8_t sdepth',
'uint8_t rcount',
'enum jay_type acc_type',
'enum jay_type src_type',
'uint8_t sbid',
'uint8_t pad[3]',
])
OPCODES = _opcodes
ENUMS: 'Mapping[str, tuple[str, list[str]]]' = {

View file

@ -25,8 +25,10 @@ predicate_block(jay_builder *b,
*/
jay_foreach_inst_in_block(block, I) {
if (jay_uses_flag(I) ||
(I->op == JAY_OPCODE_MIN || I->op == JAY_OPCODE_MAX) ||
I->op == JAY_OPCODE_MIN ||
I->op == JAY_OPCODE_MAX ||
I->op == JAY_OPCODE_CSEL ||
I->op == JAY_OPCODE_DPAS ||
(condition.file != UFLAG && jay_is_no_mask(I)) ||
(--limit) < 0)
return false;

View file

@ -259,6 +259,7 @@ static const struct {
OP(DP4A_SS, DP4A, 3),
OP(DP4A_SU, DP4A, 3),
OP(DP4A_UU, DP4A, 3),
OP(DPAS, DPAS, 3),
OP(ELSE, ELSE, 0),
OP(ENDIF, ENDIF, 0),
OP(EXPAND_QUAD, MOV, 2),
@ -592,6 +593,21 @@ emit(struct jay_codegen *jc,
gen->opcode = GEN_OP_HALT;
break;
case JAY_OPCODE_DPAS: {
gen_reg_type acc_type = to_gen_reg_type(jay_dpas_acc_type(I));
gen_reg_type src_type = to_gen_reg_type(jay_dpas_src_type(I));
gen->dst = gen_retype(gen->dst, acc_type);
gen->src[0] = gen_retype(gen->src[0], acc_type);
gen->src[1] = gen_retype(gen->src[1], src_type);
gen->src[2] = gen_retype(gen->src[2], src_type);
gen->dpas.sdepth = jay_dpas_sdepth(I);
gen->dpas.rcount = jay_dpas_rcount(I);
gen->exec_size = jc->devinfo->ver >= 20 ? 16 : 8;
break;
}
default:
break;
}

View file

@ -87,13 +87,32 @@ adjust_width_for_type(unsigned width, enum jay_type type)
static unsigned
get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
{
jay_shader *shader = validate->func->shader;
/* TODO: I think this can be simplified */
if (I->op == JAY_OPCODE_EXPAND_QUAD) {
return 4;
}
if (I->op == JAY_OPCODE_ZIP_UGPR16) {
return jay_ugpr_per_grf(validate->func->shader);
return jay_ugpr_per_grf(shader);
}
if (I->op == JAY_OPCODE_DPAS) {
const unsigned dpas_exec_size = 8 * reg_unit(shader->devinfo);
const unsigned grf_size = shader->devinfo->grf_size;
const unsigned acc_size_B = jay_type_size_bits(jay_dpas_acc_type(I)) / 8;
unsigned bytes;
switch (s) {
case 0: bytes = jay_dpas_rcount(I) * dpas_exec_size * acc_size_B; break;
case 1: bytes = jay_dpas_sdepth(I) * grf_size; break;
case 2: bytes = jay_dpas_rcount(I) * jay_dpas_sdepth(I) * 4; break;
default:
UNREACHABLE("invalid DPAS source");
}
return bytes / (shader->dispatch_width * 4);
}
unsigned simd_width = jay_simd_width_logical(validate->func->shader, I);
@ -263,7 +282,9 @@ validate_inst(struct validate_state *validate, jay_inst *I)
CHECK(!I->src[s].negate || jay_has_src_mods(I, s));
}
if (I->op == JAY_OPCODE_SEL) {
if (I->op == JAY_OPCODE_DPAS) {
CHECK(jay_num_values(I->dst) == get_src_words(validate, I, 0));
} else if (I->op == JAY_OPCODE_SEL) {
CHECK(jay_is_flag(I->src[2]) && "SEL src[2] (selector) must be a flag");
} else if (I->op == JAY_OPCODE_SYNC) {
CHECK(validate->post_ra && "SYNC does not exist while scheduling");

View file

@ -2900,9 +2900,7 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
goto fail_base;
device->has_cooperative_matrix =
(device->info.has_systolic ||
debug_get_bool_option("INTEL_LOWER_DPAS", false)) &&
!intel_use_jay_any_stage(&device->info);
device->info.has_systolic || debug_get_bool_option("INTEL_LOWER_DPAS", false);
/* Because of Xe2 PAT selected compression and the Vulkan spec requirement
* to always return the same memory types for Images with same properties