mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-10 01:18:18 +02:00
jay: Handle dpas_intel intrinsic
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41913>
This commit is contained in:
parent
0f309dbfe5
commit
0aff5e006c
10 changed files with 166 additions and 15 deletions
|
|
@ -136,7 +136,8 @@ can_access_accum(jay_shader *shader, jay_inst *I, signed src)
|
|||
/* "No Accumulator usage for Control Flow, Math, Send, DPAS instructions." */
|
||||
if (jay_op_is_control_flow(I->op) ||
|
||||
I->op == JAY_OPCODE_MATH ||
|
||||
I->op == JAY_OPCODE_SEND) {
|
||||
I->op == JAY_OPCODE_SEND ||
|
||||
I->op == JAY_OPCODE_DPAS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|||
|
||||
def infer_type(op: 'Opcode') -> bool:
|
||||
return op.has_dest and (set(op.types) <= set(["u1", "u32", "u64"]) or
|
||||
op.name == 'mov')
|
||||
op.name == 'mov') and op.name != 'dpas'
|
||||
|
||||
|
||||
def signature(op: 'Opcode', with_dest: bool = True, with_types: bool = False,
|
||||
|
|
@ -107,6 +107,8 @@ _jay_${OPCODE}(${signature(op, with_types = True)})
|
|||
|
||||
#define jay_${OPCODE}(${signature(op, with_types = True, mode = 'call')}) _jay_${OPCODE}(${signature(op, with_types = True, src = 'JAY_BUILD_SRC({})', mode='call')})
|
||||
|
||||
% if op.name not in no_typed_wrappers:
|
||||
|
||||
% for type in op.types:
|
||||
static inline ${'jay_def' if op.has_dest else 'void'}
|
||||
_jay_${OPCODE}_${type}(${signature(op, with_dest = False)})
|
||||
|
|
@ -123,6 +125,8 @@ _jay_${OPCODE}_${type}(${signature(op, with_dest = False)})
|
|||
'call')}) _jay_${OPCODE}_${type}(${signature(op, src='JAY_BUILD_SRC({})', mode = 'call', with_dest = False)})
|
||||
% endfor
|
||||
|
||||
% endif
|
||||
|
||||
% endfor
|
||||
|
||||
#undef type_assert
|
||||
|
|
@ -141,7 +145,8 @@ def main() -> int:
|
|||
f.write(Template(TEMPLATE).render(
|
||||
opcodes=ops,
|
||||
signature=signature,
|
||||
infer_type=infer_type))
|
||||
infer_type=infer_type,
|
||||
no_typed_wrappers={'dpas'}))
|
||||
except Exception:
|
||||
print(exceptions.text_error_template().render())
|
||||
return 1
|
||||
|
|
|
|||
|
|
@ -212,6 +212,27 @@ jay_alu_source_type(nir_alu_instr *alu, unsigned i)
|
|||
nir_src_bit_size(alu->src[i].src));
|
||||
}
|
||||
|
||||
static enum jay_type
|
||||
jay_type_for_glsl_base_type(enum glsl_base_type t)
|
||||
{
|
||||
switch (t) {
|
||||
case GLSL_TYPE_UINT: return JAY_TYPE_U32;
|
||||
case GLSL_TYPE_INT: return JAY_TYPE_S32;
|
||||
case GLSL_TYPE_FLOAT: return JAY_TYPE_F32;
|
||||
case GLSL_TYPE_FLOAT16: return JAY_TYPE_F16;
|
||||
case GLSL_TYPE_BFLOAT16: return JAY_TYPE_BF16;
|
||||
case GLSL_TYPE_DOUBLE: return JAY_TYPE_F64;
|
||||
case GLSL_TYPE_UINT16: return JAY_TYPE_U16;
|
||||
case GLSL_TYPE_INT16: return JAY_TYPE_S16;
|
||||
case GLSL_TYPE_UINT8: return JAY_TYPE_U8;
|
||||
case GLSL_TYPE_INT8: return JAY_TYPE_S8;
|
||||
case GLSL_TYPE_UINT64: return JAY_TYPE_U64;
|
||||
case GLSL_TYPE_INT64: return JAY_TYPE_S64;
|
||||
default:
|
||||
UNREACHABLE("invalid base type");
|
||||
}
|
||||
}
|
||||
|
||||
static inline jay_def
|
||||
nj_def(nir_def *def)
|
||||
{
|
||||
|
|
@ -1335,6 +1356,39 @@ jay_emit_rt_trace_ray(struct nir_to_jay_state *nj, nir_intrinsic_instr *instr)
|
|||
jay_SCHEDULE_BARRIER(&nj->bld);
|
||||
}
|
||||
|
||||
static void
|
||||
jay_emit_dpas(struct nir_to_jay_state *nj,
|
||||
nir_intrinsic_instr *intr)
|
||||
{
|
||||
assert(mesa_shader_stage_uses_workgroup(nj->nir->info.stage));
|
||||
|
||||
/* For Accumulator source we can use null register. */
|
||||
bool src0_use_null = true;
|
||||
for (unsigned c = 0; c < nir_src_num_components(intr->src[0]); c++) {
|
||||
nir_scalar val = nir_scalar_resolved(intr->src[0].ssa, c);
|
||||
src0_use_null &= nir_scalar_is_zero(val);
|
||||
}
|
||||
|
||||
jay_builder *b = &nj->bld;
|
||||
jay_def dst = nj_def(&intr->def);
|
||||
jay_def src[3] = {
|
||||
src0_use_null ? jay_null()
|
||||
: jay_as_gpr(b, nj_src(intr->src[0])),
|
||||
jay_as_gpr(b, nj_src(intr->src[1])),
|
||||
jay_as_gpr(b, nj_src(intr->src[2])),
|
||||
};
|
||||
|
||||
/* Jay follows HW source order. */
|
||||
jay_DPAS(b, dst, src[0], src[2], src[1],
|
||||
nir_intrinsic_systolic_depth(intr),
|
||||
nir_intrinsic_repeat_count(intr),
|
||||
jay_type_for_glsl_base_type(nir_intrinsic_dest_base_type(intr)),
|
||||
jay_type_for_glsl_base_type(nir_intrinsic_src_base_type(intr)),
|
||||
/* sbid */ 0)->saturate = nir_intrinsic_saturate(intr);
|
||||
|
||||
nj->s->prog_data->cs.uses_systolic = true;
|
||||
}
|
||||
|
||||
static void
|
||||
jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
|
||||
{
|
||||
|
|
@ -1836,6 +1890,10 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
|
|||
jay_emit_rt_trace_ray(nj, intr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_dpas_intel:
|
||||
jay_emit_dpas(nj, intr);
|
||||
break;
|
||||
|
||||
default:
|
||||
#ifndef NDEBUG
|
||||
assert(intr->intrinsic < nir_num_intrinsics);
|
||||
|
|
|
|||
|
|
@ -981,7 +981,8 @@ jay_is_no_mask(const jay_inst *I)
|
|||
I->op == JAY_OPCODE_QUAD_SWIZZLE ||
|
||||
I->op == JAY_OPCODE_DESWIZZLE_EVEN ||
|
||||
I->op == JAY_OPCODE_DESWIZZLE_ODD ||
|
||||
I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS;
|
||||
I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
|
||||
I->op == JAY_OPCODE_DPAS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -54,25 +54,31 @@ sync_sbids(jay_builder *b, uint32_t mask, gen_sbid_mode mode)
|
|||
static inline bool
|
||||
jay_inst_is_unordered(const jay_inst *I)
|
||||
{
|
||||
return I->op == JAY_OPCODE_SEND;
|
||||
return I->op == JAY_OPCODE_SEND ||
|
||||
I->op == JAY_OPCODE_DPAS;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
jay_inst_has_sbid(const jay_inst *I)
|
||||
{
|
||||
return I->op == JAY_OPCODE_SEND && !jay_send_eot(I);
|
||||
return jay_inst_is_unordered(I) &&
|
||||
!(I->op == JAY_OPCODE_SEND && jay_send_eot(I));
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
jay_inst_sbid(const jay_inst *I)
|
||||
{
|
||||
return jay_send_sbid(I);
|
||||
return I->op == JAY_OPCODE_SEND ? jay_send_sbid(I)
|
||||
: jay_dpas_sbid(I);
|
||||
}
|
||||
|
||||
static inline void
|
||||
jay_inst_set_sbid(jay_inst *I, unsigned sbid)
|
||||
{
|
||||
jay_set_send_sbid(I, sbid);
|
||||
if (I->op == JAY_OPCODE_SEND)
|
||||
jay_set_send_sbid(I, sbid);
|
||||
else
|
||||
jay_set_dpas_sbid(I, sbid);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -218,6 +224,9 @@ inferred_sync_pipe(const struct intel_device_info *devinfo, const jay_inst *I)
|
|||
|
||||
if (I->op == JAY_OPCODE_SEND) {
|
||||
return GEN_PIPE_NONE;
|
||||
} else if (I->op == JAY_OPCODE_DPAS) {
|
||||
return jay_type_is_any_float(jay_dpas_acc_type(I)) ? GEN_PIPE_FLOAT
|
||||
: GEN_PIPE_INT;
|
||||
} else if (devinfo->verx10 >= 125 && type == JAY_TYPE_F64) {
|
||||
/* Avoid emitting (RegDist, SWSB) annotations for long instructions on
|
||||
* platforms where they are unordered as they may not be allowed.
|
||||
|
|
@ -286,6 +295,10 @@ lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx)
|
|||
ctx->last_sync = I;
|
||||
uint32_t sbid_mask = 0;
|
||||
if (jay_sync_op(I) == TGL_SYNC_NOP) {
|
||||
/* The SYNC.nops added by this function that are RegDist-only, are
|
||||
* added *before* the instruction so are not seen here.
|
||||
*/
|
||||
assert(I->dep.mode != GEN_SBID_NULL);
|
||||
sbid_mask = BITFIELD_BIT(I->dep.sbid);
|
||||
} else if (jay_sync_op(I) == TGL_SYNC_ALLRD ||
|
||||
jay_sync_op(I) == TGL_SYNC_ALLWR) {
|
||||
|
|
@ -415,6 +428,24 @@ lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx)
|
|||
GEN_PIPE_NONE,
|
||||
};
|
||||
|
||||
/* DPAS can only represent in-order dependency for its inferred pipe,
|
||||
* so if it depends on something else, add an extra SYNC.nop for that.
|
||||
*/
|
||||
if (I->op == JAY_OPCODE_DPAS &&
|
||||
wait_pipes &&
|
||||
(!single_wait ||
|
||||
last_pipe != inferred_sync_pipe(func->shader->devinfo, I))) {
|
||||
assert(I->dep.regdist > 0);
|
||||
jay_builder b = jay_init_builder(func, jay_before_inst(I));
|
||||
|
||||
jay_inst *sync = jay_SYNC(&b, jay_null(), TGL_SYNC_NOP);
|
||||
sync->dep.regdist = I->dep.regdist;
|
||||
sync->dep.pipe = I->dep.pipe;
|
||||
|
||||
I->dep.regdist = 0;
|
||||
I->dep.pipe = GEN_PIPE_NONE;
|
||||
}
|
||||
|
||||
/* Fold the immediate preceding SYNC.nop into this instruction, allowing
|
||||
* us to wait on both ALU and a SBID in the same annotation. We cannot do
|
||||
* this safely in the presence of predication or SIMD splitting that could
|
||||
|
|
@ -475,7 +506,14 @@ jay_lower_scoreboard_trivial(jay_shader *shader)
|
|||
{
|
||||
jay_foreach_inst_in_shader_safe(shader, func, I) {
|
||||
if (jay_inst_has_sbid(I)) {
|
||||
I->dep = gen_swsb_dst_dep(gen_swsb_sbid(GEN_SBID_SET, 0), 1);
|
||||
if (I->op == JAY_OPCODE_DPAS) {
|
||||
/* DPAS can't have an A@1, so insert an extra SYNC.nop. */
|
||||
jay_builder before = jay_init_builder(func, jay_before_inst(I));
|
||||
jay_SYNC(&before, jay_null(), TGL_SYNC_NOP)->dep = gen_swsb_regdist(1);
|
||||
I->dep = gen_swsb_sbid(GEN_SBID_SET, 0);
|
||||
} else {
|
||||
I->dep = gen_swsb_dst_dep(gen_swsb_sbid(GEN_SBID_SET, 0), 1);
|
||||
}
|
||||
|
||||
jay_builder b = jay_init_builder(func, jay_after_inst(I));
|
||||
sync_sbids(&b, BITFIELD_BIT(0), GEN_SBID_DST);
|
||||
|
|
|
|||
|
|
@ -222,6 +222,17 @@ op('shuffle', 2, 'u1 u32')
|
|||
# Shuffle with a constant lane index.
|
||||
op('broadcast_imm', 1, 'u1 u32', 0, ['unsigned lane'])
|
||||
|
||||
# Follows hardware source order: C B A. Data is already packed u32 slots
|
||||
# by NIR, types are used when making the gen_inst.
|
||||
op('dpas', 3, 'u32', 0, [
|
||||
'uint8_t sdepth',
|
||||
'uint8_t rcount',
|
||||
'enum jay_type acc_type',
|
||||
'enum jay_type src_type',
|
||||
'uint8_t sbid',
|
||||
'uint8_t pad[3]',
|
||||
])
|
||||
|
||||
OPCODES = _opcodes
|
||||
|
||||
ENUMS: 'Mapping[str, tuple[str, list[str]]]' = {
|
||||
|
|
|
|||
|
|
@ -25,8 +25,10 @@ predicate_block(jay_builder *b,
|
|||
*/
|
||||
jay_foreach_inst_in_block(block, I) {
|
||||
if (jay_uses_flag(I) ||
|
||||
(I->op == JAY_OPCODE_MIN || I->op == JAY_OPCODE_MAX) ||
|
||||
I->op == JAY_OPCODE_MIN ||
|
||||
I->op == JAY_OPCODE_MAX ||
|
||||
I->op == JAY_OPCODE_CSEL ||
|
||||
I->op == JAY_OPCODE_DPAS ||
|
||||
(condition.file != UFLAG && jay_is_no_mask(I)) ||
|
||||
(--limit) < 0)
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -259,6 +259,7 @@ static const struct {
|
|||
OP(DP4A_SS, DP4A, 3),
|
||||
OP(DP4A_SU, DP4A, 3),
|
||||
OP(DP4A_UU, DP4A, 3),
|
||||
OP(DPAS, DPAS, 3),
|
||||
OP(ELSE, ELSE, 0),
|
||||
OP(ENDIF, ENDIF, 0),
|
||||
OP(EXPAND_QUAD, MOV, 2),
|
||||
|
|
@ -592,6 +593,21 @@ emit(struct jay_codegen *jc,
|
|||
gen->opcode = GEN_OP_HALT;
|
||||
break;
|
||||
|
||||
case JAY_OPCODE_DPAS: {
|
||||
gen_reg_type acc_type = to_gen_reg_type(jay_dpas_acc_type(I));
|
||||
gen_reg_type src_type = to_gen_reg_type(jay_dpas_src_type(I));
|
||||
|
||||
gen->dst = gen_retype(gen->dst, acc_type);
|
||||
gen->src[0] = gen_retype(gen->src[0], acc_type);
|
||||
gen->src[1] = gen_retype(gen->src[1], src_type);
|
||||
gen->src[2] = gen_retype(gen->src[2], src_type);
|
||||
|
||||
gen->dpas.sdepth = jay_dpas_sdepth(I);
|
||||
gen->dpas.rcount = jay_dpas_rcount(I);
|
||||
gen->exec_size = jc->devinfo->ver >= 20 ? 16 : 8;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,13 +87,32 @@ adjust_width_for_type(unsigned width, enum jay_type type)
|
|||
static unsigned
|
||||
get_src_words(struct validate_state *validate, jay_inst *I, unsigned s)
|
||||
{
|
||||
jay_shader *shader = validate->func->shader;
|
||||
|
||||
/* TODO: I think this can be simplified */
|
||||
if (I->op == JAY_OPCODE_EXPAND_QUAD) {
|
||||
return 4;
|
||||
}
|
||||
|
||||
if (I->op == JAY_OPCODE_ZIP_UGPR16) {
|
||||
return jay_ugpr_per_grf(validate->func->shader);
|
||||
return jay_ugpr_per_grf(shader);
|
||||
}
|
||||
|
||||
if (I->op == JAY_OPCODE_DPAS) {
|
||||
const unsigned dpas_exec_size = 8 * reg_unit(shader->devinfo);
|
||||
const unsigned grf_size = shader->devinfo->grf_size;
|
||||
const unsigned acc_size_B = jay_type_size_bits(jay_dpas_acc_type(I)) / 8;
|
||||
|
||||
unsigned bytes;
|
||||
switch (s) {
|
||||
case 0: bytes = jay_dpas_rcount(I) * dpas_exec_size * acc_size_B; break;
|
||||
case 1: bytes = jay_dpas_sdepth(I) * grf_size; break;
|
||||
case 2: bytes = jay_dpas_rcount(I) * jay_dpas_sdepth(I) * 4; break;
|
||||
default:
|
||||
UNREACHABLE("invalid DPAS source");
|
||||
}
|
||||
|
||||
return bytes / (shader->dispatch_width * 4);
|
||||
}
|
||||
|
||||
unsigned simd_width = jay_simd_width_logical(validate->func->shader, I);
|
||||
|
|
@ -263,7 +282,9 @@ validate_inst(struct validate_state *validate, jay_inst *I)
|
|||
CHECK(!I->src[s].negate || jay_has_src_mods(I, s));
|
||||
}
|
||||
|
||||
if (I->op == JAY_OPCODE_SEL) {
|
||||
if (I->op == JAY_OPCODE_DPAS) {
|
||||
CHECK(jay_num_values(I->dst) == get_src_words(validate, I, 0));
|
||||
} else if (I->op == JAY_OPCODE_SEL) {
|
||||
CHECK(jay_is_flag(I->src[2]) && "SEL src[2] (selector) must be a flag");
|
||||
} else if (I->op == JAY_OPCODE_SYNC) {
|
||||
CHECK(validate->post_ra && "SYNC does not exist while scheduling");
|
||||
|
|
|
|||
|
|
@ -2900,9 +2900,7 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
|
|||
goto fail_base;
|
||||
|
||||
device->has_cooperative_matrix =
|
||||
(device->info.has_systolic ||
|
||||
debug_get_bool_option("INTEL_LOWER_DPAS", false)) &&
|
||||
!intel_use_jay_any_stage(&device->info);
|
||||
device->info.has_systolic || debug_get_bool_option("INTEL_LOWER_DPAS", false);
|
||||
|
||||
/* Because of Xe2 PAT selected compression and the Vulkan spec requirement
|
||||
* to always return the same memory types for Images with same properties
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue