diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index 54ef7db976f..ed046307755 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -3515,6 +3515,20 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); break; } + case nir_op_udot_4x8_uadd_sat: + case nir_op_udot_4x8_uadd: { + assert(b->shader->arch >= 9); + bi_idpadd_v4u8_to(b, dst, s0, s1, s2, + instr->op == nir_op_udot_4x8_uadd_sat); + break; + } + case nir_op_sdot_4x8_iadd_sat: + case nir_op_sdot_4x8_iadd: { + assert(b->shader->arch >= 9); + bi_idpadd_v4s8_to(b, dst, s0, s1, s2, + instr->op == nir_op_sdot_4x8_iadd_sat); + break; + } default: fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); diff --git a/src/panfrost/compiler/bifrost_compile.h b/src/panfrost/compiler/bifrost_compile.h index a7a28c31881..c5de9b893da 100644 --- a/src/panfrost/compiler/bifrost_compile.h +++ b/src/panfrost/compiler/bifrost_compile.h @@ -148,6 +148,10 @@ void bifrost_compile_shader_nir(nir_shader *nir, .support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES), \ .lower_hadd = arch >= 11, \ .discard_is_demote = true, \ + .has_udot_4x8 = arch >= 9, \ + .has_udot_4x8_sat = arch >= 9, \ + .has_sdot_4x8 = arch >= 9, \ + .has_sdot_4x8_sat = arch >= 9, \ }; DEFINE_OPTIONS(6); diff --git a/src/panfrost/compiler/valhall/ISA.xml b/src/panfrost/compiler/valhall/ISA.xml index 2e2c060986f..9708ea4caf9 100644 --- a/src/panfrost/compiler/valhall/ISA.xml +++ b/src/panfrost/compiler/valhall/ISA.xml @@ -2235,21 +2235,20 @@ Z coordinate as 32-bit floating point - + 8-bit integer dot product between 4 channel vectors, intended for machine learning. Available in both unsigned and signed variants, controlling sign-extension/zero-extension behaviour to the final 32-bit destination. Saturation is available. Corresponds to the `cl_arm_integer_dot_product_*` - family of OpenCL extensions. Not for actual use, just for completeness. - Instead, use your platform's neural accelerator. + family of OpenCL extensions. For $A, B \in \{ 0, \ldots, 255 \}^4$ and $\text{Accumulator} \in \mathbb{Z}$, calculates $(A \cdot B) + \text{Accumulator}$ and optionally saturates. - - + + A B Accumulator diff --git a/src/panfrost/compiler/valhall/valhall.c.py b/src/panfrost/compiler/valhall/valhall.c.py index bc9c2d87740..da4f2fcd913 100644 --- a/src/panfrost/compiler/valhall/valhall.c.py +++ b/src/panfrost/compiler/valhall/valhall.c.py @@ -88,8 +88,6 @@ SKIP = set([ "NOT_OLD.i64", # TODO - "IDP.v4s8", - "IDP.v4u8", "FATAN_ASSIST.f32", "SEG_ADD.u64", "TEX_DUAL",