diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c
index 54ef7db976f..ed046307755 100644
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -3515,6 +3515,20 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
break;
}
+ case nir_op_udot_4x8_uadd_sat:
+ case nir_op_udot_4x8_uadd: {
+ assert(b->shader->arch >= 9);
+ bi_idpadd_v4u8_to(b, dst, s0, s1, s2,
+ instr->op == nir_op_udot_4x8_uadd_sat);
+ break;
+ }
+ case nir_op_sdot_4x8_iadd_sat:
+ case nir_op_sdot_4x8_iadd: {
+ assert(b->shader->arch >= 9);
+ bi_idpadd_v4s8_to(b, dst, s0, s1, s2,
+ instr->op == nir_op_sdot_4x8_iadd_sat);
+ break;
+ }
default:
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
diff --git a/src/panfrost/compiler/bifrost_compile.h b/src/panfrost/compiler/bifrost_compile.h
index a7a28c31881..c5de9b893da 100644
--- a/src/panfrost/compiler/bifrost_compile.h
+++ b/src/panfrost/compiler/bifrost_compile.h
@@ -148,6 +148,10 @@ void bifrost_compile_shader_nir(nir_shader *nir,
.support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES), \
.lower_hadd = arch >= 11, \
.discard_is_demote = true, \
+ .has_udot_4x8 = arch >= 9, \
+ .has_udot_4x8_sat = arch >= 9, \
+ .has_sdot_4x8 = arch >= 9, \
+ .has_sdot_4x8_sat = arch >= 9, \
};
DEFINE_OPTIONS(6);
diff --git a/src/panfrost/compiler/valhall/ISA.xml b/src/panfrost/compiler/valhall/ISA.xml
index 2e2c060986f..9708ea4caf9 100644
--- a/src/panfrost/compiler/valhall/ISA.xml
+++ b/src/panfrost/compiler/valhall/ISA.xml
@@ -2235,21 +2235,20 @@
Z coordinate as 32-bit floating point
-
+
8-bit integer dot product between 4 channel vectors, intended for machine
learning. Available in both unsigned and signed variants, controlling
sign-extension/zero-extension behaviour to the final 32-bit destination.
Saturation is available. Corresponds to the `cl_arm_integer_dot_product_*`
- family of OpenCL extensions. Not for actual use, just for completeness.
- Instead, use your platform's neural accelerator.
+ family of OpenCL extensions.
For $A, B \in \{ 0, \ldots, 255 \}^4$ and $\text{Accumulator} \in
\mathbb{Z}$, calculates $(A \cdot B) + \text{Accumulator}$ and optionally
saturates.
-
-
+
+
A
B
Accumulator
diff --git a/src/panfrost/compiler/valhall/valhall.c.py b/src/panfrost/compiler/valhall/valhall.c.py
index bc9c2d87740..da4f2fcd913 100644
--- a/src/panfrost/compiler/valhall/valhall.c.py
+++ b/src/panfrost/compiler/valhall/valhall.c.py
@@ -88,8 +88,6 @@ SKIP = set([
"NOT_OLD.i64",
# TODO
- "IDP.v4s8",
- "IDP.v4u8",
"FATAN_ASSIST.f32",
"SEG_ADD.u64",
"TEX_DUAL",