panvk: add support for {s,u}dot_4x8_{sat}

Generate IDPADD instruction to support integer dot product
Support is added for both signed/unsigned dot product as well as
saturated dot product.
Support is only for v9+.

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34812>
This commit is contained in:
Romaric Jodin 2025-05-05 13:59:22 +02:00 committed by Marge Bot
parent dc1c701489
commit ffdc08dfb6
4 changed files with 22 additions and 7 deletions

View file

@ -3515,6 +3515,20 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
break;
}
case nir_op_udot_4x8_uadd_sat:
case nir_op_udot_4x8_uadd: {
assert(b->shader->arch >= 9);
bi_idpadd_v4u8_to(b, dst, s0, s1, s2,
instr->op == nir_op_udot_4x8_uadd_sat);
break;
}
case nir_op_sdot_4x8_iadd_sat:
case nir_op_sdot_4x8_iadd: {
assert(b->shader->arch >= 9);
bi_idpadd_v4s8_to(b, dst, s0, s1, s2,
instr->op == nir_op_sdot_4x8_iadd_sat);
break;
}
default:
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);

View file

@ -148,6 +148,10 @@ void bifrost_compile_shader_nir(nir_shader *nir,
.support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES), \
.lower_hadd = arch >= 11, \
.discard_is_demote = true, \
.has_udot_4x8 = arch >= 9, \
.has_udot_4x8_sat = arch >= 9, \
.has_sdot_4x8 = arch >= 9, \
.has_sdot_4x8_sat = arch >= 9, \
};
DEFINE_OPTIONS(6);

View file

@ -2235,21 +2235,20 @@
<src absneg="true">Z coordinate as 32-bit floating point</src>
</ins>
<group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2" unused="true" unit="FMA">
<group name="IDPADD" title="8-bit dot product and accumulate" dests="1" opcode="0xC2" unit="FMA">
<desc>
8-bit integer dot product between 4 channel vectors, intended for machine
learning. Available in both unsigned and signed variants, controlling
sign-extension/zero-extension behaviour to the final 32-bit destination.
Saturation is available. Corresponds to the `cl_arm_integer_dot_product_*`
family of OpenCL extensions. Not for actual use, just for completeness.
Instead, use your platform's neural accelerator.
family of OpenCL extensions.
For $A, B \in \{ 0, \ldots, 255 \}^4$ and $\text{Accumulator} \in
\mathbb{Z}$, calculates $(A \cdot B) + \text{Accumulator}$ and optionally
saturates.
</desc>
<ins name="IDP.v4s8" opcode2="0"/>
<ins name="IDP.v4u8" opcode2="1"/>
<ins name="IDPADD.v4s8" opcode2="0"/>
<ins name="IDPADD.v4u8" opcode2="1"/>
<src>A</src>
<src>B</src>
<src>Accumulator</src>

View file

@ -88,8 +88,6 @@ SKIP = set([
"NOT_OLD.i64",
# TODO
"IDP.v4s8",
"IDP.v4u8",
"FATAN_ASSIST.f32",
"SEG_ADD.u64",
"TEX_DUAL",