From 58debf726c26375cac04c5ba27bacc7f6086b41d Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 8 Apr 2026 10:43:03 +0100 Subject: [PATCH] aco/gfx11.7: add opcode numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Marek Olšák Part-of: --- src/amd/compiler/aco_assembler.cpp | 2 + src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_opcodes.py | 106 ++++++------- src/amd/compiler/aco_opcodes_cpp.py | 5 + src/amd/compiler/tests/test_assembler.cpp | 175 ++++++++++++++++++++++ 5 files changed, 236 insertions(+), 53 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 76df819d273..f7f915b3295 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -52,6 +52,8 @@ struct asm_context { opcode = &instr_info.opcode_gfx10[0]; else if (gfx_level <= GFX11_5) opcode = &instr_info.opcode_gfx11[0]; + else if (gfx_level <= GFX11_7) + opcode = &instr_info.opcode_gfx11_7[0]; else opcode = &instr_info.opcode_gfx12[0]; } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 05882b2c329..e9b60834873 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2626,6 +2626,7 @@ typedef struct { const int16_t opcode_gfx9[static_cast(aco_opcode::num_opcodes)]; const int16_t opcode_gfx10[static_cast(aco_opcode::num_opcodes)]; const int16_t opcode_gfx11[static_cast(aco_opcode::num_opcodes)]; + const int16_t opcode_gfx11_7[static_cast(aco_opcode::num_opcodes)]; const int16_t opcode_gfx12[static_cast(aco_opcode::num_opcodes)]; const std::bitset(aco_opcode::num_opcodes)> is_atomic; const char* name[static_cast(aco_opcode::num_opcodes)]; diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 1e44e2ae873..d6b69360d70 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -221,10 +221,10 @@ class Format(IntEnum): return res -Opcode = namedtuple('Opcode', ['gfx6', 'gfx7', 'gfx8', 'gfx9', 'gfx10', 'gfx11', 'gfx12']) +Opcode = namedtuple('Opcode', ['gfx6', 'gfx7', 'gfx8', 'gfx9', 'gfx10', 'gfx11', 'gfx11_7', 'gfx12']) # namedtuple 'defaults' keyword requires python 3.7+. Use an equivalent construct # to support older versions. -Opcode.__new__.__defaults__=(-1, -1, -1, -1, -1, -1, -1) +Opcode.__new__.__defaults__ = (-1,) * len(Opcode._fields) class AcoBaseType(IntEnum): aco_base_type_none = 0 @@ -1005,7 +1005,7 @@ VOP2 = { ("v_fmamk_f16", dst(noMods(F16)), noMods(src(F16, F16, IMM)), op(gfx10=0x37)), ("v_fmaak_f16", dst(noMods(F16)), noMods(src(F16, F16, IMM)), op(gfx10=0x38)), ("v_pk_fmac_f16", dst(noMods(PkF16)), noMods(src(PkF16, PkF16, PkF16)), op(gfx10=0x3c)), - ("v_dot2c_f32_f16", dst(noMods(F32)), noMods(src(PkF16, PkF16, F32)), op(gfx9=0x37, gfx10=0x02, gfx12=-1)), #v_dot2acc_f32_f16 in GFX11 + ("v_dot2c_f32_f16", dst(noMods(F32)), noMods(src(PkF16, PkF16, F32)), op(gfx9=0x37, gfx10=0x02, gfx11_7=-1)), #v_dot2acc_f32_f16 in GFX11 ("v_add_f64", dst(F64), src(F64, F64), op(gfx12=0x02), InstrClass.ValuDoubleAdd), ("v_mul_f64", dst(F64), src(F64, F64), op(gfx12=0x06), InstrClass.ValuDoubleAdd), ("v_lshlrev_b64", dst(U64), src(U32, U64), op(gfx12=0x1f), InstrClass.Valu64), @@ -1118,10 +1118,10 @@ VOP1 = { ("v_cvt_u32_u16", dst(U32), src(U16), op(gfx11=0x6b)), ("v_mov_b16", dst(U16), src(mods(U16)), op(gfx11=0x1c)), ("v_swap_b16", dst(U16, U16), src(U16, U16), op(gfx11=0x66)), - ("v_cvt_f32_fp8", dst(noMods(F32)), src(F8), op(gfx12=0x6c)), - ("v_cvt_f32_bf8", dst(noMods(F32)), src(BF8), op(gfx12=0x6d)), - ("v_cvt_pk_f32_fp8", dst(PkF32), src(PkF8), op(gfx12=0x6e)), - ("v_cvt_pk_f32_bf8", dst(PkF32), src(PkBF8), op(gfx12=0x6f)), + ("v_cvt_f32_fp8", dst(noMods(F32)), src(F8), op(gfx11_7=0x6c)), + ("v_cvt_f32_bf8", dst(noMods(F32)), src(BF8), op(gfx11_7=0x6d)), + ("v_cvt_pk_f32_fp8", dst(PkF32), src(PkF8), op(gfx11_7=0x6e)), + ("v_cvt_pk_f32_bf8", dst(PkF32), src(PkBF8), op(gfx11_7=0x6f)), } for (name, defs, ops, num, cls) in default_class(VOP1, InstrClass.Valu32): insn(name, num, Format.VOP1, cls, definitions = defs, operands = ops) @@ -1196,7 +1196,7 @@ for comp, dtype, cmps, cmpx in itertools.product(range(16), dtypes, range(1), ra elif dtype.type in [I64, U64]: cls = InstrClass.Valu64 - enc = Opcode(gfx6, gfx6, gfx8, gfx8, gfx10, gfx11, gfx12) + enc = Opcode(gfx6, gfx6, gfx8, gfx8, gfx10, gfx11, gfx11, gfx12) insn(name, enc, Format.VOPC, cls, definitions = dst(EXEC if cmpx else VCC), operands = src(dtype.type, dtype.type)) @@ -1221,10 +1221,10 @@ VOPP = { ("v_pk_fma_f16", dst(PkF16), src(PkF16, PkF16, PkF16), op(gfx9=0x0e)), ("v_pk_add_f16", dst(PkF16), src(PkF16, PkF16), op(gfx9=0x0f)), ("v_pk_mul_f16", dst(PkF16), src(PkF16, PkF16), op(gfx9=0x10)), - ("v_pk_min_f16", dst(PkF16), src(PkF16, PkF16), op(gfx9=0x11, gfx12=0x1b)), # called v_pk_min_num_f16 in GFX12 - ("v_pk_max_f16", dst(PkF16), src(PkF16, PkF16), op(gfx9=0x12, gfx12=0x1c)), # called v_pk_min_num_f16 in GFX12 - ("v_pk_minimum_f16", dst(PkF16), src(PkF16, PkF16), op(gfx12=0x1d)), - ("v_pk_maximum_f16", dst(PkF16), src(PkF16, PkF16), op(gfx12=0x1e)), + ("v_pk_min_f16", dst(PkF16), src(PkF16, PkF16), op(gfx9=0x11, gfx11_7=0x12, gfx12=0x1b)), # called v_pk_min_num_f16 in GFX12 + ("v_pk_max_f16", dst(PkF16), src(PkF16, PkF16), op(gfx9=0x12, gfx11_7=0x11, gfx12=0x1c)), # called v_pk_min_num_f16 in GFX12 + ("v_pk_minimum_f16", dst(PkF16), src(PkF16, PkF16), op(gfx11_7=0x1d)), + ("v_pk_maximum_f16", dst(PkF16), src(PkF16, PkF16), op(gfx11_7=0x1e)), ("v_fma_mix_f32", dst(F32), src(F32, F32, F32), op(gfx9=0x20)), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA ("v_fma_mixlo_f16", dst(F16), src(F32, F32, F32), op(gfx9=0x21)), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA ("v_fma_mixhi_f16", dst(F16), src(F32, F32, F32), op(gfx9=0x22)), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA @@ -1240,32 +1240,32 @@ VOPP = { ("v_dot8_u32_u4", dst(U32), src(PkU16, PkU16, U32), op(gfx9=0x2b, gfx10=0x19)), ("v_dot2_f32_f16", dst(noMods(F32)), src(PkF16, PkF16, F32), op(gfx9=0x23, gfx10=0x13)), ("v_dot2_f32_bf16", dst(noMods(F32)), noMods(src(PkBF16, PkBF16, F32)), op(gfx11=0x1a)), - ("v_dot4_f32_fp8_bf8", dst(noMods(F32)), noMods(src(Pk4F8, Pk4BF8, F32)), op(gfx12=0x24)), - ("v_dot4_f32_bf8_fp8", dst(noMods(F32)), noMods(src(Pk4BF8, Pk4F8, F32)), op(gfx12=0x25)), - ("v_dot4_f32_fp8_fp8", dst(noMods(F32)), noMods(src(Pk4F8, Pk4F8, F32)), op(gfx12=0x26)), - ("v_dot4_f32_bf8_bf8", dst(noMods(F32)), noMods(src(Pk4BF8, Pk4BF8, F32)), op(gfx12=0x27)), + ("v_dot4_f32_fp8_bf8", dst(noMods(F32)), noMods(src(Pk4F8, Pk4BF8, F32)), op(gfx11_7=0x24)), + ("v_dot4_f32_bf8_fp8", dst(noMods(F32)), noMods(src(Pk4BF8, Pk4F8, F32)), op(gfx11_7=0x25)), + ("v_dot4_f32_fp8_fp8", dst(noMods(F32)), noMods(src(Pk4F8, Pk4F8, F32)), op(gfx11_7=0x26)), + ("v_dot4_f32_bf8_bf8", dst(noMods(F32)), noMods(src(Pk4BF8, Pk4BF8, F32)), op(gfx11_7=0x27)), ("v_wmma_f32_16x16x16_f16", dst(), src(), op(gfx11=0x40), InstrClass.WMMA), ("v_wmma_f32_16x16x16_bf16", dst(), src(), op(gfx11=0x41), InstrClass.WMMA), ("v_wmma_f16_16x16x16_f16", dst(), src(), op(gfx11=0x42), InstrClass.WMMA), ("v_wmma_bf16_16x16x16_bf16", dst(), src(), op(gfx11=0x43), InstrClass.WMMA), ("v_wmma_i32_16x16x16_iu8", dst(), src(), op(gfx11=0x44), InstrClass.WMMA), ("v_wmma_i32_16x16x16_iu4", dst(), src(), op(gfx11=0x45), InstrClass.WMMA), - ("v_wmma_f32_16x16x16_fp8_fp8", dst(), src(), op(gfx12=0x46), InstrClass.WMMA), - ("v_wmma_f32_16x16x16_fp8_bf8", dst(), src(), op(gfx12=0x47), InstrClass.WMMA), - ("v_wmma_f32_16x16x16_bf8_fp8", dst(), src(), op(gfx12=0x48), InstrClass.WMMA), - ("v_wmma_f32_16x16x16_bf8_bf8", dst(), src(), op(gfx12=0x49), InstrClass.WMMA), - ("v_wmma_i32_16x16x32_iu4", dst(), src(), op(gfx12=0x4a), InstrClass.WMMA), - ("v_swmmac_f32_16x16x32_f16", dst(), src(), op(gfx12=0x50), InstrClass.WMMA), - ("v_swmmac_f32_16x16x32_bf16", dst(), src(), op(gfx12=0x51), InstrClass.WMMA), - ("v_swmmac_f16_16x16x32_f16", dst(), src(), op(gfx12=0x52), InstrClass.WMMA), - ("v_swmmac_bf16_16x16x32_bf16", dst(), src(), op(gfx12=0x53), InstrClass.WMMA), - ("v_swmmac_i32_16x16x32_iu8", dst(), src(), op(gfx12=0x54), InstrClass.WMMA), - ("v_swmmac_i32_16x16x32_iu4", dst(), src(), op(gfx12=0x55), InstrClass.WMMA), - ("v_swmmac_i32_16x16x64_iu4", dst(), src(), op(gfx12=0x56), InstrClass.WMMA), - ("v_swmmac_f32_16x16x32_fp8_fp8", dst(), src(), op(gfx12=0x57), InstrClass.WMMA), - ("v_swmmac_f32_16x16x32_fp8_bf8", dst(), src(), op(gfx12=0x58), InstrClass.WMMA), - ("v_swmmac_f32_16x16x32_bf8_fp8", dst(), src(), op(gfx12=0x59), InstrClass.WMMA), - ("v_swmmac_f32_16x16x32_bf8_bf8", dst(), src(), op(gfx12=0x5a), InstrClass.WMMA), + ("v_wmma_f32_16x16x16_fp8_fp8", dst(), src(), op(gfx11_7=0x46), InstrClass.WMMA), + ("v_wmma_f32_16x16x16_fp8_bf8", dst(), src(), op(gfx11_7=0x47), InstrClass.WMMA), + ("v_wmma_f32_16x16x16_bf8_fp8", dst(), src(), op(gfx11_7=0x48), InstrClass.WMMA), + ("v_wmma_f32_16x16x16_bf8_bf8", dst(), src(), op(gfx11_7=0x49), InstrClass.WMMA), + ("v_wmma_i32_16x16x32_iu4", dst(), src(), op(gfx11_7=0x4a), InstrClass.WMMA), + ("v_swmmac_f32_16x16x32_f16", dst(), src(), op(gfx11_7=0x50), InstrClass.WMMA), + ("v_swmmac_f32_16x16x32_bf16", dst(), src(), op(gfx11_7=0x51), InstrClass.WMMA), + ("v_swmmac_f16_16x16x32_f16", dst(), src(), op(gfx11_7=0x52), InstrClass.WMMA), + ("v_swmmac_bf16_16x16x32_bf16", dst(), src(), op(gfx11_7=0x53), InstrClass.WMMA), + ("v_swmmac_i32_16x16x32_iu8", dst(), src(), op(gfx11_7=0x54), InstrClass.WMMA), + ("v_swmmac_i32_16x16x32_iu4", dst(), src(), op(gfx11_7=0x55), InstrClass.WMMA), + ("v_swmmac_i32_16x16x64_iu4", dst(), src(), op(gfx11_7=0x56), InstrClass.WMMA), + ("v_swmmac_f32_16x16x32_fp8_fp8", dst(), src(), op(gfx11_7=0x57), InstrClass.WMMA), + ("v_swmmac_f32_16x16x32_fp8_bf8", dst(), src(), op(gfx11_7=0x58), InstrClass.WMMA), + ("v_swmmac_f32_16x16x32_bf8_fp8", dst(), src(), op(gfx11_7=0x59), InstrClass.WMMA), + ("v_swmmac_f32_16x16x32_bf8_bf8", dst(), src(), op(gfx11_7=0x5a), InstrClass.WMMA), } for (name, defs, ops, num, cls) in default_class(VOPP, InstrClass.Valu32): insn(name, num, Format.VOP3P, cls, definitions = defs, operands = ops) @@ -1320,7 +1320,7 @@ VOP3 = { ("v_max3_f32", dst(F32), src(F32, F32, F32), op(0x154, gfx8=0x1d3, gfx10=0x154, gfx11=0x21c, gfx12=0x22a)), # called v_max3_num_f32 in GFX12 ("v_max3_i32", dst(U32), src(U32, U32, U32), op(0x155, gfx8=0x1d4, gfx10=0x155, gfx11=0x21d)), ("v_max3_u32", dst(U32), src(U32, U32, U32), op(0x156, gfx8=0x1d5, gfx10=0x156, gfx11=0x21e)), - ("v_med3_f32", dst(F32), src(F32, F32, F32), op(0x157, gfx8=0x1d6, gfx10=0x157, gfx11=0x21f, gfx12=0x231)), # called v_med3_num_f32 in GFX12 + ("v_med3_f32", dst(F32), src(F32, F32, F32), op(0x157, gfx8=0x1d6, gfx10=0x157, gfx11=0x21f, gfx11_7=0x231)), # called v_med3_num_f32 in GFX12 ("v_med3_i32", dst(U32), src(U32, U32, U32), op(0x158, gfx8=0x1d7, gfx10=0x158, gfx11=0x220)), ("v_med3_u32", dst(U32), src(U32, U32, U32), op(0x159, gfx8=0x1d8, gfx10=0x159, gfx11=0x221)), ("v_sad_u8", dst(U32), src(U32, U32, U32), op(0x15a, gfx8=0x1d9, gfx10=0x15a, gfx11=0x222)), @@ -1369,7 +1369,7 @@ VOP3 = { ("v_max3_f16", dst(F16), src(F16, F16, F16), op(gfx9=0x1f7, gfx10=0x354, gfx11=0x24c, gfx12=0x22c)), # called v_max3_num_f16 in GFX12 ("v_max3_i16", dst(U16), src(U16, U16, U16), op(gfx9=0x1f8, gfx10=0x355, gfx11=0x24d)), ("v_max3_u16", dst(U16), src(U16, U16, U16), op(gfx9=0x1f9, gfx10=0x356, gfx11=0x24e)), - ("v_med3_f16", dst(F16), src(F16, F16, F16), op(gfx9=0x1fa, gfx10=0x357, gfx11=0x24f, gfx12=0x232)), # called v_med3_num_f16 in GFX12 + ("v_med3_f16", dst(F16), src(F16, F16, F16), op(gfx9=0x1fa, gfx10=0x357, gfx11=0x24f, gfx11_7=0x232)), # called v_med3_num_f16 in GFX12 ("v_med3_i16", dst(U16), src(U16, U16, U16), op(gfx9=0x1fb, gfx10=0x358, gfx11=0x250)), ("v_med3_u16", dst(U16), src(U16, U16, U16), op(gfx9=0x1fc, gfx10=0x359, gfx11=0x251)), ("v_lshl_add_u32", dst(U32), src(U32, U32, U32), op(gfx9=0x1fd, gfx10=0x346, gfx11=0x246)), @@ -1443,14 +1443,14 @@ VOP3 = { ("v_or_b16", dst(U16), src(U16, U16), op(gfx11=0x363)), ("v_xor_b16", dst(U16), src(U16, U16), op(gfx11=0x364)), ("v_cndmask_b16", dst(U16), src(mods(U16), mods(U16), VCC), op(gfx11=0x25d)), - ("v_minimum3_f32", dst(F32), src(F32, F32, F32), op(gfx12=0x22d)), - ("v_maximum3_f32", dst(F32), src(F32, F32, F32), op(gfx12=0x22e)), - ("v_minimum3_f16", dst(F16), src(F16, F16, F16), op(gfx12=0x22f)), - ("v_maximum3_f16", dst(F16), src(F16, F16, F16), op(gfx12=0x230)), - ("v_minimummaximum_f32", dst(F32), src(F32, F32, F32), op(gfx12=0x26c)), - ("v_maximumminimum_f32", dst(F32), src(F32, F32, F32), op(gfx12=0x26d)), - ("v_minimummaximum_f16", dst(F16), src(F16, F16, F16), op(gfx12=0x26e)), - ("v_maximumminimum_f16", dst(F16), src(F16, F16, F16), op(gfx12=0x26f)), + ("v_minimum3_f32", dst(F32), src(F32, F32, F32), op(gfx11_7=0x22d)), + ("v_maximum3_f32", dst(F32), src(F32, F32, F32), op(gfx11_7=0x22e)), + ("v_minimum3_f16", dst(F16), src(F16, F16, F16), op(gfx11_7=0x22f)), + ("v_maximum3_f16", dst(F16), src(F16, F16, F16), op(gfx11_7=0x230)), + ("v_minimummaximum_f32", dst(F32), src(F32, F32, F32), op(gfx11_7=0x26c)), + ("v_maximumminimum_f32", dst(F32), src(F32, F32, F32), op(gfx11_7=0x26d)), + ("v_minimummaximum_f16", dst(F16), src(F16, F16, F16), op(gfx11_7=0x26e)), + ("v_maximumminimum_f16", dst(F16), src(F16, F16, F16), op(gfx11_7=0x26f)), ("v_s_exp_f32", dst(F32), src(F32), op(gfx12=0x280), InstrClass.ValuPseudoScalarTrans), ("v_s_exp_f16", dst(F16), src(F16), op(gfx12=0x281), InstrClass.ValuPseudoScalarTrans), ("v_s_log_f32", dst(F32), src(F32), op(gfx12=0x282), InstrClass.ValuPseudoScalarTrans), @@ -1461,19 +1461,19 @@ VOP3 = { ("v_s_rsq_f16", dst(F16), src(F16), op(gfx12=0x287), InstrClass.ValuPseudoScalarTrans), ("v_s_sqrt_f32", dst(F32), src(F32), op(gfx12=0x288), InstrClass.ValuPseudoScalarTrans), ("v_s_sqrt_f16", dst(F16), src(F16), op(gfx12=0x289), InstrClass.ValuPseudoScalarTrans), - ("v_minimum_f64", dst(F64), src(F64, F64), op(gfx12=0x341)), - ("v_maximum_f64", dst(F64), src(F64, F64), op(gfx12=0x342)), - ("v_minimum_f32", dst(F32), src(F32, F32), op(gfx12=0x365)), - ("v_maximum_f32", dst(F32), src(F32, F32), op(gfx12=0x366)), - ("v_minimum_f16", dst(F16), src(F16, F16), op(gfx12=0x367)), - ("v_maximum_f16", dst(F16), src(F16, F16), op(gfx12=0x368)), + ("v_minimum_f64", dst(F64), src(F64, F64), op(gfx11_7=0x341)), + ("v_maximum_f64", dst(F64), src(F64, F64), op(gfx11_7=0x342)), + ("v_minimum_f32", dst(F32), src(F32, F32), op(gfx11_7=0x365)), + ("v_maximum_f32", dst(F32), src(F32, F32), op(gfx11_7=0x366)), + ("v_minimum_f16", dst(F16), src(F16, F16), op(gfx11_7=0x367)), + ("v_maximum_f16", dst(F16), src(F16, F16), op(gfx11_7=0x368)), ("v_permlane16_var_b32", dst(U32), src(U32, U32), op(gfx12=0x30f)), ("v_permlanex16_var_b32", dst(U32), src(U32, U32), op(gfx12=0x310)), - ("v_cvt_pk_fp8_f32", dst(PkF8), src(F32, F32), op(gfx12=0x369)), + ("v_cvt_pk_fp8_f32", dst(PkF8), src(F32, F32), op(gfx11_7=0x369)), ("p_v_cvt_pk_fp8_f32_ovfl", dst(PkF8), src(F32, F32), op(-1)), - ("v_cvt_pk_bf8_f32", dst(PkBF8), src(F32, F32), op(gfx12=0x36a)), - ("v_cvt_sr_fp8_f32", dst(F8), src(F32, U32), op(gfx12=0x36b)), - ("v_cvt_sr_bf8_f32", dst(BF8), src(F32, U32), op(gfx12=0x36c)), + ("v_cvt_pk_bf8_f32", dst(PkBF8), src(F32, F32), op(gfx11_7=0x36a)), + ("v_cvt_sr_fp8_f32", dst(F8), src(F32, U32), op(gfx11_7=0x36b)), + ("v_cvt_sr_bf8_f32", dst(BF8), src(F32, U32), op(gfx11_7=0x36c)), } for (name, defs, ops, num, cls) in default_class(VOP3, InstrClass.Valu32): insn(name, num, Format.VOP3, cls, definitions = defs, operands = ops) @@ -2093,7 +2093,7 @@ for ver in Opcode._fields: if key in op_to_name: # exceptions names = set([op_to_name[key], inst.name]) - if ver in ['gfx8', 'gfx9', 'gfx11', 'gfx12'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']): + if ver not in ['gfx6', 'gfx7', 'gfx10'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']): continue # v_mad_legacy_f32 is replaced with v_fma_legacy_f32 on GFX10.3 if ver == 'gfx10' and names == set(['v_mad_legacy_f32', 'v_fma_legacy_f32']): diff --git a/src/amd/compiler/aco_opcodes_cpp.py b/src/amd/compiler/aco_opcodes_cpp.py index 8c99fd5d950..d2e2036b512 100644 --- a/src/amd/compiler/aco_opcodes_cpp.py +++ b/src/amd/compiler/aco_opcodes_cpp.py @@ -46,6 +46,11 @@ extern const aco::Info instr_info = { ${instructions[name].op.gfx11}, % endfor }, + { + % for name in opcode_names: + ${instructions[name].op.gfx11_7}, + % endfor + }, { % for name in opcode_names: ${instructions[name].op.gfx12}, diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index c1920f5a0be..cce1d21dbf9 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -1560,3 +1560,178 @@ BEGIN_TEST(assembler.vintrp_high_16bits) finish_assembler_test(); } END_TEST + +BEGIN_TEST(assembler.gfx11_7) + if (LLVM_VERSION_MAJOR < 23 || !setup_cs(NULL, GFX11_7)) + return; + + Definition dst_v0 = bld.def(v1); + dst_v0.setFixed(PhysReg(256)); + + Definition dst_v1 = bld.def(v1); + dst_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v0(bld.tmp(v1)); + op_v0.setFixed(PhysReg(256 + 0)); + + Operand op_v1(bld.tmp(v1)); + op_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v2(bld.tmp(v1)); + op_v2.setFixed(PhysReg(256 + 2)); + + Operand op_v4(bld.tmp(v1)); + op_v4.setFixed(PhysReg(256 + 4)); + + Operand op_v5(bld.tmp(v1)); + op_v5.setFixed(PhysReg(256 + 5)); + + Operand op_v6(bld.tmp(v1)); + op_v6.setFixed(PhysReg(256 + 6)); + + //>> BB0: + //! v_cvt_f32_fp8_e32 v0, v1 ; 7e00d901 + //! v_cvt_f32_bf8_e32 v0, v1 ; 7e00db01 + bld.vop1(aco_opcode::v_cvt_f32_fp8, dst_v0, op_v1); + bld.vop1(aco_opcode::v_cvt_f32_bf8, dst_v0, op_v1); + + //! v_cvt_pk_f32_fp8_e32 v[0:1], v1.l ; 7e00dd01 + //! v_cvt_pk_f32_bf8_e32 v[0:1], v1.l ; 7e00df01 + //! v_cvt_pk_f32_fp8_e32 v[0:1], v1.h ; 7e00dd81 + //! v_cvt_pk_f32_bf8_e32 v[0:1], v1.h ; 7e00df81 + bld.vop1(aco_opcode::v_cvt_pk_f32_fp8, dst_v0, op_v1); + bld.vop1(aco_opcode::v_cvt_pk_f32_bf8, dst_v0, op_v1); + bld.vop1(aco_opcode::v_cvt_pk_f32_fp8, dst_v0, op_v1).instr->valu().opsel[0] = true; + bld.vop1(aco_opcode::v_cvt_pk_f32_bf8, dst_v0, op_v1).instr->valu().opsel[0] = true; + + //! v_pk_minimum_f16 v0, v1, v2 ; cc1d0000 18020501 + //! v_pk_maximum_f16 v0, v1, v2 ; cc1e0000 18020501 + bld.vop3p(aco_opcode::v_pk_minimum_f16, dst_v0, op_v1, op_v2, 0x0, 0x3); + bld.vop3p(aco_opcode::v_pk_maximum_f16, dst_v0, op_v1, op_v2, 0x0, 0x3); + + //! v_pk_min_num_f16 v0, v1, v2 ; cc120000 18020501 + //! v_pk_max_num_f16 v0, v1, v2 ; cc110000 18020501 + bld.vop3p(aco_opcode::v_pk_min_f16, dst_v0, op_v1, op_v2, 0x0, 0x3); + bld.vop3p(aco_opcode::v_pk_max_f16, dst_v0, op_v1, op_v2, 0x0, 0x3); + + //! v_dot4_f32_fp8_fp8 v0, v0, v1, v2 ; cc264000 1c0a0300 + //! v_dot4_f32_bf8_bf8 v0, v0, v1, v2 ; cc274000 1c0a0300 + //! v_dot4_f32_fp8_bf8 v0, v0, v1, v2 ; cc244000 1c0a0300 + //! v_dot4_f32_bf8_fp8 v0, v0, v1, v2 ; cc254000 1c0a0300 + bld.vop3p(aco_opcode::v_dot4_f32_fp8_fp8, dst_v0, op_v0, op_v1, op_v2, 0x0, 0x7); + bld.vop3p(aco_opcode::v_dot4_f32_bf8_bf8, dst_v0, op_v0, op_v1, op_v2, 0x0, 0x7); + bld.vop3p(aco_opcode::v_dot4_f32_fp8_bf8, dst_v0, op_v0, op_v1, op_v2, 0x0, 0x7); + bld.vop3p(aco_opcode::v_dot4_f32_bf8_fp8, dst_v0, op_v0, op_v1, op_v2, 0x0, 0x7); + + //! v_wmma_f32_16x16x16_fp8_fp8 v[0:3], v4, v5, v[0:3] ; cc464000 1c020b04 + //! v_wmma_f32_16x16x16_fp8_bf8 v[0:3], v4, v5, v[0:3] ; cc474000 1c020b04 + //! v_wmma_f32_16x16x16_bf8_fp8 v[0:3], v4, v5, v[0:3] ; cc484000 1c020b04 + //! v_wmma_f32_16x16x16_bf8_bf8 v[0:3], v4, v5, v[0:3] ; cc494000 1c020b04 + //! v_wmma_i32_16x16x32_iu4 v[0:3], v4, v5, v[0:3] ; cc4a4000 1c020b04 + bld.vop3p(aco_opcode::v_wmma_f32_16x16x16_fp8_fp8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_wmma_f32_16x16x16_fp8_bf8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_wmma_f32_16x16x16_bf8_fp8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_wmma_f32_16x16x16_bf8_bf8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_wmma_i32_16x16x32_iu4, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + + //! v_swmmac_f32_16x16x32_f16 v[0:3], v[4:5], v[6:9], v0 ; cc504000 1c020d04 + //! v_swmmac_f32_16x16x32_bf16 v[0:3], v[4:5], v[6:9], v0 ; cc514000 1c020d04 + //! v_swmmac_f16_16x16x32_f16 v[0:1], v[4:5], v[6:9], v0 ; cc524000 1c020d04 + //! v_swmmac_bf16_16x16x32_bf16 v[0:1], v[4:5], v[6:9], v0 ; cc534000 1c020d04 + //! v_swmmac_i32_16x16x32_iu8 v[0:3], v4, v[5:6], v0 ; cc544000 1c020b04 + //! v_swmmac_i32_16x16x32_iu4 v[0:3], v4, v5, v0 ; cc554000 1c020b04 + //! v_swmmac_i32_16x16x64_iu4 v[0:3], v4, v[5:6], v0 ; cc564000 1c020b04 + //! v_swmmac_f32_16x16x32_fp8_fp8 v[0:3], v4, v[5:6], v0 ; cc574000 1c020b04 + //! v_swmmac_f32_16x16x32_fp8_bf8 v[0:3], v4, v[5:6], v0 ; cc584000 1c020b04 + //! v_swmmac_f32_16x16x32_bf8_fp8 v[0:3], v4, v[5:6], v0 ; cc594000 1c020b04 + //! v_swmmac_f32_16x16x32_bf8_bf8 v[0:3], v4, v[5:6], v0 ; cc5a4000 1c020b04 + bld.vop3p(aco_opcode::v_swmmac_f32_16x16x32_f16, dst_v0, op_v4, op_v6, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_f32_16x16x32_bf16, dst_v0, op_v4, op_v6, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_f16_16x16x32_f16, dst_v0, op_v4, op_v6, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_bf16_16x16x32_bf16, dst_v0, op_v4, op_v6, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_i32_16x16x32_iu8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_i32_16x16x32_iu4, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_i32_16x16x64_iu4, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_f32_16x16x32_fp8_fp8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_f32_16x16x32_fp8_bf8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_f32_16x16x32_bf8_fp8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + bld.vop3p(aco_opcode::v_swmmac_f32_16x16x32_bf8_bf8, dst_v0, op_v4, op_v5, op_v0, 0x0, 0x7); + + //! v_min3_num_f32 v0, v0, v1, v2 ; d6190000 040a0300 + //! v_max3_num_f32 v0, v0, v1, v2 ; d61c0000 040a0300 + //! v_med3_num_f32 v0, v0, v1, v2 ; d6310000 040a0300 + bld.vop3(aco_opcode::v_min3_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_max3_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_med3_f32, dst_v0, op_v0, op_v1, op_v2); + + //! v_min3_num_f16 v0.l, v0.l, v1.l, v2.l ; d6490000 040a0300 + //! v_max3_num_f16 v0.l, v0.l, v1.l, v2.l ; d64c0000 040a0300 + //! v_med3_num_f16 v0.l, v0.l, v1.l, v2.l ; d6320000 040a0300 + bld.vop3(aco_opcode::v_min3_f16, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_max3_f16, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_med3_f16, dst_v0, op_v0, op_v1, op_v2); + + //! v_minimum3_f32 v0, v0, v1, v2 ; d62d0000 040a0300 + //! v_maximum3_f32 v0, v0, v1, v2 ; d62e0000 040a0300 + //! v_minimum3_f16 v0.l, v0.l, v1.l, v2.l ; d62f0000 040a0300 + //! v_maximum3_f16 v0.l, v0.l, v1.l, v2.l ; d6300000 040a0300 + bld.vop3(aco_opcode::v_minimum3_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maximum3_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_minimum3_f16, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maximum3_f16, dst_v0, op_v0, op_v1, op_v2); + + //! v_minimummaximum_f32 v0, v0, v1, v2 ; d66c0000 040a0300 + //! v_maximumminimum_f32 v0, v0, v1, v2 ; d66d0000 040a0300 + //! v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l ; d66e0000 040a0300 + //! v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l ; d66f0000 040a0300 + bld.vop3(aco_opcode::v_minimummaximum_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maximumminimum_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_minimummaximum_f16, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maximumminimum_f16, dst_v0, op_v0, op_v1, op_v2); + + //! v_minmax_num_f32 v0, v0, v1, v2 ; d65f0000 040a0300 + //! v_maxmin_num_f32 v0, v0, v1, v2 ; d65e0000 040a0300 + //! v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; d6610000 040a0300 + //! v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; d6600000 040a0300 + bld.vop3(aco_opcode::v_minmax_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maxmin_f32, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_minmax_f16, dst_v0, op_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maxmin_f16, dst_v0, op_v0, op_v1, op_v2); + + //! v_minimum_f64 v[0:1], v[2:3], v[4:5] ; d7410000 00020902 + //! v_maximum_f64 v[0:1], v[2:3], v[4:5] ; d7420000 00020902 + //! v_minimum_f32 v0, v1, v2 ; d7650000 00020501 + //! v_maximum_f32 v0, v1, v2 ; d7660000 00020501 + //! v_minimum_f16 v0.l, v1.l, v2.l ; d7670000 00020501 + //! v_maximum_f16 v0.l, v1.l, v2.l ; d7680000 00020501 + bld.vop3(aco_opcode::v_minimum_f64, dst_v0, op_v2, op_v4); + bld.vop3(aco_opcode::v_maximum_f64, dst_v0, op_v2, op_v4); + bld.vop3(aco_opcode::v_minimum_f32, dst_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maximum_f32, dst_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_minimum_f16, dst_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_maximum_f16, dst_v0, op_v1, op_v2); + + //! v_max_num_f64 v[0:1], v[0:1], v[2:3] ; d72a0000 00020500 + //! v_min_num_f64 v[0:1], v[0:1], v[2:3] ; d7290000 00020500 + //! v_max_num_f32_e32 v0, v0, v1 ; 20000300 + //! v_min_num_f32_e32 v0, v0, v1 ; 1e000300 + //! v_max_num_f16_e32 v0.l, v0.l, v1.l ; 72000300 + //! v_min_num_f16_e32 v0.l, v0.l, v1.l ; 74000300 + bld.vop3(aco_opcode::v_max_f64_e64, dst_v0, op_v0, op_v2); + bld.vop3(aco_opcode::v_min_f64_e64, dst_v0, op_v0, op_v2); + bld.vop2(aco_opcode::v_max_f32, dst_v0, op_v0, op_v1); + bld.vop2(aco_opcode::v_min_f32, dst_v0, op_v0, op_v1); + bld.vop2(aco_opcode::v_max_f16, dst_v0, op_v0, op_v1); + bld.vop2(aco_opcode::v_min_f16, dst_v0, op_v0, op_v1); + + //! v_cvt_pk_fp8_f32 v0.l, v1, v2 ; d7690000 00020501 + //! v_cvt_pk_bf8_f32 v0.l, v1, v2 ; d76a0000 00020501 + //! v_cvt_sr_fp8_f32 v0, v1, v2 ; d76b0000 00020501 + //! v_cvt_sr_bf8_f32 v0, v1, v2 ; d76c0000 00020501 + bld.vop3(aco_opcode::v_cvt_pk_fp8_f32, dst_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_cvt_pk_bf8_f32, dst_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_cvt_sr_fp8_f32, dst_v0, op_v1, op_v2); + bld.vop3(aco_opcode::v_cvt_sr_bf8_f32, dst_v0, op_v1, op_v2); + + finish_assembler_test(); +END_TEST