From 62e50de5d0bfe9ba90795cbb9422273cf2636848 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 22 Apr 2025 10:50:42 +0100 Subject: [PATCH] aco: use v_perm_b32 for byte swaps within a VGPR on gfx10 Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 22 ++-- src/amd/compiler/tests/test_to_hw_instr.cpp | 125 +++++++++++--------- 2 files changed, 81 insertions(+), 66 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 125d51f0c06..d15687fec00 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1398,16 +1398,19 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres return did_copy; } +void +swap_bytes_bperm(Builder& bld, Definition def, Operand op) +{ + assert(def.physReg().reg() == op.physReg().reg()); + uint8_t swiz[] = {4, 5, 6, 7}; + std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]); + create_bperm(bld, swiz, def, Operand::zero()); +} + void swap_subdword_gfx11(Builder& bld, Definition def, Operand op) { - if (def.physReg().reg() == op.physReg().reg()) { - assert(def.bytes() != 2); /* handled by caller */ - uint8_t swiz[] = {4, 5, 6, 7}; - std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]); - create_bperm(bld, swiz, def, Operand::zero()); - return; - } + assert(def.physReg().reg() != op.physReg().reg()); /* handled by caller */ if (def.bytes() == 2) { Operand def_as_op = Operand(def.physReg(), def.regClass()); @@ -1446,7 +1449,7 @@ swap_subdword_gfx11(Builder& bld, Definition def, Operand op) * into the same VGPR. */ swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b)); - swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b)); + swap_bytes_bperm(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b)); swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b)); } } @@ -1532,6 +1535,9 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) { bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op, Operand::c32(2u)); + } else if (def.bytes() == 1 && def.physReg().reg() == op.physReg().reg() && + ctx->program->gfx_level >= GFX10) { + swap_bytes_bperm(bld, def, op); } else { assert(def.regClass().is_subdword()); if (ctx->program->gfx_level >= GFX11) { diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 010b3d7e571..c638016198b 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -29,34 +29,34 @@ BEGIN_TEST(to_hw_instr.swap_subdword) v128_hi.reg_b += 2; v129_hi.reg_b += 2; - for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) { + for (amd_gfx_level lvl : {GFX8, GFX9, GFX10, GFX11}) { if (!setup_cs(NULL, lvl)) continue; - //~gfx(8|9|11)>> p_unit_test 0 + //>> p_unit_test 0 //~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 - //~gfx(9|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16] + //~gfx(9|10|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), Operand(v0_hi, v2b), Operand(v0_lo, v2b)); - //~gfx(8|9|11)! p_unit_test 1 + //! p_unit_test 1 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] - //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] - //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 + //~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] + //~gfx(8|9|10)! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b)); - //~gfx(8|9|11)! p_unit_test 2 - //~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 - //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0 - //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 - //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 - //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 + //! p_unit_test 2 + //~gfx(8|9|10)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 + //~gfx(8|9|10)! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0 + //~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 + //~gfx(8|9|10)! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 + //~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][0:16] @@ -65,39 +65,39 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b), Operand(v0_lo, v2b)); - //~gfx(8|9|11)! p_unit_test 3 + //! p_unit_test 3 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] - //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] - //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 - //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 + //~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] + //~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 + //~gfx(8|9|10)! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16] //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_b3, v1b), Operand(v1_lo, v1), Operand(v0_b3, v1b)); - //~gfx(8|9|11)! p_unit_test 4 + //! p_unit_test 4 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] - //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] - //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 - //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 + //~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] + //~gfx(8|9|10)! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 + //~gfx(8|9|10)! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104 //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b), Operand(v1_lo, v1), Operand(v0_lo, v1b)); - //~gfx(8|9|11)! p_unit_test 5 + //! p_unit_test 5 //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1] - //~gfx(9|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1] - //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 - //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 + //~gfx(9|10|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1] + //~gfx(8|9|10)! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 + //~gfx(8|9|10)! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u)); @@ -105,35 +105,35 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Definition(v1_lo, v1), Operand(v1_lo, v1b), Operand(v1_hi, v1b), Operand(v0_lo, v1)); - //~gfx(8|9|11)! p_unit_test 6 + //! p_unit_test 6 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] - //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] + //~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), Definition(v1_lo, v1), Operand(v1_lo, v2b), Operand(v1_hi, v2b), Operand(v0_lo, v1)); - //~gfx(8|9|11)! p_unit_test 7 + //! p_unit_test 7 //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1] - //~gfx(9|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1] - //~gfx(8|9|11)! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 + //~gfx(9|10|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1] + //! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), Definition(v1_lo, v1), Operand(v1_hi, v2b), Operand(v1_lo, v2b), Operand(v0_lo, v1)); - //~gfx(8|9|11)! p_unit_test 8 + //! p_unit_test 8 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] - //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] - //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 - //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 - //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 + //~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] + //~gfx(8|9|10)! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 + //~gfx(8|9|10)! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 + //~gfx(8|9|10)! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] @@ -141,28 +141,28 @@ BEGIN_TEST(to_hw_instr.swap_subdword) bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b), Operand(v1_lo, v3b), Operand(v0_lo, v3b)); - //~gfx(8|9|11)! p_unit_test 9 + //! p_unit_test 9 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] - //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] - //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 + //~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] + //~gfx(8|9|10)! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b), Definition(v0_b3, v1b), Operand(v1_lo, v3b), Operand(v0_lo, v3b), Operand(v1_b3, v1b)); - //~gfx(8|9|11)! p_unit_test 10 - //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 - //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 - //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 + //! p_unit_test 10 + //~gfx(8|9|10)! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 + //~gfx(8|9|10)! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 + //~gfx(8|9|10)! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 //~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 //~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi - //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 - //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 - //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 + //~gfx(8|9|10)! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 + //~gfx(8|9|10)! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 + //~gfx(8|9|10)! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506 //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] @@ -170,27 +170,27 @@ BEGIN_TEST(to_hw_instr.swap_subdword) bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b), Operand(v1_b1, v2b), Operand(v0_b1, v2b)); - //~gfx(8|9|11)! p_unit_test 11 - //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1 + //! p_unit_test 11 + //~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1 //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32]) - //~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42 + //! v1: %0:v[0] = v_mov_b32 42 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), Operand::c32(42u), Operand(v0_hi, v2b)); - //~gfx(8|9|11)! p_unit_test 12 + //! p_unit_test 12 //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 + //~gfx(10|11)! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v1b), Definition(v0_b3, v1b), Operand(v0_b3, v1b), Operand(v0_b1, v1b)); - //~gfx(8|9|11)! p_unit_test 13 - //~gfx[89]! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0 - //~gfx[89]! v2b: %0:v[128][0:16] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword0 dst_preserve src0_sel:uword1 src1_sel:uword0 - //~gfx[89]! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0 + //! p_unit_test 13 + //~gfx(8|9|10)! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0 + //~gfx(8|9|10)! v2b: %0:v[128][0:16] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword0 dst_preserve src0_sel:uword1 src1_sel:uword0 + //~gfx(8|9|10)! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0 //~gfx11! v2b: %0:v[128][0:16] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16] //~gfx11! v2b: %0:v[129][16:32] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16] opsel_hi //~gfx11! v2b: %0:v[128][0:16] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16] @@ -198,10 +198,10 @@ BEGIN_TEST(to_hw_instr.swap_subdword) bld.pseudo(aco_opcode::p_parallelcopy, Definition(v128_lo, v2b), Definition(v129_hi, v2b), Operand(v129_hi, v2b), Operand(v128_lo, v2b)); - //~gfx(8|9|11)! p_unit_test 14 - //~gfx[89]! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1 - //~gfx[89]! v2b: %0:v[128][16:32] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword1 dst_preserve src0_sel:uword0 src1_sel:uword1 - //~gfx[89]! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1 + //! p_unit_test 14 + //~gfx(8|9|10)! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1 + //~gfx(8|9|10)! v2b: %0:v[128][16:32] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword1 dst_preserve src0_sel:uword0 src1_sel:uword1 + //~gfx(8|9|10)! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1 //~gfx11! v2b: %0:v[128][16:32] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) opsel_hi //~gfx11! v2b: %0:v[129][0:16] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) //~gfx11! v2b: %0:v[128][16:32] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) opsel_hi @@ -209,8 +209,17 @@ BEGIN_TEST(to_hw_instr.swap_subdword) bld.pseudo(aco_opcode::p_parallelcopy, Definition(v128_hi, v2b), Definition(v129_lo, v2b), Operand(v129_lo, v2b), Operand(v128_hi, v2b)); + //! p_unit_test 15 + //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[0][16:24], %0:v[0][0:8] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte0 + //~gfx[89]!v1b: %0:v[0][0:8] = v_xor_b32 %0:v[0][16:24], %0:v[0][0:8] dst_sel:ubyte0 dst_preserve src0_sel:ubyte2 src1_sel:ubyte0 + //~gfx[89]!v1b: %0:v[0][16:24] = v_xor_b32 %0:v[0][16:24], %0:v[0][0:8] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte0 + //~gfx(10|11)! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v0_hi, v1b), + Operand(v0_hi, v1b), Operand(v0_lo, v1b)); + //~gfx11! s_sendmsg sendmsg(dealloc_vgprs) - //~gfx(8|9|11)! s_endpgm + //! s_endpgm finish_to_hw_instr_test(); }