aco: use v_perm_b32 for byte swaps within a VGPR on gfx10

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34636>
This commit is contained in:
Rhys Perry 2025-04-22 10:50:42 +01:00 committed by Marge Bot
parent a43783fd76
commit 62e50de5d0
2 changed files with 81 additions and 66 deletions

View file

@ -1398,16 +1398,19 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
return did_copy;
}
void
swap_bytes_bperm(Builder& bld, Definition def, Operand op)
{
assert(def.physReg().reg() == op.physReg().reg());
uint8_t swiz[] = {4, 5, 6, 7};
std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
create_bperm(bld, swiz, def, Operand::zero());
}
void
swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
{
if (def.physReg().reg() == op.physReg().reg()) {
assert(def.bytes() != 2); /* handled by caller */
uint8_t swiz[] = {4, 5, 6, 7};
std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
create_bperm(bld, swiz, def, Operand::zero());
return;
}
assert(def.physReg().reg() != op.physReg().reg()); /* handled by caller */
if (def.bytes() == 2) {
Operand def_as_op = Operand(def.physReg(), def.regClass());
@ -1446,7 +1449,7 @@ swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
* into the same VGPR.
*/
swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
swap_bytes_bperm(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
}
}
@ -1532,6 +1535,9 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese
} else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
Operand::c32(2u));
} else if (def.bytes() == 1 && def.physReg().reg() == op.physReg().reg() &&
ctx->program->gfx_level >= GFX10) {
swap_bytes_bperm(bld, def, op);
} else {
assert(def.regClass().is_subdword());
if (ctx->program->gfx_level >= GFX11) {

View file

@ -29,34 +29,34 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
v128_hi.reg_b += 2;
v129_hi.reg_b += 2;
for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) {
for (amd_gfx_level lvl : {GFX8, GFX9, GFX10, GFX11}) {
if (!setup_cs(NULL, lvl))
continue;
//~gfx(8|9|11)>> p_unit_test 0
//>> p_unit_test 0
//~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
//~gfx(9|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16]
//~gfx(9|10|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v0_hi, v2b), Operand(v0_lo, v2b));
//~gfx(8|9|11)! p_unit_test 1
//! p_unit_test 1
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx(8|9|10)! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
Operand(v1_lo, v1), Operand(v0_lo, v2b));
//~gfx(8|9|11)! p_unit_test 2
//~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//! p_unit_test 2
//~gfx(8|9|10)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx(8|9|10)! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0
//~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx(8|9|10)! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][0:16]
@ -65,39 +65,39 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b),
Operand(v0_lo, v2b));
//~gfx(8|9|11)! p_unit_test 3
//! p_unit_test 3
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
//~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
//~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
//~gfx(8|9|10)! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
//~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16]
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_b3, v1b),
Operand(v1_lo, v1), Operand(v0_b3, v1b));
//~gfx(8|9|11)! p_unit_test 4
//! p_unit_test 4
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx(8|9|10)! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
//~gfx(8|9|10)! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
Operand(v1_lo, v1), Operand(v0_lo, v1b));
//~gfx(8|9|11)! p_unit_test 5
//! p_unit_test 5
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
//~gfx(9|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
//~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
//~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
//~gfx(9|10|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
//~gfx(8|9|10)! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
//~gfx(8|9|10)! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
@ -105,35 +105,35 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Definition(v1_lo, v1), Operand(v1_lo, v1b), Operand(v1_hi, v1b),
Operand(v0_lo, v1));
//~gfx(8|9|11)! p_unit_test 6
//! p_unit_test 6
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Definition(v1_lo, v1), Operand(v1_lo, v2b), Operand(v1_hi, v2b),
Operand(v0_lo, v1));
//~gfx(8|9|11)! p_unit_test 7
//! p_unit_test 7
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
//~gfx(9|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
//~gfx(8|9|11)! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
//~gfx(9|10|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
//! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Definition(v1_lo, v1), Operand(v1_hi, v2b), Operand(v1_lo, v2b),
Operand(v0_lo, v1));
//~gfx(8|9|11)! p_unit_test 8
//! p_unit_test 8
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx(8|9|10)! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx(8|9|10)! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx(8|9|10)! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
@ -141,28 +141,28 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
Operand(v1_lo, v3b), Operand(v0_lo, v3b));
//~gfx(8|9|11)! p_unit_test 9
//! p_unit_test 9
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
//~gfx(9|10|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx(8|9|10)! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
Definition(v0_b3, v1b), Operand(v1_lo, v3b), Operand(v0_lo, v3b),
Operand(v1_b3, v1b));
//~gfx(8|9|11)! p_unit_test 10
//~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//! p_unit_test 10
//~gfx(8|9|10)! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx(8|9|10)! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx(8|9|10)! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
//~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
//~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx(8|9|10)! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx(8|9|10)! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx(8|9|10)! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
@ -170,27 +170,27 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b),
Operand(v1_b1, v2b), Operand(v0_b1, v2b));
//~gfx(8|9|11)! p_unit_test 11
//~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
//! p_unit_test 11
//~gfx(8|9|10)! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
//~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32])
//~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42
//! v1: %0:v[0] = v_mov_b32 42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
Operand::c32(42u), Operand(v0_hi, v2b));
//~gfx(8|9|11)! p_unit_test 12
//! p_unit_test 12
//~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
//~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
//~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
//~gfx(10|11)! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v1b), Definition(v0_b3, v1b),
Operand(v0_b3, v1b), Operand(v0_b1, v1b));
//~gfx(8|9|11)! p_unit_test 13
//~gfx[89]! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0
//~gfx[89]! v2b: %0:v[128][0:16] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword0 dst_preserve src0_sel:uword1 src1_sel:uword0
//~gfx[89]! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0
//! p_unit_test 13
//~gfx(8|9|10)! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0
//~gfx(8|9|10)! v2b: %0:v[128][0:16] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword0 dst_preserve src0_sel:uword1 src1_sel:uword0
//~gfx(8|9|10)! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0
//~gfx11! v2b: %0:v[128][0:16] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16]
//~gfx11! v2b: %0:v[129][16:32] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16] opsel_hi
//~gfx11! v2b: %0:v[128][0:16] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16]
@ -198,10 +198,10 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v128_lo, v2b), Definition(v129_hi, v2b),
Operand(v129_hi, v2b), Operand(v128_lo, v2b));
//~gfx(8|9|11)! p_unit_test 14
//~gfx[89]! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1
//~gfx[89]! v2b: %0:v[128][16:32] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword1 dst_preserve src0_sel:uword0 src1_sel:uword1
//~gfx[89]! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1
//! p_unit_test 14
//~gfx(8|9|10)! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1
//~gfx(8|9|10)! v2b: %0:v[128][16:32] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword1 dst_preserve src0_sel:uword0 src1_sel:uword1
//~gfx(8|9|10)! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1
//~gfx11! v2b: %0:v[128][16:32] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) opsel_hi
//~gfx11! v2b: %0:v[129][0:16] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32])
//~gfx11! v2b: %0:v[128][16:32] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) opsel_hi
@ -209,8 +209,17 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v128_hi, v2b), Definition(v129_lo, v2b),
Operand(v129_lo, v2b), Operand(v128_hi, v2b));
//! p_unit_test 15
//~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[0][16:24], %0:v[0][0:8] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte0
//~gfx[89]!v1b: %0:v[0][0:8] = v_xor_b32 %0:v[0][16:24], %0:v[0][0:8] dst_sel:ubyte0 dst_preserve src0_sel:ubyte2 src1_sel:ubyte0
//~gfx[89]!v1b: %0:v[0][16:24] = v_xor_b32 %0:v[0][16:24], %0:v[0][0:8] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte0
//~gfx(10|11)! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v0_hi, v1b),
Operand(v0_hi, v1b), Operand(v0_lo, v1b));
//~gfx11! s_sendmsg sendmsg(dealloc_vgprs)
//~gfx(8|9|11)! s_endpgm
//! s_endpgm
finish_to_hw_instr_test();
}