From 40bfb088285a4ffecedbf22742c241900e477d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Fri, 16 Oct 2020 15:12:28 +0200 Subject: [PATCH] aco: refactor GFX6_7 subdword copy lowering The new code uses alignbyte which leads to shorter code and preserves the operand's registers. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 34 +++++----- src/amd/compiler/tests/test_to_hw_instr.cpp | 69 ++++++++------------- 2 files changed, 43 insertions(+), 60 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index ca8865b96ea..fd3f93110db 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1022,30 +1022,28 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool if (op.physReg().byte()) { assert(def.physReg().byte() == 0); bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op); - } else if (def.physReg().byte() == 2) { + } else if (def.physReg().byte()) { assert(op.physReg().byte() == 0); /* preserve the target's lower half */ - def = Definition(def.physReg().advance(-2), v1); - bld.vop2(aco_opcode::v_and_b32, Definition(op.physReg(), v1), Operand(0xFFFFu), op); - if (def.physReg().reg() != op.physReg().reg()) - bld.vop2(aco_opcode::v_and_b32, def, Operand(0xFFFFu), Operand(def.physReg(), v2b)); - bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, Operand(def.physReg(), v2b), op); - } else if (def.physReg().byte()) { - unsigned bits = def.physReg().byte() * 8; - assert(op.physReg().byte() == 0); - def = Definition(def.physReg().advance(-def.physReg().byte()), v1); - bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass())); + uint32_t bits = def.physReg().byte() * 8; + PhysReg lo_reg = PhysReg(def.physReg().reg()); + Definition lo_half = Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte())); + Definition dst = Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes())); + if (def.physReg().reg() == op.physReg().reg()) { - if (bits < 24) { - bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op); - } else { + bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), Operand(lo_reg, lo_half.regClass())); + if (def.physReg().byte() == 1) { + bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand((1 << bits) + 1u), op); + } else if (def.physReg().byte() == 2) { + bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op); + } else if (def.physReg().byte() == 3) { bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u)); - bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op); + bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op); } } else { - bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op); - bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op); - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op); + lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte())); + bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), Operand(lo_reg, lo_half.regClass())); + bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, Operand(lo_half.physReg(), lo_half.regClass()), Operand(4 - def.physReg().byte())); } } else { bld.vop1(aco_opcode::v_mov_b32, def, op); diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 0fe8e168aee..ed5c999631b 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -57,9 +57,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v1_lo, v2b), Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 1 - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16] - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16] + //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] + //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2 //~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1] bld.pseudo(aco_opcode::p_unit_test, Operand(1u)); bld.pseudo(aco_opcode::p_create_vector, @@ -67,9 +66,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v1_lo, v2b), Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 2 - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16] - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16] + //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] + //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2 //~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1] //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand(2u)); @@ -78,9 +76,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v0_lo, v2b), Operand(v2_lo, v2b)); //~gfx[67]! p_unit_test 3 - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16] - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16] + //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] + //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2 //~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1] //~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16] //~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16] @@ -92,12 +89,10 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v2_lo, v2b), Operand(v3_lo, v2b)); //~gfx[67]! p_unit_test 4 - //~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16] - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[2][0:16] - //~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16] - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16] - //~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[3][0:16] + //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] + //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2 + //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2 //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] @@ -157,10 +152,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v1_lo, v1b), Operand(v0_lo, v1b)); //~gfx[67]! p_unit_test 10 - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8] - //~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8] + //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8] + //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3 //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand(10u)); bld.pseudo(aco_opcode::p_create_vector, @@ -168,32 +161,24 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v1_lo, v1b), Operand(v0_lo, v1b)); //~gfx[67]! p_unit_test 11 - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8] - //~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8] + //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8] + //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3 //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] - //~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:8] - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16] - //~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[2][0:8] + //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] + //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand(11u)); bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v3b), Operand(v1_lo, v1b), Operand(v0_lo, v1b), Operand(v2_lo, v1b)); //~gfx[67]! p_unit_test 12 - //~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8] - //~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8] + //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8] + //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3 //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] - //~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:8] - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16] - //~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[2][0:8] - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffffff, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[3] = v_lshlrev_b32 24, %0:v[3][0:8] - //~gfx[67]! v1: %0:v[0] = v_or_b32 %0:v[0][0:8], %0:v[3][0:8] - //~gfx[67]! v1: %0:v[3] = v_lshrrev_b32 24, %0:v[3][0:8] + //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] + //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2 + //~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24] + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1 bld.pseudo(aco_opcode::p_unit_test, Operand(12u)); bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), @@ -201,11 +186,11 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v2_lo, v1b), Operand(v3_lo, v1b)); //~gfx[67]! p_unit_test 13 - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xff, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_mul_u32_u24 0x101, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8] - //~gfx[67]! v1: %0:v[0] = v_and_b32 0xffffff, %0:v[0][0:8] + //~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8] + //~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8] + //~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16] + //~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8] + //~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24] //~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001 //~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8] bld.pseudo(aco_opcode::p_unit_test, Operand(13u));