From a43783fd7642356cf0e35fa2a7b42d718eb87196 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 18 Apr 2025 17:45:44 +0100 Subject: [PATCH] aco: use v_perm_b32 for do_pack_2x16 on gfx10+ fossil-db (gfx1201); Totals from 93 (0.12% of 79377) affected shaders: Instrs: 373212 -> 372761 (-0.12%) CodeSize: 2062752 -> 2063704 (+0.05%); split: -0.00%, +0.05% Latency: 4172059 -> 4171993 (-0.00%); split: -0.00%, +0.00% InvThroughput: 1299144 -> 1299093 (-0.00%) Copies: 51268 -> 50831 (-0.85%) Branches: 10980 -> 10979 (-0.01%) VALU: 220192 -> 219756 (-0.20%) VOPD: 48 -> 47 (-2.08%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 34 ++++++++++ src/amd/compiler/tests/test_to_hw_instr.cpp | 72 ++++++++++++++++++++- 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 145d3577ef1..125d51f0c06 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1595,6 +1595,40 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera return; } + if (ctx->program->gfx_level >= GFX10 && !lo.constantEquals(0) && !hi.constantEquals(0)) { + uint8_t swiz[4]; + Operand ops[2] = {lo, hi}; + for (unsigned i = 0; i < 2; i++) { + ops[i] = + ops[i].isConstant() ? Operand::c32((int32_t)(int16_t)ops[i].constantValue()) : ops[i]; + + swiz[i * 2 + 0] = i * 4 + ops[i].physReg().byte(); + swiz[i * 2 + 1] = i * 4 + ops[i].physReg().byte() + 1; + + if (ops[i].isLiteral()) { + Operand b0 = Operand::c32((int32_t)(int8_t)ops[i].constantValue()); + Operand b1 = Operand::c32((int32_t)(int8_t)(ops[i].constantValue() >> 8)); + if (!b0.isLiteral() && + (b1.constantValue() == 0x00 || b1.constantValue() == 0xffffffff)) { + ops[i] = b0; + swiz[i * 2 + 1] = b1.constantValue() ? 13 : 12; + } else if (!b1.isLiteral() && + (b0.constantValue() == 0x00 || b0.constantValue() == 0xffffffff)) { + ops[i] = b1; + swiz[i * 2 + 0] = b0.constantValue() ? 13 : 12; + swiz[i * 2 + 1]--; + } else if (b0.constantValue() == b1.constantValue()) { + ops[i] = b0; + swiz[i * 2 + 1]--; + } + } + } + if (!ops[0].isLiteral() && !ops[1].isLiteral()) { + create_bperm(bld, swiz, def, ops[0], ops[1]); + return; + } + } + Definition def_lo = Definition(def.physReg(), v2b); Definition def_hi = Definition(def.physReg().advance(2), v2b); diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index eb26b97ed5f..010b3d7e571 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -725,7 +725,7 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant) v0_hi.reg_b += 2; v1_hi.reg_b += 2; - for (amd_gfx_level lvl : {GFX10, GFX11}) { + for (amd_gfx_level lvl : {GFX9, GFX10, GFX11}) { if (!setup_cs(NULL, lvl)) continue; @@ -733,7 +733,9 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant) program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; //>> p_unit_test 0 - //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2 + //~gfx9! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32] + //~gfx9! v1: %_:v[0] = v_or_b32 0x38000000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), Operand(v1_hi, v2b), Operand::c16(0x3800)); @@ -745,7 +747,7 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant) Operand(v1_hi, v2b), Operand::zero(2)); //! p_unit_test 2 - //~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx(9|10)! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] //~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), @@ -763,6 +765,70 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant) bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), Operand::zero(2), Operand(v1_lo, v2b)); + //! p_unit_test 5 + //~gfx9! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0xff10, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[1], 16, 0x5040d00 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand::c16(0xff10), Operand(v1_lo, v2b)); + + //! p_unit_test 6 + //~gfx9! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0x1000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[1], 16, 0x504000c + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand::c16(0x1000), Operand(v1_lo, v2b)); + + //! p_unit_test 7 + //~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0xff100000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 16, %_:v[1], 0xd040100 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::c16(0xff10)); + + //! p_unit_test 8 + //~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0x10ff0000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 16, %_:v[1], 0x40d0100 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::c16(0x10ff)); + + //! p_unit_test 9 + //~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0x10100000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 16, %_:v[1], 0x4040100 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::c16(0x1010)); + + //! p_unit_test 10 + //~gfx(9|10)! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16] + //! v1: %_:v[0] = v_or_b32 0x10110000, %_:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::c16(0x1011)); + + //! p_unit_test 11 + //~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0xfff00000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 -16, %_:v[1], 0x5040100 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::c16(0xfff0)); + + //! p_unit_test 12 + //~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx9! v1: %_:v[0] = v_or_b32 0xf00000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 -16, %_:v[1], 0xc040100 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::c16(0x00f0)); + //~gfx11! s_sendmsg sendmsg(dealloc_vgprs) //! s_endpgm