aco: use v_perm_b32 for do_pack_2x16 on gfx10+

fossil-db (gfx1201);
Totals from 93 (0.12% of 79377) affected shaders:
Instrs: 373212 -> 372761 (-0.12%)
CodeSize: 2062752 -> 2063704 (+0.05%); split: -0.00%, +0.05%
Latency: 4172059 -> 4171993 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 1299144 -> 1299093 (-0.00%)
Copies: 51268 -> 50831 (-0.85%)
Branches: 10980 -> 10979 (-0.01%)
VALU: 220192 -> 219756 (-0.20%)
VOPD: 48 -> 47 (-2.08%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34636>
This commit is contained in:
Rhys Perry 2025-04-18 17:45:44 +01:00 committed by Marge Bot
parent 78a3b48db0
commit a43783fd76
2 changed files with 103 additions and 3 deletions

View file

@ -1595,6 +1595,40 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
return;
}
if (ctx->program->gfx_level >= GFX10 && !lo.constantEquals(0) && !hi.constantEquals(0)) {
uint8_t swiz[4];
Operand ops[2] = {lo, hi};
for (unsigned i = 0; i < 2; i++) {
ops[i] =
ops[i].isConstant() ? Operand::c32((int32_t)(int16_t)ops[i].constantValue()) : ops[i];
swiz[i * 2 + 0] = i * 4 + ops[i].physReg().byte();
swiz[i * 2 + 1] = i * 4 + ops[i].physReg().byte() + 1;
if (ops[i].isLiteral()) {
Operand b0 = Operand::c32((int32_t)(int8_t)ops[i].constantValue());
Operand b1 = Operand::c32((int32_t)(int8_t)(ops[i].constantValue() >> 8));
if (!b0.isLiteral() &&
(b1.constantValue() == 0x00 || b1.constantValue() == 0xffffffff)) {
ops[i] = b0;
swiz[i * 2 + 1] = b1.constantValue() ? 13 : 12;
} else if (!b1.isLiteral() &&
(b0.constantValue() == 0x00 || b0.constantValue() == 0xffffffff)) {
ops[i] = b1;
swiz[i * 2 + 0] = b0.constantValue() ? 13 : 12;
swiz[i * 2 + 1]--;
} else if (b0.constantValue() == b1.constantValue()) {
ops[i] = b0;
swiz[i * 2 + 1]--;
}
}
}
if (!ops[0].isLiteral() && !ops[1].isLiteral()) {
create_bperm(bld, swiz, def, ops[0], ops[1]);
return;
}
}
Definition def_lo = Definition(def.physReg(), v2b);
Definition def_hi = Definition(def.physReg().advance(2), v2b);

View file

@ -725,7 +725,7 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant)
v0_hi.reg_b += 2;
v1_hi.reg_b += 2;
for (amd_gfx_level lvl : {GFX10, GFX11}) {
for (amd_gfx_level lvl : {GFX9, GFX10, GFX11}) {
if (!setup_cs(NULL, lvl))
continue;
@ -733,7 +733,9 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant)
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
//>> p_unit_test 0
//! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
//~gfx9! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32]
//~gfx9! v1: %_:v[0] = v_or_b32 0x38000000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_hi, v2b), Operand::c16(0x3800));
@ -745,7 +747,7 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant)
Operand(v1_hi, v2b), Operand::zero(2));
//! p_unit_test 2
//~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx(9|10)! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
@ -763,6 +765,70 @@ BEGIN_TEST(to_hw_instr.pack2x16_constant)
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand::zero(2), Operand(v1_lo, v2b));
//! p_unit_test 5
//~gfx9! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0xff10, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[1], 16, 0x5040d00
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand::c16(0xff10), Operand(v1_lo, v2b));
//! p_unit_test 6
//~gfx9! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0x1000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[1], 16, 0x504000c
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand::c16(0x1000), Operand(v1_lo, v2b));
//! p_unit_test 7
//~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0xff100000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 16, %_:v[1], 0xd040100
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::c16(0xff10));
//! p_unit_test 8
//~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0x10ff0000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 16, %_:v[1], 0x40d0100
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::c16(0x10ff));
//! p_unit_test 9
//~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0x10100000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 16, %_:v[1], 0x4040100
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::c16(0x1010));
//! p_unit_test 10
//~gfx(9|10)! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
//! v1: %_:v[0] = v_or_b32 0x10110000, %_:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::c16(0x1011));
//! p_unit_test 11
//~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0xfff00000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 -16, %_:v[1], 0x5040100
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::c16(0xfff0));
//! p_unit_test 12
//~gfx9! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx9! v1: %_:v[0] = v_or_b32 0xf00000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 -16, %_:v[1], 0xc040100
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::c16(0x00f0));
//~gfx11! s_sendmsg sendmsg(dealloc_vgprs)
//! s_endpgm