aco: Use v_mov_b16 on GFX11.

Foz-DB GFX1100:
Totals from 4684 (3.47% of 134913) affected shaders:
CodeSize: 41086444 -> 41043476 (-0.10%)
Instrs: 8176019 -> 8175995 (-0.00%)
Latency: 83792071 -> 83792023 (-0.00%)
InvThroughput: 10311371 -> 10311369 (-0.00%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20369>
This commit is contained in:
Georg Lehmann 2022-12-17 11:55:29 +01:00 committed by Marge Bot
parent c2790fe537
commit 39b7502f04
3 changed files with 49 additions and 15 deletions

View file

@ -1101,6 +1101,33 @@ create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
}
void
emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
{
/* v_mov_b16 uses 32bit inline constants. */
if (op.isConstant()) {
if (!op.isLiteral() && op.physReg() >= 240) {
/* v_add_f16 is smaller because it can use 16bit fp inline constants. */
Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
if (dst.physReg().byte() == 2)
instr->vop3().opsel = 0x8;
return;
}
op = Operand::c32((int32_t)(int16_t)op.constantValue());
}
if (!dst.physReg().byte() && !op.physReg().byte()) {
bld.vop1(aco_opcode::v_mov_b16, dst, op);
} else {
// TODO: this can use VOP1 for vgpr0-127 with assembler support
Instruction* instr = bld.vop1_e64(aco_opcode::v_mov_b16, dst, op);
if (op.physReg().byte() == 2)
instr->vop3().opsel |= 0x1;
if (dst.physReg().byte() == 2)
instr->vop3().opsel |= 0x8;
}
}
void
copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
{
@ -1166,6 +1193,8 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
} else {
bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
}
} else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
emit_v_mov_b16(bld, dst, op);
} else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
/* use v_mov_b32 to avoid possible issues with denormal flushing or
@ -1340,7 +1369,7 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
swiz[def.physReg().byte()] = op.physReg().byte();
create_bperm(bld, swiz, def, op);
} else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
addsub_subdword_gfx11(bld, def, op, Operand::zero(), false);
emit_v_mov_b16(bld, def, op);
} else if (def.regClass().is_subdword()) {
bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
} else {
@ -1577,9 +1606,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
/* either hi or lo are already placed correctly */
if (ctx->program->gfx_level >= GFX11) {
if (lo.physReg().reg() == def.physReg().reg())
addsub_subdword_gfx11(bld, def_hi, hi, Operand::zero(), false);
emit_v_mov_b16(bld, def_hi, hi);
else
addsub_subdword_gfx11(bld, def_lo, lo, Operand::zero(), false);
emit_v_mov_b16(bld, def_lo, lo);
return;
} else if (ctx->program->gfx_level >= GFX8) {
if (lo.physReg().reg() == def.physReg().reg())

View file

@ -895,6 +895,7 @@ VOP1 = {
( -1, -1, -1, -1, -1, 0x69, "v_not_b16", False, False),
( -1, -1, -1, -1, -1, 0x6a, "v_cvt_i32_i16", False, False),
( -1, -1, -1, -1, -1, 0x6b, "v_cvt_u32_u16", False, False),
( -1, -1, -1, -1, -1, 0x1c, "v_mov_b16", True, False),
}
for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, cls) in default_class(VOP1, InstrClass.Valu32):
opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP1, cls, in_mod, out_mod)

View file

@ -241,7 +241,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
bld.pseudo(aco_opcode::p_parallelcopy,
Definition(v0_lo, v1), Definition(v1_lo, v2b),
@ -253,8 +253,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[1][16:32]), 0 opsel_hi
//~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 %0:v[0][0:16], 0 opsel_hi
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
@ -270,7 +270,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
//~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
//~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
//~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 %0:v[0][0:16], 0
//~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16]
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
bld.pseudo(aco_opcode::p_parallelcopy,
@ -285,7 +285,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104
//~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
bld.pseudo(aco_opcode::p_parallelcopy,
Definition(v0_lo, v1), Definition(v1_lo, v1b),
@ -386,7 +386,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx(8|9|11)! p_unit_test 11
//~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
//~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 hi(%0:v[0][16:32]), 0
//~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32])
//~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
@ -466,21 +466,23 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
/* 16-bit copy */
//! p_unit_test 6
//~gfx(9|10)! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword
//~gfx11! v2b: %_:v[0][0:16] = v_pack_b32_f16 0.5, hi(%_:v[0][16:32])
//~gfx11! v2b: %0:v[0][0:16] = v_add_f16 0.5, 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800));
//! p_unit_test 7
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
//~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
//~gfx(10|11)! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
//~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
//~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0x4205
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205));
//! p_unit_test 8
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
//~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0]
//~gfx(10|11)! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
//~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0x4205 opsel_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205));
@ -530,7 +532,8 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
//>> p_unit_test 13
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
//~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d
//~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d
//~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b),
Operand::c16(0x00ff));
@ -538,14 +541,15 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
//! p_unit_test 14
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
//~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0]
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504
//~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b),
Operand::c16(0xff00));
//! p_unit_test 15
//~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
//~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0c
//~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b),
Operand::zero(2));