mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 11:18:08 +02:00
aco: Use v_mov_b16 on GFX11.
Foz-DB GFX1100: Totals from 4684 (3.47% of 134913) affected shaders: CodeSize: 41086444 -> 41043476 (-0.10%) Instrs: 8176019 -> 8175995 (-0.00%) Latency: 83792071 -> 83792023 (-0.00%) InvThroughput: 10311371 -> 10311369 (-0.00%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20369>
This commit is contained in:
parent
c2790fe537
commit
39b7502f04
3 changed files with 49 additions and 15 deletions
|
|
@ -1101,6 +1101,33 @@ create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
|
|||
bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
|
||||
}
|
||||
|
||||
void
|
||||
emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
|
||||
{
|
||||
/* v_mov_b16 uses 32bit inline constants. */
|
||||
if (op.isConstant()) {
|
||||
if (!op.isLiteral() && op.physReg() >= 240) {
|
||||
/* v_add_f16 is smaller because it can use 16bit fp inline constants. */
|
||||
Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
|
||||
if (dst.physReg().byte() == 2)
|
||||
instr->vop3().opsel = 0x8;
|
||||
return;
|
||||
}
|
||||
op = Operand::c32((int32_t)(int16_t)op.constantValue());
|
||||
}
|
||||
|
||||
if (!dst.physReg().byte() && !op.physReg().byte()) {
|
||||
bld.vop1(aco_opcode::v_mov_b16, dst, op);
|
||||
} else {
|
||||
// TODO: this can use VOP1 for vgpr0-127 with assembler support
|
||||
Instruction* instr = bld.vop1_e64(aco_opcode::v_mov_b16, dst, op);
|
||||
if (op.physReg().byte() == 2)
|
||||
instr->vop3().opsel |= 0x1;
|
||||
if (dst.physReg().byte() == 2)
|
||||
instr->vop3().opsel |= 0x8;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
|
||||
{
|
||||
|
|
@ -1166,6 +1193,8 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
|
|||
} else {
|
||||
bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
|
||||
}
|
||||
} else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
|
||||
emit_v_mov_b16(bld, dst, op);
|
||||
} else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
|
||||
if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
|
||||
/* use v_mov_b32 to avoid possible issues with denormal flushing or
|
||||
|
|
@ -1340,7 +1369,7 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
|
|||
swiz[def.physReg().byte()] = op.physReg().byte();
|
||||
create_bperm(bld, swiz, def, op);
|
||||
} else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
|
||||
addsub_subdword_gfx11(bld, def, op, Operand::zero(), false);
|
||||
emit_v_mov_b16(bld, def, op);
|
||||
} else if (def.regClass().is_subdword()) {
|
||||
bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
|
||||
} else {
|
||||
|
|
@ -1577,9 +1606,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
|
|||
/* either hi or lo are already placed correctly */
|
||||
if (ctx->program->gfx_level >= GFX11) {
|
||||
if (lo.physReg().reg() == def.physReg().reg())
|
||||
addsub_subdword_gfx11(bld, def_hi, hi, Operand::zero(), false);
|
||||
emit_v_mov_b16(bld, def_hi, hi);
|
||||
else
|
||||
addsub_subdword_gfx11(bld, def_lo, lo, Operand::zero(), false);
|
||||
emit_v_mov_b16(bld, def_lo, lo);
|
||||
return;
|
||||
} else if (ctx->program->gfx_level >= GFX8) {
|
||||
if (lo.physReg().reg() == def.physReg().reg())
|
||||
|
|
|
|||
|
|
@ -895,6 +895,7 @@ VOP1 = {
|
|||
( -1, -1, -1, -1, -1, 0x69, "v_not_b16", False, False),
|
||||
( -1, -1, -1, -1, -1, 0x6a, "v_cvt_i32_i16", False, False),
|
||||
( -1, -1, -1, -1, -1, 0x6b, "v_cvt_u32_u16", False, False),
|
||||
( -1, -1, -1, -1, -1, 0x1c, "v_mov_b16", True, False),
|
||||
}
|
||||
for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, cls) in default_class(VOP1, InstrClass.Valu32):
|
||||
opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP1, cls, in_mod, out_mod)
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy,
|
||||
Definition(v0_lo, v1), Definition(v1_lo, v2b),
|
||||
|
|
@ -253,8 +253,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
|
||||
//~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
|
||||
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[1][16:32]), 0 opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 %0:v[0][0:16], 0 opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
|
||||
|
|
@ -270,7 +270,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
|
||||
//~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 %0:v[0][0:16], 0
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16]
|
||||
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy,
|
||||
|
|
@ -285,7 +285,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
|
||||
//~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
|
||||
//~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy,
|
||||
Definition(v0_lo, v1), Definition(v1_lo, v1b),
|
||||
|
|
@ -386,7 +386,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
|
||||
//~gfx(8|9|11)! p_unit_test 11
|
||||
//~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 hi(%0:v[0][16:32]), 0
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32])
|
||||
//~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
|
||||
|
|
@ -466,21 +466,23 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
|
|||
/* 16-bit copy */
|
||||
//! p_unit_test 6
|
||||
//~gfx(9|10)! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword
|
||||
//~gfx11! v2b: %_:v[0][0:16] = v_pack_b32_f16 0.5, hi(%_:v[0][16:32])
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_f16 0.5, 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800));
|
||||
|
||||
//! p_unit_test 7
|
||||
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
|
||||
//~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
|
||||
//~gfx(10|11)! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
|
||||
//~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0x4205
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205));
|
||||
|
||||
//! p_unit_test 8
|
||||
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
|
||||
//~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0]
|
||||
//~gfx(10|11)! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
|
||||
//~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0x4205 opsel_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205));
|
||||
|
||||
|
|
@ -530,7 +532,8 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
|
|||
//>> p_unit_test 13
|
||||
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
|
||||
//~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0]
|
||||
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d
|
||||
//~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b),
|
||||
Operand::c16(0x00ff));
|
||||
|
|
@ -538,14 +541,15 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
|
|||
//! p_unit_test 14
|
||||
//~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
|
||||
//~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0]
|
||||
//~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504
|
||||
//~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b),
|
||||
Operand::c16(0xff00));
|
||||
|
||||
//! p_unit_test 15
|
||||
//~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
|
||||
//~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0c
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b),
|
||||
Operand::zero(2));
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue