aco: use s_pack_*_b32_b16 more in p_insert/p_extract lowering

This opcode doesn't write SCC, which gives later passes more freedom to
move instructions.

fossil-db (navi21):
Totals from 727 (0.92% of 79395) affected shaders:
Latency: 14943483 -> 14942704 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 3225790 -> 3225766 (-0.00%); split: -0.00%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29912>
This commit is contained in:
Rhys Perry 2024-06-06 15:32:38 +01:00 committed by Marge Bot
parent ca161a96d1
commit f842bd81ca
2 changed files with 19 additions and 8 deletions

View file

@ -2497,14 +2497,17 @@ lower_to_hw_instr(Program* program)
bool signext = !instr->operands[3].constantEquals(0);
if (dst.regClass() == s1) {
if (offset == (32 - bits)) {
bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
bld.def(s1, scc), op, Operand::c32(offset));
} else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
if (offset == 0 && signext && (bits == 8 || bits == 16)) {
bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
dst, op);
} else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
} else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 &&
!signext) {
bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero());
} else if (offset == (32 - bits)) {
bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
bld.def(s1, scc), op, Operand::c32(offset));
} else {
bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
@ -2574,7 +2577,11 @@ lower_to_hw_instr(Program* program)
bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
if (dst.regClass() == s1) {
if (offset == (32 - bits)) {
if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
} else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) {
bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op);
} else if (offset == (32 - bits)) {
bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
Operand::c32(offset));
} else if (offset == 0) {

View file

@ -425,7 +425,9 @@ BEGIN_TEST(to_hw_instr.extract)
//~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
//~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1]
EXT(0, 16)
//! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16
//~gfx(7,8)_unsigned! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16
//~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_hh_b32_b16 %_:s[1], 0
//~gfx.*_signed! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16
EXT(1, 16)
#undef EXT
@ -523,9 +525,11 @@ BEGIN_TEST(to_hw_instr.insert)
INS(2, 8)
//! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 24
INS(3, 8)
//! s1: %_:s[0], s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
//~gfx(7|8)! s1: %_:s[0], s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
//~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
INS(0, 16)
//! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16
//~gfx(7|8)! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16
//~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 0, %_:s[1]
INS(1, 16)
#undef INS