From ca161a96d1d9a06bdfa4e32e9903ca1a3a34332a Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 6 Jun 2024 15:25:13 +0100 Subject: [PATCH] aco: combine extracts into s_pack_ll_b32_b16 fossil-db (navi21): Totals from 3 (0.00% of 79395) affected shaders: Instrs: 45941 -> 45924 (-0.04%) CodeSize: 241768 -> 241756 (-0.00%) Latency: 176501 -> 176491 (-0.01%) Copies: 6884 -> 6882 (-0.03%) SALU: 6101 -> 6088 (-0.21%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 18 +++++++- src/amd/compiler/tests/test_optimizer.cpp | 50 +++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index ef90bed4f2a..542c17750b4 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1050,6 +1050,13 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i } else if (instr->isVALU() && sel.size() == 2 && !instr->valu().opsel[idx] && can_use_opsel(ctx.program->gfx_level, instr->opcode, idx)) { return true; + } else if (instr->opcode == aco_opcode::s_pack_ll_b32_b16 && sel.size() == 2 && + (idx == 1 || ctx.program->gfx_level >= GFX11 || !sel.offset())) { + return true; + } else if (sel.size() == 2 && + ((instr->opcode == aco_opcode::s_pack_lh_b32_b16 && idx == 0) || + (instr->opcode == aco_opcode::s_pack_hl_b32_b16 && idx == 1))) { + return true; } else if (instr->opcode == aco_opcode::p_extract) { SubdwordSel instrSel = parse_extract(instr.get()); @@ -1124,6 +1131,13 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& !info.instr->operands[0].isOfType(RegType::vgpr)) instr->format = asVOP3(instr->format); } + } else if (instr->opcode == aco_opcode::s_pack_ll_b32_b16) { + if (sel.offset()) + instr->opcode = idx ? aco_opcode::s_pack_lh_b32_b16 : aco_opcode::s_pack_hl_b32_b16; + } else if (instr->opcode == aco_opcode::s_pack_lh_b32_b16 || + instr->opcode == aco_opcode::s_pack_hl_b32_b16) { + if (sel.offset()) + instr->opcode = aco_opcode::s_pack_hh_b32_b16; } else if (instr->opcode == aco_opcode::p_extract) { SubdwordSel instrSel = parse_extract(instr.get()); @@ -3784,7 +3798,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->definitions.empty() || is_dead(ctx.uses, instr.get())) return; - if (instr->isVALU()) { + if (instr->isVALU() || instr->isSALU()) { /* Apply SDWA. Do this after label_instruction() so it can remove * label_extract if not all instructions can take SDWA. */ for (unsigned i = 0; i < instr->operands.size(); i++) { @@ -3811,7 +3825,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) instr->operands[i].setTemp(info.instr->operands[0].getTemp()); } } + } + if (instr->isVALU()) { if (can_apply_sgprs(ctx, instr)) apply_sgprs(ctx, instr); combine_mad_mix(ctx, instr); diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index c5230075829..32cd0379132 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1977,3 +1977,53 @@ BEGIN_TEST(optimize.vinterp_inreg_output_modifiers) finish_opt_test(); END_TEST + +BEGIN_TEST(optimize.s_pack) + //>> s1: %a, s1: %b, s1: %c = p_startpgm + if (!setup_cs("s1 s1 s1", GFX11)) + return; + + Temp lo = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[1], + Operand::c32(0), Operand::c32(16u), Operand::c32(false)); + Temp hi = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[2], + Operand::c32(1), Operand::c32(16u), Operand::c32(false)); + + //! s1: %res0 = s_pack_lh_b32_b16 %b, %c + //! p_unit_test 0, %res0 + writeout(0, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, hi)); + + //! s1: %res1 = s_pack_ll_b32_b16 %b, %b + //! p_unit_test 1, %res1 + writeout(1, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, lo)); + + //! s1: %res2 = s_pack_hl_b32_b16 %c, %b + //! p_unit_test 2, %res2 + writeout(2, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, lo)); + + //! s1: %res3 = s_pack_hh_b32_b16 %c, %c + //! p_unit_test 3, %res3 + writeout(3, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, hi)); + + lo = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[1], Operand::c32(0), + Operand::c32(16u), Operand::c32(false)); + hi = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[2], Operand::c32(1), + Operand::c32(16u), Operand::c32(false)); + + //! s1: %res4 = s_pack_ll_b32_b16 %a, %b + //! p_unit_test 4, %res4 + writeout(4, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), inputs[0], lo)); + + //! s1: %res5 = s_pack_lh_b32_b16 %a, %c + //! p_unit_test 5, %res5 + writeout(5, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), inputs[0], hi)); + + //! s1: %res6 = s_pack_ll_b32_b16 %b, %a + //! p_unit_test 6, %res6 + writeout(6, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, inputs[0])); + + //! s1: %res7 = s_pack_hl_b32_b16 %c, %a + //! p_unit_test 7, %res7 + writeout(7, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, inputs[0])); + + finish_opt_test(); +END_TEST