From f1a932bc295ecb552cf6cec2a7d0593595bcebbb Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 17 Oct 2024 15:07:56 +0100 Subject: [PATCH] aco: apply extract to p_extract_vector fossil-db (navi21): Totals from 46 (0.06% of 79395) affected shaders: Instrs: 80126 -> 79944 (-0.23%); split: -0.27%, +0.04% CodeSize: 486860 -> 485668 (-0.24%); split: -0.31%, +0.06% Latency: 1615395 -> 1614218 (-0.07%); split: -0.07%, +0.00% InvThroughput: 705479 -> 705013 (-0.07%); split: -0.07%, +0.00% Copies: 18934 -> 18797 (-0.72%); split: -0.98%, +0.25% VALU: 52452 -> 52268 (-0.35%); split: -0.41%, +0.06% SALU: 17253 -> 17255 (+0.01%); split: -0.02%, +0.03% Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 35 ++++++-- src/amd/compiler/tests/test_sdwa.cpp | 130 +++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 70f333cc049..2224c7f5e9e 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1084,14 +1084,15 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i } else if (sel.size() == 2 && ((instr->opcode == aco_opcode::s_pack_lh_b32_b16 && idx == 0) || (instr->opcode == aco_opcode::s_pack_hl_b32_b16 && idx == 1))) { return true; - } else if (instr->opcode == aco_opcode::p_extract) { + } else if (instr->opcode == aco_opcode::p_extract || + instr->opcode == aco_opcode::p_extract_vector) { if (ctx.program->gfx_level < GFX9 && !info.instr->operands[0].isOfType(RegType::vgpr) && instr->definitions[0].regClass().is_subdword()) return false; SubdwordSel instrSel = parse_extract(instr.get()); - return apply_extract_twice(sel, instr->operands[idx].getTemp(), instrSel, - instr->definitions[0].getTemp()) != SubdwordSel(); + return instrSel && apply_extract_twice(sel, instr->operands[idx].getTemp(), instrSel, + instr->definitions[0].getTemp()); } return false; @@ -1175,6 +1176,29 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& instr->operands[2] = Operand::c32(new_sel.size() * 8u); instr->operands[3] = Operand::c32(new_sel.sign_extend()); return; + } else if (instr->opcode == aco_opcode::p_extract_vector) { + SubdwordSel instrSel = parse_extract(instr.get()); + SubdwordSel new_sel = apply_extract_twice(sel, instr->operands[idx].getTemp(), instrSel, + instr->definitions[0].getTemp()); + assert(new_sel.size() <= 2); + + if (new_sel.size() == instr->definitions[0].bytes()) { + instr->operands[1] = Operand::c32(new_sel.offset() / instr->definitions[0].bytes()); + return; + } else { + /* parse_extract() only succeeds with p_extract_vector for VGPR definitions because there + * are no sub-dword SGPR regclasses. */ + assert(instr->definitions[0].regClass().type() != RegType::sgpr); + + Instruction* ext = create_instruction(aco_opcode::p_extract, Format::PSEUDO, 4, 1); + ext->definitions[0] = instr->definitions[0]; + ext->operands[0] = instr->operands[0]; + ext->operands[1] = Operand::c32(new_sel.offset() / new_sel.size()); + ext->operands[2] = Operand::c32(new_sel.size() * 8u); + ext->operands[3] = Operand::c32(new_sel.sign_extend()); + ext->pass_flags = instr->pass_flags; + instr.reset(ext); + } } /* These are the only labels worth keeping at the moment. */ @@ -3785,7 +3809,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isSDWA() || instr->isDPP()) return; - if (instr->opcode == aco_opcode::p_extract) { + if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_extract_vector) { ssa_info& info = ctx.info[instr->operands[0].tempId()]; if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) { apply_extract(ctx, instr, 0, info); @@ -3794,7 +3818,8 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) instr->operands[0].setTemp(info.instr->operands[0].getTemp()); } - apply_ds_extract(ctx, instr); + if (instr->opcode == aco_opcode::p_extract) + apply_ds_extract(ctx, instr); } /* TODO: There are still some peephole optimizations that could be done: diff --git a/src/amd/compiler/tests/test_sdwa.cpp b/src/amd/compiler/tests/test_sdwa.cpp index a7fe358dec0..708ca9e0601 100644 --- a/src/amd/compiler/tests/test_sdwa.cpp +++ b/src/amd/compiler/tests/test_sdwa.cpp @@ -699,3 +699,133 @@ BEGIN_TEST(optimize.sdwa.subdword_extract) finish_opt_test(); END_TEST + +BEGIN_TEST(optimize.sdwa.extract_vector) + //>> v1: %a = p_startpgm + if (!setup_cs("v1", GFX10_3)) + return; + + Temp a = inputs[0]; + + //! v1b: %res0 = p_extract_vector %a, 0 + //! p_unit_test 0, %res0 + writeout( + 0, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 0), Operand::c32(0))); + + //! v1b: %res1 = p_extract_vector %a, 1 + //! p_unit_test 1, %res1 + writeout( + 1, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 1), Operand::c32(0))); + + //! v1b: %res2 = p_extract_vector %a, 2 + //! p_unit_test 2, %res2 + writeout( + 2, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 2), Operand::c32(0))); + + //! v1b: %res3 = p_extract_vector %a, 3 + //! p_unit_test 3, %res3 + writeout( + 3, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 3), Operand::c32(0))); + + //! v1b: %res4 = p_extract_vector %a, 0 + //! p_unit_test 4, %res4 + writeout( + 4, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 0), Operand::c32(0))); + + //! v1b: %res5 = p_extract_vector %a, 2 + //! p_unit_test 5, %res5 + writeout( + 5, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 1), Operand::c32(0))); + + //! v1b: %res6 = p_extract_vector %a, 1 + //! p_unit_test 6, %res6 + writeout( + 6, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 0), Operand::c32(1))); + + //! v1b: %res7 = p_extract_vector %a, 3 + //! p_unit_test 7, %res7 + writeout( + 7, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 1), Operand::c32(1))); + + //! v1: %res8_tmp = p_extract %a, 0, 8, 0 + //! v1b: %res8 = p_extract_vector %res8_tmp, 1 + //! p_unit_test 8, %res8 + writeout( + 8, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 0), Operand::c32(1))); + + //! v1: %res9_tmp = p_extract %a, 0, 16, 0 + //! v1b: %res9 = p_extract_vector %res9_tmp, 2 + //! p_unit_test 9, %res9 + writeout( + 9, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 0), Operand::c32(2))); + + //! v1: %res10_tmp = p_extract %a, 1, 16, 0 + //! v1b: %res10 = p_extract_vector %res10_tmp, 2 + //! p_unit_test 10, %res10 + writeout(10, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 1), + Operand::c32(2))); + + //! v1: %res11_tmp = p_extract %a, 1, 8, 0 + //! v1b: %res11 = p_extract_vector %res11_tmp, 2 + //! p_unit_test 11, %res11 + writeout( + 11, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 1), Operand::c32(2))); + + //! v2b: %res12 = p_extract %a, 0, 8, 0 + //! p_unit_test 12, %res12 + writeout( + 12, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 0), Operand::c32(0))); + + //! v2b: %res13 = p_extract %a, 1, 8, 0 + //! p_unit_test 13, %res13 + writeout( + 13, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 1), Operand::c32(0))); + + //! v2b: %res14 = p_extract %a, 2, 8, 0 + //! p_unit_test 14, %res14 + writeout( + 14, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 2), Operand::c32(0))); + + //! v2b: %res15 = p_extract %a, 3, 8, 0 + //! p_unit_test 15, %res15 + writeout( + 15, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 3), Operand::c32(0))); + + //! v2b: %res16 = p_extract_vector %a, 0 + //! p_unit_test 16, %res16 + writeout(16, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ushort(a, 0), + Operand::c32(0))); + + //! v2b: %res17 = p_extract_vector %a, 1 + //! p_unit_test 17, %res17 + writeout(17, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ushort(a, 1), + Operand::c32(0))); + + //! v1: %res18_tmp = p_extract %a, 0, 8, 0 + //! v2b: %res18 = p_extract_vector %res18_tmp, 1 + //! p_unit_test 18, %res18 + writeout( + 18, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 0), Operand::c32(1))); + + //! v1: %res19_tmp = p_extract %a, 0, 16, 0 + //! v2b: %res19 = p_extract_vector %res19_tmp, 1 + //! p_unit_test 19, %res19 + writeout(19, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ushort(a, 0), + Operand::c32(1))); + + //! v1b: %res20 = p_extract_vector %a, 2 + //! p_unit_test 20, %res20 + writeout(20, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), + bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(1), + Operand::c32(16), Operand::c32(false)), + Operand::c32(0))); + + //! v1b: %res21 = p_extract_vector %a, 3 + //! p_unit_test 21, %res21 + writeout(21, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), + bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(1), + Operand::c32(16), Operand::c32(false)), + Operand::c32(1))); + + finish_opt_test(); +END_TEST