aco: apply extract to p_extract_vector

fossil-db (navi21):
Totals from 46 (0.06% of 79395) affected shaders:
Instrs: 80126 -> 79944 (-0.23%); split: -0.27%, +0.04%
CodeSize: 486860 -> 485668 (-0.24%); split: -0.31%, +0.06%
Latency: 1615395 -> 1614218 (-0.07%); split: -0.07%, +0.00%
InvThroughput: 705479 -> 705013 (-0.07%); split: -0.07%, +0.00%
Copies: 18934 -> 18797 (-0.72%); split: -0.98%, +0.25%
VALU: 52452 -> 52268 (-0.35%); split: -0.41%, +0.06%
SALU: 17253 -> 17255 (+0.01%); split: -0.02%, +0.03%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31762>
This commit is contained in:
Rhys Perry 2024-10-17 15:07:56 +01:00 committed by Marge Bot
parent 6cb9d39bc2
commit f1a932bc29
2 changed files with 160 additions and 5 deletions

View file

@ -1084,14 +1084,15 @@ can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_i
} else if (sel.size() == 2 && ((instr->opcode == aco_opcode::s_pack_lh_b32_b16 && idx == 0) ||
(instr->opcode == aco_opcode::s_pack_hl_b32_b16 && idx == 1))) {
return true;
} else if (instr->opcode == aco_opcode::p_extract) {
} else if (instr->opcode == aco_opcode::p_extract ||
instr->opcode == aco_opcode::p_extract_vector) {
if (ctx.program->gfx_level < GFX9 && !info.instr->operands[0].isOfType(RegType::vgpr) &&
instr->definitions[0].regClass().is_subdword())
return false;
SubdwordSel instrSel = parse_extract(instr.get());
return apply_extract_twice(sel, instr->operands[idx].getTemp(), instrSel,
instr->definitions[0].getTemp()) != SubdwordSel();
return instrSel && apply_extract_twice(sel, instr->operands[idx].getTemp(), instrSel,
instr->definitions[0].getTemp());
}
return false;
@ -1175,6 +1176,29 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
instr->operands[2] = Operand::c32(new_sel.size() * 8u);
instr->operands[3] = Operand::c32(new_sel.sign_extend());
return;
} else if (instr->opcode == aco_opcode::p_extract_vector) {
SubdwordSel instrSel = parse_extract(instr.get());
SubdwordSel new_sel = apply_extract_twice(sel, instr->operands[idx].getTemp(), instrSel,
instr->definitions[0].getTemp());
assert(new_sel.size() <= 2);
if (new_sel.size() == instr->definitions[0].bytes()) {
instr->operands[1] = Operand::c32(new_sel.offset() / instr->definitions[0].bytes());
return;
} else {
/* parse_extract() only succeeds with p_extract_vector for VGPR definitions because there
* are no sub-dword SGPR regclasses. */
assert(instr->definitions[0].regClass().type() != RegType::sgpr);
Instruction* ext = create_instruction(aco_opcode::p_extract, Format::PSEUDO, 4, 1);
ext->definitions[0] = instr->definitions[0];
ext->operands[0] = instr->operands[0];
ext->operands[1] = Operand::c32(new_sel.offset() / new_sel.size());
ext->operands[2] = Operand::c32(new_sel.size() * 8u);
ext->operands[3] = Operand::c32(new_sel.sign_extend());
ext->pass_flags = instr->pass_flags;
instr.reset(ext);
}
}
/* These are the only labels worth keeping at the moment. */
@ -3785,7 +3809,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->isSDWA() || instr->isDPP())
return;
if (instr->opcode == aco_opcode::p_extract) {
if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_extract_vector) {
ssa_info& info = ctx.info[instr->operands[0].tempId()];
if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) {
apply_extract(ctx, instr, 0, info);
@ -3794,7 +3818,8 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
instr->operands[0].setTemp(info.instr->operands[0].getTemp());
}
apply_ds_extract(ctx, instr);
if (instr->opcode == aco_opcode::p_extract)
apply_ds_extract(ctx, instr);
}
/* TODO: There are still some peephole optimizations that could be done:

View file

@ -699,3 +699,133 @@ BEGIN_TEST(optimize.sdwa.subdword_extract)
finish_opt_test();
END_TEST
BEGIN_TEST(optimize.sdwa.extract_vector)
//>> v1: %a = p_startpgm
if (!setup_cs("v1", GFX10_3))
return;
Temp a = inputs[0];
//! v1b: %res0 = p_extract_vector %a, 0
//! p_unit_test 0, %res0
writeout(
0, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 0), Operand::c32(0)));
//! v1b: %res1 = p_extract_vector %a, 1
//! p_unit_test 1, %res1
writeout(
1, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 1), Operand::c32(0)));
//! v1b: %res2 = p_extract_vector %a, 2
//! p_unit_test 2, %res2
writeout(
2, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 2), Operand::c32(0)));
//! v1b: %res3 = p_extract_vector %a, 3
//! p_unit_test 3, %res3
writeout(
3, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 3), Operand::c32(0)));
//! v1b: %res4 = p_extract_vector %a, 0
//! p_unit_test 4, %res4
writeout(
4, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 0), Operand::c32(0)));
//! v1b: %res5 = p_extract_vector %a, 2
//! p_unit_test 5, %res5
writeout(
5, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 1), Operand::c32(0)));
//! v1b: %res6 = p_extract_vector %a, 1
//! p_unit_test 6, %res6
writeout(
6, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 0), Operand::c32(1)));
//! v1b: %res7 = p_extract_vector %a, 3
//! p_unit_test 7, %res7
writeout(
7, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 1), Operand::c32(1)));
//! v1: %res8_tmp = p_extract %a, 0, 8, 0
//! v1b: %res8 = p_extract_vector %res8_tmp, 1
//! p_unit_test 8, %res8
writeout(
8, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 0), Operand::c32(1)));
//! v1: %res9_tmp = p_extract %a, 0, 16, 0
//! v1b: %res9 = p_extract_vector %res9_tmp, 2
//! p_unit_test 9, %res9
writeout(
9, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 0), Operand::c32(2)));
//! v1: %res10_tmp = p_extract %a, 1, 16, 0
//! v1b: %res10 = p_extract_vector %res10_tmp, 2
//! p_unit_test 10, %res10
writeout(10, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ushort(a, 1),
Operand::c32(2)));
//! v1: %res11_tmp = p_extract %a, 1, 8, 0
//! v1b: %res11 = p_extract_vector %res11_tmp, 2
//! p_unit_test 11, %res11
writeout(
11, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), ext_ubyte(a, 1), Operand::c32(2)));
//! v2b: %res12 = p_extract %a, 0, 8, 0
//! p_unit_test 12, %res12
writeout(
12, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 0), Operand::c32(0)));
//! v2b: %res13 = p_extract %a, 1, 8, 0
//! p_unit_test 13, %res13
writeout(
13, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 1), Operand::c32(0)));
//! v2b: %res14 = p_extract %a, 2, 8, 0
//! p_unit_test 14, %res14
writeout(
14, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 2), Operand::c32(0)));
//! v2b: %res15 = p_extract %a, 3, 8, 0
//! p_unit_test 15, %res15
writeout(
15, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 3), Operand::c32(0)));
//! v2b: %res16 = p_extract_vector %a, 0
//! p_unit_test 16, %res16
writeout(16, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ushort(a, 0),
Operand::c32(0)));
//! v2b: %res17 = p_extract_vector %a, 1
//! p_unit_test 17, %res17
writeout(17, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ushort(a, 1),
Operand::c32(0)));
//! v1: %res18_tmp = p_extract %a, 0, 8, 0
//! v2b: %res18 = p_extract_vector %res18_tmp, 1
//! p_unit_test 18, %res18
writeout(
18, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ubyte(a, 0), Operand::c32(1)));
//! v1: %res19_tmp = p_extract %a, 0, 16, 0
//! v2b: %res19 = p_extract_vector %res19_tmp, 1
//! p_unit_test 19, %res19
writeout(19, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), ext_ushort(a, 0),
Operand::c32(1)));
//! v1b: %res20 = p_extract_vector %a, 2
//! p_unit_test 20, %res20
writeout(20, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b),
bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(1),
Operand::c32(16), Operand::c32(false)),
Operand::c32(0)));
//! v1b: %res21 = p_extract_vector %a, 3
//! p_unit_test 21, %res21
writeout(21, bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b),
bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(1),
Operand::c32(16), Operand::c32(false)),
Operand::c32(1)));
finish_opt_test();
END_TEST