mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 09:38:07 +02:00
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c) if 'b' is a constant (less than or equal to 6 to avoid creating literals) and 'a' known to be a 16-bit or a 24-bit value. On GFX9+, this is already optimized to v_lshl_add_u32. fossils-db (Polaris10): Totals from 1916 (1.36% of 140385) affected shaders: SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05% CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00% Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00% Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00% VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25% SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96% VClause: 37231 -> 37204 (-0.07%) SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01% Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02% Branches: 63478 -> 63477 (-0.00%) PreSGPRs: 61818 -> 61816 (-0.00%) Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
This commit is contained in:
parent
eaef1f2127
commit
d9e4504b0d
3 changed files with 84 additions and 2 deletions
|
|
@ -1475,7 +1475,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
} else if (dst.regClass() == v2) {
|
||||
emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
|
||||
} else if (dst.regClass() == s1) {
|
||||
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
|
||||
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
|
||||
} else if (dst.regClass() == s2) {
|
||||
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -2682,6 +2682,42 @@ bool combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
return false;
|
||||
}
|
||||
|
||||
/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c) */
|
||||
bool combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->usesModifiers())
|
||||
return false;
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
Instruction *op_instr = follow_operand(ctx, instr->operands[i]);
|
||||
if (!op_instr)
|
||||
continue;
|
||||
|
||||
if (op_instr->opcode != aco_opcode::s_lshl_b32)
|
||||
continue;
|
||||
|
||||
if (op_instr->operands[1].isConstant() &&
|
||||
op_instr->operands[1].constantValue() <= 6 && /* no literals */
|
||||
(op_instr->operands[0].is24bit() ||
|
||||
op_instr->operands[0].is16bit())) {
|
||||
uint32_t multiplier = 1 << op_instr->operands[1].constantValue();
|
||||
|
||||
ctx.uses[instr->operands[i].tempId()]--;
|
||||
|
||||
aco_ptr<VOP3A_instruction> new_instr{create_instruction<VOP3A_instruction>(aco_opcode::v_mad_u32_u24, Format::VOP3A, 3, 1)};
|
||||
new_instr->operands[0] = op_instr->operands[0];
|
||||
new_instr->operands[1] = Operand(multiplier);
|
||||
new_instr->operands[2] = instr->operands[!i];
|
||||
new_instr->definitions[0] = instr->definitions[0];
|
||||
instr = std::move(new_instr);
|
||||
ctx.info[instr->definitions[0].tempId()].label = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
|
||||
// this would mean that we'd have to fix the instruction uses while value propagation
|
||||
|
||||
|
|
@ -2925,7 +2961,8 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
|
|||
bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
|
||||
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
|
||||
else if (!carry_out && combine_add_bcnt(ctx, instr)) ;
|
||||
else if (!carry_out) combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2);
|
||||
else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ;
|
||||
else if (!carry_out) combine_add_lshl(ctx, instr);
|
||||
} else if (instr->opcode == aco_opcode::v_sub_u32 ||
|
||||
instr->opcode == aco_opcode::v_sub_co_u32 ||
|
||||
instr->opcode == aco_opcode::v_sub_co_u32_e64) {
|
||||
|
|
|
|||
|
|
@ -329,6 +329,51 @@ BEGIN_TEST(optimize.add_lshl)
|
|||
Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
|
||||
writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
|
||||
|
||||
//~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
|
||||
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
||||
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
|
||||
//! p_unit_test 2, %res2
|
||||
Temp lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand(3u));
|
||||
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
||||
|
||||
//~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
|
||||
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
|
||||
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
||||
//! p_unit_test 3, %res3
|
||||
Operand a_24bit = Operand(inputs[0]);
|
||||
a_24bit.set24bit(true);
|
||||
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand(7u));
|
||||
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
||||
|
||||
//! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
|
||||
//~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
|
||||
//~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
|
||||
//! p_unit_test 4, %carry
|
||||
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand(3u));
|
||||
Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
|
||||
writeout(4, carry);
|
||||
|
||||
//~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
|
||||
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
|
||||
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
|
||||
//! p_unit_test 5, %res5
|
||||
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
|
||||
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
||||
|
||||
//~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
||||
//~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
||||
//! p_unit_test 6, %res6
|
||||
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand(3u));
|
||||
writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
||||
|
||||
//~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
||||
//~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
||||
//! p_unit_test 7, %res7
|
||||
Operand a_16bit = Operand(inputs[0]);
|
||||
a_16bit.set16bit(true);
|
||||
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand(4u));
|
||||
writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
||||
|
||||
finish_opt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue