aco/optimizer: use new helpers for v_add_u32 opts

Foz-DB Navi48:
Totals from 1554 (1.89% of 82419) affected shaders:
Instrs: 5154325 -> 5151499 (-0.05%); split: -0.08%, +0.02%
CodeSize: 27310012 -> 27318708 (+0.03%); split: -0.01%, +0.05%
VGPRs: 97236 -> 97200 (-0.04%); split: -0.05%, +0.01%
Latency: 34121873 -> 34120894 (-0.00%); split: -0.02%, +0.01%
InvThroughput: 6735276 -> 6730418 (-0.07%); split: -0.08%, +0.01%
VClause: 130106 -> 130090 (-0.01%); split: -0.05%, +0.04%
SClause: 90439 -> 90449 (+0.01%); split: -0.00%, +0.01%
Copies: 382920 -> 382401 (-0.14%); split: -0.18%, +0.05%
Branches: 130089 -> 130091 (+0.00%)
PreSGPRs: 67745 -> 67743 (-0.00%); split: -0.01%, +0.00%
PreVGPRs: 72710 -> 72674 (-0.05%)
VALU: 2941866 -> 2938129 (-0.13%); split: -0.13%, +0.00%
SALU: 651032 -> 651779 (+0.11%); split: -0.02%, +0.14%
VOPD: 2446 -> 2393 (-2.17%); split: +0.70%, -2.86%

Foz-DB Navi21:
Totals from 1534 (1.86% of 82387) affected shaders:
MaxWaves: 32481 -> 32479 (-0.01%)
Instrs: 4732755 -> 4730039 (-0.06%); split: -0.06%, +0.00%
CodeSize: 25305728 -> 25313148 (+0.03%); split: -0.00%, +0.03%
VGPRs: 84424 -> 84448 (+0.03%)
SpillVGPRs: 2420 -> 2419 (-0.04%)
Scratch: 180224 -> 179200 (-0.57%)
Latency: 36843383 -> 36846269 (+0.01%); split: -0.01%, +0.02%
InvThroughput: 9252495 -> 9238142 (-0.16%); split: -0.17%, +0.02%
VClause: 146629 -> 146671 (+0.03%); split: -0.02%, +0.05%
SClause: 94502 -> 94512 (+0.01%); split: -0.00%, +0.01%
Copies: 403672 -> 403592 (-0.02%); split: -0.09%, +0.07%
Branches: 141145 -> 141137 (-0.01%)
PreSGPRs: 70003 -> 70001 (-0.00%); split: -0.01%, +0.00%
PreVGPRs: 70835 -> 70800 (-0.05%)
VALU: 3114513 -> 3111338 (-0.10%); split: -0.10%, +0.00%
SALU: 651177 -> 651925 (+0.11%); split: -0.02%, +0.13%
VMEM: 271263 -> 271261 (-0.00%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38530>
This commit is contained in:
Georg Lehmann 2024-12-14 18:56:57 +01:00 committed by Marge Bot
parent 715b9214da
commit 0359c8a901
2 changed files with 15 additions and 91 deletions

View file

@ -3585,75 +3585,6 @@ combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2,
return false;
}
/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
bool
combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
bool is_or = instr->opcode == aco_opcode::v_or_b32;
aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
"120", 1 | 2))
return true;
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
"120", 1 | 2))
return true;
if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
return true;
if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
return true;
if (instr->isSDWA() || instr->isDPP())
return false;
/* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
* v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
* v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
* v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
*/
for (unsigned i = 0; i < 2; i++) {
Instruction* extins = follow_operand(ctx, instr->operands[i]);
if (!extins)
continue;
aco_opcode op;
Operand operands[3];
if (extins->opcode == aco_opcode::p_insert &&
(extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
op = new_op_lshl;
operands[1] =
Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
} else if (is_or &&
(extins->opcode == aco_opcode::p_insert ||
(extins->opcode == aco_opcode::p_extract &&
extins->operands[3].constantEquals(0))) &&
extins->operands[1].constantEquals(0)) {
op = aco_opcode::v_and_or_b32;
operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
} else {
continue;
}
operands[0] = extins->operands[0];
operands[2] = instr->operands[!i];
if (!check_vop3_operands(ctx, 3, operands))
continue;
uint8_t neg = 0, abs = 0, opsel = 0, omod = 0;
bool clamp = false;
if (instr->isVOP3())
clamp = instr->valu().clamp;
ctx.uses[instr->operands[i].tempId()]--;
create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
return true;
}
return false;
}
/* v_not(v_xor(a, b)) -> v_xnor(a, b) */
bool
combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
@ -4684,26 +4615,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
} else if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) {
combine_not_xor(ctx, instr);
} else if (instr->opcode == aco_opcode::v_add_u32 && !instr->usesModifiers()) {
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
} else if (combine_add_bcnt(ctx, instr)) {
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_i32_i24,
aco_opcode::v_mad_i32_i24, "120", 1 | 2)) {
} else if (ctx.program->gfx_level >= GFX9) {
if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
1 | 2)) {
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
"120", 1 | 2)) {
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
"012", 1 | 2)) {
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
"012", 1 | 2)) {
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
"012", 1 | 2)) {
} else if (combine_add_or_then_and_lshl(ctx, instr)) {
}
}
combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
} else if ((instr->opcode == aco_opcode::v_add_co_u32 ||
instr->opcode == aco_opcode::v_add_co_u32_e64) &&
!instr->usesModifiers()) {
@ -4934,6 +4846,18 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
add_opt(s_xor_b32, v_xor3_b32, 0x3, "012", nullptr, true);
add_opt(v_not_b32, v_xnor_b32, 0x3, "01", nullptr, true);
add_opt(s_not_b32, v_xnor_b32, 0x3, "01", nullptr, true);
} else if (info.opcode == aco_opcode::v_add_u32 && !info.clamp) {
assert(ctx.program->gfx_level >= GFX9);
add_opt(v_bcnt_u32_b32, v_bcnt_u32_b32, 0x3, "102", remove_const_cb<0>, true);
add_opt(v_mul_u32_u24, v_mad_u32_u24, 0x3, "120", nullptr, true);
add_opt(v_mul_i32_i24, v_mad_i32_i24, 0x3, "120", nullptr, true);
add_opt(v_xor_b32, v_xad_u32, 0x3, "120", nullptr, true);
add_opt(s_xor_b32, v_xad_u32, 0x3, "120", nullptr, true);
add_opt(v_add_u32, v_add3_u32, 0x3, "012", nullptr, true);
add_opt(s_add_u32, v_add3_u32, 0x3, "012", nullptr, true);
add_opt(s_add_i32, v_add3_u32, 0x3, "012", nullptr, true);
add_opt(v_lshlrev_b32, v_lshl_add_u32, 0x3, "210", nullptr, true);
add_opt(s_lshl_b32, v_lshl_add_u32, 0x3, "120", nullptr, true);
}
if (match_and_apply_patterns(ctx, info, patterns)) {

View file

@ -363,9 +363,9 @@ BEGIN_TEST(optimize.bcnt)
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
//~gfx8! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
//~gfx(9|10)! v1: %res3 = v_bcnt_u32_b32 %b, %a
//! p_unit_test 3, %res3
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));