mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-10 11:50:30 +01:00
aco/optimizer: use new helpers for v_add_u32 opts
Foz-DB Navi48: Totals from 1554 (1.89% of 82419) affected shaders: Instrs: 5154325 -> 5151499 (-0.05%); split: -0.08%, +0.02% CodeSize: 27310012 -> 27318708 (+0.03%); split: -0.01%, +0.05% VGPRs: 97236 -> 97200 (-0.04%); split: -0.05%, +0.01% Latency: 34121873 -> 34120894 (-0.00%); split: -0.02%, +0.01% InvThroughput: 6735276 -> 6730418 (-0.07%); split: -0.08%, +0.01% VClause: 130106 -> 130090 (-0.01%); split: -0.05%, +0.04% SClause: 90439 -> 90449 (+0.01%); split: -0.00%, +0.01% Copies: 382920 -> 382401 (-0.14%); split: -0.18%, +0.05% Branches: 130089 -> 130091 (+0.00%) PreSGPRs: 67745 -> 67743 (-0.00%); split: -0.01%, +0.00% PreVGPRs: 72710 -> 72674 (-0.05%) VALU: 2941866 -> 2938129 (-0.13%); split: -0.13%, +0.00% SALU: 651032 -> 651779 (+0.11%); split: -0.02%, +0.14% VOPD: 2446 -> 2393 (-2.17%); split: +0.70%, -2.86% Foz-DB Navi21: Totals from 1534 (1.86% of 82387) affected shaders: MaxWaves: 32481 -> 32479 (-0.01%) Instrs: 4732755 -> 4730039 (-0.06%); split: -0.06%, +0.00% CodeSize: 25305728 -> 25313148 (+0.03%); split: -0.00%, +0.03% VGPRs: 84424 -> 84448 (+0.03%) SpillVGPRs: 2420 -> 2419 (-0.04%) Scratch: 180224 -> 179200 (-0.57%) Latency: 36843383 -> 36846269 (+0.01%); split: -0.01%, +0.02% InvThroughput: 9252495 -> 9238142 (-0.16%); split: -0.17%, +0.02% VClause: 146629 -> 146671 (+0.03%); split: -0.02%, +0.05% SClause: 94502 -> 94512 (+0.01%); split: -0.00%, +0.01% Copies: 403672 -> 403592 (-0.02%); split: -0.09%, +0.07% Branches: 141145 -> 141137 (-0.01%) PreSGPRs: 70003 -> 70001 (-0.00%); split: -0.01%, +0.00% PreVGPRs: 70835 -> 70800 (-0.05%) VALU: 3114513 -> 3111338 (-0.10%); split: -0.10%, +0.00% SALU: 651177 -> 651925 (+0.11%); split: -0.02%, +0.13% VMEM: 271263 -> 271261 (-0.00%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38530>
This commit is contained in:
parent
715b9214da
commit
0359c8a901
2 changed files with 15 additions and 91 deletions
|
|
@ -3585,75 +3585,6 @@ combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2,
|
|||
return false;
|
||||
}
|
||||
|
||||
/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
|
||||
bool
|
||||
combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
bool is_or = instr->opcode == aco_opcode::v_or_b32;
|
||||
aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
|
||||
|
||||
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
|
||||
"120", 1 | 2))
|
||||
return true;
|
||||
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
|
||||
"120", 1 | 2))
|
||||
return true;
|
||||
if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
|
||||
return true;
|
||||
if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
|
||||
return true;
|
||||
|
||||
if (instr->isSDWA() || instr->isDPP())
|
||||
return false;
|
||||
|
||||
/* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
|
||||
* v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
|
||||
* v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
|
||||
* v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
|
||||
*/
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
Instruction* extins = follow_operand(ctx, instr->operands[i]);
|
||||
if (!extins)
|
||||
continue;
|
||||
|
||||
aco_opcode op;
|
||||
Operand operands[3];
|
||||
|
||||
if (extins->opcode == aco_opcode::p_insert &&
|
||||
(extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
|
||||
op = new_op_lshl;
|
||||
operands[1] =
|
||||
Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
|
||||
} else if (is_or &&
|
||||
(extins->opcode == aco_opcode::p_insert ||
|
||||
(extins->opcode == aco_opcode::p_extract &&
|
||||
extins->operands[3].constantEquals(0))) &&
|
||||
extins->operands[1].constantEquals(0)) {
|
||||
op = aco_opcode::v_and_or_b32;
|
||||
operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
operands[0] = extins->operands[0];
|
||||
operands[2] = instr->operands[!i];
|
||||
|
||||
if (!check_vop3_operands(ctx, 3, operands))
|
||||
continue;
|
||||
|
||||
uint8_t neg = 0, abs = 0, opsel = 0, omod = 0;
|
||||
bool clamp = false;
|
||||
if (instr->isVOP3())
|
||||
clamp = instr->valu().clamp;
|
||||
|
||||
ctx.uses[instr->operands[i].tempId()]--;
|
||||
create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* v_not(v_xor(a, b)) -> v_xnor(a, b) */
|
||||
bool
|
||||
combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
|
|
@ -4684,26 +4615,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
} else if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) {
|
||||
combine_not_xor(ctx, instr);
|
||||
} else if (instr->opcode == aco_opcode::v_add_u32 && !instr->usesModifiers()) {
|
||||
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
|
||||
} else if (combine_add_bcnt(ctx, instr)) {
|
||||
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
|
||||
aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
|
||||
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_i32_i24,
|
||||
aco_opcode::v_mad_i32_i24, "120", 1 | 2)) {
|
||||
} else if (ctx.program->gfx_level >= GFX9) {
|
||||
if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
|
||||
1 | 2)) {
|
||||
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
|
||||
"120", 1 | 2)) {
|
||||
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
|
||||
"012", 1 | 2)) {
|
||||
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
|
||||
"012", 1 | 2)) {
|
||||
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
|
||||
"012", 1 | 2)) {
|
||||
} else if (combine_add_or_then_and_lshl(ctx, instr)) {
|
||||
}
|
||||
}
|
||||
combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
|
||||
} else if ((instr->opcode == aco_opcode::v_add_co_u32 ||
|
||||
instr->opcode == aco_opcode::v_add_co_u32_e64) &&
|
||||
!instr->usesModifiers()) {
|
||||
|
|
@ -4934,6 +4846,18 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
add_opt(s_xor_b32, v_xor3_b32, 0x3, "012", nullptr, true);
|
||||
add_opt(v_not_b32, v_xnor_b32, 0x3, "01", nullptr, true);
|
||||
add_opt(s_not_b32, v_xnor_b32, 0x3, "01", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_add_u32 && !info.clamp) {
|
||||
assert(ctx.program->gfx_level >= GFX9);
|
||||
add_opt(v_bcnt_u32_b32, v_bcnt_u32_b32, 0x3, "102", remove_const_cb<0>, true);
|
||||
add_opt(v_mul_u32_u24, v_mad_u32_u24, 0x3, "120", nullptr, true);
|
||||
add_opt(v_mul_i32_i24, v_mad_i32_i24, 0x3, "120", nullptr, true);
|
||||
add_opt(v_xor_b32, v_xad_u32, 0x3, "120", nullptr, true);
|
||||
add_opt(s_xor_b32, v_xad_u32, 0x3, "120", nullptr, true);
|
||||
add_opt(v_add_u32, v_add3_u32, 0x3, "012", nullptr, true);
|
||||
add_opt(s_add_u32, v_add3_u32, 0x3, "012", nullptr, true);
|
||||
add_opt(s_add_i32, v_add3_u32, 0x3, "012", nullptr, true);
|
||||
add_opt(v_lshlrev_b32, v_lshl_add_u32, 0x3, "210", nullptr, true);
|
||||
add_opt(s_lshl_b32, v_lshl_add_u32, 0x3, "120", nullptr, true);
|
||||
}
|
||||
|
||||
if (match_and_apply_patterns(ctx, info, patterns)) {
|
||||
|
|
|
|||
|
|
@ -363,9 +363,9 @@ BEGIN_TEST(optimize.bcnt)
|
|||
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
||||
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
|
||||
|
||||
//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
|
||||
//~gfx8! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
|
||||
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
|
||||
//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
|
||||
//~gfx(9|10)! v1: %res3 = v_bcnt_u32_b32 %b, %a
|
||||
//! p_unit_test 3, %res3
|
||||
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
|
||||
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue