aco/optimizer: reassociate mul(mul(a, const), b) into mul_omod(a, b)

Foz-DB Navi48:
Totals from 14608 (14.96% of 97637) affected shaders:
MaxWaves: 364201 -> 364421 (+0.06%)
Instrs: 28051720 -> 28022503 (-0.10%); split: -0.13%, +0.03%
CodeSize: 148938740 -> 148943480 (+0.00%); split: -0.04%, +0.04%
VGPRs: 994520 -> 994004 (-0.05%); split: -0.05%, +0.00%
SpillSGPRs: 45182 -> 45179 (-0.01%)
Latency: 187734461 -> 187725301 (-0.00%); split: -0.07%, +0.06%
InvThroughput: 33967002 -> 33949881 (-0.05%); split: -0.11%, +0.06%
VClause: 495237 -> 495207 (-0.01%); split: -0.03%, +0.02%
Copies: 2048324 -> 2047937 (-0.02%); split: -0.12%, +0.10%
Branches: 598445 -> 598431 (-0.00%); split: -0.01%, +0.01%
PreSGPRs: 877715 -> 877684 (-0.00%)
PreVGPRs: 778146 -> 776383 (-0.23%); split: -0.23%, +0.00%
VALU: 16413380 -> 16391508 (-0.13%); split: -0.15%, +0.01%
SALU: 3685279 -> 3677655 (-0.21%); split: -0.23%, +0.02%
VOPD: 26219 -> 25926 (-1.12%); split: +0.43%, -1.55%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38730>
This commit is contained in:
Georg Lehmann 2025-11-30 14:34:05 +01:00 committed by Marge Bot
parent 125ac1626d
commit a8f5ced670

View file

@ -3991,6 +3991,53 @@ create_med3_cb(opt_ctx& ctx, alu_opt_info& info)
return false; return false;
} }
bool
can_reassoc_omod(opt_ctx& ctx, const alu_opt_info& info, unsigned bit_size)
{
unsigned denorm = bit_size == 32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64;
bool no_signed_zero =
info.opcode == aco_opcode::v_mul_legacy_f32 || !info.defs[0].isSZPreserve();
return no_signed_zero && !info.omod && !info.defs[0].isPrecise() && denorm == fp_denorm_flush;
}
template <bool is_rcp>
bool
reassoc_omod_cb(opt_ctx& ctx, alu_opt_info& info)
{
if (info.defs[0].isPrecise())
return false;
aco_type type = instr_info.alu_opcode_infos[(int)info.opcode].def_types[0];
for (unsigned op_idx = 0; op_idx < 2; op_idx++) {
uint64_t constant = 0;
if (!op_info_get_constant(ctx, info.operands[op_idx], type, &constant))
continue;
double val = extract_float(constant, type.bit_size);
if (val < 0.0) {
info.operands[!op_idx].neg[0] ^= true;
val = fabs(val);
}
if (val == (is_rcp ? 0.5 : 2.0))
info.omod = 1;
else if (val == (is_rcp ? 0.25 : 4.0))
info.omod = 2;
else if (val == (is_rcp ? 2.0 : 0.5))
info.omod = 3;
else
return false;
info.operands.erase(std::next(info.operands.begin(), op_idx));
return true;
}
return false;
}
template <unsigned bits> template <unsigned bits>
bool bool
shift_to_mad_cb(opt_ctx& ctx, alu_opt_info& info) shift_to_mad_cb(opt_ctx& ctx, alu_opt_info& info)
@ -4293,16 +4340,38 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
} else if (info.opcode == aco_opcode::v_min_i16_e64) { } else if (info.opcode == aco_opcode::v_min_i16_e64) {
add_opt(v_min_i16_e64, v_min3_i16, 0x3, "120", nullptr, true); add_opt(v_min_i16_e64, v_min3_i16, 0x3, "120", nullptr, true);
add_opt(v_max_i16_e64, v_med3_i16, 0x3, "012", create_med3_cb<true>, true); add_opt(v_max_i16_e64, v_med3_i16, 0x3, "012", create_med3_cb<true>, true);
} else if (((info.opcode == aco_opcode::v_mul_f32 && !info.defs[0].isNaNPreserve() && } else if (info.opcode == aco_opcode::v_mul_f32 || info.opcode == aco_opcode::v_mul_legacy_f32) {
!info.defs[0].isInfPreserve()) || bool legacy = info.opcode == aco_opcode::v_mul_legacy_f32;
(info.opcode == aco_opcode::v_mul_legacy_f32 && !info.defs[0].isSZPreserve())) &&
!info.clamp && !info.omod && !ctx.fp_mode.must_flush_denorms32) { if ((legacy ? !info.defs[0].isSZPreserve()
/* v_mul_f32(a, v_cndmask_b32(0, 1.0, cond)) -> v_cndmask_b32(0, a, cond) */ : (!info.defs[0].isNaNPreserve() && !info.defs[0].isInfPreserve())) &&
add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "1032", !info.clamp && !info.omod && !ctx.fp_mode.must_flush_denorms32) {
and_cb<check_const_cb<0, 0>, remove_const_cb<0x3f800000>>, true); /* v_mul_f32(a, v_cndmask_b32(0, 1.0, cond)) -> v_cndmask_b32(0, a, cond) */
/* v_mul_f32(a, v_cndmask_b32(1.0, 0, cond)) -> v_cndmask_b32(a, 0, cond) */ add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "1032",
add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "0231", and_cb<check_const_cb<0, 0>, remove_const_cb<0x3f800000>>, true);
and_cb<check_const_cb<1, 0>, remove_const_cb<0x3f800000>>, true); /* v_mul_f32(a, v_cndmask_b32(1.0, 0, cond)) -> v_cndmask_b32(a, 0, cond) */
add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "0231",
and_cb<check_const_cb<1, 0>, remove_const_cb<0x3f800000>>, true);
}
if (can_reassoc_omod(ctx, info, 32)) {
if (legacy) {
add_opt(v_mul_f32, v_mul_legacy_f32, 0x3, "120", reassoc_omod_cb<false>, true);
add_opt(v_mul_legacy_f32, v_mul_legacy_f32, 0x3, "120", reassoc_omod_cb<false>, true);
add_opt(s_mul_f32, v_mul_legacy_f32, 0x3, "120", reassoc_omod_cb<false>, true);
} else {
add_opt(v_mul_f32, v_mul_f32, 0x3, "120", reassoc_omod_cb<false>, true);
add_opt(v_mul_legacy_f32, v_mul_f32, 0x3, "120", reassoc_omod_cb<false>, true);
add_opt(s_mul_f32, v_mul_f32, 0x3, "120", reassoc_omod_cb<false>, true);
}
}
} else if (info.opcode == aco_opcode::v_mul_f16 && can_reassoc_omod(ctx, info, 16)) {
add_opt(v_mul_f16, v_mul_f16, 0x3, "120", reassoc_omod_cb<false>, true);
add_opt(s_mul_f16, v_mul_f16, 0x3, "120", reassoc_omod_cb<false>, true);
} else if (info.opcode == aco_opcode::v_mul_f64 && can_reassoc_omod(ctx, info, 64)) {
add_opt(v_mul_f64, v_mul_f64, 0x3, "120", reassoc_omod_cb<false>, true);
} else if (info.opcode == aco_opcode::v_mul_f64_e64 && can_reassoc_omod(ctx, info, 64)) {
add_opt(v_mul_f64_e64, v_mul_f64_e64, 0x3, "120", reassoc_omod_cb<false>, true);
} else if (info.opcode == aco_opcode::v_add_u16 && !info.clamp) { } else if (info.opcode == aco_opcode::v_add_u16 && !info.clamp) {
if (ctx.program->gfx_level < GFX9) { if (ctx.program->gfx_level < GFX9) {
add_opt(v_mul_lo_u16, v_mad_legacy_u16, 0x3, "120"); add_opt(v_mul_lo_u16, v_mad_legacy_u16, 0x3, "120");