From a8f5ced6702d1b6f9a257619f2e3519140c5247c Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sun, 30 Nov 2025 14:34:05 +0100 Subject: [PATCH] aco/optimizer: reassociate mul(mul(a, const), b) into mul_omod(a, b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi48: Totals from 14608 (14.96% of 97637) affected shaders: MaxWaves: 364201 -> 364421 (+0.06%) Instrs: 28051720 -> 28022503 (-0.10%); split: -0.13%, +0.03% CodeSize: 148938740 -> 148943480 (+0.00%); split: -0.04%, +0.04% VGPRs: 994520 -> 994004 (-0.05%); split: -0.05%, +0.00% SpillSGPRs: 45182 -> 45179 (-0.01%) Latency: 187734461 -> 187725301 (-0.00%); split: -0.07%, +0.06% InvThroughput: 33967002 -> 33949881 (-0.05%); split: -0.11%, +0.06% VClause: 495237 -> 495207 (-0.01%); split: -0.03%, +0.02% Copies: 2048324 -> 2047937 (-0.02%); split: -0.12%, +0.10% Branches: 598445 -> 598431 (-0.00%); split: -0.01%, +0.01% PreSGPRs: 877715 -> 877684 (-0.00%) PreVGPRs: 778146 -> 776383 (-0.23%); split: -0.23%, +0.00% VALU: 16413380 -> 16391508 (-0.13%); split: -0.15%, +0.01% SALU: 3685279 -> 3677655 (-0.21%); split: -0.23%, +0.02% VOPD: 26219 -> 25926 (-1.12%); split: +0.43%, -1.55% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 89 ++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index f4bdb249e5b..0488eff8bef 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3991,6 +3991,53 @@ create_med3_cb(opt_ctx& ctx, alu_opt_info& info) return false; } +bool +can_reassoc_omod(opt_ctx& ctx, const alu_opt_info& info, unsigned bit_size) +{ + unsigned denorm = bit_size == 32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64; + bool no_signed_zero = + info.opcode == aco_opcode::v_mul_legacy_f32 || !info.defs[0].isSZPreserve(); + + return no_signed_zero && !info.omod && !info.defs[0].isPrecise() && denorm == fp_denorm_flush; +} + +template +bool +reassoc_omod_cb(opt_ctx& ctx, alu_opt_info& info) +{ + if (info.defs[0].isPrecise()) + return false; + + aco_type type = instr_info.alu_opcode_infos[(int)info.opcode].def_types[0]; + + for (unsigned op_idx = 0; op_idx < 2; op_idx++) { + uint64_t constant = 0; + if (!op_info_get_constant(ctx, info.operands[op_idx], type, &constant)) + continue; + + double val = extract_float(constant, type.bit_size); + if (val < 0.0) { + info.operands[!op_idx].neg[0] ^= true; + val = fabs(val); + } + + if (val == (is_rcp ? 0.5 : 2.0)) + info.omod = 1; + else if (val == (is_rcp ? 0.25 : 4.0)) + info.omod = 2; + else if (val == (is_rcp ? 2.0 : 0.5)) + info.omod = 3; + else + return false; + + info.operands.erase(std::next(info.operands.begin(), op_idx)); + + return true; + } + + return false; +} + template bool shift_to_mad_cb(opt_ctx& ctx, alu_opt_info& info) @@ -4293,16 +4340,38 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } else if (info.opcode == aco_opcode::v_min_i16_e64) { add_opt(v_min_i16_e64, v_min3_i16, 0x3, "120", nullptr, true); add_opt(v_max_i16_e64, v_med3_i16, 0x3, "012", create_med3_cb, true); - } else if (((info.opcode == aco_opcode::v_mul_f32 && !info.defs[0].isNaNPreserve() && - !info.defs[0].isInfPreserve()) || - (info.opcode == aco_opcode::v_mul_legacy_f32 && !info.defs[0].isSZPreserve())) && - !info.clamp && !info.omod && !ctx.fp_mode.must_flush_denorms32) { - /* v_mul_f32(a, v_cndmask_b32(0, 1.0, cond)) -> v_cndmask_b32(0, a, cond) */ - add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "1032", - and_cb, remove_const_cb<0x3f800000>>, true); - /* v_mul_f32(a, v_cndmask_b32(1.0, 0, cond)) -> v_cndmask_b32(a, 0, cond) */ - add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "0231", - and_cb, remove_const_cb<0x3f800000>>, true); + } else if (info.opcode == aco_opcode::v_mul_f32 || info.opcode == aco_opcode::v_mul_legacy_f32) { + bool legacy = info.opcode == aco_opcode::v_mul_legacy_f32; + + if ((legacy ? !info.defs[0].isSZPreserve() + : (!info.defs[0].isNaNPreserve() && !info.defs[0].isInfPreserve())) && + !info.clamp && !info.omod && !ctx.fp_mode.must_flush_denorms32) { + /* v_mul_f32(a, v_cndmask_b32(0, 1.0, cond)) -> v_cndmask_b32(0, a, cond) */ + add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "1032", + and_cb, remove_const_cb<0x3f800000>>, true); + /* v_mul_f32(a, v_cndmask_b32(1.0, 0, cond)) -> v_cndmask_b32(a, 0, cond) */ + add_opt(v_cndmask_b32, v_cndmask_b32, 0x3, "0231", + and_cb, remove_const_cb<0x3f800000>>, true); + } + + if (can_reassoc_omod(ctx, info, 32)) { + if (legacy) { + add_opt(v_mul_f32, v_mul_legacy_f32, 0x3, "120", reassoc_omod_cb, true); + add_opt(v_mul_legacy_f32, v_mul_legacy_f32, 0x3, "120", reassoc_omod_cb, true); + add_opt(s_mul_f32, v_mul_legacy_f32, 0x3, "120", reassoc_omod_cb, true); + } else { + add_opt(v_mul_f32, v_mul_f32, 0x3, "120", reassoc_omod_cb, true); + add_opt(v_mul_legacy_f32, v_mul_f32, 0x3, "120", reassoc_omod_cb, true); + add_opt(s_mul_f32, v_mul_f32, 0x3, "120", reassoc_omod_cb, true); + } + } + } else if (info.opcode == aco_opcode::v_mul_f16 && can_reassoc_omod(ctx, info, 16)) { + add_opt(v_mul_f16, v_mul_f16, 0x3, "120", reassoc_omod_cb, true); + add_opt(s_mul_f16, v_mul_f16, 0x3, "120", reassoc_omod_cb, true); + } else if (info.opcode == aco_opcode::v_mul_f64 && can_reassoc_omod(ctx, info, 64)) { + add_opt(v_mul_f64, v_mul_f64, 0x3, "120", reassoc_omod_cb, true); + } else if (info.opcode == aco_opcode::v_mul_f64_e64 && can_reassoc_omod(ctx, info, 64)) { + add_opt(v_mul_f64_e64, v_mul_f64_e64, 0x3, "120", reassoc_omod_cb, true); } else if (info.opcode == aco_opcode::v_add_u16 && !info.clamp) { if (ctx.program->gfx_level < GFX9) { add_opt(v_mul_lo_u16, v_mad_legacy_u16, 0x3, "120");