From 0e4d4aeef79a1e5ba53c7f086f5f824a40dba3ad Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sat, 1 Mar 2025 11:26:02 +0100 Subject: [PATCH] aco/optimizer: add some bitop combining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi48: Totals from 53 (0.06% of 82419) affected shaders: Instrs: 172843 -> 172769 (-0.04%); split: -0.06%, +0.01% CodeSize: 937308 -> 936924 (-0.04%); split: -0.04%, +0.00% Latency: 454652 -> 454823 (+0.04%); split: -0.01%, +0.05% InvThroughput: 89833 -> 89812 (-0.02%); split: -0.06%, +0.03% PreSGPRs: 2926 -> 2929 (+0.10%) PreVGPRs: 2920 -> 2919 (-0.03%); split: -0.07%, +0.03% VALU: 76638 -> 76556 (-0.11%) SALU: 37856 -> 37859 (+0.01%); split: -0.01%, +0.01% VOPD: 10943 -> 10936 (-0.06%) Foz-DB Navi21: Totals from 59 (0.07% of 82387) affected shaders: Instrs: 1047744 -> 1047578 (-0.02%) CodeSize: 5641948 -> 5640780 (-0.02%) Latency: 5116816 -> 5116957 (+0.00%); split: -0.00%, +0.01% InvThroughput: 1274035 -> 1274023 (-0.00%); split: -0.00%, +0.00% VClause: 30744 -> 30745 (+0.00%) PreSGPRs: 3329 -> 3333 (+0.12%) PreVGPRs: 4130 -> 4129 (-0.02%); split: -0.05%, +0.02% VALU: 689731 -> 689562 (-0.02%) SALU: 162830 -> 162833 (+0.00%); split: -0.00%, +0.00% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 496692f244d..f1991b6e7fe 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -4428,6 +4428,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } else if (info.opcode == aco_opcode::v_add_u32 && !info.clamp) { assert(ctx.program->gfx_level >= GFX9); add_opt(v_bcnt_u32_b32, v_bcnt_u32_b32, 0x3, "102", remove_const_cb<0>, true); + add_opt(s_bcnt1_i32_b32, v_bcnt_u32_b32, 0x3, "10", nullptr, true); + add_opt(v_mbcnt_lo_u32_b32, v_mbcnt_lo_u32_b32, 0x3, "102", remove_const_cb<0>, true); + add_opt(v_mbcnt_hi_u32_b32_e64, v_mbcnt_hi_u32_b32_e64, 0x3, "102", remove_const_cb<0>, true); add_opt(v_mad_u32_u16, v_mad_u32_u16, 0x3, "1203", remove_const_cb<0>, true); add_opt(v_mul_u32_u24, v_mad_u32_u24, 0x3, "120", nullptr, true); add_opt(v_mul_i32_i24, v_mad_i32_i24, 0x3, "120", nullptr, true); @@ -4450,6 +4453,13 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (ctx.uses[info.defs[1].tempId()] == 0) { add_opt(v_bcnt_u32_b32, v_bcnt_u32_b32, 0x3, "102", and_cb, pop_def_cb>); + add_opt(s_bcnt1_i32_b32, v_bcnt_u32_b32, 0x3, "10", pop_def_cb); + add_opt(v_mbcnt_lo_u32_b32, v_mbcnt_lo_u32_b32, 0x3, "102", + and_cb, pop_def_cb>); + add_opt(v_mbcnt_hi_u32_b32, v_mbcnt_hi_u32_b32, 0x3, "102", + and_cb, pop_def_cb>); + add_opt(v_mbcnt_hi_u32_b32_e64, v_mbcnt_hi_u32_b32_e64, 0x3, "102", + and_cb, pop_def_cb>); add_opt(v_mul_u32_u24, v_mad_u32_u24, 0x3, "120", pop_def_cb); add_opt(v_mul_i32_i24, v_mad_i32_i24, 0x3, "120", pop_def_cb); add_opt(v_lshlrev_b32, v_mad_u32_u24, 0x3, "210",