From faba30a8f37ab2cd5855e0fcc35d9f15c7ec1ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Sat, 6 Aug 2022 09:40:44 +0200 Subject: [PATCH] aco/optimizer: Optimize p_extract + v_mul_u32_u24 to v_mad_u32_u16. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should perform the same but removes SDWA from the address calculations in NGG culling shaders for example. This is done because SDWA is no longer available on GFX11. Fossil DB stats on GFX1100: Totals from 36 (0.03% of 134913) affected shaders: CodeSize: 300968 -> 300884 (-0.03%); split: -0.04%, +0.01% Instrs: 60955 -> 60863 (-0.15%); split: -0.15%, +0.00% Latency: 426809 -> 426819 (+0.00%); split: -0.06%, +0.06% InvThroughput: 39076 -> 39025 (-0.13%); split: -0.14%, +0.01% VClause: 1440 -> 1443 (+0.21%) Copies: 5714 -> 5725 (+0.19%) Fossil DB stats on GFX1100 with NGG culling enabled: Totals from 60953 (45.18% of 134913) affected shaders: VGPRs: 2273172 -> 2273160 (-0.00%) CodeSize: 186401864 -> 186403036 (+0.00%); split: -0.00%, +0.00% Instrs: 37038048 -> 36977353 (-0.16%); split: -0.16%, +0.00% Latency: 146466770 -> 146350172 (-0.08%); split: -0.08%, +0.00% InvThroughput: 15342790 -> 15228585 (-0.74%); split: -0.74%, +0.00% VClause: 669662 -> 669665 (+0.00%) Copies: 2972380 -> 2972482 (+0.00%); split: -0.01%, +0.01% Signed-off-by: Timur Kristóf Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index e73ea603346..62df72faf1c 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1106,6 +1106,11 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) || (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) { return true; + } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 && + !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() && + (instr->operands[!idx].is16bit() || + instr->operands[!idx].constantValue() <= UINT16_MAX)) { + return true; } else if (idx < 2 && can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword) @@ -1162,6 +1167,18 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) { /* The undesireable upper bits are already shifted out. */ return; + } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 && + !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() && + (instr->operands[!idx].is16bit() || + instr->operands[!idx].constantValue() <= UINT16_MAX)) { + Instruction* mad = + create_instruction(aco_opcode::v_mad_u32_u16, Format::VOP3, 3, 1); + mad->definitions[0] = instr->definitions[0]; + mad->operands[0] = instr->operands[0]; + mad->operands[1] = instr->operands[1]; + mad->operands[2] = Operand::zero(); + mad->vop3().opsel = (sel.offset() / 2) << idx; + instr.reset(mad); } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { to_SDWA(ctx, instr);