From ca6ef505ff9b37e4ef8d947ed23fa4f2e44c02f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 30 Sep 2021 14:32:07 +0200 Subject: [PATCH] aco/optimizer: Skip SDWA on v_lshlrev when unnecessary in apply_extract. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the following cases: - lower 16 bits are extracted and the shift amount is 16 or more - lower 8 bits are extracted and the shift amount is 24 or more the undesireable upper bits are already shifted out, and therefore there is no need to add SDWA to the v_lshlrev instruction. Fossil DB stats on Sienna Cichlid with NGGC on: Totals from 58239 (45.27% of 128647) affected shaders: CodeSize: 153498624 -> 153265616 (-0.15%); split: -0.15%, +0.00% Instrs: 29636304 -> 29578064 (-0.20%); split: -0.20%, +0.00% Latency: 136931496 -> 136876379 (-0.04%); split: -0.04%, +0.00% InvThroughput: 21134367 -> 21078861 (-0.26%); split: -0.26%, +0.00% Copies: 2777550 -> 2777548 (-0.00%); split: -0.00%, +0.00% Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 6773b96eacc..ad70ca15503 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -854,6 +854,11 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& SubdwordSel sel = parse_extract(info.instr); assert(sel); + instr->operands[idx].set16bit(false); + instr->operands[idx].set24bit(false); + + ctx.info[tmp.id()].label &= ~label_insert; + if (sel.size() == 4) { /* full dword selection */ } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) { @@ -863,6 +868,12 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; } + } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() && + sel.offset() == 0 && + ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) || + (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) { + /* The undesireable upper bits are already shifted out. */ + return; } else if (can_use_SDWA(ctx.program->chip_class, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) { to_SDWA(ctx, instr); @@ -872,10 +883,6 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& instr->vop3().opsel |= 1 << idx; } - instr->operands[idx].set16bit(false); - instr->operands[idx].set24bit(false); - - ctx.info[tmp.id()].label &= ~label_insert; /* label_vopc seems to be the only one worth keeping at the moment */ for (Definition& def : instr->definitions) ctx.info[def.tempId()].label &= label_vopc;