aco: move try_merge_break_with_continue() to lower_branches()

Totals from 3 (0.00% of 79395) affected shaders: (Navi31)

Instrs: 12888 -> 12882 (-0.05%)
Latency: 83253 -> 83246 (-0.01%)
InvThroughput: 9251 -> 9249 (-0.02%)
Branches: 483 -> 480 (-0.62%)
SALU: 1329 -> 1326 (-0.23%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32477>
This commit is contained in:
Daniel Schürmann 2024-11-21 11:40:55 +01:00 committed by Marge Bot
parent 13ad3db43f
commit 12656ea5f5
2 changed files with 109 additions and 0 deletions

View file

@ -196,6 +196,7 @@ public:
s_wqm = (unsigned) aco_opcode::s_wqm_b64,
s_and_saveexec = (unsigned) aco_opcode::s_and_saveexec_b64,
s_or_saveexec = (unsigned) aco_opcode::s_or_saveexec_b64,
s_andn2_wrexec = (unsigned) aco_opcode::s_andn2_wrexec_b64,
s_xnor = (unsigned) aco_opcode::s_xnor_b64,
s_xor = (unsigned) aco_opcode::s_xor_b64,
s_bcnt1_i32 = (unsigned) aco_opcode::s_bcnt1_i32_b64,
@ -339,6 +340,8 @@ public:
return aco_opcode::s_and_saveexec_b32;
case s_or_saveexec:
return aco_opcode::s_or_saveexec_b32;
case s_andn2_wrexec:
return aco_opcode::s_andn2_wrexec_b32;
case s_xnor:
return aco_opcode::s_xnor_b32;
case s_xor:

View file

@ -134,6 +134,109 @@ try_remove_simple_block(branch_ctx& ctx, Block& block)
block.instructions.clear();
}
void
try_merge_break_with_continue(branch_ctx& ctx, Block& block)
{
/* Look for this:
* BB1:
* ...
* p_branch_z exec BB3, BB2
* BB2:
* ...
* s[0:1], scc = s_andn2 s[0:1], exec
* s_cbranch_scc0 BB4
* BB3:
* exec = s_mov_b64 s[0:1]
* s_branch BB1
* BB4:
* ...
*
* And turn it into this:
* BB1:
* ...
* p_branch_z exec BB3, BB2
* BB2:
* ...
* BB3:
* s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
* s_cbranch_scc1 BB1, BB4
* BB4:
* ...
*/
if (block.linear_succs.size() != 2 || block.instructions.size() < 2)
return;
Instruction* branch = block.instructions.back().get();
if (branch->opcode != aco_opcode::s_cbranch_scc0)
return;
Block& merge = ctx.program->blocks[block.linear_succs[0]];
Block& loopexit = ctx.program->blocks[block.linear_succs[1]];
/* Just a jump to the loop header. */
if (merge.linear_succs.size() != 1)
return;
/* We want to use the loopexit as the fallthrough block from merge,
* so there shouldn't be a block inbetween.
*/
for (unsigned i = merge.index + 1; i < loopexit.index; i++) {
if (!ctx.program->blocks[i].instructions.empty())
return;
}
for (unsigned merge_pred : merge.linear_preds) {
if (merge_pred == block.index)
continue;
Block& pred = ctx.program->blocks[merge_pred];
Instruction* pred_branch = pred.instructions.back().get();
/* The branch needs to be exec zero only, otherwise we corrupt exec. */
if (pred_branch->opcode != aco_opcode::p_cbranch_z ||
pred_branch->operands[0].physReg() != exec)
return;
}
/* merge block: copy to exec, branch */
if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch)
return;
Builder bld(ctx.program);
Instruction* execwrite = merge.instructions[0].get();
if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec())
return;
Instruction* execsrc = block.instructions[block.instructions.size() - 2].get();
if (execsrc->opcode != bld.w64or32(Builder::s_andn2) ||
execsrc->definitions[0].physReg() != execwrite->operands[0].physReg() ||
execsrc->operands[0].physReg() != execwrite->operands[0].physReg() ||
execsrc->operands[1].physReg() != exec)
return;
/* Use conditional branch in merge block. */
block.instructions.pop_back();
merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1;
block.linear_succs.pop_back();
block.linear_succs[0] = merge.index;
merge.linear_succs.push_back(loopexit.index);
std::swap(merge.linear_succs[0], merge.linear_succs[1]);
std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index,
merge.index);
if (ctx.program->gfx_level >= GFX9) {
/* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
Instruction* wr_exec =
bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1],
Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]);
merge.instructions[0].reset(wr_exec);
} else {
/* Move s_andn2 to the merge block. */
merge.instructions.emplace(merge.instructions.begin(), std::move(block.instructions.back()));
}
block.instructions.pop_back();
ctx.blocks_incoming_exec_used[merge.index] = true;
}
void
eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
{
@ -358,6 +461,9 @@ lower_branches(Program* program)
lower_branch_instruction(ctx, block);
eliminate_useless_exec_writes_in_block(ctx, block);
if (block.kind & block_kind_break)
try_merge_break_with_continue(ctx, block);
if (block.linear_succs.size() == 1)
try_remove_simple_block(ctx, block);
}