From 1bb39be75e2a912b050eaa56eabe4bb3b77b2df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Thu, 13 Feb 2025 09:16:05 +0100 Subject: [PATCH] aco/insert_exec_mask: Don't immediately set exec to zero in break/continue blocks Instead, only indicate that exec should be zero and do so in the successive helper block. This allows to insert the parallelcopies from logical phis directly before the branch in break and continue blocks. Totals from 56 (0.07% of 79377) affected shaders: (Navi31) Latency: 2472367 -> 2472422 (+0.00%); split: -0.00%, +0.00% InvThroughput: 253053 -> 253055 (+0.00%); split: -0.00%, +0.00% Cc: mesa-stable Part-of: (cherry picked from commit 7f7c1d463a11459b7159b5455920ab0a00923ba7) --- .pick_status.json | 2 +- src/amd/compiler/aco_insert_exec_mask.cpp | 35 ++++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 069f82608bc..f09c079bd1f 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2254,7 +2254,7 @@ "description": "aco/insert_exec_mask: Don't immediately set exec to zero in break/continue blocks", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 734385c5e47..7b3d2b26e08 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -300,6 +300,21 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> } else if (preds.size() == 1) { ctx.info[idx].exec = ctx.info[preds[0]].exec; + + /* After continue and break blocks, we implicitly set exec to zero. + * This is so that parallelcopies can be inserted before the branch + * without being affected by the changed exec mask. + */ + if (ctx.info[idx].exec.back().op.constantEquals(0)) { + assert(block->logical_succs.empty()); + /* Check whether the successor block already restores exec. */ + uint16_t block_kind = ctx.program->blocks[block->linear_succs[0]].kind; + if (!(block_kind & (block_kind_loop_header | block_kind_loop_exit | block_kind_invert | + block_kind_merge))) { + /* The successor does not restore exec. */ + restore_exec = true; + } + } } else { assert(preds.size() == 2); assert(ctx.info[preds[0]].exec.size() == ctx.info[preds[1]].exec.size()); @@ -703,14 +718,8 @@ add_branch_code(exec_ctx& ctx, Block* block) break; } - /* check if the successor is the merge block, otherwise set exec to 0 */ - // TODO: this could be done better by directly branching to the merge block - unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; - Block& succ = ctx.program->blocks[succ_idx]; - if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes())); - } - + /* Implicitly set exec to zero and branch. */ + ctx.info[idx].exec.back().op = Operand::zero(bld.lm.bytes()); bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); } else if (block->kind & block_kind_continue) { @@ -729,14 +738,8 @@ add_branch_code(exec_ctx& ctx, Block* block) } assert(cond != Temp()); - /* check if the successor is the merge block, otherwise set exec to 0 */ - // TODO: this could be done better by directly branching to the merge block - unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; - Block& succ = ctx.program->blocks[succ_idx]; - if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes())); - } - + /* Implicitly set exec to zero and branch. */ + ctx.info[idx].exec.back().op = Operand::zero(bld.lm.bytes()); bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); } else {