diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 94781ba8d71..428149ebc55 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2824,6 +2824,11 @@ lower_to_hw_instr(Program* program) branch->opcode == aco_opcode::p_cbranch_nz) && branch->operands[0].physReg() == exec); + if (branch->never_taken) { + assert(!uniform_branch); + continue; + } + /* Check if the branch instruction can be removed. * This is beneficial when executing the next block with an empty exec mask * is faster than the branch instruction itself. @@ -2880,18 +2885,14 @@ lower_to_hw_instr(Program* program) num_scalar++; } } - } else if (inst->isEXP()) { - /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */ + } else if (inst->isEXP() || inst->isSMEM() || inst->isBarrier()) { + /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs), + * SMEM might be an invalid access, and barriers are probably expensive. */ can_remove = false; } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || inst->isLDSDIR()) { // TODO: GFX6-9 can use vskip can_remove = prefer_remove; - } else if (inst->isSMEM()) { - /* SMEM are at least as expensive as branches */ - can_remove = prefer_remove && branch->never_taken; - } else if (inst->isBarrier()) { - can_remove = prefer_remove && branch->never_taken; } else { can_remove = false; assert(false && "Pseudo instructions should be lowered by this point."); diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index 081c965b642..0c922bf9ea9 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -215,6 +215,7 @@ try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) assert(branch.target[1] == block->index); branch.target[1] = succ.index; branch.opcode = aco_opcode::p_branch; + branch.rarely_taken = branch.never_taken = false; } else if (branch.target[1] == block->index) { /* check if there is a fall-through path from block to succ */ bool falls_through = block->index < succ.index; @@ -256,6 +257,7 @@ try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) branch.operands.pop_back(); branch.opcode = aco_opcode::p_branch; + branch.rarely_taken = branch.never_taken = false; } for (unsigned i = 0; i < pred.linear_succs.size(); i++)