aco: make all exec accesses non-temporaries

So that they are not counted into the register demand.

Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
This commit is contained in:
Daniel Schürmann 2021-02-03 15:44:49 +01:00 committed by Marge Bot
parent 5d7b3bf1a7
commit a56ddca4e8

View file

@ -311,8 +311,9 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
if (ctx.info[idx].exec.back().second & mask_type_wqm)
return;
if (ctx.info[idx].exec.back().second & mask_type_global) {
Temp exec_mask = ctx.info[idx].exec.back().first;
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
Temp exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), Operand(exec, bld.lm));
ctx.info[idx].exec.back().first = exec_mask;
exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm));
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
return;
}
@ -320,7 +321,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
assert(ctx.info[idx].exec.back().first.id());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first);
}
@ -336,17 +338,24 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_exact);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
assert(ctx.info[idx].exec.back().first.id());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first);
return;
}
/* otherwise, we create an exact mask and push to the stack */
Temp wqm = ctx.info[idx].exec.back().first;
Temp exact = bld.tmp(bld.lm);
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
Temp wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
ctx.info[idx].exec.back().first = wqm;
ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
ctx.info[idx].exec.emplace_back(Temp(0, bld.lm), mask_type_exact);
}
Operand get_exec_op(Temp t)
{
if (t == Temp())
return Operand(exec, t.regClass());
else
return Operand(t);
}
unsigned add_coupling_code(exec_ctx& ctx, Block* block,
@ -360,29 +369,27 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (idx == 0) {
aco_ptr<Instruction>& startpgm = block->instructions[0];
assert(startpgm->opcode == aco_opcode::p_startpgm);
Temp exec_mask = startpgm->definitions.back().getTemp();
bld.insert(std::move(startpgm));
/* exec seems to need to be manually initialized with combined shaders */
if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) {
bld.copy(bld.exec(Definition(exec_mask)), Operand(UINT32_MAX, bld.lm == s2));
instructions[0]->definitions.pop_back();
bld.copy(Definition(exec, bld.lm), Operand(UINT32_MAX, bld.lm == s2));
}
if (ctx.handle_wqm) {
ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial);
ctx.info[0].exec.emplace_back(Temp(0, bld.lm), mask_type_global | mask_type_exact | mask_type_initial);
/* if this block only needs WQM, initialize already */
if (ctx.info[0].block_needs == WQM)
transition_to_WQM(ctx, bld, 0);
} else {
uint8_t mask = mask_type_global;
if (ctx.program->needs_wqm) {
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm));
mask |= mask_type_wqm;
} else {
mask |= mask_type_exact;
}
ctx.info[0].exec.emplace_back(exec_mask, mask);
ctx.info[0].exec.emplace_back(Temp(0, bld.lm), mask);
}
return 1;
@ -402,7 +409,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
for (int i = 0; i < info.num_exec_masks - 1; i++) {
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
}
}
@ -412,7 +419,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
/* this phi might be trivial but ensures a parallelcopy on the loop header */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
}
@ -421,8 +428,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (info.has_divergent_continue)
phi->definitions[0] = bld.def(bld.lm);
else
phi->definitions[0] = bld.def(bld.lm, exec);
phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
phi->definitions[0] = Definition(exec, bld.lm);
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first);
Temp loop_active = bld.insert(std::move(phi));
if (info.has_divergent_break) {
@ -442,7 +449,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
}
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first), mask_type);
}
@ -465,7 +472,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
aco_ptr<Instruction>& phi = header->instructions[instr_idx];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[instr_idx].first);
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[instr_idx].first);
instr_idx++;
}
}
@ -474,14 +481,14 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
}
if (info.has_divergent_break) {
aco_ptr<Instruction>& phi = header->instructions[instr_idx];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
}
assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
@ -520,11 +527,11 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(bld.lm);
if (exec_idx == info.num_exec_masks - 1u) {
phi->definitions[0].setFixed(exec);
phi->definitions[0] = Definition(exec, bld.lm);
need_parallelcopy = false;
}
for (unsigned i = 0; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[preds[i]].exec[exec_idx].first);
phi->operands[i] = get_exec_op(ctx.info[preds[i]].exec[exec_idx].first);
ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
}
}
@ -553,13 +560,13 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
}
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
if (need_parallelcopy) {
if (need_parallelcopy && get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
/* only create this parallelcopy is needed, since the operand isn't
* fixed to exec which causes the spiller to miscalculate register demand */
/* TODO: Fix register_demand calculation for spilling on loop exits.
* The problem is only mitigated because the register demand could be
* higher if the exec phi doesn't get assigned to exec. */
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first);
}
@ -582,16 +589,17 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
/* create phis for diverged exec masks */
for (unsigned i = 0; i < num_exec_masks; i++) {
bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
/* skip trivial phis */
if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]);
continue;
}
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
ctx.info[preds[0]].exec[i].first,
ctx.info[preds[1]].exec[i].first);
bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
get_exec_op(ctx.info[preds[0]].exec[i].first),
get_exec_op(ctx.info[preds[1]].exec[i].first));
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
ctx.info[idx].exec.emplace_back(phi, mask_type);
}
@ -620,10 +628,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
transition_to_Exact(ctx, bld, idx);
}
if (block->kind & block_kind_merge) {
if (block->kind & block_kind_merge && ctx.info[idx].exec.back().first != Temp()) {
Temp restore = ctx.info[idx].exec.back().first;
assert(restore.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), restore);
}
return i;
@ -671,20 +679,22 @@ void process_instructions(exec_ctx& ctx, Block* block,
}
int num = ctx.info[block->index].exec.size();
assert(num);
Operand cond = instr->operands[0];
for (int i = num - 1; i >= 0; i--) {
/* discard from current exec */
const Operand cond = instr->operands[0];
Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
Operand(exec, bld.lm), cond).def(1).getTemp();
/* discard from inner to outer exec mask on stack */
for (int i = num - 2; i >= 0; i--) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == num - 1) {
andn2->operands[0].setFixed(exec);
andn2->definitions[0].setFixed(exec);
}
if (i == 0) {
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
}
ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
exit_cond = andn2->definitions[1].getTemp();
}
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(exit_cond);
assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
} else if (needs == WQM && state != WQM) {
@ -720,7 +730,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
assert(exact_mask.second & mask_type_exact);
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
instr->operands[0] = Operand(exec, bld.lm); /* current exec */
instr->operands[1] = Operand(exact_mask.first);
instr->definitions[0] = dst;
instr->definitions[1] = bld.def(s1, scc);
@ -735,18 +745,14 @@ void process_instructions(exec_ctx& ctx, Block* block,
if (instr->operands[0].isConstant()) {
assert(instr->operands[0].constantValue() == -1u);
/* transition to exact and set exec to zero */
Temp old_exec = ctx.info[block->index].exec.back().first;
Temp new_exec = bld.tmp(bld.lm);
exit_cond = bld.tmp(s1);
cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
num = ctx.info[block->index].exec.size() - 2;
if (ctx.info[block->index].exec.back().second & mask_type_exact) {
ctx.info[block->index].exec.back().first = new_exec;
} else {
if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
ctx.info[block->index].exec.back().first = cond;
ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact);
ctx.info[block->index].exec.emplace_back(Temp(0, bld.lm), mask_type_exact);
}
} else {
/* demote_if: transition to exact */
@ -761,8 +767,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == (int)ctx.info[block->index].exec.size() - 1) {
andn2->operands[0].setFixed(exec);
andn2->definitions[0].setFixed(exec);
andn2->operands[0] = Operand(exec, bld.lm);
andn2->definitions[0] = Definition(exec, bld.lm);
}
ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
@ -873,7 +879,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
/* For normal breaks, this is the exec mask. For discard+break, it's the
* old exec mask before it was zero'd.
*/
Operand break_cond = bld.exec(ctx.info[idx].exec.back().first);
Operand break_cond = Operand(exec, bld.lm);
if (block->kind & block_kind_discard) {
@ -890,17 +896,14 @@ void add_branch_code(exec_ctx& ctx, Block* block)
num = ctx.info[idx].exec.size() - 1;
}
Temp old_exec = ctx.info[idx].exec.back().first;
Temp new_exec = bld.tmp(bld.lm);
Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
ctx.info[idx].exec.back().first = new_exec;
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
for (int i = num - 1; i >= 0; i--) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
get_exec_op(ctx.info[block->index].exec[i].first), cond);
if (i == (int)ctx.info[idx].exec.size() - 1)
andn2->definitions[0].setFixed(exec);
andn2->definitions[0] = Definition(exec, bld.lm);
if (i == 0)
bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
@ -925,8 +928,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
}
if (need_parallelcopy)
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first);
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]);
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -961,19 +964,17 @@ void add_branch_code(exec_ctx& ctx, Block* block)
if (ctx.info[idx].block_needs & Exact_Branch)
transition_to_Exact(ctx, bld, idx);
Temp current_exec = ctx.info[idx].exec.back().first;
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
Temp then_mask = bld.tmp(bld.lm);
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
ctx.info[idx].exec.back().first = old_exec;
/* add next current exec to the stack */
ctx.info[idx].exec.emplace_back(then_mask, mask_type);
ctx.info[idx].exec.emplace_back(Temp(0, bld.lm), mask_type);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -981,17 +982,11 @@ void add_branch_code(exec_ctx& ctx, Block* block)
// exec = s_andn2_b64 (original_exec, exec)
assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz);
block->instructions.pop_back();
Temp then_mask = ctx.info[idx].exec.back().first;
uint8_t mask_type = ctx.info[idx].exec.back().second;
ctx.info[idx].exec.pop_back();
Temp orig_exec = ctx.info[idx].exec.back().first;
Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
bld.def(s1, scc), orig_exec, bld.exec(then_mask));
assert(ctx.info[idx].exec.size() >= 2);
Temp orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm));
/* add next current exec to the stack */
ctx.info[idx].exec.emplace_back(else_mask, mask_type);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -1016,7 +1011,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2));
bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
@ -1027,7 +1022,6 @@ void add_branch_code(exec_ctx& ctx, Block* block)
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
block->instructions.pop_back();
Temp current_exec = ctx.info[idx].exec.back().first;
Temp cond = Temp();
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
@ -1035,7 +1029,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, bld.exec(current_exec));
exec_mask, Operand(exec, bld.lm));
ctx.info[idx].exec[exec_idx].first = exec_mask;
}
assert(cond != Temp());
@ -1045,7 +1039,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2));
bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);