diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 96d1ccaa0b0..e8e11abf77b 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -311,8 +311,9 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) if (ctx.info[idx].exec.back().second & mask_type_wqm) return; if (ctx.info[idx].exec.back().second & mask_type_global) { - Temp exec_mask = ctx.info[idx].exec.back().first; - exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask)); + Temp exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), Operand(exec, bld.lm)); + ctx.info[idx].exec.back().first = exec_mask; + exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm)); ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); return; } @@ -320,7 +321,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_wqm); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + assert(ctx.info[idx].exec.back().first.id()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); } @@ -336,17 +338,24 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_exact); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + assert(ctx.info[idx].exec.back().first.id()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); return; } /* otherwise, we create an exact mask and push to the stack */ - Temp wqm = ctx.info[idx].exec.back().first; - Temp exact = bld.tmp(bld.lm); - wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), - bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm)); + Temp wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), + Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm)); ctx.info[idx].exec.back().first = wqm; - ctx.info[idx].exec.emplace_back(exact, mask_type_exact); + ctx.info[idx].exec.emplace_back(Temp(0, bld.lm), mask_type_exact); +} + +Operand get_exec_op(Temp t) +{ + if (t == Temp()) + return Operand(exec, t.regClass()); + else + return Operand(t); } unsigned add_coupling_code(exec_ctx& ctx, Block* block, @@ -360,29 +369,27 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (idx == 0) { aco_ptr& startpgm = block->instructions[0]; assert(startpgm->opcode == aco_opcode::p_startpgm); - Temp exec_mask = startpgm->definitions.back().getTemp(); bld.insert(std::move(startpgm)); /* exec seems to need to be manually initialized with combined shaders */ if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) { - bld.copy(bld.exec(Definition(exec_mask)), Operand(UINT32_MAX, bld.lm == s2)); - instructions[0]->definitions.pop_back(); + bld.copy(Definition(exec, bld.lm), Operand(UINT32_MAX, bld.lm == s2)); } if (ctx.handle_wqm) { - ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial); + ctx.info[0].exec.emplace_back(Temp(0, bld.lm), mask_type_global | mask_type_exact | mask_type_initial); /* if this block only needs WQM, initialize already */ if (ctx.info[0].block_needs == WQM) transition_to_WQM(ctx, bld, 0); } else { uint8_t mask = mask_type_global; if (ctx.program->needs_wqm) { - exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask)); + bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm)); mask |= mask_type_wqm; } else { mask |= mask_type_exact; } - ctx.info[0].exec.emplace_back(exec_mask, mask); + ctx.info[0].exec.emplace_back(Temp(0, bld.lm), mask); } return 1; @@ -402,7 +409,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, for (int i = 0; i < info.num_exec_masks - 1; i++) { phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); phi->definitions[0] = bld.def(bld.lm); - phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first); + phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first); ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); } } @@ -412,7 +419,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* this phi might be trivial but ensures a parallelcopy on the loop header */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; phi->definitions[0] = bld.def(bld.lm); - phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); + phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); } @@ -421,8 +428,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (info.has_divergent_continue) phi->definitions[0] = bld.def(bld.lm); else - phi->definitions[0] = bld.def(bld.lm, exec); - phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first); + phi->definitions[0] = Definition(exec, bld.lm); + phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first); Temp loop_active = bld.insert(std::move(phi)); if (info.has_divergent_break) { @@ -442,7 +449,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); - ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first), mask_type); } @@ -465,7 +472,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr& phi = header->instructions[instr_idx]; assert(phi->opcode == aco_opcode::p_linear_phi); for (unsigned i = 1; i < phi->operands.size(); i++) - phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[instr_idx].first); + phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[instr_idx].first); instr_idx++; } } @@ -474,14 +481,14 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr& phi = header->instructions[instr_idx++]; assert(phi->opcode == aco_opcode::p_linear_phi); for (unsigned i = 1; i < phi->operands.size(); i++) - phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first); + phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first); } if (info.has_divergent_break) { aco_ptr& phi = header->instructions[instr_idx]; assert(phi->opcode == aco_opcode::p_linear_phi); for (unsigned i = 1; i < phi->operands.size(); i++) - phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); + phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); } assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2); @@ -520,11 +527,11 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; phi->definitions[0] = bld.def(bld.lm); if (exec_idx == info.num_exec_masks - 1u) { - phi->definitions[0].setFixed(exec); + phi->definitions[0] = Definition(exec, bld.lm); need_parallelcopy = false; } for (unsigned i = 0; i < phi->operands.size(); i++) - phi->operands[i] = Operand(ctx.info[preds[i]].exec[exec_idx].first); + phi->operands[i] = get_exec_op(ctx.info[preds[i]].exec[exec_idx].first); ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); } } @@ -553,13 +560,13 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); - if (need_parallelcopy) { + if (need_parallelcopy && get_exec_op(ctx.info[idx].exec.back().first).isTemp()) { /* only create this parallelcopy is needed, since the operand isn't * fixed to exec which causes the spiller to miscalculate register demand */ /* TODO: Fix register_demand calculation for spilling on loop exits. * The problem is only mitigated because the register demand could be * higher if the exec phi doesn't get assigned to exec. */ - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); } @@ -582,16 +589,17 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* create phis for diverged exec masks */ for (unsigned i = 0; i < num_exec_masks; i++) { - bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge); - if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) { + /* skip trivial phis */ + if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) { assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second); ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]); continue; } - Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm), - ctx.info[preds[0]].exec[i].first, - ctx.info[preds[1]].exec[i].first); + bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge); + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm), + get_exec_op(ctx.info[preds[0]].exec[i].first), + get_exec_op(ctx.info[preds[1]].exec[i].first)); uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; ctx.info[idx].exec.emplace_back(phi, mask_type); } @@ -620,10 +628,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, transition_to_Exact(ctx, bld, idx); } - if (block->kind & block_kind_merge) { + if (block->kind & block_kind_merge && ctx.info[idx].exec.back().first != Temp()) { Temp restore = ctx.info[idx].exec.back().first; assert(restore.size() == bld.lm.size()); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), restore); } return i; @@ -671,20 +679,22 @@ void process_instructions(exec_ctx& ctx, Block* block, } int num = ctx.info[block->index].exec.size(); assert(num); - Operand cond = instr->operands[0]; - for (int i = num - 1; i >= 0; i--) { + + /* discard from current exec */ + const Operand cond = instr->operands[0]; + Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), + Operand(exec, bld.lm), cond).def(1).getTemp(); + + /* discard from inner to outer exec mask on stack */ + for (int i = num - 2; i >= 0; i--) { Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); - if (i == num - 1) { - andn2->operands[0].setFixed(exec); - andn2->definitions[0].setFixed(exec); - } - if (i == 0) { - instr->opcode = aco_opcode::p_exit_early_if; - instr->operands[0] = bld.scc(andn2->definitions[1].getTemp()); - } ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); + exit_cond = andn2->definitions[1].getTemp(); } + + instr->opcode = aco_opcode::p_exit_early_if; + instr->operands[0] = bld.scc(exit_cond); assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); } else if (needs == WQM && state != WQM) { @@ -720,7 +730,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(exact_mask.second & mask_type_exact); instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2)); - instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */ + instr->operands[0] = Operand(exec, bld.lm); /* current exec */ instr->operands[1] = Operand(exact_mask.first); instr->definitions[0] = dst; instr->definitions[1] = bld.def(s1, scc); @@ -735,18 +745,14 @@ void process_instructions(exec_ctx& ctx, Block* block, if (instr->operands[0].isConstant()) { assert(instr->operands[0].constantValue() == -1u); /* transition to exact and set exec to zero */ - Temp old_exec = ctx.info[block->index].exec.back().first; - Temp new_exec = bld.tmp(bld.lm); exit_cond = bld.tmp(s1); cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)), - bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); + Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm)); num = ctx.info[block->index].exec.size() - 2; - if (ctx.info[block->index].exec.back().second & mask_type_exact) { - ctx.info[block->index].exec.back().first = new_exec; - } else { + if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) { ctx.info[block->index].exec.back().first = cond; - ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact); + ctx.info[block->index].exec.emplace_back(Temp(0, bld.lm), mask_type_exact); } } else { /* demote_if: transition to exact */ @@ -761,8 +767,8 @@ void process_instructions(exec_ctx& ctx, Block* block, Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == (int)ctx.info[block->index].exec.size() - 1) { - andn2->operands[0].setFixed(exec); - andn2->definitions[0].setFixed(exec); + andn2->operands[0] = Operand(exec, bld.lm); + andn2->definitions[0] = Definition(exec, bld.lm); } ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); @@ -873,7 +879,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) /* For normal breaks, this is the exec mask. For discard+break, it's the * old exec mask before it was zero'd. */ - Operand break_cond = bld.exec(ctx.info[idx].exec.back().first); + Operand break_cond = Operand(exec, bld.lm); if (block->kind & block_kind_discard) { @@ -890,17 +896,14 @@ void add_branch_code(exec_ctx& ctx, Block* block) num = ctx.info[idx].exec.size() - 1; } - Temp old_exec = ctx.info[idx].exec.back().first; - Temp new_exec = bld.tmp(bld.lm); Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), - bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); - ctx.info[idx].exec.back().first = new_exec; + Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm)); for (int i = num - 1; i >= 0; i--) { Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), - ctx.info[block->index].exec[i].first, cond); + get_exec_op(ctx.info[block->index].exec[i].first), cond); if (i == (int)ctx.info[idx].exec.size() - 1) - andn2->definitions[0].setFixed(exec); + andn2->definitions[0] = Definition(exec, bld.lm); if (i == 0) bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp())); ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); @@ -925,8 +928,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) } if (need_parallelcopy) - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); - bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); return; } @@ -961,19 +964,17 @@ void add_branch_code(exec_ctx& ctx, Block* block) if (ctx.info[idx].block_needs & Exact_Branch) transition_to_Exact(ctx, bld, idx); - Temp current_exec = ctx.info[idx].exec.back().first; uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - Temp then_mask = bld.tmp(bld.lm); Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), - bld.exec(Definition(then_mask)), cond, bld.exec(current_exec)); + Definition(exec, bld.lm), cond, Operand(exec, bld.lm)); ctx.info[idx].exec.back().first = old_exec; /* add next current exec to the stack */ - ctx.info[idx].exec.emplace_back(then_mask, mask_type); + ctx.info[idx].exec.emplace_back(Temp(0, bld.lm), mask_type); - bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); return; } @@ -981,17 +982,11 @@ void add_branch_code(exec_ctx& ctx, Block* block) // exec = s_andn2_b64 (original_exec, exec) assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz); block->instructions.pop_back(); - Temp then_mask = ctx.info[idx].exec.back().first; - uint8_t mask_type = ctx.info[idx].exec.back().second; - ctx.info[idx].exec.pop_back(); - Temp orig_exec = ctx.info[idx].exec.back().first; - Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec), - bld.def(s1, scc), orig_exec, bld.exec(then_mask)); + assert(ctx.info[idx].exec.size() >= 2); + Temp orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first; + bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm)); - /* add next current exec to the stack */ - ctx.info[idx].exec.emplace_back(else_mask, mask_type); - - bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); return; } @@ -1016,7 +1011,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2)); + bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2)); } bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); @@ -1027,7 +1022,6 @@ void add_branch_code(exec_ctx& ctx, Block* block) assert(block->instructions.back()->opcode == aco_opcode::p_branch); block->instructions.pop_back(); - Temp current_exec = ctx.info[idx].exec.back().first; Temp cond = Temp(); for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) @@ -1035,7 +1029,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), - exec_mask, bld.exec(current_exec)); + exec_mask, Operand(exec, bld.lm)); ctx.info[idx].exec[exec_idx].first = exec_mask; } assert(cond != Temp()); @@ -1045,7 +1039,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2)); + bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2)); } bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);