diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 0199373ba99..6ce47fe3a37 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -29,6 +29,14 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx void add_subdword_definition(Program* program, aco_ptr& instr, PhysReg reg, bool allow_16bit_write); +struct parallelcopy { + constexpr parallelcopy(Operand op_, Definition def_) : op(op_), def(def_) + {} + + Operand op; + Definition def; +}; + struct assignment { PhysReg reg; RegClass rc; @@ -809,22 +817,21 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) } void -update_renames(ra_ctx& ctx, RegisterFile& reg_file, - std::vector>& parallelcopies, +update_renames(ra_ctx& ctx, RegisterFile& reg_file, std::vector& parallelcopies, aco_ptr& instr) { /* clear operands */ - for (std::pair& copy : parallelcopies) { + for (parallelcopy& copy : parallelcopies) { /* the definitions with id are not from this function and already handled */ - if (copy.second.isTemp()) + if (copy.def.isTemp()) continue; - reg_file.clear(copy.first); + reg_file.clear(copy.op); } /* allocate id's and rename operands: this is done transparently here */ auto it = parallelcopies.begin(); while (it != parallelcopies.end()) { - if (it->second.isTemp()) { + if (it->def.isTemp()) { ++it; continue; } @@ -832,9 +839,9 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, /* check if we moved a definition: change the register and remove copy */ bool is_def = false; for (Definition& def : instr->definitions) { - if (def.isTemp() && def.getTemp() == it->first.getTemp()) { + if (def.isTemp() && def.getTemp() == it->op.getTemp()) { // FIXME: ensure that the definition can use this reg - def.setFixed(it->second.physReg()); + def.setFixed(it->def.physReg()); reg_file.fill(def); ctx.assignments[def.tempId()].reg = def.physReg(); it = parallelcopies.erase(it); @@ -846,34 +853,34 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, continue; /* check if we moved another parallelcopy definition */ - for (std::pair& other : parallelcopies) { - if (!other.second.isTemp()) + for (parallelcopy& other : parallelcopies) { + if (!other.def.isTemp()) continue; - if (it->first.getTemp() == other.second.getTemp()) { - other.second.setFixed(it->second.physReg()); - ctx.assignments[other.second.tempId()].reg = other.second.physReg(); + if (it->op.getTemp() == other.def.getTemp()) { + other.def.setFixed(it->def.physReg()); + ctx.assignments[other.def.tempId()].reg = other.def.physReg(); it = parallelcopies.erase(it); is_def = true; /* check if we moved an operand, again */ bool fill = true; for (Operand& op : instr->operands) { - if (op.isTemp() && op.tempId() == other.second.tempId()) { + if (op.isTemp() && op.tempId() == other.def.tempId()) { // FIXME: ensure that the operand can use this reg - op.setFixed(other.second.physReg()); + op.setFixed(other.def.physReg()); fill = !op.isKillBeforeDef(); } } if (fill) - reg_file.fill(other.second); + reg_file.fill(other.def); break; } } if (is_def) continue; - std::pair& copy = *it; - copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass())); - ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass()); + parallelcopy& copy = *it; + copy.def.setTemp(ctx.program->allocateTmp(copy.def.regClass())); + ctx.assignments.emplace_back(copy.def.physReg(), copy.def.regClass()); assert(ctx.assignments.size() == ctx.program->peekAllocationId()); /* check if we moved an operand */ @@ -883,9 +890,9 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, Operand& op = instr->operands[i]; if (!op.isTemp()) continue; - if (op.tempId() == copy.first.tempId()) { + if (op.tempId() == copy.op.tempId()) { /* only rename precolored operands if the copy-location matches */ - bool omit_renaming = op.isPrecolored() && op.physReg() != copy.second.physReg(); + bool omit_renaming = op.isPrecolored() && op.physReg() != copy.def.physReg(); /* Fix the kill flags */ if (first[omit_renaming]) @@ -897,8 +904,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, if (omit_renaming) continue; - op.setTemp(copy.second.getTemp()); - op.setFixed(copy.second.physReg()); + op.setTemp(copy.def.getTemp()); + op.setFixed(copy.def.physReg()); fill = !op.isKillBeforeDef() || op.isPrecolored(); } @@ -906,7 +913,7 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, /* Apply changes to register file. */ if (fill) - reg_file.fill(copy.second); + reg_file.fill(copy.def); ++it; } @@ -1042,7 +1049,7 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte std::optional get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, - std::vector>& parallelcopies, + std::vector& parallelcopies, aco_ptr& instr, const PhysRegInterval def_reg, DefInfo info, unsigned id) { @@ -1094,8 +1101,7 @@ get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, } bool -get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, - std::vector>& parallelcopies, +get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector& parallelcopies, const std::vector& vars, aco_ptr& instr, const PhysRegInterval def_reg) { @@ -1245,9 +1251,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, } std::optional -get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, - std::vector>& parallelcopies, const DefInfo& info, - aco_ptr& instr) +get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector& parallelcopies, + const DefInfo& info, aco_ptr& instr) { const PhysRegInterval& bounds = info.bounds; uint32_t size = info.size; @@ -1373,7 +1378,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) tmp_file.fill_killed_operands(instr.get()); - std::vector> pc; + std::vector pc; if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win)) return {}; @@ -1465,7 +1470,7 @@ add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val) */ PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector& vars, - std::vector>& parallelcopies, PhysReg start) + std::vector& parallelcopies, PhysReg start) { /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword * temporary sizes to dwords. @@ -1616,7 +1621,7 @@ get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr>& parallelcopies) + std::vector& parallelcopies) { PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true); int zeros = reg_file.count_zero(linear_vgpr_bounds); @@ -1642,7 +1647,7 @@ compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file, */ PhysReg alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr& instr, - std::vector>& parallelcopies) + std::vector& parallelcopies) { assert(instr->opcode == aco_opcode::p_start_linear_vgpr); assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0); @@ -1675,7 +1680,7 @@ alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr> pc; + std::vector pc; if (!ctx.policy.skip_optimistic_path && get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) { parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); @@ -1726,7 +1731,7 @@ should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file) PhysReg get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, - std::vector>& parallelcopies, aco_ptr& instr, + std::vector& parallelcopies, aco_ptr& instr, int operand_index = -1) { auto split_vec = ctx.split_vectors.find(temp.id()); @@ -1800,7 +1805,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, return *res; /* try compacting the linear vgprs to make more space */ - std::vector> pc; + std::vector pc; if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) && compact_linear_vgprs(ctx, reg_file, pc)) { parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); @@ -1808,8 +1813,8 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, /* We don't need to fill the copy definitions in because we don't care about the linear VGPR * space here. */ RegisterFile tmp_file(reg_file); - for (std::pair& copy : pc) - tmp_file.clear(copy.first); + for (parallelcopy& copy : pc) + tmp_file.clear(copy.op); return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index); } @@ -1867,8 +1872,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, PhysReg get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, - std::vector>& parallelcopies, - aco_ptr& instr) + std::vector& parallelcopies, aco_ptr& instr) { RegClass rc = temp.regClass(); /* create_vector instructions have different costs w.r.t. register coalescing */ @@ -1985,7 +1989,7 @@ get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, std::vector vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); bool success = false; - std::vector> pc; + std::vector pc; success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size}); if (!success) { @@ -2088,8 +2092,7 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsign void handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, - std::vector>& parallelcopy, - aco_ptr& instr) + std::vector& parallelcopy, aco_ptr& instr) { assert(instr->operands.size() <= 128); assert(parallelcopy.empty()); @@ -2115,7 +2118,7 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, /* An instruction can have at most one operand precolored to the same register. */ assert(std::none_of(parallelcopy.begin(), parallelcopy.end(), - [&](auto copy) { return copy.second.physReg() == op.physReg(); })); + [&](auto copy) { return copy.def.physReg() == op.physReg(); })); /* clear from register_file so fixed operands are not collected be collect_vars() */ tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src @@ -2149,8 +2152,8 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, - std::vector>& parallelcopy, - aco_ptr& instr, Operand& operand, unsigned operand_index) + std::vector& parallelcopy, aco_ptr& instr, + Operand& operand, unsigned operand_index) { /* clear the operand in case it's only a stride mismatch */ PhysReg src = ctx.assignments[operand.tempId()].reg; @@ -2170,45 +2173,44 @@ get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file, std::vector>& instructions, Block& block, aco_ptr& phi, Temp tmp) { - std::vector> parallelcopy; + std::vector parallelcopy; PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi); update_renames(ctx, register_file, parallelcopy, phi); /* process parallelcopy */ - for (std::pair pc : parallelcopy) { + for (struct parallelcopy pc : parallelcopy) { /* see if it's a copy from a different phi */ // TODO: prefer moving some previous phis over live-ins // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a // problem in practice since they can only be fixed to exec) Instruction* prev_phi = NULL; for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { - if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) + if ((*phi_it)->definitions[0].tempId() == pc.op.tempId()) prev_phi = phi_it->get(); } if (prev_phi) { /* if so, just update that phi's register */ - prev_phi->definitions[0].setFixed(pc.second.physReg()); + prev_phi->definitions[0].setFixed(pc.def.physReg()); register_file.fill(prev_phi->definitions[0]); - ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), - pc.second.regClass()}; + ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.def.physReg(), pc.def.regClass()}; continue; } /* rename */ - auto orig_it = ctx.orig_names.find(pc.first.tempId()); - Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp(); - add_rename(ctx, orig, pc.second.getTemp()); + auto orig_it = ctx.orig_names.find(pc.op.tempId()); + Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.op.getTemp(); + add_rename(ctx, orig, pc.def.getTemp()); /* otherwise, this is a live-in and we need to create a new phi * to move it in this block's predecessors */ aco_opcode opcode = - pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + pc.op.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; Block::edge_vec& preds = - pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; + pc.op.getTemp().is_linear() ? block.linear_preds : block.logical_preds; aco_ptr new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; - new_phi->definitions[0] = pc.second; + new_phi->definitions[0] = pc.def; for (unsigned i = 0; i < preds.size(); i++) - new_phi->operands[i] = Operand(pc.first); + new_phi->operands[i] = Operand(pc.op); instructions.emplace_back(std::move(new_phi)); /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is @@ -2945,30 +2947,30 @@ optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr } void -undo_renames(ra_ctx& ctx, std::vector>& parallelcopies, +undo_renames(ra_ctx& ctx, std::vector& parallelcopies, aco_ptr& instr) { /* Undo renaming if possible in order to reduce latency. * * This can also remove a use of a SCC->SGPR copy, which can then be removed completely if the * post-RA optimizer eliminates the copy by duplicating the instruction that produced the SCC */ - for (std::pair copy : parallelcopies) { + for (parallelcopy copy : parallelcopies) { bool first[2] = {true, true}; for (unsigned i = 0; i < instr->operands.size(); i++) { Operand& op = instr->operands[i]; - if (!op.isTemp() || op.getTemp() != copy.second.getTemp()) { - first[1] &= !op.isTemp() || op.getTemp() != copy.first.getTemp(); + if (!op.isTemp() || op.getTemp() != copy.def.getTemp()) { + first[1] &= !op.isTemp() || op.getTemp() != copy.op.getTemp(); continue; } bool use_original = !op.isPrecolored() && !op.isLateKill(); - use_original &= operand_can_use_reg(ctx.program->gfx_level, instr, i, copy.first.physReg(), - copy.first.regClass()); + use_original &= operand_can_use_reg(ctx.program->gfx_level, instr, i, copy.op.physReg(), + copy.op.regClass()); if (use_original) { - const PhysRegInterval copy_reg = {copy.first.physReg(), copy.first.size()}; - for (std::pair& pc : parallelcopies) { - const PhysRegInterval def_reg = {pc.second.physReg(), pc.second.size()}; + const PhysRegInterval copy_reg = {copy.op.physReg(), copy.op.size()}; + for (parallelcopy& pc : parallelcopies) { + const PhysRegInterval def_reg = {pc.def.physReg(), pc.def.size()}; use_original &= !intersects(def_reg, copy_reg); } } @@ -2984,15 +2986,15 @@ undo_renames(ra_ctx& ctx, std::vector>& parallelc first[use_original] = false; if (use_original) { - op.setTemp(copy.first.getTemp()); - op.setFixed(copy.first.physReg()); + op.setTemp(copy.op.getTemp()); + op.setFixed(copy.op.physReg()); } } } } void -emit_parallel_copy_internal(ra_ctx& ctx, std::vector>& parallelcopy, +emit_parallel_copy_internal(ra_ctx& ctx, std::vector& parallelcopy, aco_ptr& instr, std::vector>& instructions, bool temp_in_scc, RegisterFile& register_file) @@ -3007,21 +3009,21 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector sgpr_operands; for (unsigned i = 0; i < parallelcopy.size(); i++) { - linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); + linear_vgpr |= parallelcopy[i].op.regClass().is_linear_vgpr(); - if (!may_swap_sgprs && parallelcopy[i].first.isTemp() && - parallelcopy[i].first.getTemp().type() == RegType::sgpr) { - unsigned op_reg = parallelcopy[i].first.physReg().reg(); - unsigned def_reg = parallelcopy[i].second.physReg().reg(); - for (unsigned j = 0; j < parallelcopy[i].first.size(); j++) { + if (!may_swap_sgprs && parallelcopy[i].op.isTemp() && + parallelcopy[i].op.getTemp().type() == RegType::sgpr) { + unsigned op_reg = parallelcopy[i].op.physReg().reg(); + unsigned def_reg = parallelcopy[i].def.physReg().reg(); + for (unsigned j = 0; j < parallelcopy[i].op.size(); j++) { sgpr_operands.set(op_reg + j); if (sgpr_operands.test(def_reg + j)) may_swap_sgprs = true; } } - pc->operands[i] = parallelcopy[i].first; - pc->definitions[i] = parallelcopy[i].second; + pc->operands[i] = parallelcopy[i].op; + pc->definitions[i] = parallelcopy[i].def; assert(pc->operands[i].size() == pc->definitions[i].size()); /* it might happen that the operand is already renamed. we have to restore the @@ -3055,18 +3057,18 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector>& parallelcopy, +emit_parallel_copy(ra_ctx& ctx, std::vector& copies, aco_ptr& instr, std::vector>& instructions, bool temp_in_scc, RegisterFile& register_file) { - if (parallelcopy.empty()) + if (copies.empty()) return; - std::vector> linear_vgpr; + std::vector linear_vgpr; if (ctx.num_linear_vgprs) { - auto next = parallelcopy.begin(); - for (auto it = parallelcopy.begin(); it != parallelcopy.end(); ++it) { - if (it->first.regClass().is_linear_vgpr()) { + auto next = copies.begin(); + for (auto it = copies.begin(); it != copies.end(); ++it) { + if (it->op.regClass().is_linear_vgpr()) { linear_vgpr.push_back(*it); continue; } @@ -3075,14 +3077,14 @@ emit_parallel_copy(ra_ctx& ctx, std::vector>& par *next = *it; ++next; } - parallelcopy.erase(next, parallelcopy.end()); + copies.erase(next, copies.end()); } /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */ emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc, register_file); - emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc, + emit_parallel_copy_internal(ctx, copies, instr, instructions, temp_in_scc, register_file); } @@ -3116,7 +3118,7 @@ register_allocation(Program* program, ra_test_policy policy) auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); for (; instr_it != block.instructions.end(); ++instr_it) { aco_ptr& instr = *instr_it; - std::vector> parallelcopy; + std::vector parallelcopy; assert(!is_phi(instr)); /* handle operands */ @@ -3396,7 +3398,7 @@ register_allocation(Program* program, ra_test_policy policy) bool temp_in_scc = register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc); - std::vector> parallelcopy; + std::vector parallelcopy; compact_linear_vgprs(ctx, register_file, parallelcopy); update_renames(ctx, register_file, parallelcopy, br); emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);