mesa/src/amd/compiler/aco_spill.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1957 lines
77 KiB
C++
Raw Normal View History

/*
* Copyright © 2018 Valve Corporation
* Copyright © 2018 Google
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_ir.h"
#include "aco_util.h"
#include "common/ac_descriptors.h"
#include "common/sid.h"
#include <algorithm>
#include <cstring>
#include <map>
#include <optional>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace std {
template <> struct hash<aco::Temp> {
size_t operator()(aco::Temp temp) const noexcept
{
uint32_t v;
std::memcpy(&v, &temp, sizeof(temp));
return std::hash<uint32_t>{}(v);
}
};
} // namespace std
/*
* Implements the spilling algorithm on SSA-form based on
* "Register Spilling and Live-Range Splitting for SSA-Form Programs"
* by Matthias Braun and Sebastian Hack.
*
* Key difference between this algorithm and the min-algorithm from the paper
* is the use of average use distances rather than next-use distances per
* instruction.
* As we decrement the number of remaining uses, the average use distances
* give an approximation of the next-use distances while being computationally
* and memory-wise less expensive.
*/
namespace aco {
namespace {
struct remat_info {
Instruction* instr;
};
struct loop_info {
uint32_t index;
aco::unordered_map<Temp, uint32_t> spills;
IDSet live_in;
};
struct use_info {
uint32_t num_uses = 0;
uint32_t last_use = 0;
float score() { return static_cast<float>(last_use) / static_cast<float>(num_uses); }
};
struct spill_ctx {
RegisterDemand target_pressure;
Program* program;
aco::monotonic_buffer_resource memory;
std::vector<aco::map<Temp, Temp>> renames;
std::vector<aco::unordered_map<Temp, uint32_t>> spills_entry;
std::vector<aco::unordered_map<Temp, uint32_t>> spills_exit;
std::vector<bool> processed;
std::vector<loop_info> loop;
std::vector<use_info> ssa_infos;
std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;
std::vector<std::vector<uint32_t>> affinities;
std::vector<bool> is_reloaded;
aco::unordered_map<Temp, remat_info> remat;
std::set<Instruction*> unused_remats;
unsigned wave_size;
unsigned sgpr_spill_slots;
unsigned vgpr_spill_slots;
Temp scratch_rsrc;
unsigned scratch_rsrc_block = -1u;
unsigned extra_vgprs;
unsigned resume_idx;
unsigned max_lds_spill_slots;
Temp lds_m0_zero;
spill_ctx(const RegisterDemand target_pressure_, Program* program_, unsigned extra_vgprs_,
unsigned max_lds_spill_slots_)
: target_pressure(target_pressure_), program(program_), memory(),
renames(program->blocks.size(), aco::map<Temp, Temp>(memory)),
spills_entry(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
spills_exit(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()),
remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0),
extra_vgprs(extra_vgprs_), resume_idx(0), max_lds_spill_slots(max_lds_spill_slots_)
{}
void add_affinity(uint32_t first, uint32_t second)
{
unsigned found_first = affinities.size();
unsigned found_second = affinities.size();
for (unsigned i = 0; i < affinities.size(); i++) {
std::vector<uint32_t>& vec = affinities[i];
for (uint32_t entry : vec) {
if (entry == first)
found_first = i;
else if (entry == second)
found_second = i;
}
}
if (found_first == affinities.size() && found_second == affinities.size()) {
affinities.emplace_back(std::vector<uint32_t>({first, second}));
} else if (found_first < affinities.size() && found_second == affinities.size()) {
affinities[found_first].push_back(second);
} else if (found_second < affinities.size() && found_first == affinities.size()) {
affinities[found_second].push_back(first);
} else if (found_first != found_second) {
/* merge second into first */
affinities[found_first].insert(affinities[found_first].end(),
affinities[found_second].begin(),
affinities[found_second].end());
affinities.erase(std::next(affinities.begin(), found_second));
} else {
assert(found_first == found_second);
}
}
uint32_t add_to_spills(Temp to_spill, aco::unordered_map<Temp, uint32_t>& spills)
{
const uint32_t spill_id = allocate_spill_id(to_spill.regClass());
for (auto pair : spills)
add_interference(spill_id, pair.second);
if (!loop.empty()) {
for (auto pair : loop.back().spills)
add_interference(spill_id, pair.second);
}
spills[to_spill] = spill_id;
return spill_id;
}
void add_interference(uint32_t first, uint32_t second)
{
if (interferences[first].first.type() != interferences[second].first.type())
return;
bool inserted = interferences[first].second.insert(second).second;
if (inserted)
interferences[second].second.insert(first);
}
uint32_t allocate_spill_id(RegClass rc)
{
interferences.emplace_back(rc, std::unordered_set<uint32_t>());
is_reloaded.push_back(false);
return next_spill_id++;
}
uint32_t next_spill_id = 0;
};
/**
* Gathers information about the number of uses and point of last use
* per SSA value.
*
* Phi definitions are added to live-ins.
*/
void
gather_ssa_use_info(spill_ctx& ctx)
{
unsigned instruction_idx = 0;
for (Block& block : ctx.program->blocks) {
for (int i = block.instructions.size() - 1; i >= 0; i--) {
aco_ptr<Instruction>& instr = block.instructions[i];
for (const Operand& op : instr->operands) {
if (op.isTemp()) {
use_info& info = ctx.ssa_infos[op.tempId()];
info.num_uses++;
info.last_use = std::max(info.last_use, instruction_idx + i);
}
}
}
/* All live-in variables at loop headers get an additional artificial use.
* As we decrement the number of uses while processing the blocks, this
* ensures that the number of uses won't becomes zero before the loop
* (and the variables' live-ranges) end.
*/
if (block.kind & block_kind_loop_header) {
for (unsigned t : ctx.program->live.live_in[block.index])
ctx.ssa_infos[t].num_uses++;
}
instruction_idx += block.instructions.size();
}
}
bool
should_rematerialize(aco_ptr<Instruction>& instr)
{
/* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&
instr->format != Format::PSEUDO && instr->format != Format::SOPK)
return false;
/* TODO: pseudo-instruction rematerialization is only supported for
* p_create_vector/p_parallelcopy */
if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&
instr->opcode != aco_opcode::p_parallelcopy)
return false;
if (instr->isSOPK() && instr->opcode != aco_opcode::s_movk_i32)
return false;
for (const Operand& op : instr->operands) {
/* TODO: rematerialization using temporaries isn't yet supported */
if (!op.isConstant())
return false;
}
/* TODO: rematerialization with multiple definitions isn't yet supported */
if (instr->definitions.size() > 1)
return false;
return true;
}
aco_ptr<Instruction>
do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
{
std::unordered_map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
if (remat != ctx.remat.end()) {
Instruction* instr = remat->second.instr;
assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
"unsupported");
assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||
instr->opcode == aco_opcode::p_parallelcopy) &&
"unsupported");
assert(instr->definitions.size() == 1 && "unsupported");
aco_ptr<Instruction> res;
res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(),
instr->definitions.size()));
if (instr->isSOPK())
res->salu().imm = instr->salu().imm;
for (unsigned i = 0; i < instr->operands.size(); i++) {
res->operands[i] = instr->operands[i];
if (instr->operands[i].isTemp()) {
assert(false && "unsupported");
if (ctx.remat.count(instr->operands[i].getTemp()))
ctx.unused_remats.erase(ctx.remat[instr->operands[i].getTemp()].instr);
}
}
res->definitions[0] = Definition(new_name);
return res;
} else {
aco_ptr<Instruction> reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
reload->operands[0] = Operand::c32(spill_id);
reload->definitions[0] = Definition(new_name);
ctx.is_reloaded[spill_id] = true;
return reload;
}
}
void
get_rematerialize_info(spill_ctx& ctx)
{
for (Block& block : ctx.program->blocks) {
bool logical = false;
for (aco_ptr<Instruction>& instr : block.instructions) {
if (instr->opcode == aco_opcode::p_logical_start)
logical = true;
else if (instr->opcode == aco_opcode::p_logical_end)
logical = false;
if (logical && should_rematerialize(instr)) {
for (const Definition& def : instr->definitions) {
if (def.isTemp()) {
ctx.remat[def.getTemp()] = remat_info{instr.get()};
ctx.unused_remats.insert(instr.get());
}
}
}
}
}
}
bool
is_spillable(spill_ctx& ctx, Temp var)
{
if (var.regClass().is_linear_vgpr())
return false;
auto is_current_var = [var](const Temp& test) { return var == test; };
return var != ctx.program->stack_ptr &&
std::none_of(ctx.program->scratch_offsets.begin(), ctx.program->scratch_offsets.end(),
is_current_var) &&
std::none_of(ctx.program->private_segment_buffers.begin(),
ctx.program->private_segment_buffers.end(), is_current_var);
}
RegisterDemand
init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
{
RegisterDemand spilled_registers;
/* first block, nothing was spilled before */
if (block->linear_preds.empty())
return {0, 0};
/* live-in variables at the beginning of the current block */
const IDSet& live_in = ctx.program->live.live_in[block_idx];
/* loop header block */
if (block->kind & block_kind_loop_header) {
assert(block->linear_preds[0] == block_idx - 1);
assert(block->logical_preds[0] == block_idx - 1);
/* check how many live-through variables should be spilled */
RegisterDemand reg_pressure = block->live_in_demand;
RegisterDemand loop_demand = reg_pressure;
RegisterDemand loop_call_spills = RegisterDemand();
unsigned i = block_idx;
while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) {
loop_demand.update(ctx.program->blocks[i].register_demand);
loop_call_spills.update(ctx.program->blocks[i].call_spills);
++i;
}
for (auto spilled : ctx.spills_exit[block_idx - 1]) {
/* variable is not live at loop entry: probably a phi operand */
if (!live_in.count(spilled.first.id()))
continue;
/* keep live-through variables spilled */
ctx.spills_entry[block_idx][spilled.first] = spilled.second;
spilled_registers += spilled.first;
loop_demand -= spilled.first;
}
if (!ctx.loop.empty()) {
/* If this is a nested loop, keep variables from the outer loop spilled. */
for (auto spilled : ctx.loop.back().spills) {
/* If the inner loop comes after the last continue statement of the outer loop,
* the loop-carried variables might not be live-in for the inner loop.
*/
if (live_in.count(spilled.first.id()) &&
ctx.spills_entry[block_idx].insert(spilled).second) {
spilled_registers += spilled.first;
loop_demand -= spilled.first;
}
}
}
/* select more live-through variables and constants */
RegType type = RegType::vgpr;
while (loop_demand.exceeds(ctx.target_pressure) ||
loop_call_spills.exceeds(spilled_registers)) {
/* if VGPR demand is low enough, select SGPRs */
if (type == RegType::vgpr && loop_demand.vgpr <= ctx.target_pressure.vgpr &&
loop_call_spills.vgpr <= spilled_registers.vgpr)
type = RegType::sgpr;
/* if SGPR demand is low enough, break */
if (type == RegType::sgpr && loop_demand.sgpr <= ctx.target_pressure.sgpr &&
loop_call_spills.sgpr <= spilled_registers.sgpr)
break;
float score = 0.0;
unsigned remat = 0;
Temp to_spill;
for (unsigned t : live_in) {
Temp var = Temp(t, ctx.program->temp_rc[t]);
if (var.type() != type || ctx.spills_entry[block_idx].count(var) ||
!is_spillable(ctx, var))
continue;
unsigned can_remat = ctx.remat.count(var);
if (can_remat > remat || (can_remat == remat && ctx.ssa_infos[t].score() > score)) {
to_spill = var;
score = ctx.ssa_infos[t].score();
remat = can_remat;
}
}
/* select SGPRs or break */
if (score == 0.0) {
if (type == RegType::sgpr)
break;
type = RegType::sgpr;
continue;
}
ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
spilled_registers += to_spill;
loop_demand -= to_spill;
}
/* create new loop_info */
loop_info info = {block_idx, ctx.spills_entry[block_idx], live_in};
ctx.loop.emplace_back(std::move(info));
/* shortcut */
if (!loop_demand.exceeds(ctx.target_pressure))
return spilled_registers;
/* if reg pressure is too high at beginning of loop, add variables with furthest use */
reg_pressure -= spilled_registers;
while (reg_pressure.exceeds(ctx.target_pressure)) {
float score = 0;
Temp to_spill = Temp();
type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
for (aco_ptr<Instruction>& phi : block->instructions) {
if (!is_phi(phi))
break;
if (!phi->definitions[0].isTemp() || phi->definitions[0].isKill())
continue;
Temp var = phi->definitions[0].getTemp();
if (var.type() == type && !ctx.spills_entry[block_idx].count(var) &&
ctx.ssa_infos[var.id()].score() > score && is_spillable(ctx, var)) {
to_spill = var;
score = ctx.ssa_infos[var.id()].score();
}
}
assert(to_spill != Temp());
ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
spilled_registers += to_spill;
reg_pressure -= to_spill;
}
return spilled_registers;
}
/* branch block */
if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
/* keep variables spilled */
unsigned pred_idx = block->linear_preds[0];
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
if (pair.first.type() != RegType::sgpr)
continue;
if (live_in.count(pair.first.id())) {
spilled_registers += pair.first;
ctx.spills_entry[block_idx].emplace(pair);
}
}
if (block->logical_preds.empty())
return spilled_registers;
pred_idx = block->logical_preds[0];
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
if (pair.first.type() != RegType::vgpr)
continue;
if (live_in.count(pair.first.id())) {
spilled_registers += pair.first;
ctx.spills_entry[block_idx].emplace(pair);
}
}
return spilled_registers;
}
/* else: merge block */
std::map<Temp, bool> partial_spills;
/* keep variables spilled on all incoming paths */
for (unsigned t : live_in) {
const RegClass rc = ctx.program->temp_rc[t];
Temp var = Temp(t, rc);
Block::edge_vec& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
/* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
* it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other
* predecessors. The idea is that it's better in practice to rematerialize redundantly than to
* create lots of phis. */
const bool remat = ctx.remat.count(var);
/* If the variable is spilled at the current loop-header, spilling is essentially for free
* while reloading is not. Thus, keep them spilled if they are at least partially spilled.
*/
const bool avoid_respill = block->loop_nest_depth && ctx.loop.back().spills.count(var);
bool spill = true;
bool partial_spill = false;
uint32_t spill_id = 0;
for (unsigned pred_idx : preds) {
if (!ctx.spills_exit[pred_idx].count(var)) {
spill = false;
} else {
partial_spill = true;
/* it might be that on one incoming path, the variable has a different spill_id, but
* add_couple_code() will take care of that. */
spill_id = ctx.spills_exit[pred_idx][var];
}
}
spill |= (remat && partial_spill);
spill |= (avoid_respill && partial_spill);
if (spill) {
ctx.spills_entry[block_idx][var] = spill_id;
partial_spills.erase(var);
spilled_registers += var;
} else {
partial_spills[var] = partial_spill;
}
}
/* same for phis */
for (aco_ptr<Instruction>& phi : block->instructions) {
if (!is_phi(phi))
break;
if (!phi->definitions[0].isTemp() || phi->definitions[0].isKill())
continue;
Block::edge_vec& preds =
phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
bool is_all_undef = true;
bool is_all_spilled = true;
bool is_partial_spill = false;
for (unsigned i = 0; i < phi->operands.size(); i++) {
if (phi->operands[i].isUndefined())
continue;
bool spilled = phi->operands[i].isTemp() &&
ctx.spills_exit[preds[i]].count(phi->operands[i].getTemp());
is_all_spilled &= spilled;
is_partial_spill |= spilled;
is_all_undef = false;
}
if (is_all_spilled && !is_all_undef) {
/* The phi is spilled at all predecessors. Keep it spilled. */
ctx.add_to_spills(phi->definitions[0].getTemp(), ctx.spills_entry[block_idx]);
spilled_registers += phi->definitions[0].getTemp();
partial_spills.erase(phi->definitions[0].getTemp());
} else {
/* Phis might increase the register pressure. */
partial_spills[phi->definitions[0].getTemp()] = is_partial_spill;
}
}
/* if reg pressure at first instruction is still too high, add partially spilled variables */
RegisterDemand reg_pressure = block->live_in_demand;
reg_pressure -= spilled_registers;
while (reg_pressure.exceeds(ctx.target_pressure)) {
assert(!partial_spills.empty());
std::map<Temp, bool>::iterator it = partial_spills.begin();
Temp to_spill = Temp();
bool is_partial_spill = false;
float score = 0.0;
RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
while (it != partial_spills.end()) {
assert(!ctx.spills_entry[block_idx].count(it->first));
if (it->first.type() == type && is_spillable(ctx, it->first) &&
((it->second && !is_partial_spill) ||
(it->second == is_partial_spill && ctx.ssa_infos[it->first.id()].score() > score))) {
score = ctx.ssa_infos[it->first.id()].score();
to_spill = it->first;
is_partial_spill = it->second;
}
++it;
}
assert(to_spill != Temp());
ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
partial_spills.erase(to_spill);
spilled_registers += to_spill;
reg_pressure -= to_spill;
}
return spilled_registers;
}
void
add_coupling_code(spill_ctx& ctx, Block* block, IDSet& live_in)
{
const unsigned block_idx = block->index;
/* No coupling code necessary */
if (block->linear_preds.size() == 0)
return;
/* Branch block: update renames */
if (block->linear_preds.size() == 1 &&
!(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
assert(ctx.processed[block->linear_preds[0]]);
ctx.renames[block_idx] = ctx.renames[block->linear_preds[0]];
if (!block->logical_preds.empty() && block->logical_preds[0] != block->linear_preds[0]) {
for (auto it : ctx.renames[block->logical_preds[0]]) {
if (it.first.type() == RegType::vgpr)
ctx.renames[block_idx].insert_or_assign(it.first, it.second);
}
}
return;
}
std::vector<aco_ptr<Instruction>> instructions;
/* loop header and merge blocks: check if all (linear) predecessors have been processed */
for (ASSERTED unsigned pred : block->linear_preds)
assert(ctx.processed[pred]);
/* iterate the phi nodes for which operands to spill at the predecessor */
for (aco_ptr<Instruction>& phi : block->instructions) {
if (!is_phi(phi))
break;
for (const Operand& op : phi->operands) {
if (op.isTemp())
ctx.ssa_infos[op.tempId()].num_uses--;
}
/* The phi is not spilled */
if (!phi->definitions[0].isTemp() ||
!ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()))
continue;
Block::edge_vec& preds =
phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
phi->definitions[0].setKill(true);
for (unsigned i = 0; i < phi->operands.size(); i++) {
if (phi->operands[i].isUndefined())
continue;
unsigned pred_idx = preds[i];
Operand spill_op = phi->operands[i];
phi->operands[i] = Operand(phi->definitions[0].regClass());
if (spill_op.isTemp()) {
assert(spill_op.isKill());
Temp var = spill_op.getTemp();
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
ctx.unused_remats.erase(ctx.remat[var].instr);
/* check if variable is already spilled at predecessor */
auto spilled = ctx.spills_exit[pred_idx].find(var);
if (spilled != ctx.spills_exit[pred_idx].end()) {
if (spilled->second != def_spill_id)
ctx.add_affinity(def_spill_id, spilled->second);
continue;
}
/* If the phi operand has the same name as the definition,
* add to predecessor's spilled variables, so that it gets
* skipped in the loop below.
*/
if (var == phi->definitions[0].getTemp())
ctx.spills_exit[pred_idx][var] = def_spill_id;
/* rename if necessary */
if (rename_it != ctx.renames[pred_idx].end()) {
spill_op.setTemp(rename_it->second);
ctx.renames[pred_idx].erase(rename_it);
}
}
/* add interferences */
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])
ctx.add_interference(def_spill_id, pair.second);
aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
spill->operands[0] = spill_op;
spill->operands[1] = Operand::c32(def_spill_id);
Block& pred = ctx.program->blocks[pred_idx];
unsigned idx = pred.instructions.size();
do {
assert(idx != 0);
idx--;
} while (phi->opcode == aco_opcode::p_phi &&
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
pred.instructions.insert(it, std::move(spill));
}
}
/* iterate all (other) spilled variables for which to spill at the predecessor */
// TODO: would be better to have them sorted: first vgprs and first with longest distance
for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
/* if variable is not live-in, it must be from a phi: this works because of CSSA form */
if (!live_in.count(pair.first.id()))
continue;
Block::edge_vec& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
for (unsigned pred_idx : preds) {
/* variable is already spilled at predecessor */
auto spilled = ctx.spills_exit[pred_idx].find(pair.first);
if (spilled != ctx.spills_exit[pred_idx].end()) {
if (spilled->second != pair.second)
ctx.add_affinity(pair.second, spilled->second);
continue;
}
/* If this variable is spilled through the entire loop, no need to re-spill.
* It can be reloaded from the same spill-slot it got at the loop-preheader.
* No need to add interferences since every spilled variable in the loop already
* interferes with the spilled loop-variables. Make sure that the spill_ids match.
*/
const uint32_t loop_nest_depth = std::min(ctx.program->blocks[pred_idx].loop_nest_depth,
ctx.program->blocks[block_idx].loop_nest_depth);
if (loop_nest_depth) {
auto spill = ctx.loop[loop_nest_depth - 1].spills.find(pair.first);
if (spill != ctx.loop[loop_nest_depth - 1].spills.end() && spill->second == pair.second)
continue;
}
/* add interferences between spilled variable and predecessors exit spills */
for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx])
ctx.add_interference(exit_spill.second, pair.second);
/* variable is in register at predecessor and has to be spilled */
/* rename if necessary */
Temp var = pair.first;
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
if (rename_it != ctx.renames[pred_idx].end()) {
var = rename_it->second;
ctx.renames[pred_idx].erase(rename_it);
}
aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
spill->operands[0] = Operand(var);
spill->operands[1] = Operand::c32(pair.second);
Block& pred = ctx.program->blocks[pred_idx];
unsigned idx = pred.instructions.size();
do {
assert(idx != 0);
idx--;
} while (pair.first.type() == RegType::vgpr &&
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
pred.instructions.insert(it, std::move(spill));
}
}
/* iterate phis for which operands to reload */
for (aco_ptr<Instruction>& phi : block->instructions) {
if (!is_phi(phi))
break;
if (phi->definitions[0].isKill())
continue;
assert(!phi->definitions[0].isTemp() ||
!ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()));
Block::edge_vec& preds =
phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
for (unsigned i = 0; i < phi->operands.size(); i++) {
if (!phi->operands[i].isTemp())
continue;
unsigned pred_idx = preds[i];
/* if the operand was reloaded, rename */
if (!ctx.spills_exit[pred_idx].count(phi->operands[i].getTemp())) {
std::map<Temp, Temp>::iterator it =
ctx.renames[pred_idx].find(phi->operands[i].getTemp());
if (it != ctx.renames[pred_idx].end()) {
phi->operands[i].setTemp(it->second);
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
} else {
auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
if (remat_it != ctx.remat.end()) {
ctx.unused_remats.erase(remat_it->second.instr);
}
}
continue;
}
Temp tmp = phi->operands[i].getTemp();
/* reload phi operand at end of predecessor block */
Temp new_name = ctx.program->allocateTmp(tmp.regClass());
Block& pred = ctx.program->blocks[pred_idx];
unsigned idx = pred.instructions.size();
do {
assert(idx != 0);
idx--;
} while (phi->opcode == aco_opcode::p_phi &&
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
aco_ptr<Instruction> reload =
do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
/* reload spilled exec mask directly to exec */
if (!phi->definitions[0].isTemp()) {
assert(phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec);
reload->definitions[0] = phi->definitions[0];
phi->operands[i] = Operand(exec, ctx.program->lane_mask);
} else {
ctx.spills_exit[pred_idx].erase(tmp);
ctx.renames[pred_idx][tmp] = new_name;
phi->operands[i].setTemp(new_name);
}
pred.instructions.insert(it, std::move(reload));
}
}
/* iterate live variables for which to reload */
for (unsigned t : live_in) {
const RegClass rc = ctx.program->temp_rc[t];
Temp var = Temp(t, rc);
/* skip spilled variables */
if (ctx.spills_entry[block_idx].count(var))
continue;
Block::edge_vec& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
for (unsigned pred_idx : preds) {
/* skip if the variable is not spilled at the predecessor */
if (!ctx.spills_exit[pred_idx].count(var))
continue;
/* variable is spilled at predecessor and has to be reloaded */
Temp new_name = ctx.program->allocateTmp(rc);
Block& pred = ctx.program->blocks[pred_idx];
unsigned idx = pred.instructions.size();
do {
assert(idx != 0);
idx--;
} while (rc.type() == RegType::vgpr &&
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
aco_ptr<Instruction> reload =
do_reload(ctx, var, new_name, ctx.spills_exit[pred.index][var]);
pred.instructions.insert(it, std::move(reload));
ctx.spills_exit[pred.index].erase(var);
ctx.renames[pred.index][var] = new_name;
}
/* check if we have to create a new phi for this variable */
Temp rename = Temp();
bool is_same = true;
for (unsigned pred_idx : preds) {
if (!ctx.renames[pred_idx].count(var)) {
if (rename == Temp())
rename = var;
else
is_same = rename == var;
} else {
if (rename == Temp())
rename = ctx.renames[pred_idx][var];
else
is_same = rename == ctx.renames[pred_idx][var];
}
if (!is_same)
break;
}
if (!is_same) {
/* the variable was renamed differently in the predecessors: we have to create a phi */
aco_opcode opcode = rc.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
rename = ctx.program->allocateTmp(rc);
for (unsigned i = 0; i < phi->operands.size(); i++) {
Temp tmp;
if (ctx.renames[preds[i]].count(var)) {
tmp = ctx.renames[preds[i]][var];
} else if (preds[i] >= block_idx) {
tmp = rename;
} else {
tmp = var;
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
if (ctx.remat.count(tmp))
ctx.unused_remats.erase(ctx.remat[tmp].instr);
}
phi->operands[i] = Operand(tmp);
}
phi->definitions[0] = Definition(rename);
phi->register_demand = block->live_in_demand;
block->instructions.insert(block->instructions.begin(), std::move(phi));
}
/* the variable was renamed: add new name to renames */
if (!(rename == Temp() || rename == var))
ctx.renames[block_idx][var] = rename;
}
}
void
process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand spilled_registers)
{
assert(!ctx.processed[block_idx]);
std::vector<aco_ptr<Instruction>> instructions;
unsigned idx = 0;
/* phis are handled separately */
while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
const Definition def = block->instructions[idx]->definitions[0];
if (def.isTemp() && !def.isKill() && def.tempId() < ctx.ssa_infos.size())
ctx.program->live.live_in[block_idx].insert(def.tempId());
instructions.emplace_back(std::move(block->instructions[idx++]));
}
auto& current_spills = ctx.spills_exit[block_idx];
std::vector<Temp> call_spills;
while (idx < block->instructions.size()) {
aco_ptr<Instruction>& instr = block->instructions[idx];
/* Spilling is handled as part of phis (they should always have the same or higher register
* demand). If we try to spill here, we might not be able to reduce the register demand enough
* because there is no path to spill constant/undef phi operands. */
if (instr->opcode == aco_opcode::p_branch) {
instructions.emplace_back(std::move(instr));
idx++;
continue;
}
std::map<Temp, std::pair<Temp, uint32_t>> reloads;
if (!call_spills.empty()) {
RegisterDemand demand = instr->register_demand;
while (!call_spills.empty() &&
!(demand - spilled_registers + call_spills.back()).exceeds(ctx.target_pressure)) {
Temp old_tmp = call_spills.back();
call_spills.pop_back();
Temp new_tmp = ctx.program->allocateTmp(old_tmp.regClass());
ctx.renames[block_idx][old_tmp] = new_tmp;
reloads[old_tmp] = std::make_pair(new_tmp, current_spills[old_tmp]);
current_spills.erase(old_tmp);
spilled_registers -= new_tmp;
}
call_spills.clear();
}
/* rename and reload operands */
for (Operand& op : instr->operands) {
if (!op.isTemp())
continue;
if (op.isFirstKill())
ctx.program->live.live_in[block_idx].erase(op.tempId());
ctx.ssa_infos[op.tempId()].num_uses--;
if (!current_spills.count(op.getTemp()))
continue;
/* the Operand is spilled: add it to reloads */
Temp new_tmp = ctx.program->allocateTmp(op.regClass());
ctx.renames[block_idx][op.getTemp()] = new_tmp;
reloads[op.getTemp()] = std::make_pair(new_tmp, current_spills[op.getTemp()]);
current_spills.erase(op.getTemp());
spilled_registers -= new_tmp;
}
/* check if register demand is low enough during and after the current instruction */
if (block->register_demand.exceeds(ctx.target_pressure) || instr->isCall()) {
RegisterDemand new_demand = instr->register_demand;
std::optional<RegisterDemand> live_changes;
/* if reg pressure is too high, spill variable with furthest next use */
while (true) {
bool needs_spill = (new_demand - spilled_registers).exceeds(ctx.target_pressure);
if (instr->isCall()) {
RegisterDemand call_preserved_limit = instr->call().callee_preserved_limit;
/* Exclude the linear VGPRs created for spilling SGPRs from the limit,
* if they are placed in clobbered register ranges (i.e. the preserved limit
* can't fit all of them). The preserved spiller will take care of those. */
call_preserved_limit.vgpr =
MAX2(call_preserved_limit.vgpr - (int16_t)ctx.extra_vgprs, 0);
needs_spill |= (instr->call().caller_preserved_demand - spilled_registers)
.exceeds(call_preserved_limit);
}
if (!needs_spill)
break;
float score = 0.0;
Temp to_spill = Temp();
bool spill_is_operand = false;
bool spill_is_clobbered = false;
unsigned respill_slot = -1u;
unsigned do_rematerialize = 0;
unsigned avoid_respill = 0;
RegType type = RegType::sgpr;
bool spill_vgpr = new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr;
if (instr->isCall()) {
RegisterDemand call_preserved_limit = instr->call().callee_preserved_limit;
call_preserved_limit.vgpr =
MAX2(call_preserved_limit.vgpr - (int16_t)ctx.extra_vgprs, 0);
spill_vgpr |= instr->call().caller_preserved_demand.vgpr - spilled_registers.vgpr >
call_preserved_limit.vgpr;
}
if (spill_vgpr)
type = RegType::vgpr;
for (unsigned t : ctx.program->live.live_in[block_idx]) {
RegClass rc = ctx.program->temp_rc[t];
Temp var = Temp(t, rc);
if (rc.type() != type || current_spills.count(var) || rc.is_linear_vgpr() ||
!is_spillable(ctx, var))
continue;
unsigned can_rematerialize = ctx.remat.count(var);
unsigned loop_variable = block->loop_nest_depth && ctx.loop.back().spills.count(var);
if (avoid_respill > loop_variable || do_rematerialize > can_rematerialize)
continue;
if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill ||
ctx.ssa_infos[t].score() > score) {
bool is_operand = false;
bool is_clobbered = false;
bool can_spill = true;
for (auto& op : instr->operands) {
if (!op.isTemp() || op.getTemp() != var)
continue;
if (op.isLateKill() || op.isKill()) {
can_spill = false;
break;
}
/* Spilling vector operands causes us to emit a split_vector, increasing live
* state temporarily. This is ok when we have enough register headroom (when
* spilling calls), but causes spilling to fail otherwise.
*/
if (op.size() > 1) {
RegisterDemand before = new_demand - spilled_registers;
before -= get_temp_registers(instr.get()) - get_live_changes(instr.get());
if ((before + op.getTemp()).exceeds(ctx.target_pressure)) {
can_spill = false;
break;
}
}
if (!live_changes)
live_changes = get_temp_reg_changes(instr.get());
/* Don't spill operands if killing operands won't help with register pressure */
if (!instr->isCall() && !op.isClobbered() && RegisterDemand(op.getTemp()).exceeds(*live_changes)) {
can_spill = false;
break;
}
if (instr->isCall() && !op.isClobbered()) {
can_spill = false;
break;
}
is_operand = true;
is_clobbered = op.isClobbered();
break;
}
if (!can_spill)
continue;
bool is_spilled_operand = is_operand && reloads.count(var);
to_spill = var;
score = ctx.ssa_infos[t].score();
do_rematerialize = can_rematerialize;
avoid_respill = loop_variable || is_spilled_operand;
spill_is_operand = is_operand;
spill_is_clobbered = is_clobbered;
/* This variable is spilled at the loop-header of the current loop.
* Re-use the spill-slot in order to avoid an extra store.
*/
if (loop_variable)
respill_slot = ctx.loop.back().spills[var];
else if (is_spilled_operand)
respill_slot = reloads[var].second;
}
}
assert(to_spill != Temp());
if (spill_is_operand) {
/* We might not be able to spill all operands. Keep live_changes up-to-date so we
* stop when we spilled every operand we can.
*/
if (!spill_is_clobbered)
*live_changes -= to_spill;
}
if (avoid_respill) {
current_spills[to_spill] = respill_slot;
spilled_registers += to_spill;
continue;
}
uint32_t spill_id = ctx.add_to_spills(to_spill, current_spills);
if (instr->isCall())
call_spills.emplace_back(to_spill);
/* add interferences with reloads */
for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads)
ctx.add_interference(spill_id, pair.second.second);
spilled_registers += to_spill;
/* rename if necessary */
if (ctx.renames[block_idx].count(to_spill)) {
to_spill = ctx.renames[block_idx][to_spill];
}
/* add spill to new instructions */
aco_ptr<Instruction> spill{
create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
spill->operands[0] = Operand(to_spill);
spill->operands[1] = Operand::c32(spill_id);
instructions.emplace_back(std::move(spill));
}
}
for (const Definition& def : instr->definitions) {
if (def.isTemp() && !def.isKill())
ctx.program->live.live_in[block_idx].insert(def.tempId());
}
/* rename operands */
for (Operand& op : instr->operands) {
if (op.isTemp()) {
auto rename_it = ctx.renames[block_idx].find(op.getTemp());
if (rename_it != ctx.renames[block_idx].end()) {
op.setTemp(rename_it->second);
} else {
/* prevent its defining instruction from being DCE'd if it could be rematerialized */
auto remat_it = ctx.remat.find(op.getTemp());
if (remat_it != ctx.remat.end()) {
ctx.unused_remats.erase(remat_it->second.instr);
}
}
}
}
/* add reloads and instruction to new instructions */
for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads) {
aco_ptr<Instruction> reload =
do_reload(ctx, pair.first, pair.second.first, pair.second.second);
instructions.emplace_back(std::move(reload));
}
instructions.emplace_back(std::move(instr));
idx++;
}
block->instructions = std::move(instructions);
}
void
spill_block(spill_ctx& ctx, unsigned block_idx)
{
Block* block = &ctx.program->blocks[block_idx];
/* determine set of variables which are spilled at the beginning of the block */
RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
if (!(block->kind & block_kind_loop_header)) {
/* add spill/reload code on incoming control flow edges */
add_coupling_code(ctx, block, ctx.program->live.live_in[block_idx]);
}
assert(ctx.spills_exit[block_idx].empty());
ctx.spills_exit[block_idx] = ctx.spills_entry[block_idx];
process_block(ctx, block_idx, block, spilled_registers);
ctx.processed[block_idx] = true;
/* check if the next block leaves the current loop */
if (block->loop_nest_depth == 0 ||
ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
return;
uint32_t loop_header_idx = ctx.loop.back().index;
/* preserve original renames at end of loop header block */
aco::map<Temp, Temp> renames = std::move(ctx.renames[loop_header_idx]);
/* add coupling code to all loop header predecessors */
for (unsigned t : ctx.loop.back().live_in)
ctx.ssa_infos[t].num_uses--;
add_coupling_code(ctx, &ctx.program->blocks[loop_header_idx], ctx.loop.back().live_in);
renames.swap(ctx.renames[loop_header_idx]);
/* remove loop header info from stack */
ctx.loop.pop_back();
if (renames.empty())
return;
/* Add the new renames to each block */
for (std::pair<Temp, Temp> rename : renames) {
/* If there is already a rename, don't overwrite it. */
for (unsigned idx = loop_header_idx; idx <= block_idx; idx++)
ctx.renames[idx].insert(rename);
}
/* propagate new renames through loop: i.e. repair the SSA */
for (unsigned idx = loop_header_idx; idx <= block_idx; idx++) {
Block& current = ctx.program->blocks[idx];
/* rename all uses in this block */
for (aco_ptr<Instruction>& instr : current.instructions) {
/* no need to rename the loop header phis once again. */
if (idx == loop_header_idx && is_phi(instr))
continue;
for (Operand& op : instr->operands) {
if (!op.isTemp())
continue;
auto rename = renames.find(op.getTemp());
if (rename != renames.end())
op.setTemp(rename->second);
}
}
}
}
void
setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
std::vector<aco_ptr<Instruction>>& instructions, uint32_t spill_slot,
unsigned temp_size, Operand* scratch_offset, unsigned* offset)
{
uint32_t scratch_size = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
uint32_t offset_range;
if (ctx.program->gfx_level >= GFX9) {
offset_range =
ctx.program->dev.scratch_global_offset_max - ctx.program->dev.scratch_global_offset_min;
} else {
if (scratch_size < ctx.program->dev.buf_offset_max)
offset_range = ctx.program->dev.buf_offset_max - scratch_size;
else
offset_range = 0;
}
assert(spill_slot >= ctx.max_lds_spill_slots);
spill_slot -= ctx.max_lds_spill_slots;
bool overflow = (ctx.vgpr_spill_slots - ctx.max_lds_spill_slots - 1) * 4 > offset_range;
Builder rsrc_bld(ctx.program);
unsigned bld_block = block.index;
if (block.kind & block_kind_top_level) {
rsrc_bld.reset(&instructions);
} else if (ctx.scratch_rsrc == Temp() && (!overflow || ctx.program->gfx_level < GFX9)) {
Block* tl_block = nullptr;
for (int i = block.index; i >= 0; i--) {
if (ctx.program->blocks[i].kind & block_kind_contains_call)
break;
/* If we're in a loop, there may be calls later in the loop where a scratch_rsrc may
* not be present, so we can't create any outside of the loop and use them here.
*/
if (ctx.program->has_call && block.loop_nest_depth > 0 &&
(ctx.program->blocks[i].kind & block_kind_loop_header))
break;
if (ctx.program->blocks[i].kind & block_kind_top_level) {
tl_block = &ctx.program->blocks[i];
break;
}
}
if (tl_block) {
/* find p_logical_end */
std::vector<aco_ptr<Instruction>>& prev_instructions = tl_block->instructions;
unsigned idx = prev_instructions.size() - 1;
while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end)
idx--;
rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx));
bld_block = tl_block->index;
} else {
rsrc_bld.reset(&instructions);
}
}
/* If spilling overflows the constant offset range at any point, we need to emit the soffset
* before every spill/reload to avoid increasing register demand.
*/
Builder offset_bld = rsrc_bld;
if (overflow)
offset_bld.reset(&instructions);
*offset = spill_slot * 4;
if (ctx.program->gfx_level >= GFX9) {
*offset += ctx.program->dev.scratch_global_offset_min;
if (ctx.scratch_rsrc == Temp() || overflow) {
int32_t saddr = scratch_size - ctx.program->dev.scratch_global_offset_min;
int32_t last_offset = (int32_t)*offset + (temp_size - 1) * 4;
if (last_offset > (int32_t)ctx.program->dev.scratch_global_offset_max) {
saddr += (int32_t)*offset;
*offset = 0;
}
/* GFX9+ uses scratch_* instructions, which don't use a resource. */
if (ctx.program->stack_ptr.id())
ctx.scratch_rsrc =
offset_bld.sop2(aco_opcode::s_add_u32, offset_bld.def(s1), Definition(scc, s1),
Operand(ctx.program->stack_ptr), Operand::c32(saddr));
else
ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr));
ctx.scratch_rsrc_block = bld_block;
}
} else {
if (ctx.scratch_rsrc == Temp()) {
ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, ctx.resume_idx, overflow);
ctx.scratch_rsrc_block = bld_block;
}
if (overflow) {
uint32_t soffset =
ctx.program->config->scratch_bytes_per_wave + *offset * ctx.program->wave_size;
*offset = 0;
*scratch_offset = offset_bld.copy(offset_bld.def(s1), Operand::c32(soffset));
} else {
if (scratch_offset->isUndefined())
*scratch_offset = Operand::zero();
*offset += scratch_size;
}
}
}
uint32_t
setup_vgpr_spill_reload_lds(spill_ctx& ctx, Block& block,
std::vector<aco_ptr<Instruction>>& instructions, uint32_t spill_slot)
{
uint32_t offset = align(ctx.program->config->lds_size, 4);
offset += spill_slot * ctx.program->workgroup_size * 4;
assert(offset < UINT16_MAX);
if (!ctx.lds_m0_zero.id()) {
Builder bld(ctx.program);
if (block.kind & block_kind_top_level) {
bld.reset(&instructions);
} else {
Block* tl_block = &block;
while (!(tl_block->kind & block_kind_top_level))
tl_block = &ctx.program->blocks[tl_block->linear_idom];
/* find p_logical_end */
std::vector<aco_ptr<Instruction>>& prev_instructions = tl_block->instructions;
unsigned idx = prev_instructions.size() - 1;
while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end)
idx--;
bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx));
}
ctx.lds_m0_zero = bld.copy(bld.def(s1, m0), Operand::c32(0));
}
return offset;
}
void
spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& instructions,
aco_ptr<Instruction>& spill, std::vector<uint32_t>& slots)
{
ctx.program->config->spilled_vgprs += spill->operands[0].size();
uint32_t spill_id = spill->operands[1].constantValue();
uint32_t spill_slot = slots[spill_id];
uint32_t slot_count = spill->operands[0].size();
uint32_t lds_slots = spill_slot < ctx.max_lds_spill_slots
? MIN2(ctx.max_lds_spill_slots - spill_slot, slot_count)
: 0;
uint32_t scratch_slots = slot_count - lds_slots;
Operand scratch_offset;
if (!ctx.program->scratch_offsets.empty())
scratch_offset = Operand(ctx.program->scratch_offsets[ctx.resume_idx]);
unsigned offset;
if (scratch_slots)
setup_vgpr_spill_reload(ctx, block, instructions, spill_slot + lds_slots, scratch_slots,
&scratch_offset, &offset);
unsigned lds_offset = 0;
if (lds_slots)
lds_offset = setup_vgpr_spill_reload_lds(ctx, block, instructions, spill_slot);
assert(spill->operands[0].isTemp());
Temp temp = spill->operands[0].getTemp();
assert(temp.type() == RegType::vgpr && !temp.is_linear());
Builder bld(ctx.program, &instructions);
if (temp.size() > 1) {
const unsigned workgroup_size = ctx.program->workgroup_size;
Instruction* split{
create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
split->operands[0] = Operand(temp);
for (unsigned i = 0; i < temp.size(); i++)
split->definitions[i] = bld.def(v1);
bld.insert(split);
for (unsigned i = 0; i < temp.size(); i++, offset += 4, lds_offset += workgroup_size * 4) {
Temp elem = split->definitions[i].getTemp();
if (i < lds_slots) {
bld.ds(aco_opcode::ds_write_addtid_b32, bld.m0(ctx.lds_m0_zero), elem, lds_offset);
} else if (ctx.program->gfx_level >= GFX9) {
bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, elem,
offset, memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc,
Operand(v1), scratch_offset, elem, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
} else if (lds_slots) {
bld.ds(aco_opcode::ds_write_addtid_b32, bld.m0(ctx.lds_m0_zero), temp, lds_offset);
} else if (ctx.program->gfx_level >= GFX9) {
bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset,
memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
scratch_offset, temp, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
void
reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& instructions,
aco_ptr<Instruction>& reload, std::vector<uint32_t>& slots)
{
uint32_t spill_id = reload->operands[0].constantValue();
uint32_t spill_slot = slots[spill_id];
uint32_t slot_count = reload->definitions[0].size();
uint32_t lds_slots = spill_slot < ctx.max_lds_spill_slots
? MIN2(ctx.max_lds_spill_slots - spill_slot, slot_count)
: 0;
uint32_t scratch_slots = slot_count - lds_slots;
Operand scratch_offset;
if (!ctx.program->scratch_offsets.empty())
scratch_offset = Operand(ctx.program->scratch_offsets[ctx.resume_idx]);
unsigned offset;
if (scratch_slots)
setup_vgpr_spill_reload(ctx, block, instructions, spill_slot + lds_slots, scratch_slots,
&scratch_offset, &offset);
unsigned lds_offset = 0;
if (lds_slots)
lds_offset = setup_vgpr_spill_reload_lds(ctx, block, instructions, spill_slot);
Definition def = reload->definitions[0];
Builder bld(ctx.program, &instructions);
if (def.size() > 1) {
const unsigned workgroup_size = ctx.program->workgroup_size;
Instruction* vec{
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
vec->definitions[0] = def;
for (unsigned i = 0; i < def.size(); i++, offset += 4, lds_offset += workgroup_size * 4) {
Temp tmp = bld.tmp(v1);
vec->operands[i] = Operand(tmp);
if (i < lds_slots) {
bld.ds(aco_opcode::ds_read_addtid_b32, Definition(tmp), bld.m0(ctx.lds_m0_zero),
lds_offset);
} else if (ctx.program->gfx_level >= GFX9) {
bld.scratch(aco_opcode::scratch_load_dword, Definition(tmp), Operand(v1),
ctx.scratch_rsrc, offset,
memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr =
bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc,
Operand(v1), scratch_offset, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
bld.insert(vec);
} else if (lds_slots) {
bld.ds(aco_opcode::ds_read_addtid_b32, def, bld.m0(ctx.lds_m0_zero), lds_offset);
} else if (ctx.program->gfx_level >= GFX9) {
bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset,
memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
Operand(v1), scratch_offset, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
void
add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,
std::vector<bool>& slots_used, unsigned id)
{
for (unsigned other : ctx.interferences[id].second) {
if (!is_assigned[other])
continue;
RegClass other_rc = ctx.interferences[other].first;
unsigned slot = slots[other];
std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true);
}
}
unsigned
find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr)
{
unsigned wave_size_minus_one = wave_size - 1;
unsigned slot = 0;
while (true) {
bool available = true;
for (unsigned i = 0; i < size; i++) {
if (slot + i < used.size() && used[slot + i]) {
available = false;
break;
}
}
if (!available) {
slot++;
continue;
}
if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) {
slot = align(slot, wave_size);
continue;
}
std::fill(used.begin(), used.end(), false);
if (slot + size > used.size())
used.resize(slot + size);
return slot;
}
}
void
assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,
std::vector<uint32_t>& slots, unsigned* num_slots)
{
std::vector<bool> slots_used;
/* assign slots for ids with affinities first */
for (std::vector<uint32_t>& vec : ctx.affinities) {
if (ctx.interferences[vec[0]].first.type() != type)
continue;
for (unsigned id : vec) {
if (!ctx.is_reloaded[id])
continue;
add_interferences(ctx, is_assigned, slots, slots_used, id);
}
unsigned slot = find_available_slot(
slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(), type == RegType::sgpr);
for (unsigned id : vec) {
assert(!is_assigned[id]);
if (ctx.is_reloaded[id]) {
slots[id] = slot;
is_assigned[id] = true;
}
}
}
/* assign slots for ids without affinities */
for (unsigned id = 0; id < ctx.interferences.size(); id++) {
if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type)
continue;
add_interferences(ctx, is_assigned, slots, slots_used, id);
unsigned slot = find_available_slot(
slots_used, ctx.wave_size, ctx.interferences[id].first.size(), type == RegType::sgpr);
slots[id] = slot;
is_assigned[id] = true;
}
*num_slots = slots_used.size();
}
void
end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector<Temp>& vgpr_spill_temps,
const std::vector<uint32_t>& slots,
const aco::unordered_map<Temp, uint32_t>& spills)
{
std::vector<bool> is_used(vgpr_spill_temps.size());
for (std::pair<Temp, uint32_t> pair : spills) {
if (pair.first.type() == RegType::sgpr && ctx.is_reloaded[pair.second])
is_used[slots[pair.second] / ctx.wave_size] = true;
}
std::vector<Temp> temps;
for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
if (vgpr_spill_temps[i].id() && !is_used[i]) {
temps.push_back(vgpr_spill_temps[i]);
vgpr_spill_temps[i] = Temp();
}
}
if (temps.empty() || block.linear_preds.empty())
return;
aco_ptr<Instruction> destr{
create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, temps.size(), 0)};
for (unsigned i = 0; i < temps.size(); i++)
destr->operands[i] = Operand(temps[i]);
std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
while (is_phi(*it))
++it;
block.instructions.insert(it, std::move(destr));
}
void
assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
{
std::vector<uint32_t> slots(ctx.interferences.size());
std::vector<bool> is_assigned(ctx.interferences.size());
/* first, handle affinities: just merge all interferences into both spill ids */
for (std::vector<uint32_t>& vec : ctx.affinities) {
for (unsigned i = 0; i < vec.size(); i++) {
for (unsigned j = i + 1; j < vec.size(); j++) {
assert(vec[i] != vec[j]);
bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
ctx.is_reloaded[vec[i]] = reloaded;
ctx.is_reloaded[vec[j]] = reloaded;
}
}
}
for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
for (ASSERTED uint32_t id : ctx.interferences[i].second)
assert(i != id);
/* for each spill slot, assign as many spill ids as possible */
assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &ctx.sgpr_spill_slots);
assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &ctx.vgpr_spill_slots);
for (unsigned id = 0; id < is_assigned.size(); id++)
assert(is_assigned[id] || !ctx.is_reloaded[id]);
for (std::vector<uint32_t>& vec : ctx.affinities) {
for (unsigned i = 0; i < vec.size(); i++) {
for (unsigned j = i + 1; j < vec.size(); j++) {
assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
if (!is_assigned[vec[i]])
continue;
assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
assert(ctx.interferences[vec[i]].first.type() ==
ctx.interferences[vec[j]].first.type());
assert(slots[vec[i]] == slots[vec[j]]);
}
}
}
/* hope, we didn't mess up */
std::vector<Temp> vgpr_spill_temps((ctx.sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);
assert(vgpr_spill_temps.size() <= spills_to_vgpr);
/* replace pseudo instructions with actual hardware instructions */
unsigned last_top_level_block_idx = 0;
for (Block& block : ctx.program->blocks) {
if (ctx.scratch_rsrc_block < ctx.program->blocks.size() &&
!(ctx.program->blocks[ctx.scratch_rsrc_block].kind & block_kind_top_level))
ctx.scratch_rsrc = Temp();
if (block.kind & block_kind_loop_header) {
for (unsigned index = block.index;
index < ctx.program->blocks.size() &&
ctx.program->blocks[index].loop_nest_depth >= block.loop_nest_depth;
++index) {
if (ctx.program->blocks[index].kind & block_kind_contains_call) {
ctx.scratch_rsrc = Temp();
break;
}
}
}
if (block.kind & block_kind_top_level) {
last_top_level_block_idx = block.index;
end_unused_spill_vgprs(ctx, block, vgpr_spill_temps, slots, ctx.spills_entry[block.index]);
/* If the block has no predecessors (for example in RT resume shaders),
* we cannot reuse the current scratch_rsrc temp because its definition is unreachable */
if (block.linear_preds.empty())
ctx.scratch_rsrc = Temp();
if (block.kind & block_kind_resume)
++ctx.resume_idx;
}
std::vector<aco_ptr<Instruction>>::iterator it;
std::vector<aco_ptr<Instruction>> instructions;
instructions.reserve(block.instructions.size());
Builder bld(ctx.program, &instructions);
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
/* Recreate the scratch_rsrc after calls, since we don't spill it. */
if ((*it)->isCall())
ctx.scratch_rsrc = Temp();
if ((*it)->opcode == aco_opcode::p_spill) {
uint32_t spill_id = (*it)->operands[1].constantValue();
if (!ctx.is_reloaded[spill_id]) {
/* never reloaded, so don't spill */
} else if (!is_assigned[spill_id]) {
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("No spill slot assigned for spill id");
} else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
spill_vgpr(ctx, block, instructions, *it, slots);
} else {
ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
uint32_t spill_slot = slots[spill_id];
/* check if the linear vgpr already exists */
if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
Temp linear_vgpr = ctx.program->allocateTmp(lv1);
vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
aco_ptr<Instruction> create{
create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
create->definitions[0] = Definition(linear_vgpr);
/* find the right place to insert this definition */
if (last_top_level_block_idx == block.index) {
/* insert right before the current instruction */
instructions.emplace_back(std::move(create));
} else {
assert(last_top_level_block_idx < block.index);
/* insert after p_logical_end of the last top-level block */
std::vector<aco_ptr<Instruction>>& block_instrs =
ctx.program->blocks[last_top_level_block_idx].instructions;
auto insert_point =
std::find_if(block_instrs.rbegin(), block_instrs.rend(),
[](const auto& iter) {
return iter->opcode == aco_opcode::p_logical_end;
})
.base();
block_instrs.insert(insert_point, std::move(create));
}
}
/* spill sgpr: just add the vgpr temp to operands */
Instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
spill->operands[2] = (*it)->operands[0];
instructions.emplace_back(aco_ptr<Instruction>(spill));
}
} else if ((*it)->opcode == aco_opcode::p_reload) {
uint32_t spill_id = (*it)->operands[0].constantValue();
assert(ctx.is_reloaded[spill_id]);
if (!is_assigned[spill_id]) {
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("No spill slot assigned for spill id");
} else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
reload_vgpr(ctx, block, instructions, *it, slots);
} else {
uint32_t spill_slot = slots[spill_id];
/* check if the linear vgpr already exists */
if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
Temp linear_vgpr = ctx.program->allocateTmp(lv1);
vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
aco_ptr<Instruction> create{
create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
create->definitions[0] = Definition(linear_vgpr);
/* find the right place to insert this definition */
if (last_top_level_block_idx == block.index) {
/* insert right before the current instruction */
instructions.emplace_back(std::move(create));
} else {
assert(last_top_level_block_idx < block.index);
/* insert after p_logical_end of the last top-level block */
std::vector<aco_ptr<Instruction>>& block_instrs =
ctx.program->blocks[last_top_level_block_idx].instructions;
auto insert_point =
std::find_if(block_instrs.rbegin(), block_instrs.rend(),
[](const auto& iter) {
return iter->opcode == aco_opcode::p_logical_end;
})
.base();
block_instrs.insert(insert_point, std::move(create));
}
}
/* reload sgpr: just add the vgpr temp to operands */
Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
reload->definitions[0] = (*it)->definitions[0];
instructions.emplace_back(aco_ptr<Instruction>(reload));
}
} else if (!ctx.unused_remats.count(it->get())) {
instructions.emplace_back(std::move(*it));
}
}
block.instructions = std::move(instructions);
}
/* update required scratch/LDS memory */
unsigned lds_slots_used = MIN2(ctx.max_lds_spill_slots, ctx.vgpr_spill_slots);
unsigned lds_bytes_used = lds_slots_used * 4 * ctx.program->workgroup_size;
if (lds_bytes_used)
ctx.program->config->lds_size = align(ctx.program->config->lds_size, 4) + lds_bytes_used;
unsigned scratch_slots_used = ctx.vgpr_spill_slots - lds_slots_used;
ctx.program->config->scratch_bytes_per_wave += scratch_slots_used * 4 * ctx.program->wave_size;
}
} /* end namespace */
void
spill(Program* program)
{
program->config->spilled_vgprs = 0;
program->config->spilled_sgprs = 0;
program->progress = CompilationProgress::after_spilling;
const RegisterDemand limit = get_addr_regs_from_waves(program, program->min_waves);
if (program->is_callee) {
RegisterDemand callee_limit = program->callee_abi.numClobbered(limit);
auto return_it = std::find_if(
program->blocks.back().instructions.rbegin(), program->blocks.back().instructions.rend(),
[](const auto& instruction) { return instruction->opcode == aco_opcode::p_return; });
if (limit.sgpr > callee_limit.sgpr && return_it != program->blocks.back().instructions.rend()) {
aco_ptr<Instruction>& old_startpgm = program->blocks.front().instructions.front();
Instruction* new_startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0,
old_startpgm->definitions.size() + 1);
for (unsigned i = 0; i < old_startpgm->definitions.size(); ++i)
new_startpgm->definitions[i] = old_startpgm->definitions[i];
unsigned abi_sgpr_spills = limit.sgpr - callee_limit.sgpr;
Temp abi_sgpr_spill_space =
program->allocateTmp(lv1.resize(DIV_ROUND_UP(abi_sgpr_spills, program->wave_size) * 4));
new_startpgm->definitions.back() = Definition(abi_sgpr_spill_space);
old_startpgm = aco_ptr<Instruction>(new_startpgm);
for (auto& block : program->blocks) {
auto reload = std::find_if(block.instructions.rbegin(), block.instructions.rend(),
[](const auto& instr)
{
return instr->opcode == aco_opcode::p_return ||
instr->opcode == aco_opcode::p_reload_preserved ||
instr->opcode == aco_opcode::p_logical_end;
});
if (reload == block.instructions.rend())
continue;
/* p_reload_preserved is always inserted just before p_logical_end - if we hit
* p_logical_end, check the previous instruction
*/
if ((*reload)->opcode == aco_opcode::p_logical_end)
++reload;
if ((*reload)->opcode != aco_opcode::p_return &&
(*reload)->opcode != aco_opcode::p_reload_preserved)
continue;
(*reload)->operands[0] = Operand(abi_sgpr_spill_space);
}
/* TODO: update live var analysis in-place */
live_var_analysis(program);
}
}
/* no spilling when register pressure is low enough */
if (program->num_waves > 0)
return;
/* lower to CSSA before spilling to ensure correctness w.r.t. phis */
lower_to_cssa(program);
/* calculate target register demand */
const RegisterDemand demand = program->max_reg_demand; /* current max */
uint16_t extra_vgprs = 0;
uint16_t extra_sgprs = 0;
uint32_t max_lds_spill_slots = 0;
/* Only use LDS for VGPR spilling if we can use addtid access, to avoid having a VGPR addr.
* Limit to single wave workgroups, to avoid needing the wave_id for the offset.
* If we have a scratch stack pointer, don't use LDS at all.
*/
if ((program->stage == compute_cs || program->stage == raytracing_cs) &&
program->workgroup_size <= program->wave_size && !program->stack_ptr.id() &&
program->gfx_level >= GFX9) {
int used_lds = program->config->lds_size;
unsigned allocated_vgprs = ALIGN_NPOT(limit.vgpr, program->dev.vgpr_alloc_granule);
unsigned num_waves = program->dev.physical_vgprs / allocated_vgprs;
num_waves = MIN2(num_waves, program->dev.max_waves_per_simd);
assert(num_waves >= 1);
int max_lds = ROUND_DOWN_TO(program->dev.lds_limit / (num_waves * program->dev.simd_per_cu),
ac_shader_get_lds_alloc_granularity(program->gfx_level));
/* Limit LDS spilling to not reduce occupancy further.
* Note that in theory, this can still limit occupancy of other shaders running
* on the CU at the same time, but that's unlikely and impossible to know at this point.
*/
max_lds_spill_slots = MAX2(max_lds - used_lds, 0) / (program->workgroup_size * 4);
}
/* calculate extra VGPRs required for spilling SGPRs */
unsigned sgpr_spills = demand.sgpr - std::min((uint16_t)demand.sgpr, (uint16_t)limit.sgpr);
sgpr_spills += program->max_call_spills.sgpr;
if (sgpr_spills)
extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
/* add extra SGPRs required for spilling VGPRs */
if (demand.vgpr + extra_vgprs > limit.vgpr || program->max_call_spills.vgpr) {
if (program->gfx_level >= GFX9)
extra_sgprs =
program->stack_ptr.id() ? 2 : 1; /* SADDR + scc for stack pointer additions */
else
extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
if (max_lds_spill_slots)
extra_sgprs += 1; /* addtid m0 */
if (demand.sgpr + extra_sgprs > limit.sgpr || program->max_call_spills.sgpr) {
/* re-calculate in case something has changed */
sgpr_spills = program->max_call_spills.sgpr;
if (demand.sgpr + extra_sgprs > limit.sgpr)
sgpr_spills += demand.sgpr + extra_sgprs - limit.sgpr;
extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
}
}
/* the spiller has to target the following register demand */
const RegisterDemand target(limit.vgpr - extra_vgprs, limit.sgpr - extra_sgprs);
/* initialize ctx */
spill_ctx ctx(target, program, extra_vgprs, max_lds_spill_slots);
gather_ssa_use_info(ctx);
get_rematerialize_info(ctx);
/* create spills and reloads */
for (unsigned i = 0; i < program->blocks.size(); i++)
spill_block(ctx, i);
/* assign spill slots and DCE rematerialized code */
assign_spill_slots(ctx, extra_vgprs);
/* update live variable information */
live_var_analysis(program);
assert(program->num_waves > 0);
}
} // namespace aco