2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
* Copyright © 2018 Google
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2019-09-17 13:22:17 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2025-10-03 11:56:38 +02:00
|
|
|
#include "ac_shader_util.h"
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
namespace aco {
|
2024-07-01 15:55:09 +02:00
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
RegisterDemand
|
2024-07-05 11:54:16 +02:00
|
|
|
get_live_changes(Instruction* instr)
|
2020-02-21 20:14:03 +00:00
|
|
|
{
|
|
|
|
|
RegisterDemand changes;
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
if (!def.isTemp() || def.isKill())
|
|
|
|
|
continue;
|
|
|
|
|
changes += def.getTemp();
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (!op.isTemp() || !op.isFirstKill())
|
|
|
|
|
continue;
|
|
|
|
|
changes -= op.getTemp();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return changes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RegisterDemand
|
2024-07-05 11:54:16 +02:00
|
|
|
get_temp_registers(Instruction* instr)
|
2020-02-21 20:14:03 +00:00
|
|
|
{
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
RegisterDemand demand_before;
|
|
|
|
|
RegisterDemand demand_after;
|
2020-02-21 15:46:39 +00:00
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
if (def.isKill())
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
demand_after += def.getTemp();
|
|
|
|
|
else if (def.isTemp())
|
|
|
|
|
demand_before -= def.getTemp();
|
2020-02-21 20:14:03 +00:00
|
|
|
}
|
2020-02-21 15:46:39 +00:00
|
|
|
|
|
|
|
|
for (Operand op : instr->operands) {
|
2024-07-04 11:03:43 +02:00
|
|
|
if (op.isFirstKill() || op.isCopyKill()) {
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
demand_before += op.getTemp();
|
|
|
|
|
if (op.isLateKill())
|
|
|
|
|
demand_after += op.getTemp();
|
2024-07-04 10:00:27 +02:00
|
|
|
} else if (op.isClobbered() && !op.isKill()) {
|
|
|
|
|
demand_before += op.getTemp();
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
}
|
2020-02-21 15:46:39 +00:00
|
|
|
}
|
|
|
|
|
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
demand_after.update(demand_before);
|
|
|
|
|
return demand_after;
|
2020-02-21 20:14:03 +00:00
|
|
|
}
|
|
|
|
|
|
2024-06-24 16:48:43 +02:00
|
|
|
RegisterDemand get_temp_reg_changes(Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
RegisterDemand available_def_space;
|
|
|
|
|
|
|
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
if (def.isTemp())
|
|
|
|
|
available_def_space += def.getTemp();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (Operand op : instr->operands) {
|
|
|
|
|
if (op.isFirstKillBeforeDef() || op.isCopyKill())
|
|
|
|
|
available_def_space -= op.getTemp();
|
2025-04-24 14:01:39 +01:00
|
|
|
else if (op.isClobbered() && !op.isKill())
|
|
|
|
|
available_def_space -= op.getTemp();
|
2024-06-24 16:48:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return available_def_space;
|
|
|
|
|
}
|
|
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
namespace {
|
2021-07-26 17:55:48 +01:00
|
|
|
|
2023-11-10 11:35:43 +01:00
|
|
|
struct live_ctx {
|
2024-07-02 16:20:44 +02:00
|
|
|
monotonic_buffer_resource m;
|
2023-11-10 11:35:43 +01:00
|
|
|
Program* program;
|
2024-07-09 15:45:01 +02:00
|
|
|
int32_t worklist;
|
|
|
|
|
uint32_t handled_once;
|
2023-11-10 11:35:43 +01:00
|
|
|
};
|
|
|
|
|
|
2022-03-16 10:56:26 +01:00
|
|
|
bool
|
|
|
|
|
instr_needs_vcc(Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->isVOPC())
|
|
|
|
|
return true;
|
|
|
|
|
if (instr->isVOP2() && !instr->isVOP3()) {
|
|
|
|
|
if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
|
|
|
|
|
instr->operands[2].regClass().type() == RegType::sgpr)
|
|
|
|
|
return true;
|
|
|
|
|
if (instr->definitions.size() == 2)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-09 15:45:01 +02:00
|
|
|
IDSet
|
|
|
|
|
compute_live_out(live_ctx& ctx, Block* block)
|
|
|
|
|
{
|
|
|
|
|
IDSet live(ctx.m);
|
|
|
|
|
|
|
|
|
|
if (block->logical_succs.empty()) {
|
|
|
|
|
/* Linear blocks:
|
|
|
|
|
* Directly insert the successor if it is a linear block as well.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned succ : block->linear_succs) {
|
|
|
|
|
if (ctx.program->blocks[succ].logical_preds.empty()) {
|
|
|
|
|
live.insert(ctx.program->live.live_in[succ]);
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned t : ctx.program->live.live_in[succ]) {
|
|
|
|
|
if (ctx.program->temp_rc[t].is_linear())
|
|
|
|
|
live.insert(t);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* Logical blocks:
|
|
|
|
|
* Linear successors are either linear blocks or logical targets.
|
|
|
|
|
*/
|
|
|
|
|
live = IDSet(ctx.program->live.live_in[block->linear_succs[0]], ctx.m);
|
|
|
|
|
if (block->linear_succs.size() == 2)
|
|
|
|
|
live.insert(ctx.program->live.live_in[block->linear_succs[1]]);
|
|
|
|
|
|
|
|
|
|
/* At most one logical target needs a separate insertion. */
|
|
|
|
|
if (block->logical_succs.back() != block->linear_succs.back()) {
|
|
|
|
|
for (unsigned t : ctx.program->live.live_in[block->logical_succs.back()]) {
|
|
|
|
|
if (!ctx.program->temp_rc[t].is_linear())
|
|
|
|
|
live.insert(t);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
assert(block->logical_succs[0] == block->linear_succs[0]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Handle phi operands */
|
|
|
|
|
if (block->linear_succs.size() == 1 && block->linear_succs[0] >= ctx.handled_once) {
|
|
|
|
|
Block& succ = ctx.program->blocks[block->linear_succs[0]];
|
|
|
|
|
auto it = std::find(succ.linear_preds.begin(), succ.linear_preds.end(), block->index);
|
|
|
|
|
unsigned op_idx = std::distance(succ.linear_preds.begin(), it);
|
|
|
|
|
for (aco_ptr<Instruction>& phi : succ.instructions) {
|
|
|
|
|
if (!is_phi(phi))
|
|
|
|
|
break;
|
|
|
|
|
if (phi->opcode == aco_opcode::p_phi || phi->definitions[0].isKill())
|
|
|
|
|
continue;
|
|
|
|
|
if (phi->operands[op_idx].isTemp())
|
|
|
|
|
live.insert(phi->operands[op_idx].tempId());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (block->logical_succs.size() == 1 && block->logical_succs[0] >= ctx.handled_once) {
|
|
|
|
|
Block& succ = ctx.program->blocks[block->logical_succs[0]];
|
|
|
|
|
auto it = std::find(succ.logical_preds.begin(), succ.logical_preds.end(), block->index);
|
|
|
|
|
unsigned op_idx = std::distance(succ.logical_preds.begin(), it);
|
|
|
|
|
for (aco_ptr<Instruction>& phi : succ.instructions) {
|
|
|
|
|
if (!is_phi(phi))
|
|
|
|
|
break;
|
|
|
|
|
if (phi->opcode == aco_opcode::p_linear_phi || phi->definitions[0].isKill())
|
|
|
|
|
continue;
|
|
|
|
|
if (phi->operands[op_idx].isTemp())
|
|
|
|
|
live.insert(phi->operands[op_idx].tempId());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return live;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-18 19:26:09 +01:00
|
|
|
template <typename T>
|
|
|
|
|
RegisterDemand
|
|
|
|
|
get_demand_for_reg(live_ctx& ctx, T op_or_def)
|
|
|
|
|
{
|
|
|
|
|
if (!op_or_def.isPrecolored())
|
|
|
|
|
return RegisterDemand();
|
|
|
|
|
|
|
|
|
|
PhysReg reg = op_or_def.physReg();
|
|
|
|
|
RegType type = op_or_def.regClass().type();
|
|
|
|
|
|
|
|
|
|
if (type == RegType::sgpr && reg >= ctx.program->dev.sgpr_limit)
|
|
|
|
|
return RegisterDemand();
|
|
|
|
|
|
|
|
|
|
PhysReg max_reg = reg.advance(op_or_def.regClass().bytes());
|
|
|
|
|
|
|
|
|
|
if (type == RegType::sgpr)
|
|
|
|
|
return RegisterDemand(0, max_reg);
|
|
|
|
|
else
|
|
|
|
|
return RegisterDemand(max_reg - 256, 0);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
2023-11-10 11:35:43 +01:00
|
|
|
process_live_temps_per_block(live_ctx& ctx, Block* block)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
RegisterDemand new_demand;
|
2025-10-11 22:39:41 +02:00
|
|
|
unsigned num_linear_vgprs = 0;
|
2024-06-26 12:07:12 +02:00
|
|
|
block->register_demand = RegisterDemand();
|
2025-11-12 21:29:56 +01:00
|
|
|
block->call_spills = RegisterDemand();
|
2024-07-09 15:45:01 +02:00
|
|
|
IDSet live = compute_live_out(ctx, block);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-03-10 13:39:42 +01:00
|
|
|
/* initialize register demand */
|
2025-10-11 22:39:41 +02:00
|
|
|
for (unsigned t : live) {
|
2023-11-10 11:35:43 +01:00
|
|
|
new_demand += Temp(t, ctx.program->temp_rc[t]);
|
2025-10-11 22:39:41 +02:00
|
|
|
if (ctx.program->temp_rc[t].is_linear_vgpr())
|
|
|
|
|
num_linear_vgprs += ctx.program->temp_rc[t].size();
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* traverse the instructions backwards */
|
2019-10-29 11:56:09 +01:00
|
|
|
int idx;
|
|
|
|
|
for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
|
|
|
|
|
Instruction* insn = block->instructions[idx].get();
|
|
|
|
|
if (is_phi(insn))
|
|
|
|
|
break;
|
|
|
|
|
|
2025-11-18 19:26:09 +01:00
|
|
|
/* Precolored operands may be fixed to a register higher than the current demand.
|
|
|
|
|
* Record the demand of precolored registers here.
|
|
|
|
|
*/
|
|
|
|
|
if (insn->hasPrecoloredGPRs()) {
|
|
|
|
|
RegisterDemand precolored_demand = RegisterDemand();
|
|
|
|
|
for (Operand op : insn->operands)
|
|
|
|
|
precolored_demand.update(get_demand_for_reg(ctx, op));
|
|
|
|
|
for (Definition def : insn->definitions)
|
|
|
|
|
precolored_demand.update(get_demand_for_reg(ctx, def));
|
|
|
|
|
ctx.program->fixed_reg_demand.update(precolored_demand);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-10 11:35:43 +01:00
|
|
|
ctx.program->needs_vcc |= instr_needs_vcc(insn);
|
2025-02-17 18:42:49 +01:00
|
|
|
RegisterDemand demand_after_instr = RegisterDemand(new_demand.vgpr, new_demand.sgpr);
|
|
|
|
|
insn->register_demand = demand_after_instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-07-25 17:15:15 +02:00
|
|
|
bool has_vgpr_def = false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* KILL */
|
|
|
|
|
for (Definition& definition : insn->definitions) {
|
2024-07-25 17:15:15 +02:00
|
|
|
has_vgpr_def |= definition.regClass().type() == RegType::vgpr &&
|
|
|
|
|
!definition.regClass().is_linear_vgpr();
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!definition.isTemp()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2024-10-22 19:03:13 +02:00
|
|
|
if (definition.isFixed() && definition.physReg() == vcc)
|
2023-11-10 11:35:43 +01:00
|
|
|
ctx.program->needs_vcc = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
const Temp temp = definition.getTemp();
|
2020-09-14 16:45:55 +01:00
|
|
|
const size_t n = live.erase(temp.id());
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (n) {
|
|
|
|
|
new_demand -= temp;
|
2025-10-11 22:39:41 +02:00
|
|
|
if (temp.regClass().is_linear_vgpr())
|
|
|
|
|
num_linear_vgprs -= temp.size();
|
2019-09-17 13:22:17 +02:00
|
|
|
definition.setKill(false);
|
|
|
|
|
} else {
|
2024-06-21 15:45:22 +02:00
|
|
|
insn->register_demand += temp;
|
2019-09-17 13:22:17 +02:00
|
|
|
definition.setKill(true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-16 13:19:25 +02:00
|
|
|
/* we need to do this in a separate loop because the next one can
|
|
|
|
|
* setKill() for several operands at once and we don't want to
|
|
|
|
|
* overwrite that in a later iteration */
|
2025-04-02 10:59:32 +02:00
|
|
|
bool is_vector_op = false;
|
2025-05-16 13:19:25 +02:00
|
|
|
for (Operand& op : insn->operands) {
|
|
|
|
|
op.setKill(false);
|
|
|
|
|
/* Linear vgprs must be late kill: this is to ensure linear VGPR operands and
|
|
|
|
|
* normal VGPR definitions don't try to use the same register, which is problematic
|
|
|
|
|
* because of assignment restrictions.
|
|
|
|
|
*/
|
2025-04-02 10:59:32 +02:00
|
|
|
bool lateKill =
|
|
|
|
|
op.hasRegClass() && op.regClass().is_linear_vgpr() && !op.isUndefined() && has_vgpr_def;
|
|
|
|
|
|
|
|
|
|
/* If this Operand is part of a vector which is only partially killed by the instruction,
|
|
|
|
|
* a definition might not fit into the gaps that get created. Mitigate by using lateKill.
|
|
|
|
|
*/
|
|
|
|
|
// TODO: is it beneficial to skip that if the vector is fully killed?
|
|
|
|
|
lateKill |= is_vector_op || op.isVectorAligned();
|
|
|
|
|
op.setLateKill(lateKill);
|
|
|
|
|
is_vector_op = op.isVectorAligned();
|
2025-05-16 13:19:25 +02:00
|
|
|
}
|
|
|
|
|
|
2024-07-25 17:01:37 +02:00
|
|
|
if (ctx.program->gfx_level >= GFX10 && insn->isVALU() &&
|
|
|
|
|
insn->definitions.back().regClass() == s2) {
|
|
|
|
|
/* RDNA2 ISA doc, 6.2.4. Wave64 Destination Restrictions:
|
|
|
|
|
* The first pass of a wave64 VALU instruction may not overwrite a scalar value used by
|
|
|
|
|
* the second half.
|
|
|
|
|
*/
|
|
|
|
|
bool carry_in = insn->opcode == aco_opcode::v_addc_co_u32 ||
|
|
|
|
|
insn->opcode == aco_opcode::v_subb_co_u32 ||
|
|
|
|
|
insn->opcode == aco_opcode::v_subbrev_co_u32;
|
|
|
|
|
for (unsigned op_idx = 0; op_idx < (carry_in ? 2 : insn->operands.size()); op_idx++) {
|
|
|
|
|
if (insn->operands[op_idx].isOfType(RegType::sgpr))
|
|
|
|
|
insn->operands[op_idx].setLateKill(true);
|
|
|
|
|
}
|
2024-07-25 17:15:15 +02:00
|
|
|
} else if (insn->opcode == aco_opcode::p_bpermute_readlane ||
|
|
|
|
|
insn->opcode == aco_opcode::p_bpermute_permlane ||
|
|
|
|
|
insn->opcode == aco_opcode::p_bpermute_shared_vgpr ||
|
|
|
|
|
insn->opcode == aco_opcode::p_dual_src_export_gfx11 ||
|
|
|
|
|
insn->opcode == aco_opcode::v_mqsad_u32_u8) {
|
|
|
|
|
for (Operand& op : insn->operands)
|
|
|
|
|
op.setLateKill(true);
|
2024-09-11 10:28:51 +02:00
|
|
|
} else if (insn->opcode == aco_opcode::p_interp_gfx11 && insn->operands.size() == 7) {
|
|
|
|
|
insn->operands[5].setLateKill(true); /* we re-use the destination reg in the middle */
|
2024-07-25 17:15:15 +02:00
|
|
|
} else if (insn->opcode == aco_opcode::v_interp_p1_f32 && ctx.program->dev.has_16bank_lds) {
|
|
|
|
|
insn->operands[0].setLateKill(true);
|
2025-02-17 18:42:49 +01:00
|
|
|
} else if (insn->opcode == aco_opcode::p_init_scratch ||
|
|
|
|
|
insn->opcode == aco_opcode::p_reload_preserved) {
|
2024-07-25 17:15:15 +02:00
|
|
|
insn->operands.back().setLateKill(true);
|
|
|
|
|
} else if (instr_info.classes[(int)insn->opcode] == instr_class::wmma) {
|
|
|
|
|
insn->operands[0].setLateKill(true);
|
|
|
|
|
insn->operands[1].setLateKill(true);
|
2024-07-25 17:01:37 +02:00
|
|
|
}
|
|
|
|
|
|
2025-05-02 11:25:46 +01:00
|
|
|
/* Check if a definition clobbers some operand */
|
2025-05-16 13:19:25 +02:00
|
|
|
RegisterDemand operand_demand;
|
2025-05-02 11:25:46 +01:00
|
|
|
auto tied_defs = get_tied_defs(insn);
|
|
|
|
|
for (auto op_idx : tied_defs) {
|
|
|
|
|
Temp tmp = insn->operands[op_idx].getTemp();
|
|
|
|
|
if (std::any_of(tied_defs.begin(), tied_defs.end(), [&](uint32_t i)
|
|
|
|
|
{ return i < op_idx && insn->operands[i].getTemp() == tmp; })) {
|
|
|
|
|
operand_demand += tmp;
|
|
|
|
|
insn->operands[op_idx].setCopyKill(true);
|
|
|
|
|
}
|
|
|
|
|
insn->operands[op_idx].setClobbered(true);
|
2025-04-15 16:34:46 +02:00
|
|
|
|
|
|
|
|
/* We use lateKill as a mitigation for RA issues when allocating definitions with
|
|
|
|
|
* partially-killed vectors. In case of a vector-aligned operand tied to a definition,
|
|
|
|
|
* this is irrelevant because the tied definition and the vector occupy the same
|
|
|
|
|
* register space, and all other definitions are allocated elsewhere.
|
|
|
|
|
* lateKill operands can't be tied to a definition because their live ranges would
|
|
|
|
|
* intersect, so remove the lateKill flag again.
|
|
|
|
|
*/
|
|
|
|
|
if (insn->operands[op_idx].isVectorAligned())
|
|
|
|
|
insn->operands[op_idx].setLateKill(false);
|
|
|
|
|
while (insn->operands[op_idx].isVectorAligned()) {
|
|
|
|
|
++op_idx;
|
|
|
|
|
insn->operands[op_idx].setClobbered(true);
|
|
|
|
|
insn->operands[op_idx].setLateKill(false);
|
|
|
|
|
}
|
2025-05-02 11:25:46 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* GEN */
|
2024-06-26 11:37:00 +02:00
|
|
|
for (unsigned i = 0; i < insn->operands.size(); ++i) {
|
|
|
|
|
Operand& operand = insn->operands[i];
|
|
|
|
|
if (!operand.isTemp())
|
|
|
|
|
continue;
|
2024-07-04 11:03:43 +02:00
|
|
|
|
|
|
|
|
const Temp temp = operand.getTemp();
|
2024-09-11 15:16:46 +02:00
|
|
|
if (operand.isPrecolored()) {
|
2024-07-04 11:03:43 +02:00
|
|
|
assert(!operand.isLateKill());
|
2024-07-04 09:42:16 +02:00
|
|
|
ctx.program->needs_vcc |= operand.physReg() == vcc;
|
2024-07-04 11:03:43 +02:00
|
|
|
|
2024-07-04 09:42:16 +02:00
|
|
|
/* Check if this operand gets overwritten by a precolored definition. */
|
|
|
|
|
if (std::any_of(insn->definitions.begin(), insn->definitions.end(),
|
|
|
|
|
[=](Definition def)
|
|
|
|
|
{
|
|
|
|
|
return def.isFixed() &&
|
|
|
|
|
def.physReg() + def.size() > operand.physReg() &&
|
|
|
|
|
operand.physReg() + operand.size() > def.physReg();
|
|
|
|
|
}))
|
|
|
|
|
operand.setClobbered(true);
|
2024-07-04 11:03:43 +02:00
|
|
|
|
2024-09-11 15:16:46 +02:00
|
|
|
/* Check if another precolored operand uses the same temporary.
|
2024-07-04 11:03:43 +02:00
|
|
|
* This assumes that operands of one instruction are not precolored twice to
|
|
|
|
|
* the same register. In this case, register pressure might be overestimated.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned j = i + 1; !operand.isCopyKill() && j < insn->operands.size(); ++j) {
|
2024-09-11 15:16:46 +02:00
|
|
|
if (insn->operands[j].isPrecolored() && insn->operands[j].getTemp() == temp) {
|
2024-07-04 11:03:43 +02:00
|
|
|
operand_demand += temp;
|
|
|
|
|
insn->operands[j].setCopyKill(true);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-07-04 09:42:16 +02:00
|
|
|
}
|
2025-04-02 10:59:32 +02:00
|
|
|
/* If this operand is part of a vector, check if the temporary needs to be duplicated. */
|
|
|
|
|
if (is_vector_op || operand.isVectorAligned()) {
|
|
|
|
|
/* Set copyKill if any other vector-operand uses the same temporary. If a scalar operand
|
|
|
|
|
* uses the same temporary, assume that it can share the register. This ignores other
|
|
|
|
|
* register constraints like tied definitions or precolored registers.
|
|
|
|
|
*/
|
|
|
|
|
bool other_is_vector_op = false;
|
|
|
|
|
for (unsigned j = 0; j < i; j++) {
|
|
|
|
|
if ((other_is_vector_op || insn->operands[j].isVectorAligned()) &&
|
|
|
|
|
insn->operands[j].getTemp() == temp) {
|
|
|
|
|
operand_demand += temp;
|
|
|
|
|
insn->register_demand += temp; /* Because of lateKill */
|
|
|
|
|
operand.setCopyKill(true);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
other_is_vector_op = insn->operands[j].isVectorAligned();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
is_vector_op = operand.isVectorAligned();
|
2024-08-22 11:09:46 +02:00
|
|
|
|
2025-05-16 13:19:25 +02:00
|
|
|
if (operand.isLateKill()) {
|
|
|
|
|
/* Make sure that same temporaries have same lateKill flags. */
|
|
|
|
|
for (Operand& other : insn->operands) {
|
|
|
|
|
if (other.isTemp() && other.getTemp() == operand.getTemp())
|
|
|
|
|
other.setLateKill(true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-22 11:09:46 +02:00
|
|
|
if (operand.isKill())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (live.insert(temp.id()).second) {
|
2024-06-26 11:37:00 +02:00
|
|
|
operand.setFirstKill(true);
|
|
|
|
|
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
|
2024-08-22 11:09:46 +02:00
|
|
|
if (insn->operands[j].isTemp() && insn->operands[j].getTemp() == temp)
|
2024-06-26 11:37:00 +02:00
|
|
|
insn->operands[j].setKill(true);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2024-06-26 11:37:00 +02:00
|
|
|
if (operand.isLateKill())
|
|
|
|
|
insn->register_demand += temp;
|
|
|
|
|
new_demand += temp;
|
2025-10-11 22:39:41 +02:00
|
|
|
if (temp.regClass().is_linear_vgpr())
|
|
|
|
|
num_linear_vgprs += temp.size();
|
2024-07-04 10:00:27 +02:00
|
|
|
} else if (operand.isClobbered()) {
|
|
|
|
|
operand_demand += temp;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-17 18:42:49 +01:00
|
|
|
if (insn->isCall()) {
|
|
|
|
|
/* For call instructions, definitions are live at the time s_setpc finishes,
|
|
|
|
|
* which continues execution in the callee. This means that all definitions are
|
|
|
|
|
* live concurrently with operands.
|
|
|
|
|
*/
|
|
|
|
|
operand_demand += insn->definitions[0].getTemp();
|
|
|
|
|
if (insn->definitions[1].physReg() == vcc)
|
|
|
|
|
operand_demand += insn->definitions[1].getTemp();
|
|
|
|
|
|
|
|
|
|
RegisterDemand limit = get_addr_regs_from_waves(ctx.program, ctx.program->min_waves);
|
|
|
|
|
insn->call().callee_preserved_limit = RegisterDemand();
|
|
|
|
|
|
|
|
|
|
BITSET_DECLARE(preserved_regs, 512);
|
|
|
|
|
insn->call().abi.preservedRegisters(preserved_regs, limit);
|
|
|
|
|
|
|
|
|
|
RegisterDemand preserved_reg_demand;
|
|
|
|
|
preserved_reg_demand.sgpr =
|
|
|
|
|
__bitset_prefix_sum(preserved_regs, limit.sgpr, 256 / BITSET_WORDBITS);
|
|
|
|
|
preserved_reg_demand.vgpr = __bitset_prefix_sum(preserved_regs + 256 / BITSET_WORDBITS,
|
|
|
|
|
limit.vgpr, 256 / BITSET_WORDBITS);
|
|
|
|
|
insn->call().callee_preserved_limit += preserved_reg_demand;
|
|
|
|
|
|
2025-10-11 22:39:41 +02:00
|
|
|
/* Killed operands effectively make a preserved register unusable for temporaries which we
|
|
|
|
|
* want to preserve (those included in caller_preserved_demand).
|
|
|
|
|
*/
|
|
|
|
|
for (auto& op : insn->operands) {
|
|
|
|
|
if (!op.isTemp() || !op.isPrecolored() || !op.isKill())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < op.size(); ++i) {
|
|
|
|
|
if (BITSET_TEST(preserved_regs, op.physReg().reg() + i))
|
|
|
|
|
insn->call().callee_preserved_limit -= Temp(0, RegClass(op.regClass().type(), 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* TODO: the spiller can't handle linear VGPRs. For now, the post-RA preserved register
|
|
|
|
|
* spilling pass makes sure that all live linear VGPRs are preserved across calls.
|
|
|
|
|
* Therefore, ignore linear VGPRs in the demand calculation here.
|
|
|
|
|
*/
|
|
|
|
|
insn->call().callee_preserved_limit.vgpr =
|
|
|
|
|
MAX2(insn->call().callee_preserved_limit.vgpr - (int16_t)num_linear_vgprs, 0);
|
|
|
|
|
|
2025-02-17 18:42:49 +01:00
|
|
|
insn->call().caller_preserved_demand = demand_after_instr;
|
2025-10-11 22:39:41 +02:00
|
|
|
insn->call().caller_preserved_demand.vgpr -= num_linear_vgprs;
|
|
|
|
|
|
|
|
|
|
/* Non-clobbered (neither discardable nor return) parameters are preserved by the callee
|
|
|
|
|
* if they are placed in clobbered registers.
|
|
|
|
|
*/
|
|
|
|
|
for (auto& op : insn->operands) {
|
|
|
|
|
if (!op.isTemp() || !op.isPrecolored() || op.isClobbered() || op.isKill())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < op.size(); ++i) {
|
|
|
|
|
if (!BITSET_TEST(preserved_regs, op.physReg().reg() + i))
|
|
|
|
|
insn->call().caller_preserved_demand -=
|
|
|
|
|
Temp(0, RegClass(op.regClass().type(), 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-02-17 18:42:49 +01:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < insn->definitions.size(); ++i) {
|
|
|
|
|
if (!insn->definitions[i].isKill())
|
|
|
|
|
insn->call().caller_preserved_demand -= insn->definitions[i].getTemp();
|
|
|
|
|
}
|
2025-11-12 21:29:56 +01:00
|
|
|
|
|
|
|
|
block->call_spills.update(insn->call().caller_preserved_demand -
|
|
|
|
|
insn->call().callee_preserved_limit);
|
2025-02-17 18:42:49 +01:00
|
|
|
}
|
|
|
|
|
|
2024-07-04 10:00:27 +02:00
|
|
|
operand_demand += new_demand;
|
|
|
|
|
insn->register_demand.update(operand_demand);
|
2024-06-26 12:07:12 +02:00
|
|
|
block->register_demand.update(insn->register_demand);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-10-29 11:56:09 +01:00
|
|
|
/* handle phi definitions */
|
2024-06-18 17:09:05 +02:00
|
|
|
for (int phi_idx = 0; phi_idx <= idx; phi_idx++) {
|
2019-10-29 11:56:09 +01:00
|
|
|
Instruction* insn = block->instructions[phi_idx].get();
|
2024-06-21 15:45:22 +02:00
|
|
|
insn->register_demand = new_demand;
|
2019-10-29 11:56:09 +01:00
|
|
|
|
2021-02-04 15:55:23 +01:00
|
|
|
assert(is_phi(insn) && insn->definitions.size() == 1);
|
|
|
|
|
if (!insn->definitions[0].isTemp()) {
|
|
|
|
|
assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-10-29 11:56:09 +01:00
|
|
|
Definition& definition = insn->definitions[0];
|
2024-10-22 19:03:13 +02:00
|
|
|
ctx.program->needs_vcc |= definition.isFixed() && definition.physReg() == vcc;
|
2024-07-09 15:45:01 +02:00
|
|
|
const size_t n = live.erase(definition.tempId());
|
|
|
|
|
if (n && (definition.isKill() || ctx.handled_once > block->index)) {
|
|
|
|
|
Block::edge_vec& preds =
|
|
|
|
|
insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
|
|
|
|
|
for (unsigned i = 0; i < preds.size(); i++) {
|
|
|
|
|
if (insn->operands[i].isTemp())
|
|
|
|
|
ctx.worklist = std::max<int>(ctx.worklist, preds[i]);
|
2022-08-17 00:23:36 +02:00
|
|
|
}
|
|
|
|
|
}
|
2024-07-09 15:45:01 +02:00
|
|
|
definition.setKill(!n);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-10-29 11:56:09 +01:00
|
|
|
/* handle phi operands */
|
2024-06-18 17:09:05 +02:00
|
|
|
for (int phi_idx = 0; phi_idx <= idx; phi_idx++) {
|
2019-10-29 11:56:09 +01:00
|
|
|
Instruction* insn = block->instructions[phi_idx].get();
|
|
|
|
|
assert(is_phi(insn));
|
2024-06-18 17:09:05 +02:00
|
|
|
/* Ignore dead phis. */
|
|
|
|
|
if (insn->definitions[0].isKill())
|
|
|
|
|
continue;
|
2024-07-09 15:45:01 +02:00
|
|
|
for (Operand& operand : insn->operands) {
|
2020-02-21 12:23:28 +00:00
|
|
|
if (!operand.isTemp())
|
2019-10-29 11:56:09 +01:00
|
|
|
continue;
|
2021-03-04 16:31:22 +01:00
|
|
|
|
|
|
|
|
/* set if the operand is killed by this (or another) phi instruction */
|
|
|
|
|
operand.setKill(!live.count(operand.tempId()));
|
2019-10-29 11:56:09 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-08 18:45:11 +02:00
|
|
|
if (ctx.program->live.live_in[block->index].insert(live)) {
|
2024-07-09 15:45:01 +02:00
|
|
|
if (block->linear_preds.size()) {
|
|
|
|
|
assert(block->logical_preds.empty() ||
|
|
|
|
|
block->logical_preds.back() <= block->linear_preds.back());
|
2024-07-08 18:45:11 +02:00
|
|
|
ctx.worklist = std::max<int>(ctx.worklist, block->linear_preds.back());
|
2024-07-09 15:45:01 +02:00
|
|
|
} else {
|
2024-07-30 17:08:19 +01:00
|
|
|
ASSERTED bool is_valid = validate_ir(ctx.program);
|
|
|
|
|
assert(!is_valid);
|
2024-07-09 15:45:01 +02:00
|
|
|
}
|
2024-07-08 18:45:11 +02:00
|
|
|
}
|
|
|
|
|
|
2024-06-19 14:11:51 +02:00
|
|
|
block->live_in_demand = new_demand;
|
2024-06-26 12:07:12 +02:00
|
|
|
block->register_demand.update(block->live_in_demand);
|
|
|
|
|
ctx.program->max_reg_demand.update(block->register_demand);
|
2025-11-12 21:29:56 +01:00
|
|
|
ctx.program->max_call_spills.update(block->call_spills);
|
2024-07-09 15:45:01 +02:00
|
|
|
ctx.handled_once = std::min(ctx.handled_once, block->index);
|
2024-06-19 14:11:51 +02:00
|
|
|
|
2023-06-12 13:28:11 +02:00
|
|
|
assert(!block->linear_preds.empty() || (new_demand == RegisterDemand() && live.empty()));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-12-18 16:18:35 +00:00
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
calc_waves_per_workgroup(Program* program)
|
|
|
|
|
{
|
2020-03-12 16:28:48 +01:00
|
|
|
/* When workgroup size is not known, just go with wave_size */
|
|
|
|
|
unsigned workgroup_size =
|
|
|
|
|
program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
|
|
|
|
|
|
2019-12-18 16:18:35 +00:00
|
|
|
return align(workgroup_size, program->wave_size) / program->wave_size;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
} /* end namespace */
|
|
|
|
|
|
2023-08-11 20:58:32 +01:00
|
|
|
bool
|
|
|
|
|
uses_scratch(Program* program)
|
|
|
|
|
{
|
|
|
|
|
/* RT uses scratch but we don't yet know how much. */
|
|
|
|
|
return program->config->scratch_bytes_per_wave || program->stage == raytracing_cs;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-13 16:41:00 +01:00
|
|
|
uint16_t
|
|
|
|
|
get_extra_sgprs(Program* program)
|
|
|
|
|
{
|
2023-08-11 20:58:32 +01:00
|
|
|
/* We don't use this register on GFX6-8 and it's removed on GFX10+. */
|
|
|
|
|
bool needs_flat_scr = uses_scratch(program) && program->gfx_level == GFX9;
|
2022-05-19 14:12:08 +01:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (program->gfx_level >= GFX10) {
|
2021-01-28 13:07:11 +00:00
|
|
|
assert(!program->dev.xnack_enabled);
|
aco: use VCC as regular SGPR pair on GFX10
There is no need to reserve it for special purposes, only.
Totals from 139391 (100.00% of 139391) affected shaders (Navi10):
VGPRs: 4738296 -> 4738156 (-0.00%); split: -0.01%, +0.00%
SpillSGPRs: 16188 -> 14968 (-7.54%); split: -7.60%, +0.06%
CodeSize: 294204472 -> 294118048 (-0.03%); split: -0.04%, +0.01%
MaxWaves: 2119584 -> 2119619 (+0.00%); split: +0.00%, -0.00%
Instrs: 56075079 -> 56056235 (-0.03%); split: -0.05%, +0.01%
Cycles: 1757781564 -> 1755354032 (-0.14%); split: -0.16%, +0.02%
VMEM: 52995887 -> 52996319 (+0.00%); split: +0.07%, -0.07%
SMEM: 9005338 -> 9004858 (-0.01%); split: +0.16%, -0.17%
VClause: 1178436 -> 1178331 (-0.01%); split: -0.02%, +0.01%
SClause: 2403649 -> 2404542 (+0.04%); split: -0.14%, +0.18%
Copies: 3447073 -> 3432417 (-0.43%); split: -0.66%, +0.23%
Branches: 1166542 -> 1166422 (-0.01%); split: -0.11%, +0.10%
PreSGPRs: 4229322 -> 4235538 (+0.15%)
PreVGPRs: 3817111 -> 3817040 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921>
2021-02-02 17:46:35 +01:00
|
|
|
return 0;
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (program->gfx_level >= GFX8) {
|
2022-05-19 14:12:08 +01:00
|
|
|
if (needs_flat_scr)
|
2019-09-13 16:41:00 +01:00
|
|
|
return 6;
|
2021-01-28 13:07:11 +00:00
|
|
|
else if (program->dev.xnack_enabled)
|
2019-09-13 16:41:00 +01:00
|
|
|
return 4;
|
|
|
|
|
else if (program->needs_vcc)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
} else {
|
2021-01-28 13:07:11 +00:00
|
|
|
assert(!program->dev.xnack_enabled);
|
2022-05-19 14:12:08 +01:00
|
|
|
if (needs_flat_scr)
|
2019-09-13 16:41:00 +01:00
|
|
|
return 4;
|
|
|
|
|
else if (program->needs_vcc)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint16_t
|
|
|
|
|
get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
|
|
|
|
|
{
|
|
|
|
|
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
|
2021-01-28 13:07:11 +00:00
|
|
|
uint16_t granule = program->dev.sgpr_alloc_granule;
|
2021-02-02 17:33:09 +01:00
|
|
|
return ALIGN_NPOT(std::max(sgprs, granule), granule);
|
2019-09-13 16:41:00 +01:00
|
|
|
}
|
|
|
|
|
|
2019-12-03 14:21:16 +00:00
|
|
|
uint16_t
|
|
|
|
|
get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
|
|
|
|
|
{
|
2021-01-28 13:07:11 +00:00
|
|
|
assert(addressable_vgprs <= program->dev.vgpr_limit);
|
|
|
|
|
uint16_t granule = program->dev.vgpr_alloc_granule;
|
2022-09-26 17:18:48 +01:00
|
|
|
return ALIGN_NPOT(std::max(addressable_vgprs, granule), granule);
|
2019-12-03 14:21:16 +00:00
|
|
|
}
|
|
|
|
|
|
2021-02-05 14:36:39 +01:00
|
|
|
unsigned
|
|
|
|
|
round_down(unsigned a, unsigned b)
|
2019-09-13 16:41:00 +01:00
|
|
|
{
|
2021-02-05 14:36:39 +01:00
|
|
|
return a - (a % b);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-24 13:12:28 +02:00
|
|
|
RegisterDemand
|
|
|
|
|
get_addr_regs_from_waves(Program* program, uint16_t waves)
|
2021-02-05 14:36:39 +01:00
|
|
|
{
|
|
|
|
|
/* it's not possible to allocate more than 128 SGPRs */
|
2021-01-28 13:07:11 +00:00
|
|
|
uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
|
2024-07-24 13:12:28 +02:00
|
|
|
sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule) - get_extra_sgprs(program);
|
|
|
|
|
sgprs = std::min(sgprs, program->dev.sgpr_limit);
|
2019-09-13 16:41:00 +01:00
|
|
|
|
2022-09-26 17:18:48 +01:00
|
|
|
uint16_t vgprs = program->dev.physical_vgprs / waves;
|
|
|
|
|
vgprs = vgprs / program->dev.vgpr_alloc_granule * program->dev.vgpr_alloc_granule;
|
2021-02-05 14:38:08 +01:00
|
|
|
vgprs -= program->config->num_shared_vgprs / 2;
|
2024-07-24 13:12:28 +02:00
|
|
|
vgprs = std::min(vgprs, program->dev.vgpr_limit);
|
|
|
|
|
return RegisterDemand(vgprs, sgprs);
|
2019-12-03 14:21:16 +00:00
|
|
|
}
|
|
|
|
|
|
2019-12-18 16:18:35 +00:00
|
|
|
void
|
|
|
|
|
calc_min_waves(Program* program)
|
|
|
|
|
{
|
|
|
|
|
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
|
2021-01-28 13:07:11 +00:00
|
|
|
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
|
2019-12-18 16:18:35 +00:00
|
|
|
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-19 11:32:56 +02:00
|
|
|
uint16_t
|
|
|
|
|
max_suitable_waves(Program* program, uint16_t waves)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-04-19 11:32:56 +02:00
|
|
|
unsigned num_simd = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
|
|
|
|
|
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
|
|
|
|
|
unsigned num_workgroups = waves * num_simd / waves_per_workgroup;
|
|
|
|
|
|
|
|
|
|
/* Adjust #workgroups for LDS */
|
2025-10-03 11:56:38 +02:00
|
|
|
unsigned lds_increment = ac_shader_get_lds_alloc_granularity(program->gfx_level);
|
|
|
|
|
unsigned lds_per_workgroup = align(program->config->lds_size, lds_increment);
|
2022-04-19 11:32:56 +02:00
|
|
|
|
|
|
|
|
if (program->stage == fragment_fs) {
|
|
|
|
|
/* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched.
|
|
|
|
|
* Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual.
|
|
|
|
|
* These limit occupancy the same way as other stages' LDS usage does.
|
|
|
|
|
*/
|
|
|
|
|
unsigned lds_bytes_per_interp = 3 * 16;
|
2024-11-26 15:56:02 +01:00
|
|
|
unsigned lds_param_bytes = lds_bytes_per_interp * program->info.ps.num_inputs;
|
2025-10-03 11:56:38 +02:00
|
|
|
lds_per_workgroup += align(lds_param_bytes, lds_increment);
|
2022-04-19 11:32:56 +02:00
|
|
|
}
|
2021-01-28 13:07:11 +00:00
|
|
|
unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
|
2022-04-19 11:32:56 +02:00
|
|
|
if (lds_per_workgroup)
|
|
|
|
|
num_workgroups = std::min(num_workgroups, lds_limit / lds_per_workgroup);
|
|
|
|
|
|
|
|
|
|
/* Hardware limitation */
|
|
|
|
|
if (waves_per_workgroup > 1)
|
|
|
|
|
num_workgroups = std::min(num_workgroups, program->wgp_mode ? 32u : 16u);
|
|
|
|
|
|
|
|
|
|
/* Adjust #waves for workgroup multiples:
|
|
|
|
|
* In cases like waves_per_workgroup=3 or lds=65536 and
|
|
|
|
|
* waves_per_workgroup=1, we want the maximum possible number of waves per
|
|
|
|
|
* SIMD and not the minimum. so DIV_ROUND_UP is used
|
|
|
|
|
*/
|
|
|
|
|
unsigned workgroup_waves = num_workgroups * waves_per_workgroup;
|
|
|
|
|
return DIV_ROUND_UP(workgroup_waves, num_simd);
|
|
|
|
|
}
|
2019-10-18 19:06:10 +01:00
|
|
|
|
2022-04-19 11:32:56 +02:00
|
|
|
void
|
|
|
|
|
update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|
|
|
|
{
|
2021-02-05 14:36:39 +01:00
|
|
|
assert(program->min_waves >= 1);
|
2024-07-24 13:12:28 +02:00
|
|
|
RegisterDemand limit = get_addr_regs_from_waves(program, program->min_waves);
|
2021-02-05 14:36:39 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* this won't compile, register pressure reduction necessary */
|
2025-11-12 21:29:56 +01:00
|
|
|
if (new_demand.exceeds(limit) || program->max_call_spills != RegisterDemand()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
program->num_waves = 0;
|
|
|
|
|
program->max_reg_demand = new_demand;
|
|
|
|
|
} else {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
|
2021-02-05 14:38:08 +01:00
|
|
|
uint16_t vgpr_demand =
|
|
|
|
|
get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
|
2021-01-28 13:07:11 +00:00
|
|
|
program->num_waves =
|
|
|
|
|
std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
|
2023-12-05 16:58:13 +01:00
|
|
|
program->num_waves = std::min(program->num_waves, program->dev.max_waves_per_simd);
|
2019-10-18 19:06:10 +01:00
|
|
|
|
2022-04-19 11:32:56 +02:00
|
|
|
/* Adjust for LDS and workgroup multiples and calculate max_reg_demand */
|
|
|
|
|
program->num_waves = max_suitable_waves(program, program->num_waves);
|
2024-07-24 13:12:28 +02:00
|
|
|
program->max_reg_demand = get_addr_regs_from_waves(program, program->num_waves);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-13 11:55:27 +02:00
|
|
|
void
|
2021-04-20 17:35:41 +01:00
|
|
|
live_var_analysis(Program* program)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-07-08 18:45:11 +02:00
|
|
|
program->live.live_in.clear();
|
2023-02-10 14:03:11 +01:00
|
|
|
program->live.memory.release();
|
2024-07-08 18:45:11 +02:00
|
|
|
program->live.live_in.resize(program->blocks.size(), IDSet(program->live.memory));
|
2023-11-10 11:35:43 +01:00
|
|
|
program->max_reg_demand = RegisterDemand();
|
2025-11-12 21:29:56 +01:00
|
|
|
program->max_call_spills = RegisterDemand();
|
2025-11-18 19:26:09 +01:00
|
|
|
program->fixed_reg_demand = RegisterDemand();
|
2022-05-12 02:50:17 -04:00
|
|
|
program->needs_vcc = program->gfx_level >= GFX10;
|
2020-02-21 12:23:28 +00:00
|
|
|
|
2023-11-10 11:35:43 +01:00
|
|
|
live_ctx ctx;
|
|
|
|
|
ctx.program = program;
|
|
|
|
|
ctx.worklist = program->blocks.size() - 1;
|
2024-07-09 15:45:01 +02:00
|
|
|
ctx.handled_once = program->blocks.size();
|
2023-11-10 11:35:43 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* this implementation assumes that the block idx corresponds to the block's position in
|
|
|
|
|
* program->blocks vector */
|
2023-11-10 11:35:43 +01:00
|
|
|
while (ctx.worklist >= 0) {
|
|
|
|
|
process_live_temps_per_block(ctx, &program->blocks[ctx.worklist--]);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2025-11-18 19:26:09 +01:00
|
|
|
program->max_reg_demand.update(program->fixed_reg_demand);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* calculate the program's register demand and number of waves */
|
2021-04-20 17:35:41 +01:00
|
|
|
if (program->progress < CompilationProgress::after_ra)
|
2023-11-10 11:35:43 +01:00
|
|
|
update_vgpr_sgpr_demand(program, program->max_reg_demand);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace aco
|