2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
* Copyright © 2018 Google
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2019-09-17 13:22:17 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
namespace aco {
|
2024-07-01 15:55:09 +02:00
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
RegisterDemand
|
2024-07-05 11:54:16 +02:00
|
|
|
get_live_changes(Instruction* instr)
|
2020-02-21 20:14:03 +00:00
|
|
|
{
|
|
|
|
|
RegisterDemand changes;
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
if (!def.isTemp() || def.isKill())
|
|
|
|
|
continue;
|
|
|
|
|
changes += def.getTemp();
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (!op.isTemp() || !op.isFirstKill())
|
|
|
|
|
continue;
|
|
|
|
|
changes -= op.getTemp();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return changes;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-20 11:37:20 +02:00
|
|
|
RegisterDemand
|
|
|
|
|
get_additional_operand_demand(Instruction* instr)
|
2023-04-12 16:00:10 +01:00
|
|
|
{
|
2024-06-20 11:37:20 +02:00
|
|
|
RegisterDemand additional_demand;
|
|
|
|
|
int op_idx = get_op_fixed_to_def(instr);
|
|
|
|
|
if (op_idx != -1 && !instr->operands[op_idx].isKill())
|
|
|
|
|
additional_demand += instr->definitions[0].getTemp();
|
|
|
|
|
|
|
|
|
|
return additional_demand;
|
2023-04-12 16:00:10 +01:00
|
|
|
}
|
|
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
RegisterDemand
|
2024-07-05 11:54:16 +02:00
|
|
|
get_temp_registers(Instruction* instr)
|
2020-02-21 20:14:03 +00:00
|
|
|
{
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
RegisterDemand demand_before;
|
|
|
|
|
RegisterDemand demand_after;
|
2020-02-21 15:46:39 +00:00
|
|
|
|
2020-02-21 20:14:03 +00:00
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
if (def.isKill())
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
demand_after += def.getTemp();
|
|
|
|
|
else if (def.isTemp())
|
|
|
|
|
demand_before -= def.getTemp();
|
2020-02-21 20:14:03 +00:00
|
|
|
}
|
2020-02-21 15:46:39 +00:00
|
|
|
|
|
|
|
|
for (Operand op : instr->operands) {
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
if (op.isFirstKill()) {
|
|
|
|
|
demand_before += op.getTemp();
|
|
|
|
|
if (op.isLateKill())
|
|
|
|
|
demand_after += op.getTemp();
|
|
|
|
|
}
|
2020-02-21 15:46:39 +00:00
|
|
|
}
|
|
|
|
|
|
2024-07-05 11:54:16 +02:00
|
|
|
demand_before += get_additional_operand_demand(instr);
|
aco: calculate register demand per instruction as maximum necessary to execute the instruction
Previously, the register demand per instruction was calculated as the number of
live variables in the register file after executing an instruction plus additional
temporary registers, necessary during the execution of the instruction.
With this change, now it also includes all variables which are live right before
executing an instruction, i.e. killed Operands.
Care has been taken so that the invariant
register_demand[idx] = register_demand[idx - 1] - get_temp_registers(prev_instr)
+ get_live_changes(instr) + get_temp_registers(instr)
still holds.
Slight changes in scheduling:
Totals from 316 (0.40% of 79395) affected shaders: (GFX11)
Instrs: 301329 -> 300777 (-0.18%); split: -0.31%, +0.12%
CodeSize: 1577976 -> 1576204 (-0.11%); split: -0.21%, +0.10%
SpillSGPRs: 448 -> 447 (-0.22%)
Latency: 1736349 -> 1726182 (-0.59%); split: -2.01%, +1.42%
InvThroughput: 243894 -> 243883 (-0.00%); split: -0.03%, +0.03%
VClause: 6134 -> 6280 (+2.38%); split: -1.04%, +3.42%
SClause: 6142 -> 6137 (-0.08%); split: -0.13%, +0.05%
Copies: 14037 -> 14032 (-0.04%); split: -0.56%, +0.52%
Branches: 3284 -> 3283 (-0.03%)
VALU: 182750 -> 182718 (-0.02%); split: -0.04%, +0.03%
SALU: 18522 -> 18538 (+0.09%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29804>
2024-06-20 10:03:11 +02:00
|
|
|
demand_after.update(demand_before);
|
|
|
|
|
return demand_after;
|
2020-02-21 20:14:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
namespace {
|
2021-07-26 17:55:48 +01:00
|
|
|
|
2023-11-10 11:35:43 +01:00
|
|
|
struct live_ctx {
|
2024-07-02 16:20:44 +02:00
|
|
|
monotonic_buffer_resource m;
|
2023-11-10 11:35:43 +01:00
|
|
|
Program* program;
|
2024-07-09 15:45:01 +02:00
|
|
|
int32_t worklist;
|
|
|
|
|
uint32_t handled_once;
|
2023-11-10 11:35:43 +01:00
|
|
|
};
|
|
|
|
|
|
2022-03-16 10:56:26 +01:00
|
|
|
bool
|
|
|
|
|
instr_needs_vcc(Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->isVOPC())
|
|
|
|
|
return true;
|
|
|
|
|
if (instr->isVOP2() && !instr->isVOP3()) {
|
|
|
|
|
if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
|
|
|
|
|
instr->operands[2].regClass().type() == RegType::sgpr)
|
|
|
|
|
return true;
|
|
|
|
|
if (instr->definitions.size() == 2)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-09 15:45:01 +02:00
|
|
|
IDSet
|
|
|
|
|
compute_live_out(live_ctx& ctx, Block* block)
|
|
|
|
|
{
|
|
|
|
|
IDSet live(ctx.m);
|
|
|
|
|
|
|
|
|
|
if (block->logical_succs.empty()) {
|
|
|
|
|
/* Linear blocks:
|
|
|
|
|
* Directly insert the successor if it is a linear block as well.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned succ : block->linear_succs) {
|
|
|
|
|
if (ctx.program->blocks[succ].logical_preds.empty()) {
|
|
|
|
|
live.insert(ctx.program->live.live_in[succ]);
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned t : ctx.program->live.live_in[succ]) {
|
|
|
|
|
if (ctx.program->temp_rc[t].is_linear())
|
|
|
|
|
live.insert(t);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* Logical blocks:
|
|
|
|
|
* Linear successors are either linear blocks or logical targets.
|
|
|
|
|
*/
|
|
|
|
|
live = IDSet(ctx.program->live.live_in[block->linear_succs[0]], ctx.m);
|
|
|
|
|
if (block->linear_succs.size() == 2)
|
|
|
|
|
live.insert(ctx.program->live.live_in[block->linear_succs[1]]);
|
|
|
|
|
|
|
|
|
|
/* At most one logical target needs a separate insertion. */
|
|
|
|
|
if (block->logical_succs.back() != block->linear_succs.back()) {
|
|
|
|
|
for (unsigned t : ctx.program->live.live_in[block->logical_succs.back()]) {
|
|
|
|
|
if (!ctx.program->temp_rc[t].is_linear())
|
|
|
|
|
live.insert(t);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
assert(block->logical_succs[0] == block->linear_succs[0]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Handle phi operands */
|
|
|
|
|
if (block->linear_succs.size() == 1 && block->linear_succs[0] >= ctx.handled_once) {
|
|
|
|
|
Block& succ = ctx.program->blocks[block->linear_succs[0]];
|
|
|
|
|
auto it = std::find(succ.linear_preds.begin(), succ.linear_preds.end(), block->index);
|
|
|
|
|
unsigned op_idx = std::distance(succ.linear_preds.begin(), it);
|
|
|
|
|
for (aco_ptr<Instruction>& phi : succ.instructions) {
|
|
|
|
|
if (!is_phi(phi))
|
|
|
|
|
break;
|
|
|
|
|
if (phi->opcode == aco_opcode::p_phi || phi->definitions[0].isKill())
|
|
|
|
|
continue;
|
|
|
|
|
if (phi->operands[op_idx].isTemp())
|
|
|
|
|
live.insert(phi->operands[op_idx].tempId());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (block->logical_succs.size() == 1 && block->logical_succs[0] >= ctx.handled_once) {
|
|
|
|
|
Block& succ = ctx.program->blocks[block->logical_succs[0]];
|
|
|
|
|
auto it = std::find(succ.logical_preds.begin(), succ.logical_preds.end(), block->index);
|
|
|
|
|
unsigned op_idx = std::distance(succ.logical_preds.begin(), it);
|
|
|
|
|
for (aco_ptr<Instruction>& phi : succ.instructions) {
|
|
|
|
|
if (!is_phi(phi))
|
|
|
|
|
break;
|
|
|
|
|
if (phi->opcode == aco_opcode::p_linear_phi || phi->definitions[0].isKill())
|
|
|
|
|
continue;
|
|
|
|
|
if (phi->operands[op_idx].isTemp())
|
|
|
|
|
live.insert(phi->operands[op_idx].tempId());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return live;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
2023-11-10 11:35:43 +01:00
|
|
|
process_live_temps_per_block(live_ctx& ctx, Block* block)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
RegisterDemand new_demand;
|
2024-06-26 12:07:12 +02:00
|
|
|
block->register_demand = RegisterDemand();
|
2024-07-09 15:45:01 +02:00
|
|
|
IDSet live = compute_live_out(ctx, block);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-03-10 13:39:42 +01:00
|
|
|
/* initialize register demand */
|
2020-09-14 16:45:55 +01:00
|
|
|
for (unsigned t : live)
|
2023-11-10 11:35:43 +01:00
|
|
|
new_demand += Temp(t, ctx.program->temp_rc[t]);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* traverse the instructions backwards */
|
2019-10-29 11:56:09 +01:00
|
|
|
int idx;
|
|
|
|
|
for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
|
|
|
|
|
Instruction* insn = block->instructions[idx].get();
|
|
|
|
|
if (is_phi(insn))
|
|
|
|
|
break;
|
|
|
|
|
|
2023-11-10 11:35:43 +01:00
|
|
|
ctx.program->needs_vcc |= instr_needs_vcc(insn);
|
2024-06-21 15:45:22 +02:00
|
|
|
insn->register_demand = RegisterDemand(new_demand.vgpr, new_demand.sgpr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-07-25 17:15:15 +02:00
|
|
|
bool has_vgpr_def = false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* KILL */
|
|
|
|
|
for (Definition& definition : insn->definitions) {
|
2024-07-25 17:15:15 +02:00
|
|
|
has_vgpr_def |= definition.regClass().type() == RegType::vgpr &&
|
|
|
|
|
!definition.regClass().is_linear_vgpr();
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!definition.isTemp()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2022-03-16 10:56:26 +01:00
|
|
|
if (definition.isFixed() && definition.physReg() == vcc)
|
2023-11-10 11:35:43 +01:00
|
|
|
ctx.program->needs_vcc = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
const Temp temp = definition.getTemp();
|
2020-09-14 16:45:55 +01:00
|
|
|
const size_t n = live.erase(temp.id());
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (n) {
|
|
|
|
|
new_demand -= temp;
|
|
|
|
|
definition.setKill(false);
|
|
|
|
|
} else {
|
2024-06-21 15:45:22 +02:00
|
|
|
insn->register_demand += temp;
|
2019-09-17 13:22:17 +02:00
|
|
|
definition.setKill(true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-25 17:01:37 +02:00
|
|
|
if (ctx.program->gfx_level >= GFX10 && insn->isVALU() &&
|
|
|
|
|
insn->definitions.back().regClass() == s2) {
|
|
|
|
|
/* RDNA2 ISA doc, 6.2.4. Wave64 Destination Restrictions:
|
|
|
|
|
* The first pass of a wave64 VALU instruction may not overwrite a scalar value used by
|
|
|
|
|
* the second half.
|
|
|
|
|
*/
|
|
|
|
|
bool carry_in = insn->opcode == aco_opcode::v_addc_co_u32 ||
|
|
|
|
|
insn->opcode == aco_opcode::v_subb_co_u32 ||
|
|
|
|
|
insn->opcode == aco_opcode::v_subbrev_co_u32;
|
|
|
|
|
for (unsigned op_idx = 0; op_idx < (carry_in ? 2 : insn->operands.size()); op_idx++) {
|
|
|
|
|
if (insn->operands[op_idx].isOfType(RegType::sgpr))
|
|
|
|
|
insn->operands[op_idx].setLateKill(true);
|
|
|
|
|
}
|
2024-07-25 17:15:15 +02:00
|
|
|
} else if (insn->opcode == aco_opcode::p_bpermute_readlane ||
|
|
|
|
|
insn->opcode == aco_opcode::p_bpermute_permlane ||
|
|
|
|
|
insn->opcode == aco_opcode::p_bpermute_shared_vgpr ||
|
|
|
|
|
insn->opcode == aco_opcode::p_dual_src_export_gfx11 ||
|
|
|
|
|
insn->opcode == aco_opcode::v_mqsad_u32_u8) {
|
|
|
|
|
for (Operand& op : insn->operands)
|
|
|
|
|
op.setLateKill(true);
|
|
|
|
|
} else if (insn->opcode == aco_opcode::p_interp_gfx11) {
|
|
|
|
|
insn->operands.back().setLateKill(true); /* we don't want the bld.lm def to use m0 */
|
|
|
|
|
if (insn->operands.size() == 7)
|
|
|
|
|
insn->operands[5].setLateKill(true); /* we re-use the destination reg in the middle */
|
|
|
|
|
} else if (insn->opcode == aco_opcode::v_interp_p1_f32 && ctx.program->dev.has_16bank_lds) {
|
|
|
|
|
insn->operands[0].setLateKill(true);
|
|
|
|
|
} else if (insn->opcode == aco_opcode::p_init_scratch) {
|
|
|
|
|
insn->operands.back().setLateKill(true);
|
|
|
|
|
} else if (instr_info.classes[(int)insn->opcode] == instr_class::wmma) {
|
|
|
|
|
insn->operands[0].setLateKill(true);
|
|
|
|
|
insn->operands[1].setLateKill(true);
|
2024-07-25 17:01:37 +02:00
|
|
|
}
|
|
|
|
|
|
2024-07-04 09:42:16 +02:00
|
|
|
/* Check if a definition clobbers some operand */
|
|
|
|
|
int op_idx = get_op_fixed_to_def(insn);
|
|
|
|
|
if (op_idx != -1)
|
|
|
|
|
insn->operands[op_idx].setClobbered(true);
|
|
|
|
|
|
2024-06-26 11:37:00 +02:00
|
|
|
/* we need to do this in a separate loop because the next one can
|
|
|
|
|
* setKill() for several operands at once and we don't want to
|
|
|
|
|
* overwrite that in a later iteration */
|
2024-07-25 17:15:15 +02:00
|
|
|
for (Operand& op : insn->operands) {
|
2024-06-26 11:37:00 +02:00
|
|
|
op.setKill(false);
|
2024-07-25 17:15:15 +02:00
|
|
|
/* Linear vgprs must be late kill: this is to ensure linear VGPR operands and
|
|
|
|
|
* normal VGPR definitions don't try to use the same register, which is problematic
|
|
|
|
|
* because of assignment restrictions.
|
|
|
|
|
*/
|
|
|
|
|
if (op.hasRegClass() && op.regClass().is_linear_vgpr() && !op.isUndefined() &&
|
|
|
|
|
has_vgpr_def)
|
|
|
|
|
op.setLateKill(true);
|
|
|
|
|
}
|
2024-06-26 11:37:00 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* GEN */
|
2024-06-26 11:37:00 +02:00
|
|
|
for (unsigned i = 0; i < insn->operands.size(); ++i) {
|
|
|
|
|
Operand& operand = insn->operands[i];
|
|
|
|
|
if (!operand.isTemp())
|
|
|
|
|
continue;
|
2024-07-04 09:42:16 +02:00
|
|
|
if (operand.isFixed() && ctx.program->progress < CompilationProgress::after_ra) {
|
|
|
|
|
ctx.program->needs_vcc |= operand.physReg() == vcc;
|
|
|
|
|
/* Check if this operand gets overwritten by a precolored definition. */
|
|
|
|
|
if (std::any_of(insn->definitions.begin(), insn->definitions.end(),
|
|
|
|
|
[=](Definition def)
|
|
|
|
|
{
|
|
|
|
|
return def.isFixed() &&
|
|
|
|
|
def.physReg() + def.size() > operand.physReg() &&
|
|
|
|
|
operand.physReg() + operand.size() > def.physReg();
|
|
|
|
|
}))
|
|
|
|
|
operand.setClobbered(true);
|
|
|
|
|
}
|
2024-06-26 11:37:00 +02:00
|
|
|
const Temp temp = operand.getTemp();
|
2024-08-22 11:09:46 +02:00
|
|
|
|
|
|
|
|
if (operand.isKill())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (live.insert(temp.id()).second) {
|
2024-06-26 11:37:00 +02:00
|
|
|
operand.setFirstKill(true);
|
|
|
|
|
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
|
2024-08-22 11:09:46 +02:00
|
|
|
if (insn->operands[j].isTemp() && insn->operands[j].getTemp() == temp)
|
2024-06-26 11:37:00 +02:00
|
|
|
insn->operands[j].setKill(true);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2024-06-26 11:37:00 +02:00
|
|
|
if (operand.isLateKill())
|
|
|
|
|
insn->register_demand += temp;
|
|
|
|
|
new_demand += temp;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-20 11:37:20 +02:00
|
|
|
RegisterDemand before_instr = new_demand + get_additional_operand_demand(insn);
|
2024-06-21 15:45:22 +02:00
|
|
|
insn->register_demand.update(before_instr);
|
2024-06-26 12:07:12 +02:00
|
|
|
block->register_demand.update(insn->register_demand);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-10-29 11:56:09 +01:00
|
|
|
/* handle phi definitions */
|
2024-06-18 17:09:05 +02:00
|
|
|
for (int phi_idx = 0; phi_idx <= idx; phi_idx++) {
|
2019-10-29 11:56:09 +01:00
|
|
|
Instruction* insn = block->instructions[phi_idx].get();
|
2024-06-21 15:45:22 +02:00
|
|
|
insn->register_demand = new_demand;
|
2019-10-29 11:56:09 +01:00
|
|
|
|
2021-02-04 15:55:23 +01:00
|
|
|
assert(is_phi(insn) && insn->definitions.size() == 1);
|
|
|
|
|
if (!insn->definitions[0].isTemp()) {
|
|
|
|
|
assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-10-29 11:56:09 +01:00
|
|
|
Definition& definition = insn->definitions[0];
|
2022-03-16 10:56:26 +01:00
|
|
|
if (definition.isFixed() && definition.physReg() == vcc)
|
2023-11-10 11:35:43 +01:00
|
|
|
ctx.program->needs_vcc = true;
|
2024-07-09 15:45:01 +02:00
|
|
|
const size_t n = live.erase(definition.tempId());
|
|
|
|
|
if (n && (definition.isKill() || ctx.handled_once > block->index)) {
|
|
|
|
|
Block::edge_vec& preds =
|
|
|
|
|
insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
|
|
|
|
|
for (unsigned i = 0; i < preds.size(); i++) {
|
|
|
|
|
if (insn->operands[i].isTemp())
|
|
|
|
|
ctx.worklist = std::max<int>(ctx.worklist, preds[i]);
|
2022-08-17 00:23:36 +02:00
|
|
|
}
|
|
|
|
|
}
|
2024-07-09 15:45:01 +02:00
|
|
|
definition.setKill(!n);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-10-29 11:56:09 +01:00
|
|
|
/* handle phi operands */
|
2024-06-18 17:09:05 +02:00
|
|
|
for (int phi_idx = 0; phi_idx <= idx; phi_idx++) {
|
2019-10-29 11:56:09 +01:00
|
|
|
Instruction* insn = block->instructions[phi_idx].get();
|
|
|
|
|
assert(is_phi(insn));
|
2024-06-18 17:09:05 +02:00
|
|
|
/* Ignore dead phis. */
|
|
|
|
|
if (insn->definitions[0].isKill())
|
|
|
|
|
continue;
|
2024-07-09 15:45:01 +02:00
|
|
|
for (Operand& operand : insn->operands) {
|
2020-02-21 12:23:28 +00:00
|
|
|
if (!operand.isTemp())
|
2019-10-29 11:56:09 +01:00
|
|
|
continue;
|
2020-02-21 12:23:28 +00:00
|
|
|
if (operand.isFixed() && operand.physReg() == vcc)
|
2023-11-10 11:35:43 +01:00
|
|
|
ctx.program->needs_vcc = true;
|
2021-03-04 16:31:22 +01:00
|
|
|
|
|
|
|
|
/* set if the operand is killed by this (or another) phi instruction */
|
|
|
|
|
operand.setKill(!live.count(operand.tempId()));
|
2019-10-29 11:56:09 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-08 18:45:11 +02:00
|
|
|
if (ctx.program->live.live_in[block->index].insert(live)) {
|
2024-07-09 15:45:01 +02:00
|
|
|
if (block->linear_preds.size()) {
|
|
|
|
|
assert(block->logical_preds.empty() ||
|
|
|
|
|
block->logical_preds.back() <= block->linear_preds.back());
|
2024-07-08 18:45:11 +02:00
|
|
|
ctx.worklist = std::max<int>(ctx.worklist, block->linear_preds.back());
|
2024-07-09 15:45:01 +02:00
|
|
|
} else {
|
2024-07-30 17:08:19 +01:00
|
|
|
ASSERTED bool is_valid = validate_ir(ctx.program);
|
|
|
|
|
assert(!is_valid);
|
2024-07-09 15:45:01 +02:00
|
|
|
}
|
2024-07-08 18:45:11 +02:00
|
|
|
}
|
|
|
|
|
|
2024-06-19 14:11:51 +02:00
|
|
|
block->live_in_demand = new_demand;
|
|
|
|
|
block->live_in_demand.sgpr += 2; /* Add 2 SGPRs for potential long-jumps. */
|
2024-06-26 12:07:12 +02:00
|
|
|
block->register_demand.update(block->live_in_demand);
|
|
|
|
|
ctx.program->max_reg_demand.update(block->register_demand);
|
2024-07-09 15:45:01 +02:00
|
|
|
ctx.handled_once = std::min(ctx.handled_once, block->index);
|
2024-06-19 14:11:51 +02:00
|
|
|
|
2023-06-12 13:28:11 +02:00
|
|
|
assert(!block->linear_preds.empty() || (new_demand == RegisterDemand() && live.empty()));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-12-18 16:18:35 +00:00
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
calc_waves_per_workgroup(Program* program)
|
|
|
|
|
{
|
2020-03-12 16:28:48 +01:00
|
|
|
/* When workgroup size is not known, just go with wave_size */
|
|
|
|
|
unsigned workgroup_size =
|
|
|
|
|
program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
|
|
|
|
|
|
2019-12-18 16:18:35 +00:00
|
|
|
return align(workgroup_size, program->wave_size) / program->wave_size;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
} /* end namespace */
|
|
|
|
|
|
2023-08-11 20:58:32 +01:00
|
|
|
bool
|
|
|
|
|
uses_scratch(Program* program)
|
|
|
|
|
{
|
|
|
|
|
/* RT uses scratch but we don't yet know how much. */
|
|
|
|
|
return program->config->scratch_bytes_per_wave || program->stage == raytracing_cs;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-13 16:41:00 +01:00
|
|
|
uint16_t
|
|
|
|
|
get_extra_sgprs(Program* program)
|
|
|
|
|
{
|
2023-08-11 20:58:32 +01:00
|
|
|
/* We don't use this register on GFX6-8 and it's removed on GFX10+. */
|
|
|
|
|
bool needs_flat_scr = uses_scratch(program) && program->gfx_level == GFX9;
|
2022-05-19 14:12:08 +01:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (program->gfx_level >= GFX10) {
|
2021-01-28 13:07:11 +00:00
|
|
|
assert(!program->dev.xnack_enabled);
|
aco: use VCC as regular SGPR pair on GFX10
There is no need to reserve it for special purposes, only.
Totals from 139391 (100.00% of 139391) affected shaders (Navi10):
VGPRs: 4738296 -> 4738156 (-0.00%); split: -0.01%, +0.00%
SpillSGPRs: 16188 -> 14968 (-7.54%); split: -7.60%, +0.06%
CodeSize: 294204472 -> 294118048 (-0.03%); split: -0.04%, +0.01%
MaxWaves: 2119584 -> 2119619 (+0.00%); split: +0.00%, -0.00%
Instrs: 56075079 -> 56056235 (-0.03%); split: -0.05%, +0.01%
Cycles: 1757781564 -> 1755354032 (-0.14%); split: -0.16%, +0.02%
VMEM: 52995887 -> 52996319 (+0.00%); split: +0.07%, -0.07%
SMEM: 9005338 -> 9004858 (-0.01%); split: +0.16%, -0.17%
VClause: 1178436 -> 1178331 (-0.01%); split: -0.02%, +0.01%
SClause: 2403649 -> 2404542 (+0.04%); split: -0.14%, +0.18%
Copies: 3447073 -> 3432417 (-0.43%); split: -0.66%, +0.23%
Branches: 1166542 -> 1166422 (-0.01%); split: -0.11%, +0.10%
PreSGPRs: 4229322 -> 4235538 (+0.15%)
PreVGPRs: 3817111 -> 3817040 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921>
2021-02-02 17:46:35 +01:00
|
|
|
return 0;
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (program->gfx_level >= GFX8) {
|
2022-05-19 14:12:08 +01:00
|
|
|
if (needs_flat_scr)
|
2019-09-13 16:41:00 +01:00
|
|
|
return 6;
|
2021-01-28 13:07:11 +00:00
|
|
|
else if (program->dev.xnack_enabled)
|
2019-09-13 16:41:00 +01:00
|
|
|
return 4;
|
|
|
|
|
else if (program->needs_vcc)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
} else {
|
2021-01-28 13:07:11 +00:00
|
|
|
assert(!program->dev.xnack_enabled);
|
2022-05-19 14:12:08 +01:00
|
|
|
if (needs_flat_scr)
|
2019-09-13 16:41:00 +01:00
|
|
|
return 4;
|
|
|
|
|
else if (program->needs_vcc)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint16_t
|
|
|
|
|
get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
|
|
|
|
|
{
|
|
|
|
|
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
|
2021-01-28 13:07:11 +00:00
|
|
|
uint16_t granule = program->dev.sgpr_alloc_granule;
|
2021-02-02 17:33:09 +01:00
|
|
|
return ALIGN_NPOT(std::max(sgprs, granule), granule);
|
2019-09-13 16:41:00 +01:00
|
|
|
}
|
|
|
|
|
|
2019-12-03 14:21:16 +00:00
|
|
|
uint16_t
|
|
|
|
|
get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
|
|
|
|
|
{
|
2021-01-28 13:07:11 +00:00
|
|
|
assert(addressable_vgprs <= program->dev.vgpr_limit);
|
|
|
|
|
uint16_t granule = program->dev.vgpr_alloc_granule;
|
2022-09-26 17:18:48 +01:00
|
|
|
return ALIGN_NPOT(std::max(addressable_vgprs, granule), granule);
|
2019-12-03 14:21:16 +00:00
|
|
|
}
|
|
|
|
|
|
2021-02-05 14:36:39 +01:00
|
|
|
unsigned
|
|
|
|
|
round_down(unsigned a, unsigned b)
|
2019-09-13 16:41:00 +01:00
|
|
|
{
|
2021-02-05 14:36:39 +01:00
|
|
|
return a - (a % b);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint16_t
|
|
|
|
|
get_addr_sgpr_from_waves(Program* program, uint16_t waves)
|
|
|
|
|
{
|
|
|
|
|
/* it's not possible to allocate more than 128 SGPRs */
|
2021-01-28 13:07:11 +00:00
|
|
|
uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
|
|
|
|
|
sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule);
|
2021-02-02 17:33:09 +01:00
|
|
|
sgprs -= get_extra_sgprs(program);
|
2021-01-28 13:07:11 +00:00
|
|
|
return std::min(sgprs, program->dev.sgpr_limit);
|
2019-09-13 16:41:00 +01:00
|
|
|
}
|
|
|
|
|
|
2021-02-05 14:36:39 +01:00
|
|
|
uint16_t
|
|
|
|
|
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
|
2019-12-03 14:21:16 +00:00
|
|
|
{
|
2022-09-26 17:18:48 +01:00
|
|
|
uint16_t vgprs = program->dev.physical_vgprs / waves;
|
|
|
|
|
vgprs = vgprs / program->dev.vgpr_alloc_granule * program->dev.vgpr_alloc_granule;
|
2021-02-05 14:38:08 +01:00
|
|
|
vgprs -= program->config->num_shared_vgprs / 2;
|
2021-01-28 13:07:11 +00:00
|
|
|
return std::min(vgprs, program->dev.vgpr_limit);
|
2019-12-03 14:21:16 +00:00
|
|
|
}
|
|
|
|
|
|
2019-12-18 16:18:35 +00:00
|
|
|
void
|
|
|
|
|
calc_min_waves(Program* program)
|
|
|
|
|
{
|
|
|
|
|
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
|
2021-01-28 13:07:11 +00:00
|
|
|
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
|
2019-12-18 16:18:35 +00:00
|
|
|
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-19 11:32:56 +02:00
|
|
|
uint16_t
|
|
|
|
|
max_suitable_waves(Program* program, uint16_t waves)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-04-19 11:32:56 +02:00
|
|
|
unsigned num_simd = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
|
|
|
|
|
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
|
|
|
|
|
unsigned num_workgroups = waves * num_simd / waves_per_workgroup;
|
|
|
|
|
|
|
|
|
|
/* Adjust #workgroups for LDS */
|
|
|
|
|
unsigned lds_per_workgroup = align(program->config->lds_size * program->dev.lds_encoding_granule,
|
|
|
|
|
program->dev.lds_alloc_granule);
|
|
|
|
|
|
|
|
|
|
if (program->stage == fragment_fs) {
|
|
|
|
|
/* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched.
|
|
|
|
|
* Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual.
|
|
|
|
|
* These limit occupancy the same way as other stages' LDS usage does.
|
|
|
|
|
*/
|
|
|
|
|
unsigned lds_bytes_per_interp = 3 * 16;
|
2022-05-05 11:32:53 +10:00
|
|
|
unsigned lds_param_bytes = lds_bytes_per_interp * program->info.ps.num_interp;
|
2022-04-19 11:32:56 +02:00
|
|
|
lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule);
|
|
|
|
|
}
|
2021-01-28 13:07:11 +00:00
|
|
|
unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
|
2022-04-19 11:32:56 +02:00
|
|
|
if (lds_per_workgroup)
|
|
|
|
|
num_workgroups = std::min(num_workgroups, lds_limit / lds_per_workgroup);
|
|
|
|
|
|
|
|
|
|
/* Hardware limitation */
|
|
|
|
|
if (waves_per_workgroup > 1)
|
|
|
|
|
num_workgroups = std::min(num_workgroups, program->wgp_mode ? 32u : 16u);
|
|
|
|
|
|
|
|
|
|
/* Adjust #waves for workgroup multiples:
|
|
|
|
|
* In cases like waves_per_workgroup=3 or lds=65536 and
|
|
|
|
|
* waves_per_workgroup=1, we want the maximum possible number of waves per
|
|
|
|
|
* SIMD and not the minimum. so DIV_ROUND_UP is used
|
|
|
|
|
*/
|
|
|
|
|
unsigned workgroup_waves = num_workgroups * waves_per_workgroup;
|
|
|
|
|
return DIV_ROUND_UP(workgroup_waves, num_simd);
|
|
|
|
|
}
|
2019-10-18 19:06:10 +01:00
|
|
|
|
2022-04-19 11:32:56 +02:00
|
|
|
void
|
|
|
|
|
update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|
|
|
|
{
|
2021-02-05 14:36:39 +01:00
|
|
|
assert(program->min_waves >= 1);
|
|
|
|
|
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
|
|
|
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* this won't compile, register pressure reduction necessary */
|
2021-02-05 14:36:39 +01:00
|
|
|
if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
|
2019-09-17 13:22:17 +02:00
|
|
|
program->num_waves = 0;
|
|
|
|
|
program->max_reg_demand = new_demand;
|
|
|
|
|
} else {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
|
2021-02-05 14:38:08 +01:00
|
|
|
uint16_t vgpr_demand =
|
|
|
|
|
get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
|
2021-01-28 13:07:11 +00:00
|
|
|
program->num_waves =
|
|
|
|
|
std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
|
2023-12-05 16:58:13 +01:00
|
|
|
program->num_waves = std::min(program->num_waves, program->dev.max_waves_per_simd);
|
2019-10-18 19:06:10 +01:00
|
|
|
|
2022-04-19 11:32:56 +02:00
|
|
|
/* Adjust for LDS and workgroup multiples and calculate max_reg_demand */
|
|
|
|
|
program->num_waves = max_suitable_waves(program, program->num_waves);
|
2019-12-03 14:21:16 +00:00
|
|
|
program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
|
2019-09-13 16:41:00 +01:00
|
|
|
program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-13 11:55:27 +02:00
|
|
|
void
|
2021-04-20 17:35:41 +01:00
|
|
|
live_var_analysis(Program* program)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-07-08 18:45:11 +02:00
|
|
|
program->live.live_in.clear();
|
2023-02-10 14:03:11 +01:00
|
|
|
program->live.memory.release();
|
2024-07-08 18:45:11 +02:00
|
|
|
program->live.live_in.resize(program->blocks.size(), IDSet(program->live.memory));
|
2023-11-10 11:35:43 +01:00
|
|
|
program->max_reg_demand = RegisterDemand();
|
2022-05-12 02:50:17 -04:00
|
|
|
program->needs_vcc = program->gfx_level >= GFX10;
|
2020-02-21 12:23:28 +00:00
|
|
|
|
2023-11-10 11:35:43 +01:00
|
|
|
live_ctx ctx;
|
|
|
|
|
ctx.program = program;
|
|
|
|
|
ctx.worklist = program->blocks.size() - 1;
|
2024-07-09 15:45:01 +02:00
|
|
|
ctx.handled_once = program->blocks.size();
|
2023-11-10 11:35:43 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* this implementation assumes that the block idx corresponds to the block's position in
|
|
|
|
|
* program->blocks vector */
|
2023-11-10 11:35:43 +01:00
|
|
|
while (ctx.worklist >= 0) {
|
|
|
|
|
process_live_temps_per_block(ctx, &program->blocks[ctx.worklist--]);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* calculate the program's register demand and number of waves */
|
2021-04-20 17:35:41 +01:00
|
|
|
if (program->progress < CompilationProgress::after_ra)
|
2023-11-10 11:35:43 +01:00
|
|
|
update_vgpr_sgpr_demand(program, program->max_reg_demand);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace aco
|