2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2019 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "aco_ir.h"
|
|
|
|
|
|
|
|
|
|
namespace aco {
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
struct NOP_ctx {
|
2019-09-12 17:42:17 +02:00
|
|
|
enum chip_class chip_class;
|
|
|
|
|
unsigned vcc_physical;
|
|
|
|
|
|
|
|
|
|
/* pre-GFX10 */
|
2019-09-17 13:22:17 +02:00
|
|
|
/* just initialize these with something less than max NOPs */
|
|
|
|
|
int VALU_wrexec = -10;
|
|
|
|
|
int VALU_wrvcc = -10;
|
|
|
|
|
int VALU_wrsgpr = -10;
|
2019-09-12 17:42:17 +02:00
|
|
|
|
|
|
|
|
/* GFX10 */
|
|
|
|
|
int last_VMEM_since_scalar_write = -1;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
NOP_ctx(Program* program) : chip_class(program->chip_class) {
|
|
|
|
|
vcc_physical = program->config->num_sgprs - 2;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if ((uint32_t) instr->format & (uint32_t) Format::VOPC)
|
|
|
|
|
return true;
|
|
|
|
|
if (instr->isVOP3() && instr->definitions.size() == 2)
|
|
|
|
|
return true;
|
|
|
|
|
if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
|
|
|
|
|
return true;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
|
|
|
|
|
{
|
|
|
|
|
return a_reg > b_reg ?
|
|
|
|
|
(a_reg - b_reg < b_size) :
|
|
|
|
|
(b_reg - a_reg < a_size);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-12 17:42:17 +02:00
|
|
|
unsigned handle_SMEM_clause(aco_ptr<Instruction>& instr, int new_idx,
|
|
|
|
|
std::vector<aco_ptr<Instruction>>& new_instructions)
|
|
|
|
|
{
|
|
|
|
|
//TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?)
|
|
|
|
|
const bool is_store = instr->definitions.empty();
|
|
|
|
|
for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) {
|
|
|
|
|
aco_ptr<Instruction>& pred = new_instructions[pred_idx];
|
|
|
|
|
if (pred->format != Format::SMEM)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
/* Don't allow clauses with store instructions since the clause's
|
|
|
|
|
* instructions may use the same address. */
|
|
|
|
|
if (is_store || pred->definitions.empty())
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
Definition& instr_def = instr->definitions[0];
|
|
|
|
|
Definition& pred_def = pred->definitions[0];
|
|
|
|
|
|
|
|
|
|
/* ISA reference doesn't say anything about this, but best to be safe */
|
|
|
|
|
if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size()))
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
for (const Operand& op : pred->operands) {
|
|
|
|
|
if (op.isConstant() || !op.isFixed())
|
|
|
|
|
continue;
|
|
|
|
|
if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size()))
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (op.isConstant() || !op.isFixed())
|
|
|
|
|
continue;
|
|
|
|
|
if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size()))
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
|
|
|
|
|
std::vector<aco_ptr<Instruction>>& old_instructions,
|
|
|
|
|
std::vector<aco_ptr<Instruction>>& new_instructions)
|
|
|
|
|
{
|
|
|
|
|
int new_idx = new_instructions.size();
|
|
|
|
|
|
|
|
|
|
// TODO: setreg / getreg / m0 writes
|
|
|
|
|
// TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
|
|
|
|
|
|
|
|
|
|
/* break off from prevous SMEM clause if needed */
|
|
|
|
|
if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) {
|
2019-09-12 17:42:17 +02:00
|
|
|
return handle_SMEM_clause(instr, new_idx, new_instructions);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (instr->isVALU() || instr->format == Format::VINTRP) {
|
|
|
|
|
int NOPs = 0;
|
|
|
|
|
|
|
|
|
|
if (instr->isDPP()) {
|
|
|
|
|
/* VALU does not forward EXEC to DPP. */
|
|
|
|
|
if (ctx.VALU_wrexec + 5 >= new_idx)
|
|
|
|
|
NOPs = 5 + ctx.VALU_wrexec - new_idx + 1;
|
|
|
|
|
|
|
|
|
|
/* VALU DPP reads VGPR written by VALU */
|
|
|
|
|
for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) {
|
|
|
|
|
aco_ptr<Instruction>& pred = new_instructions[pred_idx];
|
|
|
|
|
if ((pred->isVALU() || pred->format == Format::VINTRP) &&
|
|
|
|
|
!pred->definitions.empty() &&
|
|
|
|
|
pred->definitions[0].physReg() == instr->operands[0].physReg()) {
|
|
|
|
|
NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* SALU writes M0 */
|
|
|
|
|
if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) {
|
|
|
|
|
aco_ptr<Instruction>& pred = new_instructions.back();
|
|
|
|
|
if (pred->isSALU() &&
|
|
|
|
|
!pred->definitions.empty() &&
|
|
|
|
|
pred->definitions[0].physReg() == m0)
|
|
|
|
|
NOPs = std::max(NOPs, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
/* VALU which uses VCCZ */
|
|
|
|
|
if (op.physReg() == PhysReg{251} &&
|
|
|
|
|
ctx.VALU_wrvcc + 5 >= new_idx)
|
|
|
|
|
NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1);
|
|
|
|
|
|
|
|
|
|
/* VALU which uses EXECZ */
|
|
|
|
|
if (op.physReg() == PhysReg{252} &&
|
|
|
|
|
ctx.VALU_wrexec + 5 >= new_idx)
|
|
|
|
|
NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1);
|
|
|
|
|
|
|
|
|
|
/* VALU which reads VCC as a constant */
|
|
|
|
|
if (ctx.VALU_wrvcc + 1 >= new_idx) {
|
|
|
|
|
for (unsigned k = 0; k < op.size(); k++) {
|
|
|
|
|
unsigned reg = op.physReg() + k;
|
|
|
|
|
if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1)
|
|
|
|
|
NOPs = std::max(NOPs, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::v_readlane_b32:
|
|
|
|
|
case aco_opcode::v_writelane_b32: {
|
|
|
|
|
if (ctx.VALU_wrsgpr + 4 < new_idx)
|
|
|
|
|
break;
|
|
|
|
|
PhysReg reg = instr->operands[1].physReg();
|
|
|
|
|
for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) {
|
|
|
|
|
aco_ptr<Instruction>& pred = new_instructions[pred_idx];
|
|
|
|
|
if (!pred->isVALU() || !VALU_writes_sgpr(pred))
|
|
|
|
|
continue;
|
|
|
|
|
for (const Definition& def : pred->definitions) {
|
|
|
|
|
if (def.physReg() == reg)
|
|
|
|
|
NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_div_fmas_f32:
|
|
|
|
|
case aco_opcode::v_div_fmas_f64: {
|
|
|
|
|
if (ctx.VALU_wrvcc + 4 >= new_idx)
|
|
|
|
|
NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
|
|
|
|
|
// FIXME: handle case if the last instruction of a block without branch is such store
|
|
|
|
|
// TODO: confirm that DS instructions cannot cause WAR hazards here
|
|
|
|
|
if (new_idx > 0) {
|
|
|
|
|
aco_ptr<Instruction>& pred = new_instructions.back();
|
|
|
|
|
if (pred->isVMEM() &&
|
|
|
|
|
pred->operands.size() == 4 &&
|
|
|
|
|
pred->operands[3].size() > 2 &&
|
|
|
|
|
pred->operands[1].size() != 8 &&
|
|
|
|
|
(pred->format != Format::MUBUF || pred->operands[2].physReg() >= 102)) {
|
|
|
|
|
/* Ops that use a 256-bit T# do not need a wait state.
|
|
|
|
|
* BUFFER_STORE_* operations that use an SGPR for "offset"
|
|
|
|
|
* do not require any wait states. */
|
|
|
|
|
PhysReg wrdata = pred->operands[3].physReg();
|
|
|
|
|
unsigned size = pred->operands[3].size();
|
|
|
|
|
assert(wrdata >= 256);
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
if (regs_intersect(def.physReg(), def.size(), wrdata, size))
|
|
|
|
|
NOPs = std::max(NOPs, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (VALU_writes_sgpr(instr)) {
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
if (def.physReg() == vcc)
|
|
|
|
|
ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1;
|
|
|
|
|
else if (def.physReg() == exec)
|
|
|
|
|
ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1;
|
|
|
|
|
else if (def.physReg() <= 102)
|
|
|
|
|
ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return NOPs;
|
|
|
|
|
} else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) {
|
|
|
|
|
/* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
|
|
|
|
|
for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) {
|
|
|
|
|
aco_ptr<Instruction>& pred = new_instructions[pred_idx];
|
|
|
|
|
if (!(pred->isVALU() && VALU_writes_sgpr(pred)))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (const Definition& def : pred->definitions) {
|
|
|
|
|
if (def.physReg() > 102)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (instr->operands.size() > 1 &&
|
|
|
|
|
regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(),
|
|
|
|
|
def.physReg(), def.size())) {
|
|
|
|
|
return 5 + pred_idx - new_idx + 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->operands.size() > 2 &&
|
|
|
|
|
regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(),
|
|
|
|
|
def.physReg(), def.size())) {
|
|
|
|
|
return 5 + pred_idx - new_idx + 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-12 17:42:17 +02:00
|
|
|
std::pair<int, int> handle_instruction_gfx10(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
|
|
|
|
|
std::vector<aco_ptr<Instruction>>& old_instructions,
|
|
|
|
|
std::vector<aco_ptr<Instruction>>& new_instructions)
|
|
|
|
|
{
|
|
|
|
|
int new_idx = new_instructions.size();
|
|
|
|
|
unsigned vNOPs = 0;
|
|
|
|
|
unsigned sNOPs = 0;
|
|
|
|
|
|
|
|
|
|
/* break off from prevous SMEM group ("clause" seems to mean something different in RDNA) if needed */
|
|
|
|
|
if (instr->format == Format::SMEM)
|
|
|
|
|
sNOPs = std::max(sNOPs, handle_SMEM_clause(instr, new_idx, new_instructions));
|
|
|
|
|
|
|
|
|
|
/* handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between */
|
|
|
|
|
if (instr->isSALU() || instr->format == Format::SMEM) {
|
|
|
|
|
if (!instr->definitions.empty() && ctx.last_VMEM_since_scalar_write != -1) {
|
|
|
|
|
ctx.last_VMEM_since_scalar_write = -1;
|
|
|
|
|
vNOPs = 1;
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->isVMEM() || instr->isFlatOrGlobal()) {
|
|
|
|
|
ctx.last_VMEM_since_scalar_write = new_idx;
|
|
|
|
|
} else if (instr->opcode == aco_opcode::s_waitcnt) {
|
|
|
|
|
uint16_t imm = static_cast<SOPP_instruction*>(instr.get())->imm;
|
|
|
|
|
unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
|
|
|
|
|
if (vmcnt == 0)
|
|
|
|
|
ctx.last_VMEM_since_scalar_write = -1;
|
|
|
|
|
} else if (instr->isVALU()) {
|
|
|
|
|
ctx.last_VMEM_since_scalar_write = -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return std::make_pair(sNOPs, vNOPs);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
void handle_block(NOP_ctx& ctx, Block& block)
|
|
|
|
|
{
|
|
|
|
|
std::vector<aco_ptr<Instruction>> instructions;
|
|
|
|
|
instructions.reserve(block.instructions.size());
|
|
|
|
|
for (unsigned i = 0; i < block.instructions.size(); i++) {
|
|
|
|
|
aco_ptr<Instruction>& instr = block.instructions[i];
|
|
|
|
|
unsigned NOPs = handle_instruction(ctx, instr, block.instructions, instructions);
|
|
|
|
|
if (NOPs) {
|
|
|
|
|
// TODO: try to move the instruction down
|
|
|
|
|
/* create NOP */
|
|
|
|
|
aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
|
|
|
|
|
nop->imm = NOPs - 1;
|
|
|
|
|
nop->block = -1;
|
|
|
|
|
instructions.emplace_back(std::move(nop));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instructions.emplace_back(std::move(instr));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.VALU_wrvcc -= instructions.size();
|
|
|
|
|
ctx.VALU_wrexec -= instructions.size();
|
|
|
|
|
ctx.VALU_wrsgpr -= instructions.size();
|
|
|
|
|
block.instructions = std::move(instructions);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-12 17:42:17 +02:00
|
|
|
void handle_block_gfx10(NOP_ctx& ctx, Block& block)
|
|
|
|
|
{
|
|
|
|
|
std::vector<aco_ptr<Instruction>> instructions;
|
|
|
|
|
instructions.reserve(block.instructions.size());
|
|
|
|
|
for (unsigned i = 0; i < block.instructions.size(); i++) {
|
|
|
|
|
aco_ptr<Instruction>& instr = block.instructions[i];
|
|
|
|
|
std::pair<int, int> NOPs = handle_instruction_gfx10(ctx, instr, block.instructions, instructions);
|
|
|
|
|
for (int i = 0; i < NOPs.second; i++) {
|
|
|
|
|
// TODO: try to move the instruction down
|
|
|
|
|
/* create NOP */
|
|
|
|
|
aco_ptr<VOP1_instruction> nop{create_instruction<VOP1_instruction>(aco_opcode::v_nop, Format::VOP1, 0, 0)};
|
|
|
|
|
instructions.emplace_back(std::move(nop));
|
|
|
|
|
}
|
|
|
|
|
if (NOPs.first) {
|
|
|
|
|
// TODO: try to move the instruction down
|
|
|
|
|
/* create NOP */
|
|
|
|
|
aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
|
|
|
|
|
nop->imm = NOPs.first - 1;
|
|
|
|
|
nop->block = -1;
|
|
|
|
|
instructions.emplace_back(std::move(nop));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instructions.emplace_back(std::move(instr));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
block.instructions = std::move(instructions);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
} /* end namespace */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void insert_NOPs(Program* program)
|
|
|
|
|
{
|
|
|
|
|
NOP_ctx ctx(program);
|
2019-09-12 17:42:17 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
if (block.instructions.empty())
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-09-12 17:42:17 +02:00
|
|
|
if (ctx.chip_class >= GFX10)
|
|
|
|
|
handle_block_gfx10(ctx, block);
|
|
|
|
|
else
|
|
|
|
|
handle_block(ctx, block);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|