2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2023-05-24 16:24:35 +01:00
|
|
|
#include "aco_builder.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2021-06-10 11:33:15 +02:00
|
|
|
#include "common/sid.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include <map>
|
|
|
|
|
#include <stack>
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
namespace aco {
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The general idea of this pass is:
|
2019-10-21 21:36:41 +01:00
|
|
|
* The CFG is traversed in reverse postorder (forward) and loops are processed
|
|
|
|
|
* several times until no progress is made.
|
|
|
|
|
* Per BB two wait_ctx is maintained: an in-context and out-context.
|
2019-09-17 13:22:17 +02:00
|
|
|
* The in-context is the joined out-contexts of the predecessors.
|
|
|
|
|
* The context contains a map: gpr -> wait_entry
|
|
|
|
|
* consisting of the information about the cnt values to be waited for.
|
|
|
|
|
* Note: After merge-nodes, it might occur that for the same register
|
|
|
|
|
* multiple cnt values are to be waited for.
|
|
|
|
|
*
|
|
|
|
|
* The values are updated according to the encountered instructions:
|
|
|
|
|
* - additional events increment the counter of waits of the same type
|
|
|
|
|
* - or erase gprs with counters higher than to be waited for.
|
|
|
|
|
*/
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
|
|
|
|
|
// when there is a load followed by a use of a previous load
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* Instructions of the same event will finish in-order except for smem
|
|
|
|
|
* and maybe flat. Instructions of different events may not finish in-order. */
|
|
|
|
|
enum wait_event : uint16_t {
|
|
|
|
|
event_smem = 1 << 0,
|
|
|
|
|
event_lds = 1 << 1,
|
|
|
|
|
event_gds = 1 << 2,
|
|
|
|
|
event_vmem = 1 << 3,
|
|
|
|
|
event_vmem_store = 1 << 4, /* GFX10+ */
|
|
|
|
|
event_flat = 1 << 5,
|
|
|
|
|
event_exp_pos = 1 << 6,
|
|
|
|
|
event_exp_param = 1 << 7,
|
|
|
|
|
event_exp_mrt_null = 1 << 8,
|
|
|
|
|
event_gds_gpr_lock = 1 << 9,
|
|
|
|
|
event_vmem_gpr_lock = 1 << 10,
|
2019-10-14 17:21:04 +01:00
|
|
|
event_sendmsg = 1 << 11,
|
2022-06-17 13:53:08 +01:00
|
|
|
event_ldsdir = 1 << 12,
|
2022-11-13 18:15:28 +00:00
|
|
|
event_valu = 1 << 13,
|
|
|
|
|
event_trans = 1 << 14,
|
|
|
|
|
event_salu = 1 << 15,
|
|
|
|
|
num_events = 16,
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum counter_type : uint8_t {
|
|
|
|
|
counter_exp = 1 << 0,
|
|
|
|
|
counter_lgkm = 1 << 1,
|
|
|
|
|
counter_vm = 1 << 2,
|
|
|
|
|
counter_vs = 1 << 3,
|
2022-11-13 18:15:28 +00:00
|
|
|
counter_alu = 1 << 4,
|
|
|
|
|
num_counters = 5,
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2022-05-25 17:21:10 +01:00
|
|
|
enum vmem_type : uint8_t {
|
|
|
|
|
vmem_nosampler = 1 << 0,
|
|
|
|
|
vmem_sampler = 1 << 1,
|
|
|
|
|
vmem_bvh = 1 << 2,
|
|
|
|
|
};
|
|
|
|
|
|
2022-06-17 13:53:08 +01:00
|
|
|
static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null |
|
|
|
|
|
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
|
2019-10-14 17:21:04 +01:00
|
|
|
static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
|
2019-09-17 13:22:17 +02:00
|
|
|
static const uint16_t vm_events = event_vmem | event_flat;
|
|
|
|
|
static const uint16_t vs_events = event_vmem_store;
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
|
|
|
|
|
* wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
|
|
|
|
|
* that we should switch to a different wave and contains info on dependencies as to
|
|
|
|
|
* when we can switch back.
|
|
|
|
|
*
|
|
|
|
|
* This seems to apply only for ALU->ALU dependencies as other instructions have better
|
|
|
|
|
* integration with the frontend.
|
|
|
|
|
*
|
|
|
|
|
* Note that if we do not emit s_delay_alu things will still be correct, but the wave
|
|
|
|
|
* will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
|
|
|
|
|
* I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
|
|
|
|
|
* instructions can take a different number of cycles based on the exec mask)
|
|
|
|
|
*/
|
|
|
|
|
struct alu_delay_info {
|
|
|
|
|
/* These are the values directly above the max representable value, i.e. the wait
|
|
|
|
|
* would turn into a no-op when we try to wait for something further back than
|
|
|
|
|
* this.
|
|
|
|
|
*/
|
|
|
|
|
static constexpr int8_t valu_nop = 5;
|
|
|
|
|
static constexpr int8_t trans_nop = 4;
|
|
|
|
|
|
|
|
|
|
/* How many VALU instructions ago this value was written */
|
|
|
|
|
int8_t valu_instrs = valu_nop;
|
|
|
|
|
/* Cycles until the writing VALU instruction is finished */
|
|
|
|
|
int8_t valu_cycles = 0;
|
|
|
|
|
|
|
|
|
|
/* How many Transcedent instructions ago this value was written */
|
|
|
|
|
int8_t trans_instrs = trans_nop;
|
|
|
|
|
/* Cycles until the writing Transcendent instruction is finished */
|
|
|
|
|
int8_t trans_cycles = 0;
|
|
|
|
|
|
|
|
|
|
/* Cycles until the writing SALU instruction is finished*/
|
|
|
|
|
int8_t salu_cycles = 0;
|
|
|
|
|
|
|
|
|
|
bool combine(const alu_delay_info& other)
|
|
|
|
|
{
|
|
|
|
|
bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
|
|
|
|
|
other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
|
|
|
|
|
other.trans_cycles > trans_cycles;
|
|
|
|
|
valu_instrs = std::min(valu_instrs, other.valu_instrs);
|
|
|
|
|
trans_instrs = std::min(trans_instrs, other.trans_instrs);
|
|
|
|
|
salu_cycles = std::max(salu_cycles, other.salu_cycles);
|
|
|
|
|
valu_cycles = std::max(valu_cycles, other.valu_cycles);
|
|
|
|
|
trans_cycles = std::max(trans_cycles, other.trans_cycles);
|
|
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Needs to be called after any change to keep the data consistent. */
|
|
|
|
|
void fixup()
|
|
|
|
|
{
|
|
|
|
|
if (valu_instrs >= valu_nop || valu_cycles <= 0) {
|
|
|
|
|
valu_instrs = valu_nop;
|
|
|
|
|
valu_cycles = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (trans_instrs >= trans_nop || trans_cycles <= 0) {
|
|
|
|
|
trans_instrs = trans_nop;
|
|
|
|
|
trans_cycles = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
salu_cycles = std::max<int8_t>(salu_cycles, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Returns true if a wait would be a no-op */
|
|
|
|
|
bool empty() const
|
|
|
|
|
{
|
|
|
|
|
return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
uint8_t
|
|
|
|
|
get_counters_for_event(wait_event ev)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
switch (ev) {
|
|
|
|
|
case event_smem:
|
|
|
|
|
case event_lds:
|
|
|
|
|
case event_gds:
|
2021-06-09 10:14:54 +02:00
|
|
|
case event_sendmsg: return counter_lgkm;
|
|
|
|
|
case event_vmem: return counter_vm;
|
|
|
|
|
case event_vmem_store: return counter_vs;
|
|
|
|
|
case event_flat: return counter_vm | counter_lgkm;
|
2019-09-17 13:22:17 +02:00
|
|
|
case event_exp_pos:
|
|
|
|
|
case event_exp_param:
|
|
|
|
|
case event_exp_mrt_null:
|
|
|
|
|
case event_gds_gpr_lock:
|
2022-06-17 13:53:08 +01:00
|
|
|
case event_vmem_gpr_lock:
|
|
|
|
|
case event_ldsdir: return counter_exp;
|
2022-11-13 18:15:28 +00:00
|
|
|
case event_valu:
|
|
|
|
|
case event_trans:
|
|
|
|
|
case event_salu: return counter_alu;
|
2021-06-09 10:14:54 +02:00
|
|
|
default: return 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct wait_entry {
|
|
|
|
|
wait_imm imm;
|
2022-11-13 18:15:28 +00:00
|
|
|
alu_delay_info delay;
|
2021-06-09 10:14:54 +02:00
|
|
|
uint16_t events; /* use wait_event notion */
|
2019-09-17 13:22:17 +02:00
|
|
|
uint8_t counters; /* use counter_type notion */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool wait_on_read : 1;
|
|
|
|
|
bool logical : 1;
|
2022-05-25 17:21:10 +01:00
|
|
|
uint8_t vmem_types : 4;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, bool logical_,
|
|
|
|
|
bool wait_on_read_)
|
|
|
|
|
: imm(imm_), delay(delay_), events(event_), counters(get_counters_for_event(event_)),
|
2022-05-25 17:21:10 +01:00
|
|
|
wait_on_read(wait_on_read_), logical(logical_), vmem_types(0)
|
2021-06-09 10:14:54 +02:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool join(const wait_entry& other)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
bool changed = (other.events & ~events) || (other.counters & ~counters) ||
|
2023-05-11 17:08:39 +01:00
|
|
|
(other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types) ||
|
|
|
|
|
(!other.logical && logical);
|
2019-09-17 13:22:17 +02:00
|
|
|
events |= other.events;
|
|
|
|
|
counters |= other.counters;
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= imm.combine(other.imm);
|
2022-11-13 18:15:28 +00:00
|
|
|
changed |= delay.combine(other.delay);
|
2020-05-07 14:27:42 +01:00
|
|
|
wait_on_read |= other.wait_on_read;
|
2022-05-25 17:21:10 +01:00
|
|
|
vmem_types |= other.vmem_types;
|
2023-05-11 17:08:39 +01:00
|
|
|
logical &= other.logical;
|
2019-10-21 21:36:41 +01:00
|
|
|
return changed;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void remove_counter(counter_type counter)
|
|
|
|
|
{
|
|
|
|
|
counters &= ~counter;
|
|
|
|
|
|
|
|
|
|
if (counter == counter_lgkm) {
|
|
|
|
|
imm.lgkm = wait_imm::unset_counter;
|
2019-10-14 17:21:04 +01:00
|
|
|
events &= ~(event_smem | event_lds | event_gds | event_sendmsg);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (counter == counter_vm) {
|
|
|
|
|
imm.vm = wait_imm::unset_counter;
|
|
|
|
|
events &= ~event_vmem;
|
2022-05-25 17:21:10 +01:00
|
|
|
vmem_types = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (counter == counter_exp) {
|
|
|
|
|
imm.exp = wait_imm::unset_counter;
|
2022-06-17 13:53:08 +01:00
|
|
|
events &= ~exp_events;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (counter == counter_vs) {
|
|
|
|
|
imm.vs = wait_imm::unset_counter;
|
|
|
|
|
events &= ~event_vmem_store;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!(counters & counter_lgkm) && !(counters & counter_vm))
|
|
|
|
|
events &= ~event_flat;
|
2022-11-13 18:15:28 +00:00
|
|
|
|
|
|
|
|
if (counter == counter_alu) {
|
|
|
|
|
delay = alu_delay_info();
|
|
|
|
|
events &= ~(event_valu | event_trans | event_salu);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct wait_ctx {
|
2021-06-09 10:14:54 +02:00
|
|
|
Program* program;
|
2022-05-12 02:50:17 -04:00
|
|
|
enum amd_gfx_level gfx_level;
|
2019-09-17 13:22:17 +02:00
|
|
|
uint16_t max_vm_cnt;
|
|
|
|
|
uint16_t max_exp_cnt;
|
|
|
|
|
uint16_t max_lgkm_cnt;
|
|
|
|
|
uint16_t max_vs_cnt;
|
|
|
|
|
uint16_t unordered_events = event_smem | event_flat;
|
|
|
|
|
|
|
|
|
|
uint8_t vm_cnt = 0;
|
|
|
|
|
uint8_t exp_cnt = 0;
|
|
|
|
|
uint8_t lgkm_cnt = 0;
|
|
|
|
|
uint8_t vs_cnt = 0;
|
|
|
|
|
bool pending_flat_lgkm = false;
|
|
|
|
|
bool pending_flat_vm = false;
|
2019-10-14 15:18:31 +02:00
|
|
|
bool pending_s_buffer_store = false; /* GFX10 workaround */
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
wait_imm barrier_imm[storage_count];
|
|
|
|
|
uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
std::map<PhysReg, wait_entry> gpr_map;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
wait_ctx() {}
|
2021-06-09 10:14:54 +02:00
|
|
|
wait_ctx(Program* program_)
|
2022-05-12 02:50:17 -04:00
|
|
|
: program(program_), gfx_level(program_->gfx_level),
|
|
|
|
|
max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6),
|
|
|
|
|
max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14),
|
|
|
|
|
max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0),
|
|
|
|
|
unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0))
|
2021-06-09 10:14:54 +02:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool join(const wait_ctx* other, bool logical)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
|
|
|
|
|
other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
|
2019-10-21 21:36:41 +01:00
|
|
|
(other->pending_flat_lgkm && !pending_flat_lgkm) ||
|
|
|
|
|
(other->pending_flat_vm && !pending_flat_vm);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
exp_cnt = std::max(exp_cnt, other->exp_cnt);
|
|
|
|
|
vm_cnt = std::max(vm_cnt, other->vm_cnt);
|
|
|
|
|
lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
|
|
|
|
|
vs_cnt = std::max(vs_cnt, other->vs_cnt);
|
|
|
|
|
pending_flat_lgkm |= other->pending_flat_lgkm;
|
|
|
|
|
pending_flat_vm |= other->pending_flat_vm;
|
2019-10-14 15:18:31 +02:00
|
|
|
pending_s_buffer_store |= other->pending_s_buffer_store;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
for (const auto& entry : other->gpr_map) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (entry.second.logical != logical)
|
|
|
|
|
continue;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
using iterator = std::map<PhysReg, wait_entry>::iterator;
|
2020-10-22 20:40:04 -07:00
|
|
|
const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
|
|
|
|
|
if (insert_pair.second) {
|
2019-10-21 21:36:41 +01:00
|
|
|
changed = true;
|
2020-10-22 20:40:04 -07:00
|
|
|
} else {
|
|
|
|
|
changed |= insert_pair.first->second.join(entry.second);
|
2019-10-21 21:36:41 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= barrier_imm[i].combine(other->barrier_imm[i]);
|
2020-11-26 21:31:30 -08:00
|
|
|
changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
|
2020-02-11 16:52:20 +00:00
|
|
|
barrier_events[i] |= other->barrier_events[i];
|
|
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
|
|
|
|
|
return changed;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
|
|
|
|
|
{
|
2019-12-04 14:41:18 +00:00
|
|
|
entry.remove_counter(counter);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2022-05-25 17:21:10 +01:00
|
|
|
uint8_t
|
|
|
|
|
get_vmem_type(Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
|
|
|
|
|
return vmem_bvh;
|
|
|
|
|
else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
|
|
|
|
|
instr->operands[1].regClass() == s4)
|
|
|
|
|
return vmem_sampler;
|
2022-05-25 17:21:50 +01:00
|
|
|
else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
|
2022-05-25 17:21:10 +01:00
|
|
|
return vmem_nosampler;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2022-11-13 18:15:28 +00:00
|
|
|
check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
for (const Operand op : instr->operands) {
|
|
|
|
|
if (op.isConstant() || op.isUndefined())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* check consecutively read gprs */
|
|
|
|
|
for (unsigned j = 0; j < op.size(); j++) {
|
|
|
|
|
PhysReg reg{op.physReg() + j};
|
2021-06-09 10:14:54 +02:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
wait.combine(it->second.imm);
|
2023-02-03 13:08:14 +01:00
|
|
|
if (instr->isVALU() || instr->isSALU())
|
2022-11-13 18:15:28 +00:00
|
|
|
delay.combine(it->second.delay);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
/* check consecutively written gprs */
|
2021-06-09 10:14:54 +02:00
|
|
|
for (unsigned j = 0; j < def.getTemp().size(); j++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
PhysReg reg{def.physReg() + j};
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (it == ctx.gpr_map.end())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Vector Memory reads and writes return in the order they were issued */
|
2022-05-25 17:21:10 +01:00
|
|
|
uint8_t vmem_type = get_vmem_type(instr);
|
|
|
|
|
if (vmem_type && ((it->second.events & vm_events) == event_vmem) &&
|
|
|
|
|
it->second.vmem_types == vmem_type)
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* LDS reads and writes return in the order they were issued. same for GDS */
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->isDS() &&
|
|
|
|
|
(it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
|
2021-01-20 15:27:16 +00:00
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
wait.combine(it->second.imm);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
bool
|
|
|
|
|
parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
|
2019-10-21 21:36:41 +01:00
|
|
|
{
|
|
|
|
|
if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
|
|
|
|
|
instr->definitions[0].physReg() == sgpr_null) {
|
2021-01-21 16:13:34 +00:00
|
|
|
imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm);
|
2021-07-14 13:49:20 +02:00
|
|
|
return true;
|
2019-10-21 21:36:41 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::s_waitcnt) {
|
2022-05-12 02:50:17 -04:00
|
|
|
imm.combine(wait_imm(ctx.gfx_level, instr->sopp().imm));
|
2021-07-14 13:49:20 +02:00
|
|
|
return true;
|
2019-10-21 21:36:41 +01:00
|
|
|
}
|
2021-07-14 13:49:20 +02:00
|
|
|
return false;
|
2019-10-21 21:36:41 +01:00
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
bool
|
|
|
|
|
parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->opcode != aco_opcode::s_delay_alu)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned imm[2] = {instr->sopp().imm & 0xf, (instr->sopp().imm >> 7) & 0xf};
|
|
|
|
|
for (unsigned i = 0; i < 2; ++i) {
|
|
|
|
|
alu_delay_wait wait = (alu_delay_wait)imm[i];
|
|
|
|
|
if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4)
|
|
|
|
|
delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1;
|
|
|
|
|
else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3)
|
|
|
|
|
delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1;
|
|
|
|
|
else if (wait >= alu_delay_wait::SALU_CYCLE_1)
|
|
|
|
|
delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1;
|
|
|
|
|
}
|
2023-01-03 18:14:16 +00:00
|
|
|
|
2023-05-22 17:44:32 +01:00
|
|
|
delay.valu_cycles = instr->pass_flags & 0xffff;
|
|
|
|
|
delay.trans_cycles = instr->pass_flags >> 16;
|
2023-01-03 18:14:16 +00:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
|
|
|
|
perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
|
2020-06-26 15:54:22 +01:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
sync_scope subgroup_scope =
|
|
|
|
|
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
2020-05-13 16:05:46 +01:00
|
|
|
if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
|
2020-06-26 15:54:22 +01:00
|
|
|
unsigned storage = sync.storage;
|
|
|
|
|
while (storage) {
|
|
|
|
|
unsigned idx = u_bit_scan(&storage);
|
|
|
|
|
|
|
|
|
|
/* LDS is private to the workgroup */
|
|
|
|
|
sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
|
|
|
|
|
|
|
|
|
|
uint16_t events = ctx.barrier_events[idx];
|
|
|
|
|
if (bar_scope_lds <= subgroup_scope)
|
|
|
|
|
events &= ~event_lds;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
|
|
|
|
|
* in-order for the same workgroup */
|
2021-01-28 11:07:26 +00:00
|
|
|
if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
|
2020-05-13 16:05:46 +01:00
|
|
|
events &= ~(event_vmem | event_vmem_store | event_smem);
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
if (events)
|
|
|
|
|
imm.combine(ctx.barrier_imm[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
force_waitcnt(wait_ctx& ctx, wait_imm& imm)
|
2020-08-19 10:40:35 +02:00
|
|
|
{
|
|
|
|
|
if (ctx.vm_cnt)
|
|
|
|
|
imm.vm = 0;
|
|
|
|
|
if (ctx.exp_cnt)
|
|
|
|
|
imm.exp = 0;
|
|
|
|
|
if (ctx.lgkm_cnt)
|
|
|
|
|
imm.lgkm = 0;
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level >= GFX10) {
|
2020-08-19 10:40:35 +02:00
|
|
|
if (ctx.vs_cnt)
|
|
|
|
|
imm.vs = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2022-11-13 18:15:28 +00:00
|
|
|
update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles)
|
|
|
|
|
{
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
|
|
|
|
|
while (it != ctx.gpr_map.end()) {
|
|
|
|
|
wait_entry& entry = it->second;
|
2022-11-13 18:15:28 +00:00
|
|
|
|
|
|
|
|
if (clear) {
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
entry.remove_counter(counter_alu);
|
2022-11-13 18:15:28 +00:00
|
|
|
} else {
|
|
|
|
|
entry.delay.valu_instrs += is_valu ? 1 : 0;
|
|
|
|
|
entry.delay.trans_instrs += is_trans ? 1 : 0;
|
|
|
|
|
entry.delay.salu_cycles -= cycles;
|
|
|
|
|
entry.delay.valu_cycles -= cycles;
|
|
|
|
|
entry.delay.trans_cycles -= cycles;
|
|
|
|
|
|
|
|
|
|
entry.delay.fixup();
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
if (it->second.delay.empty())
|
|
|
|
|
entry.remove_counter(counter_alu);
|
2022-11-13 18:15:28 +00:00
|
|
|
}
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
|
|
|
|
|
if (!entry.counters)
|
|
|
|
|
it = ctx.gpr_map.erase(it);
|
|
|
|
|
else
|
|
|
|
|
it++;
|
2022-11-13 18:15:28 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
|
|
|
|
|
memory_sync_info sync_info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-06-24 14:17:29 +02:00
|
|
|
if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
|
2020-08-19 10:40:35 +02:00
|
|
|
/* Force emitting waitcnt states right after the instruction if there is
|
2022-06-24 14:17:29 +02:00
|
|
|
* something to wait for. This is also applied for s_setpc_b64 to ensure
|
|
|
|
|
* waitcnt states are inserted before jumping to the PS epilog.
|
2020-08-19 10:40:35 +02:00
|
|
|
*/
|
2022-06-23 16:30:17 +01:00
|
|
|
force_waitcnt(ctx, imm);
|
2020-08-19 10:40:35 +02:00
|
|
|
}
|
|
|
|
|
|
2023-04-06 23:09:35 +03:00
|
|
|
/* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
|
|
|
|
|
* overlapping waves proceed into the ordered section.
|
|
|
|
|
*/
|
|
|
|
|
if (ctx.program->has_pops_overlapped_waves_wait &&
|
|
|
|
|
(ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done
|
|
|
|
|
: (instr->opcode == aco_opcode::s_sendmsg &&
|
|
|
|
|
instr->sopp().imm == sendmsg_ordered_ps_done))) {
|
|
|
|
|
if (ctx.vm_cnt)
|
|
|
|
|
imm.vm = 0;
|
|
|
|
|
if (ctx.gfx_level >= GFX10 && ctx.vs_cnt)
|
|
|
|
|
imm.vs = 0;
|
|
|
|
|
/* Await SMEM loads too, as it's possible for an application to create them, like using a
|
|
|
|
|
* scalarization loop - pointless and unoptimal for an inherently divergent address of
|
|
|
|
|
* per-pixel data, but still can be done at least synthetically and must be handled correctly.
|
|
|
|
|
*/
|
|
|
|
|
if (ctx.program->has_smem_buffer_or_global_loads && ctx.lgkm_cnt)
|
|
|
|
|
imm.lgkm = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
check_instr(ctx, imm, delay, instr);
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2020-02-07 16:33:35 +01:00
|
|
|
/* It's required to wait for scalar stores before "writing back" data.
|
|
|
|
|
* It shouldn't cost anything anyways since we're about to do s_endpgm.
|
|
|
|
|
*/
|
|
|
|
|
if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) {
|
2022-05-12 02:50:17 -04:00
|
|
|
assert(ctx.gfx_level >= GFX8);
|
2020-02-07 16:33:35 +01:00
|
|
|
imm.lgkm = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level >= GFX10 && instr->isSMEM()) {
|
2019-10-14 15:18:31 +02:00
|
|
|
/* GFX10: A store followed by a load at the same address causes a problem because
|
|
|
|
|
* the load doesn't load the correct values unless we wait for the store first.
|
|
|
|
|
* This is NOT mitigated by an s_nop.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Refine this when we have proper alias analysis.
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
|
2021-01-21 16:13:34 +00:00
|
|
|
!instr->smem().sync.can_reorder()) {
|
2019-10-14 15:18:31 +02:00
|
|
|
imm.lgkm = 0;
|
|
|
|
|
}
|
2019-09-17 19:59:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-10-27 12:49:09 +01:00
|
|
|
if (instr->opcode == aco_opcode::ds_ordered_count &&
|
|
|
|
|
((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
|
|
|
|
|
imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
if (instr->opcode == aco_opcode::p_barrier)
|
2021-07-14 13:49:20 +02:00
|
|
|
perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel);
|
2020-06-26 15:54:22 +01:00
|
|
|
else
|
2021-07-14 13:49:20 +02:00
|
|
|
perform_barrier(ctx, imm, sync_info, semantic_release);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (!imm.empty() || !delay.empty()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
|
|
|
|
|
imm.vm = 0;
|
|
|
|
|
if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
|
|
|
|
|
imm.lgkm = 0;
|
|
|
|
|
|
|
|
|
|
/* reset counters */
|
|
|
|
|
ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
|
|
|
|
|
ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
|
|
|
|
|
ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
|
|
|
|
|
ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
|
|
|
|
|
|
|
|
|
|
/* update barrier wait imms */
|
2020-06-26 15:54:22 +01:00
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_imm& bar = ctx.barrier_imm[i];
|
2020-02-11 16:52:20 +00:00
|
|
|
uint16_t& bar_ev = ctx.barrier_events[i];
|
|
|
|
|
if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
|
2019-09-17 13:22:17 +02:00
|
|
|
bar.exp = wait_imm::unset_counter;
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev &= ~exp_events;
|
|
|
|
|
}
|
|
|
|
|
if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) {
|
2019-09-17 13:22:17 +02:00
|
|
|
bar.vm = wait_imm::unset_counter;
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev &= ~(vm_events & ~event_flat);
|
|
|
|
|
}
|
|
|
|
|
if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) {
|
2019-09-17 13:22:17 +02:00
|
|
|
bar.lgkm = wait_imm::unset_counter;
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev &= ~(lgkm_events & ~event_flat);
|
|
|
|
|
}
|
|
|
|
|
if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) {
|
2019-09-17 13:22:17 +02:00
|
|
|
bar.vs = wait_imm::unset_counter;
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev &= ~vs_events;
|
|
|
|
|
}
|
|
|
|
|
if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
|
|
|
|
|
bar_ev &= ~event_flat;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (ctx.program->gfx_level >= GFX11) {
|
|
|
|
|
update_alu(ctx, false, false, false,
|
|
|
|
|
MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-12 15:28:49 +01:00
|
|
|
/* remove all gprs with higher counter from map */
|
2021-06-09 10:14:54 +02:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
|
|
|
|
|
while (it != ctx.gpr_map.end()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
|
2019-12-04 14:41:18 +00:00
|
|
|
ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
|
2019-12-04 14:41:18 +00:00
|
|
|
ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
|
2019-12-04 14:41:18 +00:00
|
|
|
ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
|
2020-05-20 18:15:36 +01:00
|
|
|
if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
|
2019-12-04 14:41:18 +00:00
|
|
|
ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
|
2022-11-13 18:15:28 +00:00
|
|
|
if (delay.valu_instrs <= it->second.delay.valu_instrs)
|
|
|
|
|
it->second.delay.valu_instrs = alu_delay_info::valu_nop;
|
|
|
|
|
if (delay.trans_instrs <= it->second.delay.trans_instrs)
|
|
|
|
|
it->second.delay.trans_instrs = alu_delay_info::trans_nop;
|
|
|
|
|
it->second.delay.fixup();
|
|
|
|
|
if (it->second.delay.empty())
|
|
|
|
|
ctx.wait_and_remove_from_entry(it->first, it->second, counter_alu);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!it->second.counters)
|
|
|
|
|
it = ctx.gpr_map.erase(it);
|
|
|
|
|
else
|
|
|
|
|
it++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (imm.vm == 0)
|
|
|
|
|
ctx.pending_flat_vm = false;
|
2019-10-14 15:18:31 +02:00
|
|
|
if (imm.lgkm == 0) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.pending_flat_lgkm = false;
|
2019-10-14 15:18:31 +02:00
|
|
|
ctx.pending_s_buffer_store = false;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
update_barrier_counter(uint8_t* ctr, unsigned max)
|
2020-02-11 16:52:20 +00:00
|
|
|
{
|
|
|
|
|
if (*ctr != wait_imm::unset_counter && *ctr < max)
|
|
|
|
|
(*ctr)++;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-06-26 15:54:22 +01:00
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_imm& bar = ctx.barrier_imm[i];
|
2020-02-11 16:52:20 +00:00
|
|
|
uint16_t& bar_ev = ctx.barrier_events[i];
|
2020-06-26 15:54:22 +01:00
|
|
|
if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev |= event;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (counters & counter_lgkm)
|
|
|
|
|
bar.lgkm = 0;
|
|
|
|
|
if (counters & counter_vm)
|
|
|
|
|
bar.vm = 0;
|
|
|
|
|
if (counters & counter_exp)
|
|
|
|
|
bar.exp = 0;
|
|
|
|
|
if (counters & counter_vs)
|
|
|
|
|
bar.vs = 0;
|
2020-02-11 16:52:20 +00:00
|
|
|
} else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) {
|
|
|
|
|
if (counters & counter_lgkm && (bar_ev & lgkm_events) == event)
|
|
|
|
|
update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt);
|
|
|
|
|
if (counters & counter_vm && (bar_ev & vm_events) == event)
|
|
|
|
|
update_barrier_counter(&bar.vm, ctx.max_vm_cnt);
|
|
|
|
|
if (counters & counter_exp && (bar_ev & exp_events) == event)
|
|
|
|
|
update_barrier_counter(&bar.exp, ctx.max_exp_cnt);
|
|
|
|
|
if (counters & counter_vs && (bar_ev & vs_events) == event)
|
|
|
|
|
update_barrier_counter(&bar.vs, ctx.max_vs_cnt);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
uint8_t counters = get_counters_for_event(event);
|
|
|
|
|
|
|
|
|
|
if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
|
|
|
|
|
ctx.lgkm_cnt++;
|
|
|
|
|
if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
|
|
|
|
|
ctx.vm_cnt++;
|
|
|
|
|
if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
|
|
|
|
|
ctx.exp_cnt++;
|
|
|
|
|
if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
|
|
|
|
|
ctx.vs_cnt++;
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
update_barrier_imm(ctx, counters, event, sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (ctx.unordered_events & event)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (ctx.pending_flat_lgkm)
|
|
|
|
|
counters &= ~counter_lgkm;
|
|
|
|
|
if (ctx.pending_flat_vm)
|
|
|
|
|
counters &= ~counter_vm;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_entry& entry = e.second;
|
|
|
|
|
|
|
|
|
|
if (entry.events & ctx.unordered_events)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
assert(entry.events);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if ((counters & counter_exp) && (entry.events & exp_events) == event &&
|
|
|
|
|
entry.imm.exp < ctx.max_exp_cnt)
|
2019-09-17 13:22:17 +02:00
|
|
|
entry.imm.exp++;
|
2021-06-09 10:14:54 +02:00
|
|
|
if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
|
|
|
|
|
entry.imm.lgkm < ctx.max_lgkm_cnt)
|
2019-09-17 13:22:17 +02:00
|
|
|
entry.imm.lgkm++;
|
2021-06-09 10:14:54 +02:00
|
|
|
if ((counters & counter_vm) && (entry.events & vm_events) == event &&
|
|
|
|
|
entry.imm.vm < ctx.max_vm_cnt)
|
2019-09-17 13:22:17 +02:00
|
|
|
entry.imm.vm++;
|
2021-06-09 10:14:54 +02:00
|
|
|
if ((counters & counter_vs) && (entry.events & vs_events) == event &&
|
|
|
|
|
entry.imm.vs < ctx.max_vs_cnt)
|
2019-09-17 13:22:17 +02:00
|
|
|
entry.imm.vs++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
assert(ctx.gfx_level < GFX10);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
|
|
|
|
|
ctx.lgkm_cnt++;
|
2019-10-22 15:16:37 +01:00
|
|
|
if (ctx.vm_cnt <= ctx.max_vm_cnt)
|
|
|
|
|
ctx.vm_cnt++;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (e.second.counters & counter_vm)
|
|
|
|
|
e.second.imm.vm = 0;
|
|
|
|
|
if (e.second.counters & counter_lgkm)
|
|
|
|
|
e.second.imm.lgkm = 0;
|
|
|
|
|
}
|
|
|
|
|
ctx.pending_flat_lgkm = true;
|
|
|
|
|
ctx.pending_flat_vm = true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
|
2023-05-11 17:08:39 +01:00
|
|
|
uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
uint16_t counters = get_counters_for_event(event);
|
|
|
|
|
wait_imm imm;
|
|
|
|
|
if (counters & counter_lgkm)
|
|
|
|
|
imm.lgkm = 0;
|
|
|
|
|
if (counters & counter_vm)
|
|
|
|
|
imm.vm = 0;
|
|
|
|
|
if (counters & counter_exp)
|
|
|
|
|
imm.exp = 0;
|
|
|
|
|
if (counters & counter_vs)
|
|
|
|
|
imm.vs = 0;
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
alu_delay_info delay;
|
|
|
|
|
if (event == event_valu) {
|
|
|
|
|
delay.valu_instrs = 0;
|
|
|
|
|
delay.valu_cycles = cycles;
|
|
|
|
|
} else if (event == event_trans) {
|
|
|
|
|
delay.trans_instrs = 0;
|
|
|
|
|
delay.trans_cycles = cycles;
|
|
|
|
|
} else if (event == event_salu) {
|
|
|
|
|
delay.salu_cycles = cycles;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-11 17:08:39 +01:00
|
|
|
wait_entry new_entry(event, imm, delay, !rc.is_linear() && !force_linear, wait_on_read);
|
2022-05-25 17:21:10 +01:00
|
|
|
new_entry.vmem_types |= vmem_types;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < rc.size(); i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!it.second)
|
|
|
|
|
it.first->second.join(new_entry);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2022-05-25 17:21:10 +01:00
|
|
|
insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (!op.isConstant() && !op.isUndefined())
|
2022-11-13 18:15:28 +00:00
|
|
|
insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
|
|
|
|
|
unsigned cycles = 0)
|
|
|
|
|
{
|
2023-05-23 14:04:41 +01:00
|
|
|
/* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without
|
|
|
|
|
* waiting for the load to finish.
|
|
|
|
|
* Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane
|
|
|
|
|
* dependency checks.
|
2023-05-11 17:08:39 +01:00
|
|
|
*/
|
2023-05-23 14:04:41 +01:00
|
|
|
uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat;
|
|
|
|
|
uint32_t alu_events = event_trans | event_valu | event_salu;
|
|
|
|
|
bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events));
|
2023-05-11 17:08:39 +01:00
|
|
|
|
|
|
|
|
insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles,
|
|
|
|
|
force_linear);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2022-11-13 18:15:28 +00:00
|
|
|
gen_alu(Instruction* instr, wait_ctx& ctx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-11-13 18:15:28 +00:00
|
|
|
Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
|
2023-02-03 13:08:14 +01:00
|
|
|
bool is_valu = instr->isVALU();
|
2022-11-13 18:15:28 +00:00
|
|
|
bool is_trans = instr->isTrans();
|
|
|
|
|
bool clear = instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() ||
|
|
|
|
|
instr->isMUBUF() || instr->isMTBUF();
|
|
|
|
|
|
|
|
|
|
wait_event event = (wait_event)0;
|
|
|
|
|
if (is_trans)
|
|
|
|
|
event = event_trans;
|
|
|
|
|
else if (is_valu)
|
|
|
|
|
event = event_valu;
|
|
|
|
|
else if (instr->isSALU())
|
|
|
|
|
event = event_salu;
|
|
|
|
|
|
|
|
|
|
if (event != (wait_event)0) {
|
|
|
|
|
for (const Definition& def : instr->definitions)
|
|
|
|
|
insert_wait_entry(ctx, def, event, 0, cycle_info.latency);
|
|
|
|
|
}
|
|
|
|
|
update_alu(ctx, is_valu, is_trans, clear, cycle_info.issue_cycles);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
gen(Instruction* instr, wait_ctx& ctx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
switch (instr->format) {
|
|
|
|
|
case Format::EXP: {
|
2021-01-21 16:13:34 +00:00
|
|
|
Export_instruction& exp_instr = instr->exp();
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
wait_event ev;
|
2021-01-21 16:13:34 +00:00
|
|
|
if (exp_instr.dest <= 9)
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_mrt_null;
|
2021-01-21 16:13:34 +00:00
|
|
|
else if (exp_instr.dest <= 15)
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_pos;
|
|
|
|
|
else
|
|
|
|
|
ev = event_exp_param;
|
|
|
|
|
update_counters(ctx, ev);
|
|
|
|
|
|
|
|
|
|
/* insert new entries for exported vgprs */
|
2021-06-09 10:14:54 +02:00
|
|
|
for (unsigned i = 0; i < 4; i++) {
|
2021-01-21 16:13:34 +00:00
|
|
|
if (exp_instr.enabled_mask & (1 << i)) {
|
|
|
|
|
unsigned idx = exp_instr.compressed ? i >> 1 : i;
|
|
|
|
|
assert(idx < exp_instr.operands.size());
|
|
|
|
|
insert_wait_entry(ctx, exp_instr.operands[idx], ev);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
insert_wait_entry(ctx, exec, s2, ev, false);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::FLAT: {
|
2021-01-21 16:13:34 +00:00
|
|
|
FLAT_instruction& flat = instr->flat();
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
|
2021-01-21 16:13:34 +00:00
|
|
|
update_counters_for_flat_load(ctx, flat.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
else
|
2021-01-21 16:13:34 +00:00
|
|
|
update_counters(ctx, event_flat, flat.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_flat);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::SMEM: {
|
2021-01-21 16:13:34 +00:00
|
|
|
SMEM_instruction& smem = instr->smem();
|
|
|
|
|
update_counters(ctx, event_smem, smem.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_smem);
|
2022-05-12 02:50:17 -04:00
|
|
|
else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder())
|
2019-10-14 15:18:31 +02:00
|
|
|
ctx.pending_s_buffer_store = true;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::DS: {
|
2021-01-21 16:13:34 +00:00
|
|
|
DS_instruction& ds = instr->ds();
|
|
|
|
|
update_counters(ctx, ds.gds ? event_gds : event_lds, ds.sync);
|
|
|
|
|
if (ds.gds)
|
2019-09-17 13:22:17 +02:00
|
|
|
update_counters(ctx, event_gds_gpr_lock);
|
|
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
2021-01-21 16:13:34 +00:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
if (ds.gds) {
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Operand& op : instr->operands)
|
|
|
|
|
insert_wait_entry(ctx, op, event_gds_gpr_lock);
|
|
|
|
|
insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-06-17 13:53:08 +01:00
|
|
|
case Format::LDSDIR: {
|
|
|
|
|
LDSDIR_instruction& ldsdir = instr->ldsdir();
|
|
|
|
|
update_counters(ctx, event_ldsdir, ldsdir.sync);
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_ldsdir);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
case Format::MUBUF:
|
|
|
|
|
case Format::MTBUF:
|
|
|
|
|
case Format::MIMG:
|
2022-05-19 15:55:53 +01:00
|
|
|
case Format::GLOBAL:
|
|
|
|
|
case Format::SCRATCH: {
|
2021-06-09 10:14:54 +02:00
|
|
|
wait_event ev =
|
2022-05-12 02:50:17 -04:00
|
|
|
!instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store;
|
2020-06-26 15:54:22 +01:00
|
|
|
update_counters(ctx, ev, get_sync_info(instr));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
2022-05-25 17:21:10 +01:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
|
2019-09-17 13:22:17 +02:00
|
|
|
update_counters(ctx, event_vmem_gpr_lock);
|
|
|
|
|
insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
|
2020-01-16 16:54:35 +01:00
|
|
|
update_counters(ctx, event_vmem_gpr_lock);
|
2021-01-14 17:46:50 +00:00
|
|
|
insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-01-16 16:54:35 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
2019-10-14 17:21:04 +01:00
|
|
|
case Format::SOPP: {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
|
2020-06-26 15:54:22 +01:00
|
|
|
update_counters(ctx, event_sendmsg);
|
2021-04-13 17:21:56 +02:00
|
|
|
break;
|
2019-10-14 17:21:04 +01:00
|
|
|
}
|
2022-10-24 02:14:24 +00:00
|
|
|
case Format::SOP1: {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_sendmsg_rtn_b64) {
|
|
|
|
|
update_counters(ctx, event_sendmsg);
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_sendmsg);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2021-06-09 10:14:54 +02:00
|
|
|
default: break;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2021-07-14 13:49:20 +02:00
|
|
|
emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (imm.vs != wait_imm::unset_counter) {
|
2022-05-12 02:50:17 -04:00
|
|
|
assert(ctx.gfx_level >= GFX10);
|
2021-06-09 10:14:54 +02:00
|
|
|
SOPK_instruction* waitcnt_vs =
|
|
|
|
|
create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
|
2019-09-12 15:28:49 +01:00
|
|
|
waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
|
2019-09-17 13:22:17 +02:00
|
|
|
waitcnt_vs->imm = imm.vs;
|
|
|
|
|
instructions.emplace_back(waitcnt_vs);
|
|
|
|
|
imm.vs = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
if (!imm.empty()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
SOPP_instruction* waitcnt =
|
|
|
|
|
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
|
2022-05-12 02:50:17 -04:00
|
|
|
waitcnt->imm = imm.pack(ctx.gfx_level);
|
2019-09-17 13:22:17 +02:00
|
|
|
waitcnt->block = -1;
|
|
|
|
|
instructions.emplace_back(waitcnt);
|
|
|
|
|
}
|
2021-07-14 13:49:20 +02:00
|
|
|
imm = wait_imm();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
void
|
|
|
|
|
emit_delay_alu(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
|
|
|
|
|
alu_delay_info& delay)
|
|
|
|
|
{
|
|
|
|
|
uint32_t imm = 0;
|
|
|
|
|
if (delay.trans_instrs != delay.trans_nop) {
|
|
|
|
|
imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (delay.valu_instrs != delay.valu_nop) {
|
|
|
|
|
imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
|
|
|
|
|
* drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
|
|
|
|
|
* getting this wrong isn't an issue. */
|
|
|
|
|
if (delay.salu_cycles && imm <= 0xf) {
|
|
|
|
|
unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
|
|
|
|
|
imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SOPP_instruction* inst =
|
|
|
|
|
create_instruction<SOPP_instruction>(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
|
|
|
|
|
inst->imm = imm;
|
|
|
|
|
inst->block = -1;
|
2023-05-22 17:44:32 +01:00
|
|
|
inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
|
2022-11-13 18:15:28 +00:00
|
|
|
instructions.emplace_back(inst);
|
|
|
|
|
delay = alu_delay_info();
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
handle_block(Program* program, Block& block, wait_ctx& ctx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
std::vector<aco_ptr<Instruction>> new_instructions;
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
wait_imm queued_imm;
|
2022-11-13 18:15:28 +00:00
|
|
|
alu_delay_info queued_delay;
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions) {
|
2021-07-14 13:49:20 +02:00
|
|
|
bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
|
2022-11-13 18:15:28 +00:00
|
|
|
bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
memory_sync_info sync_info = get_sync_info(instr.get());
|
2022-11-13 18:15:28 +00:00
|
|
|
kill(queued_imm, queued_delay, instr.get(), ctx, sync_info);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (program->gfx_level >= GFX11)
|
|
|
|
|
gen_alu(instr.get(), ctx);
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
gen(instr.get(), ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) {
|
2022-06-17 13:53:08 +01:00
|
|
|
if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
|
|
|
|
|
instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
|
|
|
|
|
queued_imm.exp = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
if (!queued_imm.empty())
|
2019-10-21 21:36:41 +01:00
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm);
|
2022-11-13 18:15:28 +00:00
|
|
|
if (!queued_delay.empty())
|
|
|
|
|
emit_delay_alu(ctx, new_instructions, queued_delay);
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2022-10-27 12:49:09 +01:00
|
|
|
bool is_ordered_count_acquire =
|
|
|
|
|
instr->opcode == aco_opcode::ds_ordered_count &&
|
|
|
|
|
!((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1);
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
new_instructions.emplace_back(std::move(instr));
|
|
|
|
|
perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);
|
2022-10-27 12:49:09 +01:00
|
|
|
|
|
|
|
|
if (is_ordered_count_acquire)
|
|
|
|
|
queued_imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2019-11-22 19:38:51 +00:00
|
|
|
if (!queued_imm.empty())
|
|
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm);
|
2022-11-13 18:15:28 +00:00
|
|
|
if (!queued_delay.empty())
|
|
|
|
|
emit_delay_alu(ctx, new_instructions, queued_delay);
|
2019-11-22 19:38:51 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
block.instructions.swap(new_instructions);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} /* end namespace */
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
insert_wait_states(Program* program)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-10-21 21:36:41 +01:00
|
|
|
/* per BB ctx */
|
|
|
|
|
std::vector<bool> done(program->blocks.size());
|
2020-04-06 16:34:45 +02:00
|
|
|
std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));
|
|
|
|
|
std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2021-07-10 12:20:56 +02:00
|
|
|
std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
|
2019-10-21 21:36:41 +01:00
|
|
|
unsigned loop_progress = 0;
|
|
|
|
|
|
2023-03-02 17:30:49 -08:00
|
|
|
for (Definition def : program->args_pending_vmem) {
|
|
|
|
|
update_counters(in_ctx[0], event_vmem);
|
|
|
|
|
insert_wait_entry(in_ctx[0], def, event_vmem);
|
2021-05-17 17:56:28 +01:00
|
|
|
}
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
for (unsigned i = 0; i < program->blocks.size();) {
|
|
|
|
|
Block& current = program->blocks[i++];
|
2023-04-15 21:45:12 +03:00
|
|
|
|
|
|
|
|
if (current.kind & block_kind_discard_early_exit) {
|
|
|
|
|
/* Because the jump to the discard early exit block may happen anywhere in a block, it's
|
|
|
|
|
* not possible to join it with its predecessors this way.
|
|
|
|
|
* We emit all required waits when emitting the discard block.
|
|
|
|
|
*/
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
wait_ctx ctx = in_ctx[current.index];
|
|
|
|
|
|
|
|
|
|
if (current.kind & block_kind_loop_header) {
|
|
|
|
|
loop_header_indices.push(current.index);
|
|
|
|
|
} else if (current.kind & block_kind_loop_exit) {
|
|
|
|
|
bool repeat = false;
|
|
|
|
|
if (loop_progress == loop_header_indices.size()) {
|
|
|
|
|
i = loop_header_indices.top();
|
|
|
|
|
repeat = true;
|
|
|
|
|
}
|
|
|
|
|
loop_header_indices.pop();
|
|
|
|
|
loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
|
|
|
|
|
if (repeat)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool changed = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned b : current.linear_preds)
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= ctx.join(&out_ctx[b], false);
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned b : current.logical_preds)
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= ctx.join(&out_ctx[b], true);
|
|
|
|
|
|
2019-12-04 14:41:18 +00:00
|
|
|
if (done[current.index] && !changed) {
|
|
|
|
|
in_ctx[current.index] = std::move(ctx);
|
2019-10-21 21:36:41 +01:00
|
|
|
continue;
|
2019-12-04 14:41:18 +00:00
|
|
|
} else {
|
|
|
|
|
in_ctx[current.index] = ctx;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
if (current.instructions.empty()) {
|
2019-12-04 14:41:18 +00:00
|
|
|
out_ctx[current.index] = std::move(ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2019-10-21 21:36:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
|
|
|
|
|
done[current.index] = true;
|
|
|
|
|
|
|
|
|
|
handle_block(program, current, ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-04 14:41:18 +00:00
|
|
|
out_ctx[current.index] = std::move(ctx);
|
|
|
|
|
}
|
2023-01-03 16:35:17 +00:00
|
|
|
|
|
|
|
|
/* Combine s_delay_alu using the skip field. */
|
|
|
|
|
if (program->gfx_level >= GFX11) {
|
|
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
int prev_delay_alu = -1;
|
|
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions) {
|
|
|
|
|
if (instr->opcode != aco_opcode::s_delay_alu) {
|
|
|
|
|
block.instructions[i++] = std::move(instr);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint16_t imm = instr->sopp().imm;
|
|
|
|
|
int skip = i - prev_delay_alu - 1;
|
|
|
|
|
if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
|
|
|
|
|
if (imm >> 7 == 0)
|
|
|
|
|
prev_delay_alu = i;
|
|
|
|
|
block.instructions[i++] = std::move(instr);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
block.instructions[prev_delay_alu]->sopp().imm |= (skip << 4) | (imm << 7);
|
|
|
|
|
prev_delay_alu = -1;
|
|
|
|
|
}
|
|
|
|
|
block.instructions.resize(i);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
} // namespace aco
|