2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2019-09-17 13:22:17 +02:00
|
|
|
*/
|
|
|
|
|
|
2023-05-24 16:24:35 +01:00
|
|
|
#include "aco_builder.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2021-06-10 11:33:15 +02:00
|
|
|
#include "common/sid.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include <map>
|
|
|
|
|
#include <stack>
|
|
|
|
|
#include <vector>
|
2024-03-27 16:38:25 +00:00
|
|
|
#include <optional>
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
namespace aco {
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The general idea of this pass is:
|
2019-10-21 21:36:41 +01:00
|
|
|
* The CFG is traversed in reverse postorder (forward) and loops are processed
|
|
|
|
|
* several times until no progress is made.
|
|
|
|
|
* Per BB two wait_ctx is maintained: an in-context and out-context.
|
2019-09-17 13:22:17 +02:00
|
|
|
* The in-context is the joined out-contexts of the predecessors.
|
|
|
|
|
* The context contains a map: gpr -> wait_entry
|
|
|
|
|
* consisting of the information about the cnt values to be waited for.
|
|
|
|
|
* Note: After merge-nodes, it might occur that for the same register
|
|
|
|
|
* multiple cnt values are to be waited for.
|
|
|
|
|
*
|
|
|
|
|
* The values are updated according to the encountered instructions:
|
|
|
|
|
* - additional events increment the counter of waits of the same type
|
|
|
|
|
* - or erase gprs with counters higher than to be waited for.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
|
|
|
|
|
// when there is a load followed by a use of a previous load
|
|
|
|
|
|
|
|
|
|
/* Instructions of the same event will finish in-order except for smem
|
|
|
|
|
* and maybe flat. Instructions of different events may not finish in-order. */
|
2024-05-03 12:04:58 +01:00
|
|
|
enum wait_event : uint32_t {
|
2019-09-17 13:22:17 +02:00
|
|
|
event_smem = 1 << 0,
|
|
|
|
|
event_lds = 1 << 1,
|
|
|
|
|
event_gds = 1 << 2,
|
|
|
|
|
event_vmem = 1 << 3,
|
|
|
|
|
event_vmem_store = 1 << 4, /* GFX10+ */
|
|
|
|
|
event_flat = 1 << 5,
|
|
|
|
|
event_exp_pos = 1 << 6,
|
|
|
|
|
event_exp_param = 1 << 7,
|
|
|
|
|
event_exp_mrt_null = 1 << 8,
|
|
|
|
|
event_gds_gpr_lock = 1 << 9,
|
|
|
|
|
event_vmem_gpr_lock = 1 << 10,
|
2019-10-14 17:21:04 +01:00
|
|
|
event_sendmsg = 1 << 11,
|
2022-06-17 13:53:08 +01:00
|
|
|
event_ldsdir = 1 << 12,
|
2024-05-03 12:04:58 +01:00
|
|
|
event_vmem_sample = 1 << 13, /* GFX12+ */
|
|
|
|
|
event_vmem_bvh = 1 << 14, /* GFX12+ */
|
|
|
|
|
event_valu = 1 << 15,
|
|
|
|
|
event_trans = 1 << 16,
|
|
|
|
|
event_salu = 1 << 17,
|
|
|
|
|
num_events = 18,
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum counter_type : uint8_t {
|
2024-05-03 11:19:55 +01:00
|
|
|
counter_exp = 1 << wait_type_exp,
|
|
|
|
|
counter_lgkm = 1 << wait_type_lgkm,
|
|
|
|
|
counter_vm = 1 << wait_type_vm,
|
|
|
|
|
counter_vs = 1 << wait_type_vs,
|
2024-05-03 12:04:58 +01:00
|
|
|
counter_sample = 1 << wait_type_sample,
|
|
|
|
|
counter_bvh = 1 << wait_type_bvh,
|
|
|
|
|
counter_km = 1 << wait_type_km,
|
2024-05-03 11:19:55 +01:00
|
|
|
counter_alu = 1 << wait_type_num,
|
|
|
|
|
num_counters = wait_type_num + 1,
|
2024-05-03 11:19:55 +01:00
|
|
|
wait_counters = BITFIELD_MASK(wait_type_num),
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
|
|
|
|
|
* wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
|
|
|
|
|
* that we should switch to a different wave and contains info on dependencies as to
|
|
|
|
|
* when we can switch back.
|
|
|
|
|
*
|
|
|
|
|
* This seems to apply only for ALU->ALU dependencies as other instructions have better
|
|
|
|
|
* integration with the frontend.
|
|
|
|
|
*
|
|
|
|
|
* Note that if we do not emit s_delay_alu things will still be correct, but the wave
|
|
|
|
|
* will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
|
|
|
|
|
* I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
|
|
|
|
|
* instructions can take a different number of cycles based on the exec mask)
|
|
|
|
|
*/
|
|
|
|
|
struct alu_delay_info {
|
|
|
|
|
/* These are the values directly above the max representable value, i.e. the wait
|
|
|
|
|
* would turn into a no-op when we try to wait for something further back than
|
|
|
|
|
* this.
|
|
|
|
|
*/
|
|
|
|
|
static constexpr int8_t valu_nop = 5;
|
|
|
|
|
static constexpr int8_t trans_nop = 4;
|
|
|
|
|
|
|
|
|
|
/* How many VALU instructions ago this value was written */
|
|
|
|
|
int8_t valu_instrs = valu_nop;
|
|
|
|
|
/* Cycles until the writing VALU instruction is finished */
|
|
|
|
|
int8_t valu_cycles = 0;
|
|
|
|
|
|
|
|
|
|
/* How many Transcedent instructions ago this value was written */
|
|
|
|
|
int8_t trans_instrs = trans_nop;
|
|
|
|
|
/* Cycles until the writing Transcendent instruction is finished */
|
|
|
|
|
int8_t trans_cycles = 0;
|
|
|
|
|
|
|
|
|
|
/* Cycles until the writing SALU instruction is finished*/
|
|
|
|
|
int8_t salu_cycles = 0;
|
|
|
|
|
|
|
|
|
|
bool combine(const alu_delay_info& other)
|
|
|
|
|
{
|
|
|
|
|
bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
|
|
|
|
|
other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
|
|
|
|
|
other.trans_cycles > trans_cycles;
|
|
|
|
|
valu_instrs = std::min(valu_instrs, other.valu_instrs);
|
|
|
|
|
trans_instrs = std::min(trans_instrs, other.trans_instrs);
|
|
|
|
|
salu_cycles = std::max(salu_cycles, other.salu_cycles);
|
|
|
|
|
valu_cycles = std::max(valu_cycles, other.valu_cycles);
|
|
|
|
|
trans_cycles = std::max(trans_cycles, other.trans_cycles);
|
|
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Needs to be called after any change to keep the data consistent. */
|
|
|
|
|
void fixup()
|
|
|
|
|
{
|
|
|
|
|
if (valu_instrs >= valu_nop || valu_cycles <= 0) {
|
|
|
|
|
valu_instrs = valu_nop;
|
|
|
|
|
valu_cycles = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (trans_instrs >= trans_nop || trans_cycles <= 0) {
|
|
|
|
|
trans_instrs = trans_nop;
|
|
|
|
|
trans_cycles = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
salu_cycles = std::max<int8_t>(salu_cycles, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Returns true if a wait would be a no-op */
|
|
|
|
|
bool empty() const
|
|
|
|
|
{
|
|
|
|
|
return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
|
|
|
|
|
}
|
2023-09-25 12:29:26 +01:00
|
|
|
|
|
|
|
|
UNUSED void print(FILE* output) const
|
|
|
|
|
{
|
|
|
|
|
if (valu_instrs != valu_nop)
|
|
|
|
|
fprintf(output, "valu_instrs: %u\n", valu_instrs);
|
|
|
|
|
if (valu_cycles)
|
|
|
|
|
fprintf(output, "valu_cycles: %u\n", valu_cycles);
|
|
|
|
|
if (trans_instrs != trans_nop)
|
|
|
|
|
fprintf(output, "trans_instrs: %u\n", trans_instrs);
|
|
|
|
|
if (trans_cycles)
|
|
|
|
|
fprintf(output, "trans_cycles: %u\n", trans_cycles);
|
|
|
|
|
if (salu_cycles)
|
|
|
|
|
fprintf(output, "salu_cycles: %u\n", salu_cycles);
|
|
|
|
|
}
|
2022-11-13 18:15:28 +00:00
|
|
|
};
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
struct wait_entry {
|
|
|
|
|
wait_imm imm;
|
2022-11-13 18:15:28 +00:00
|
|
|
alu_delay_info delay;
|
2024-05-03 12:04:58 +01:00
|
|
|
uint32_t events; /* use wait_event notion */
|
2019-09-17 13:22:17 +02:00
|
|
|
uint8_t counters; /* use counter_type notion */
|
|
|
|
|
bool wait_on_read : 1;
|
|
|
|
|
bool logical : 1;
|
2024-05-03 12:04:58 +01:00
|
|
|
uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_,
|
|
|
|
|
bool logical_, bool wait_on_read_)
|
|
|
|
|
: imm(imm_), delay(delay_), events(event_), counters(counters_), wait_on_read(wait_on_read_),
|
|
|
|
|
logical(logical_), vmem_types(0)
|
2020-05-07 14:27:42 +01:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool join(const wait_entry& other)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-10-21 21:36:41 +01:00
|
|
|
bool changed = (other.events & ~events) || (other.counters & ~counters) ||
|
2023-05-11 17:08:39 +01:00
|
|
|
(other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types) ||
|
|
|
|
|
(!other.logical && logical);
|
2019-09-17 13:22:17 +02:00
|
|
|
events |= other.events;
|
|
|
|
|
counters |= other.counters;
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= imm.combine(other.imm);
|
2022-11-13 18:15:28 +00:00
|
|
|
changed |= delay.combine(other.delay);
|
2020-05-07 14:27:42 +01:00
|
|
|
wait_on_read |= other.wait_on_read;
|
2022-05-25 17:21:10 +01:00
|
|
|
vmem_types |= other.vmem_types;
|
2023-05-11 17:08:39 +01:00
|
|
|
logical &= other.logical;
|
2019-10-21 21:36:41 +01:00
|
|
|
return changed;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
void remove_alu_counter()
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
counters &= ~counter_alu;
|
|
|
|
|
delay = alu_delay_info();
|
|
|
|
|
events &= ~(event_valu | event_trans | event_salu);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
void remove_wait(wait_type type, uint32_t type_events)
|
|
|
|
|
{
|
|
|
|
|
counters &= ~(1 << type);
|
|
|
|
|
imm[type] = wait_imm::unset_counter;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
events &= ~type_events | event_flat;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!(counters & counter_lgkm) && !(counters & counter_vm))
|
2024-05-03 11:19:55 +01:00
|
|
|
events &= ~(type_events & event_flat);
|
2022-11-13 18:15:28 +00:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
if (type == wait_type_vm)
|
|
|
|
|
vmem_types = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2023-09-25 12:29:26 +01:00
|
|
|
|
|
|
|
|
UNUSED void print(FILE* output) const
|
|
|
|
|
{
|
|
|
|
|
fprintf(output, "logical: %u\n", logical);
|
|
|
|
|
imm.print(output);
|
|
|
|
|
delay.print(output);
|
|
|
|
|
if (events)
|
|
|
|
|
fprintf(output, "events: %u\n", events);
|
|
|
|
|
if (counters)
|
|
|
|
|
fprintf(output, "counters: %u\n", counters);
|
|
|
|
|
if (!wait_on_read)
|
|
|
|
|
fprintf(output, "wait_on_read: %u\n", wait_on_read);
|
|
|
|
|
if (!logical)
|
|
|
|
|
fprintf(output, "logical: %u\n", logical);
|
|
|
|
|
if (vmem_types)
|
|
|
|
|
fprintf(output, "vmem_types: %u\n", vmem_types);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
struct target_info {
|
2024-05-03 11:19:57 +01:00
|
|
|
wait_imm max_cnt;
|
2024-05-03 11:19:55 +01:00
|
|
|
uint32_t events[wait_type_num] = {};
|
|
|
|
|
uint16_t unordered_events;
|
|
|
|
|
|
|
|
|
|
target_info(enum amd_gfx_level gfx_level)
|
|
|
|
|
{
|
2024-05-03 11:19:57 +01:00
|
|
|
max_cnt = wait_imm::max(gfx_level);
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++)
|
|
|
|
|
max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0;
|
2024-05-03 11:19:55 +01:00
|
|
|
|
|
|
|
|
events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
|
|
|
|
|
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
|
|
|
|
|
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
|
|
|
|
|
events[wait_type_vm] = event_vmem | event_flat;
|
|
|
|
|
events[wait_type_vs] = event_vmem_store;
|
2024-05-03 12:04:58 +01:00
|
|
|
if (gfx_level >= GFX12) {
|
|
|
|
|
events[wait_type_sample] = event_vmem_sample;
|
|
|
|
|
events[wait_type_bvh] = event_vmem_bvh;
|
|
|
|
|
events[wait_type_km] = event_smem | event_sendmsg;
|
|
|
|
|
events[wait_type_lgkm] &= ~events[wait_type_km];
|
|
|
|
|
}
|
2024-05-03 11:19:55 +01:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
u_foreach_bit (j, events[i])
|
|
|
|
|
counters[j] |= (1 << i);
|
|
|
|
|
}
|
|
|
|
|
counters[ffs(event_valu) - 1] |= counter_alu;
|
|
|
|
|
counters[ffs(event_trans) - 1] |= counter_alu;
|
|
|
|
|
counters[ffs(event_salu) - 1] |= counter_alu;
|
|
|
|
|
|
|
|
|
|
unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint8_t get_counters_for_event(uint16_t event) const { return counters[ffs(event) - 1]; }
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
/* Bitfields of counters affected by each event */
|
|
|
|
|
uint8_t counters[num_events] = {};
|
|
|
|
|
};
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
struct wait_ctx {
|
|
|
|
|
Program* program;
|
2022-05-12 02:50:17 -04:00
|
|
|
enum amd_gfx_level gfx_level;
|
2024-05-03 11:19:55 +01:00
|
|
|
const target_info* info;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
uint32_t nonzero = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
bool pending_flat_lgkm = false;
|
|
|
|
|
bool pending_flat_vm = false;
|
2019-10-14 15:18:31 +02:00
|
|
|
bool pending_s_buffer_store = false; /* GFX10 workaround */
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
wait_imm barrier_imm[storage_count];
|
|
|
|
|
uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
std::map<PhysReg, wait_entry> gpr_map;
|
|
|
|
|
|
|
|
|
|
wait_ctx() {}
|
2024-05-03 11:19:55 +01:00
|
|
|
wait_ctx(Program* program_, const target_info* info_)
|
|
|
|
|
: program(program_), gfx_level(program_->gfx_level), info(info_)
|
2021-02-03 14:34:09 +00:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool join(const wait_ctx* other, bool logical)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
bool changed = (other->pending_flat_lgkm && !pending_flat_lgkm) ||
|
|
|
|
|
(other->pending_flat_vm && !pending_flat_vm) || (~nonzero & other->nonzero);
|
|
|
|
|
|
|
|
|
|
nonzero |= other->nonzero;
|
2019-09-17 13:22:17 +02:00
|
|
|
pending_flat_lgkm |= other->pending_flat_lgkm;
|
|
|
|
|
pending_flat_vm |= other->pending_flat_vm;
|
2019-10-14 15:18:31 +02:00
|
|
|
pending_s_buffer_store |= other->pending_s_buffer_store;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-10-29 11:03:07 -07:00
|
|
|
for (const auto& entry : other->gpr_map) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (entry.second.logical != logical)
|
|
|
|
|
continue;
|
|
|
|
|
|
2020-10-22 20:40:04 -07:00
|
|
|
using iterator = std::map<PhysReg, wait_entry>::iterator;
|
|
|
|
|
const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
|
|
|
|
|
if (insert_pair.second) {
|
2019-10-21 21:36:41 +01:00
|
|
|
changed = true;
|
2020-10-22 20:40:04 -07:00
|
|
|
} else {
|
|
|
|
|
changed |= insert_pair.first->second.join(entry.second);
|
2019-10-21 21:36:41 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= barrier_imm[i].combine(other->barrier_imm[i]);
|
2020-11-26 21:31:30 -08:00
|
|
|
changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
|
2020-02-11 16:52:20 +00:00
|
|
|
barrier_events[i] |= other->barrier_events[i];
|
|
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
|
|
|
|
|
return changed;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2023-09-25 12:29:26 +01:00
|
|
|
UNUSED void print(FILE* output) const
|
|
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++)
|
|
|
|
|
fprintf(output, "nonzero[%u]: %u\n", i, nonzero & (1 << i) ? 1 : 0);
|
2023-09-25 12:29:26 +01:00
|
|
|
fprintf(output, "pending_flat_lgkm: %u\n", pending_flat_lgkm);
|
|
|
|
|
fprintf(output, "pending_flat_vm: %u\n", pending_flat_vm);
|
|
|
|
|
for (const auto& entry : gpr_map) {
|
|
|
|
|
fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
|
|
|
|
|
entry.first.reg() & 0xff);
|
|
|
|
|
entry.second.print(output);
|
|
|
|
|
fprintf(output, "}\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
|
|
|
|
if (!barrier_imm[i].empty() || barrier_events[i]) {
|
|
|
|
|
fprintf(output, "barriers[%u] = {\n", i);
|
|
|
|
|
barrier_imm[i].print(output);
|
|
|
|
|
fprintf(output, "events: %u\n", barrier_events[i]);
|
|
|
|
|
fprintf(output, "}\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2024-05-03 12:04:58 +01:00
|
|
|
wait_event
|
|
|
|
|
get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type)
|
|
|
|
|
{
|
|
|
|
|
if (instr->definitions.empty() && ctx.gfx_level >= GFX10)
|
|
|
|
|
return event_vmem_store;
|
|
|
|
|
wait_event ev = event_vmem;
|
|
|
|
|
if (ctx.gfx_level >= GFX12 && type != vmem_nosampler)
|
|
|
|
|
ev = type == vmem_bvh ? event_vmem_bvh : event_vmem_sample;
|
|
|
|
|
return ev;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2022-11-13 18:15:28 +00:00
|
|
|
check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
for (const Operand op : instr->operands) {
|
|
|
|
|
if (op.isConstant() || op.isUndefined())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* check consecutively read gprs */
|
|
|
|
|
for (unsigned j = 0; j < op.size(); j++) {
|
|
|
|
|
PhysReg reg{op.physReg() + j};
|
|
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
|
|
|
|
if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
wait.combine(it->second.imm);
|
2023-02-03 13:08:14 +01:00
|
|
|
if (instr->isVALU() || instr->isSALU())
|
2022-11-13 18:15:28 +00:00
|
|
|
delay.combine(it->second.delay);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
/* check consecutively written gprs */
|
|
|
|
|
for (unsigned j = 0; j < def.getTemp().size(); j++) {
|
|
|
|
|
PhysReg reg{def.physReg() + j};
|
|
|
|
|
|
|
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
|
|
|
|
if (it == ctx.gpr_map.end())
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-04-24 16:57:10 +01:00
|
|
|
wait_imm reg_imm = it->second.imm;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* Vector Memory reads and writes return in the order they were issued */
|
2024-05-03 12:04:58 +01:00
|
|
|
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
|
2024-05-03 11:19:55 +01:00
|
|
|
if (vmem_type) {
|
2024-05-03 12:04:58 +01:00
|
|
|
uint32_t event = get_vmem_event(ctx, instr, vmem_type);
|
|
|
|
|
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
|
|
|
|
|
if ((it->second.events & ctx.info->events[type]) == event &&
|
2024-05-03 11:19:55 +01:00
|
|
|
(type != wait_type_vm || it->second.vmem_types == vmem_type))
|
|
|
|
|
reg_imm[type] = wait_imm::unset_counter;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* LDS reads and writes return in the order they were issued. same for GDS */
|
2024-05-03 11:19:55 +01:00
|
|
|
if (instr->isDS() && (it->second.events & ctx.info->events[wait_type_lgkm]) ==
|
|
|
|
|
(instr->ds().gds ? event_gds : event_lds))
|
2024-04-24 16:57:10 +01:00
|
|
|
reg_imm.lgkm = wait_imm::unset_counter;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-04-24 16:57:10 +01:00
|
|
|
wait.combine(reg_imm);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
bool
|
|
|
|
|
parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->opcode != aco_opcode::s_delay_alu)
|
|
|
|
|
return false;
|
|
|
|
|
|
2024-03-19 15:46:56 +01:00
|
|
|
unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf};
|
2022-11-13 18:15:28 +00:00
|
|
|
for (unsigned i = 0; i < 2; ++i) {
|
|
|
|
|
alu_delay_wait wait = (alu_delay_wait)imm[i];
|
|
|
|
|
if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4)
|
|
|
|
|
delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1;
|
|
|
|
|
else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3)
|
|
|
|
|
delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1;
|
|
|
|
|
else if (wait >= alu_delay_wait::SALU_CYCLE_1)
|
|
|
|
|
delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1;
|
|
|
|
|
}
|
2023-01-03 18:14:16 +00:00
|
|
|
|
2023-05-22 17:44:32 +01:00
|
|
|
delay.valu_cycles = instr->pass_flags & 0xffff;
|
|
|
|
|
delay.trans_cycles = instr->pass_flags >> 16;
|
2023-01-03 18:14:16 +00:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
|
|
|
|
perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
|
2020-06-26 15:54:22 +01:00
|
|
|
{
|
|
|
|
|
sync_scope subgroup_scope =
|
|
|
|
|
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
2020-05-13 16:05:46 +01:00
|
|
|
if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
|
2020-06-26 15:54:22 +01:00
|
|
|
unsigned storage = sync.storage;
|
|
|
|
|
while (storage) {
|
|
|
|
|
unsigned idx = u_bit_scan(&storage);
|
|
|
|
|
|
|
|
|
|
/* LDS is private to the workgroup */
|
|
|
|
|
sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
|
|
|
|
|
|
|
|
|
|
uint16_t events = ctx.barrier_events[idx];
|
|
|
|
|
if (bar_scope_lds <= subgroup_scope)
|
|
|
|
|
events &= ~event_lds;
|
|
|
|
|
|
2021-01-28 11:07:26 +00:00
|
|
|
/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
|
|
|
|
|
* in-order for the same workgroup */
|
|
|
|
|
if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
|
2020-05-13 16:05:46 +01:00
|
|
|
events &= ~(event_vmem | event_vmem_store | event_smem);
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
if (events)
|
|
|
|
|
imm.combine(ctx.barrier_imm[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-19 10:40:35 +02:00
|
|
|
void
|
|
|
|
|
force_waitcnt(wait_ctx& ctx, wait_imm& imm)
|
|
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (i, ctx.nonzero)
|
|
|
|
|
imm[i] = 0;
|
2020-08-19 10:40:35 +02:00
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2022-11-13 18:15:28 +00:00
|
|
|
update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles)
|
|
|
|
|
{
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
|
|
|
|
|
while (it != ctx.gpr_map.end()) {
|
|
|
|
|
wait_entry& entry = it->second;
|
2022-11-13 18:15:28 +00:00
|
|
|
|
|
|
|
|
if (clear) {
|
2024-05-03 11:19:55 +01:00
|
|
|
entry.remove_alu_counter();
|
2022-11-13 18:15:28 +00:00
|
|
|
} else {
|
|
|
|
|
entry.delay.valu_instrs += is_valu ? 1 : 0;
|
|
|
|
|
entry.delay.trans_instrs += is_trans ? 1 : 0;
|
|
|
|
|
entry.delay.salu_cycles -= cycles;
|
|
|
|
|
entry.delay.valu_cycles -= cycles;
|
|
|
|
|
entry.delay.trans_cycles -= cycles;
|
|
|
|
|
|
|
|
|
|
entry.delay.fixup();
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
if (it->second.delay.empty())
|
2024-05-03 11:19:55 +01:00
|
|
|
entry.remove_alu_counter();
|
2022-11-13 18:15:28 +00:00
|
|
|
}
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
|
|
|
|
|
if (!entry.counters)
|
|
|
|
|
it = ctx.gpr_map.erase(it);
|
|
|
|
|
else
|
|
|
|
|
it++;
|
2022-11-13 18:15:28 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
|
|
|
|
|
memory_sync_info sync_info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-06-24 14:17:29 +02:00
|
|
|
if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
|
2020-08-19 10:40:35 +02:00
|
|
|
/* Force emitting waitcnt states right after the instruction if there is
|
2022-06-24 14:17:29 +02:00
|
|
|
* something to wait for. This is also applied for s_setpc_b64 to ensure
|
|
|
|
|
* waitcnt states are inserted before jumping to the PS epilog.
|
2020-08-19 10:40:35 +02:00
|
|
|
*/
|
2022-06-23 16:30:17 +01:00
|
|
|
force_waitcnt(ctx, imm);
|
2020-08-19 10:40:35 +02:00
|
|
|
}
|
|
|
|
|
|
2023-04-06 23:09:35 +03:00
|
|
|
/* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
|
|
|
|
|
* overlapping waves proceed into the ordered section.
|
|
|
|
|
*/
|
|
|
|
|
if (ctx.program->has_pops_overlapped_waves_wait &&
|
|
|
|
|
(ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done
|
|
|
|
|
: (instr->opcode == aco_opcode::s_sendmsg &&
|
2024-03-19 15:46:56 +01:00
|
|
|
instr->salu().imm == sendmsg_ordered_ps_done))) {
|
2024-05-03 11:19:55 +01:00
|
|
|
uint8_t c = counter_vm | counter_vs;
|
2023-04-06 23:09:35 +03:00
|
|
|
/* Await SMEM loads too, as it's possible for an application to create them, like using a
|
|
|
|
|
* scalarization loop - pointless and unoptimal for an inherently divergent address of
|
|
|
|
|
* per-pixel data, but still can be done at least synthetically and must be handled correctly.
|
|
|
|
|
*/
|
2024-05-03 11:19:55 +01:00
|
|
|
if (ctx.program->has_smem_buffer_or_global_loads)
|
|
|
|
|
c |= counter_lgkm;
|
|
|
|
|
|
|
|
|
|
u_foreach_bit (i, c & ctx.nonzero)
|
|
|
|
|
imm[i] = 0;
|
2023-04-06 23:09:35 +03:00
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
check_instr(ctx, imm, delay, instr);
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2020-02-07 16:33:35 +01:00
|
|
|
/* It's required to wait for scalar stores before "writing back" data.
|
|
|
|
|
* It shouldn't cost anything anyways since we're about to do s_endpgm.
|
|
|
|
|
*/
|
2024-05-03 11:19:55 +01:00
|
|
|
if ((ctx.nonzero & BITFIELD_BIT(wait_type_lgkm)) && instr->opcode == aco_opcode::s_dcache_wb) {
|
2022-05-12 02:50:17 -04:00
|
|
|
assert(ctx.gfx_level >= GFX8);
|
2020-02-07 16:33:35 +01:00
|
|
|
imm.lgkm = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level >= GFX10 && instr->isSMEM()) {
|
2019-10-14 15:18:31 +02:00
|
|
|
/* GFX10: A store followed by a load at the same address causes a problem because
|
|
|
|
|
* the load doesn't load the correct values unless we wait for the store first.
|
|
|
|
|
* This is NOT mitigated by an s_nop.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Refine this when we have proper alias analysis.
|
|
|
|
|
*/
|
2021-01-21 16:13:34 +00:00
|
|
|
if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
|
|
|
|
|
!instr->smem().sync.can_reorder()) {
|
2019-10-14 15:18:31 +02:00
|
|
|
imm.lgkm = 0;
|
|
|
|
|
}
|
2019-09-17 19:59:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-10-27 12:49:09 +01:00
|
|
|
if (instr->opcode == aco_opcode::ds_ordered_count &&
|
|
|
|
|
((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
|
|
|
|
|
imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
if (instr->opcode == aco_opcode::p_barrier)
|
2021-07-14 13:49:20 +02:00
|
|
|
perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel);
|
2020-06-26 15:54:22 +01:00
|
|
|
else
|
2021-07-14 13:49:20 +02:00
|
|
|
perform_barrier(ctx, imm, sync_info, semantic_release);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (!imm.empty() || !delay.empty()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
|
|
|
|
|
imm.vm = 0;
|
|
|
|
|
if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
|
|
|
|
|
imm.lgkm = 0;
|
|
|
|
|
|
|
|
|
|
/* reset counters */
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++)
|
|
|
|
|
ctx.nonzero &= imm[i] == 0 ? ~BITFIELD_BIT(i) : UINT32_MAX;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* update barrier wait imms */
|
2020-06-26 15:54:22 +01:00
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_imm& bar = ctx.barrier_imm[i];
|
2020-02-11 16:52:20 +00:00
|
|
|
uint16_t& bar_ev = ctx.barrier_events[i];
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned j = 0; j < wait_type_num; j++) {
|
|
|
|
|
if (bar[j] != wait_imm::unset_counter && imm[j] <= bar[j]) {
|
|
|
|
|
bar[j] = wait_imm::unset_counter;
|
|
|
|
|
bar_ev &= ~ctx.info->events[j] | event_flat;
|
|
|
|
|
}
|
2020-02-11 16:52:20 +00:00
|
|
|
}
|
|
|
|
|
if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
|
|
|
|
|
bar_ev &= ~event_flat;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (ctx.program->gfx_level >= GFX11) {
|
|
|
|
|
update_alu(ctx, false, false, false,
|
|
|
|
|
MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-12 15:28:49 +01:00
|
|
|
/* remove all gprs with higher counter from map */
|
2019-09-17 13:22:17 +02:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
|
|
|
|
|
while (it != ctx.gpr_map.end()) {
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if (imm[i] != wait_imm::unset_counter && imm[i] <= it->second.imm[i])
|
|
|
|
|
it->second.remove_wait((wait_type)i, ctx.info->events[i]);
|
|
|
|
|
}
|
2022-11-13 18:15:28 +00:00
|
|
|
if (delay.valu_instrs <= it->second.delay.valu_instrs)
|
|
|
|
|
it->second.delay.valu_instrs = alu_delay_info::valu_nop;
|
|
|
|
|
if (delay.trans_instrs <= it->second.delay.trans_instrs)
|
|
|
|
|
it->second.delay.trans_instrs = alu_delay_info::trans_nop;
|
|
|
|
|
it->second.delay.fixup();
|
|
|
|
|
if (it->second.delay.empty())
|
2024-05-03 11:19:55 +01:00
|
|
|
it->second.remove_alu_counter();
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!it->second.counters)
|
|
|
|
|
it = ctx.gpr_map.erase(it);
|
|
|
|
|
else
|
|
|
|
|
it++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (imm.vm == 0)
|
|
|
|
|
ctx.pending_flat_vm = false;
|
2019-10-14 15:18:31 +02:00
|
|
|
if (imm.lgkm == 0) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.pending_flat_lgkm = false;
|
2019-10-14 15:18:31 +02:00
|
|
|
ctx.pending_s_buffer_store = false;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
void
|
|
|
|
|
update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-06-26 15:54:22 +01:00
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_imm& bar = ctx.barrier_imm[i];
|
2020-02-11 16:52:20 +00:00
|
|
|
uint16_t& bar_ev = ctx.barrier_events[i];
|
2020-06-26 15:54:22 +01:00
|
|
|
if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev |= event;
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (j, counters)
|
|
|
|
|
bar[j] = 0;
|
2024-05-03 11:19:55 +01:00
|
|
|
} else if (!(bar_ev & ctx.info->unordered_events) && !(ctx.info->unordered_events & event)) {
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (j, counters) {
|
|
|
|
|
if (bar[j] != wait_imm::unset_counter && (bar_ev & ctx.info->events[j]) == event)
|
|
|
|
|
bar[j] = std::min<uint16_t>(bar[j] + 1, ctx.info->max_cnt[j]);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
void
|
|
|
|
|
update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
uint8_t counters = ctx.info->get_counters_for_event(event);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
ctx.nonzero |= counters;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
update_barrier_imm(ctx, counters, event, sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
if (ctx.info->unordered_events & event)
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (ctx.pending_flat_lgkm)
|
|
|
|
|
counters &= ~counter_lgkm;
|
|
|
|
|
if (ctx.pending_flat_vm)
|
|
|
|
|
counters &= ~counter_vm;
|
|
|
|
|
|
|
|
|
|
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
|
|
|
|
wait_entry& entry = e.second;
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
if (entry.events & ctx.info->unordered_events)
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
assert(entry.events);
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (i, counters) {
|
|
|
|
|
if ((entry.events & ctx.info->events[i]) == event)
|
|
|
|
|
entry.imm[i] = std::min<uint16_t>(entry.imm[i] + 1, ctx.info->max_cnt[i]);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
void
|
|
|
|
|
update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
assert(ctx.gfx_level < GFX10);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
ctx.nonzero |= BITFIELD_BIT(wait_type_lgkm) | BITFIELD_BIT(wait_type_vm);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
|
|
|
|
|
if (e.second.counters & counter_vm)
|
|
|
|
|
e.second.imm.vm = 0;
|
|
|
|
|
if (e.second.counters & counter_lgkm)
|
|
|
|
|
e.second.imm.lgkm = 0;
|
|
|
|
|
}
|
|
|
|
|
ctx.pending_flat_lgkm = true;
|
|
|
|
|
ctx.pending_flat_vm = true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-07 14:27:42 +01:00
|
|
|
void
|
|
|
|
|
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
|
2023-05-11 17:08:39 +01:00
|
|
|
uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
uint16_t counters = ctx.info->get_counters_for_event(event);
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_imm imm;
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (i, counters & wait_counters)
|
|
|
|
|
imm[i] = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
alu_delay_info delay;
|
|
|
|
|
if (event == event_valu) {
|
|
|
|
|
delay.valu_instrs = 0;
|
|
|
|
|
delay.valu_cycles = cycles;
|
|
|
|
|
} else if (event == event_trans) {
|
|
|
|
|
delay.trans_instrs = 0;
|
|
|
|
|
delay.trans_cycles = cycles;
|
|
|
|
|
} else if (event == event_salu) {
|
|
|
|
|
delay.salu_cycles = cycles;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear,
|
|
|
|
|
wait_on_read);
|
2024-05-03 12:04:58 +01:00
|
|
|
if (counters & counter_vm)
|
|
|
|
|
new_entry.vmem_types |= vmem_types;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < rc.size(); i++) {
|
2020-02-07 11:55:43 +00:00
|
|
|
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!it.second)
|
|
|
|
|
it.first->second.join(new_entry);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-07 14:27:42 +01:00
|
|
|
void
|
2022-05-25 17:21:10 +01:00
|
|
|
insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (!op.isConstant() && !op.isUndefined())
|
2022-11-13 18:15:28 +00:00
|
|
|
insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
|
|
|
|
|
unsigned cycles = 0)
|
|
|
|
|
{
|
2023-05-23 14:04:41 +01:00
|
|
|
/* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without
|
|
|
|
|
* waiting for the load to finish.
|
|
|
|
|
* Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane
|
|
|
|
|
* dependency checks.
|
2023-05-11 17:08:39 +01:00
|
|
|
*/
|
2023-05-23 14:04:41 +01:00
|
|
|
uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat;
|
|
|
|
|
uint32_t alu_events = event_trans | event_valu | event_salu;
|
|
|
|
|
bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events));
|
2023-05-11 17:08:39 +01:00
|
|
|
|
|
|
|
|
insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles,
|
|
|
|
|
force_linear);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-07 14:27:42 +01:00
|
|
|
void
|
2022-11-13 18:15:28 +00:00
|
|
|
gen_alu(Instruction* instr, wait_ctx& ctx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-11-13 18:15:28 +00:00
|
|
|
Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
|
2023-02-03 13:08:14 +01:00
|
|
|
bool is_valu = instr->isVALU();
|
2022-11-13 18:15:28 +00:00
|
|
|
bool is_trans = instr->isTrans();
|
|
|
|
|
bool clear = instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() ||
|
|
|
|
|
instr->isMUBUF() || instr->isMTBUF();
|
|
|
|
|
|
|
|
|
|
wait_event event = (wait_event)0;
|
|
|
|
|
if (is_trans)
|
|
|
|
|
event = event_trans;
|
|
|
|
|
else if (is_valu)
|
|
|
|
|
event = event_valu;
|
|
|
|
|
else if (instr->isSALU())
|
|
|
|
|
event = event_salu;
|
|
|
|
|
|
|
|
|
|
if (event != (wait_event)0) {
|
|
|
|
|
for (const Definition& def : instr->definitions)
|
|
|
|
|
insert_wait_entry(ctx, def, event, 0, cycle_info.latency);
|
|
|
|
|
}
|
2023-07-15 19:49:49 +02:00
|
|
|
update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
|
|
|
|
|
clear, cycle_info.issue_cycles);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
gen(Instruction* instr, wait_ctx& ctx)
|
|
|
|
|
{
|
|
|
|
|
switch (instr->format) {
|
|
|
|
|
case Format::EXP: {
|
2021-01-21 16:13:34 +00:00
|
|
|
Export_instruction& exp_instr = instr->exp();
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
wait_event ev;
|
2021-01-21 16:13:34 +00:00
|
|
|
if (exp_instr.dest <= 9)
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_mrt_null;
|
2021-01-21 16:13:34 +00:00
|
|
|
else if (exp_instr.dest <= 15)
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_pos;
|
|
|
|
|
else
|
|
|
|
|
ev = event_exp_param;
|
|
|
|
|
update_counters(ctx, ev);
|
|
|
|
|
|
|
|
|
|
/* insert new entries for exported vgprs */
|
|
|
|
|
for (unsigned i = 0; i < 4; i++) {
|
2021-01-21 16:13:34 +00:00
|
|
|
if (exp_instr.enabled_mask & (1 << i)) {
|
|
|
|
|
unsigned idx = exp_instr.compressed ? i >> 1 : i;
|
|
|
|
|
assert(idx < exp_instr.operands.size());
|
|
|
|
|
insert_wait_entry(ctx, exp_instr.operands[idx], ev);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
insert_wait_entry(ctx, exec, s2, ev, false);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::FLAT: {
|
2021-01-21 16:13:34 +00:00
|
|
|
FLAT_instruction& flat = instr->flat();
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
|
2021-01-21 16:13:34 +00:00
|
|
|
update_counters_for_flat_load(ctx, flat.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
else
|
2021-01-21 16:13:34 +00:00
|
|
|
update_counters(ctx, event_flat, flat.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_flat);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::SMEM: {
|
2021-01-21 16:13:34 +00:00
|
|
|
SMEM_instruction& smem = instr->smem();
|
|
|
|
|
update_counters(ctx, event_smem, smem.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_smem);
|
2022-05-12 02:50:17 -04:00
|
|
|
else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder())
|
2019-10-14 15:18:31 +02:00
|
|
|
ctx.pending_s_buffer_store = true;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::DS: {
|
2021-01-21 16:13:34 +00:00
|
|
|
DS_instruction& ds = instr->ds();
|
|
|
|
|
update_counters(ctx, ds.gds ? event_gds : event_lds, ds.sync);
|
|
|
|
|
if (ds.gds)
|
2019-09-17 13:22:17 +02:00
|
|
|
update_counters(ctx, event_gds_gpr_lock);
|
|
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
2021-01-21 16:13:34 +00:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
if (ds.gds) {
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Operand& op : instr->operands)
|
|
|
|
|
insert_wait_entry(ctx, op, event_gds_gpr_lock);
|
|
|
|
|
insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-06-17 13:53:08 +01:00
|
|
|
case Format::LDSDIR: {
|
|
|
|
|
LDSDIR_instruction& ldsdir = instr->ldsdir();
|
|
|
|
|
update_counters(ctx, event_ldsdir, ldsdir.sync);
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_ldsdir);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
case Format::MUBUF:
|
|
|
|
|
case Format::MTBUF:
|
|
|
|
|
case Format::MIMG:
|
2022-05-19 15:55:53 +01:00
|
|
|
case Format::GLOBAL:
|
|
|
|
|
case Format::SCRATCH: {
|
2024-05-03 12:04:58 +01:00
|
|
|
uint8_t type = get_vmem_type(ctx.gfx_level, instr);
|
|
|
|
|
wait_event ev = get_vmem_event(ctx, instr, type);
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
update_counters(ctx, ev, get_sync_info(instr));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
2024-05-03 12:04:58 +01:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], ev, type);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
|
2019-09-17 13:22:17 +02:00
|
|
|
update_counters(ctx, event_vmem_gpr_lock);
|
|
|
|
|
insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
|
2020-01-16 16:54:35 +01:00
|
|
|
update_counters(ctx, event_vmem_gpr_lock);
|
2021-01-14 17:46:50 +00:00
|
|
|
insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-01-16 16:54:35 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
2019-10-14 17:21:04 +01:00
|
|
|
case Format::SOPP: {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
|
2020-06-26 15:54:22 +01:00
|
|
|
update_counters(ctx, event_sendmsg);
|
2021-04-13 17:21:56 +02:00
|
|
|
break;
|
2019-10-14 17:21:04 +01:00
|
|
|
}
|
2022-10-24 02:14:24 +00:00
|
|
|
case Format::SOP1: {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_sendmsg_rtn_b64) {
|
|
|
|
|
update_counters(ctx, event_sendmsg);
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_sendmsg);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2021-07-14 13:49:20 +02:00
|
|
|
emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 12:04:58 +01:00
|
|
|
Builder bld(ctx.program, &instructions);
|
|
|
|
|
|
|
|
|
|
if (ctx.gfx_level >= GFX12) {
|
|
|
|
|
if (imm.vm != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) {
|
|
|
|
|
bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (imm.vm << 8) | imm.lgkm);
|
|
|
|
|
imm.vm = wait_imm::unset_counter;
|
|
|
|
|
imm.lgkm = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (imm.vs != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) {
|
|
|
|
|
bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (imm.vs << 8) | imm.lgkm);
|
|
|
|
|
imm.vs = wait_imm::unset_counter;
|
|
|
|
|
imm.lgkm = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode op[wait_type_num];
|
|
|
|
|
op[wait_type_exp] = aco_opcode::s_wait_expcnt;
|
|
|
|
|
op[wait_type_lgkm] = aco_opcode::s_wait_dscnt;
|
|
|
|
|
op[wait_type_vm] = aco_opcode::s_wait_loadcnt;
|
|
|
|
|
op[wait_type_vs] = aco_opcode::s_wait_storecnt;
|
|
|
|
|
op[wait_type_sample] = aco_opcode::s_wait_samplecnt;
|
|
|
|
|
op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt;
|
|
|
|
|
op[wait_type_km] = aco_opcode::s_wait_kmcnt;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if (imm[i] != wait_imm::unset_counter)
|
|
|
|
|
bld.sopp(op[i], imm[i]);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (imm.vs != wait_imm::unset_counter) {
|
|
|
|
|
assert(ctx.gfx_level >= GFX10);
|
|
|
|
|
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), imm.vs);
|
|
|
|
|
imm.vs = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
if (!imm.empty())
|
|
|
|
|
bld.sopp(aco_opcode::s_waitcnt, imm.pack(ctx.gfx_level));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2021-07-14 13:49:20 +02:00
|
|
|
imm = wait_imm();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
void
|
|
|
|
|
emit_delay_alu(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
|
|
|
|
|
alu_delay_info& delay)
|
|
|
|
|
{
|
|
|
|
|
uint32_t imm = 0;
|
|
|
|
|
if (delay.trans_instrs != delay.trans_nop) {
|
|
|
|
|
imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (delay.valu_instrs != delay.valu_nop) {
|
|
|
|
|
imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
|
|
|
|
|
* drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
|
|
|
|
|
* getting this wrong isn't an issue. */
|
|
|
|
|
if (delay.salu_cycles && imm <= 0xf) {
|
|
|
|
|
unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
|
|
|
|
|
imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
|
2024-03-25 12:05:50 +01:00
|
|
|
inst->salu().imm = imm;
|
2023-05-22 17:44:32 +01:00
|
|
|
inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
|
2022-11-13 18:15:28 +00:00
|
|
|
instructions.emplace_back(inst);
|
|
|
|
|
delay = alu_delay_info();
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-27 16:38:25 +00:00
|
|
|
bool
|
|
|
|
|
check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
for (Operand op : instr->operands) {
|
|
|
|
|
if (op.isConstant())
|
|
|
|
|
continue;
|
|
|
|
|
for (unsigned i = 0; i < op.size(); i++) {
|
|
|
|
|
if (regs_written[op.physReg().reg() + i])
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
for (unsigned i = 0; i < def.size(); i++)
|
|
|
|
|
regs_written[def.physReg().reg() + i] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
|
|
|
|
handle_block(Program* program, Block& block, wait_ctx& ctx)
|
|
|
|
|
{
|
|
|
|
|
std::vector<aco_ptr<Instruction>> new_instructions;
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
wait_imm queued_imm;
|
2022-11-13 18:15:28 +00:00
|
|
|
alu_delay_info queued_delay;
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2024-03-27 16:38:25 +00:00
|
|
|
size_t clause_end = 0;
|
|
|
|
|
for (size_t i = 0; i < block.instructions.size(); i++) {
|
|
|
|
|
aco_ptr<Instruction>& instr = block.instructions[i];
|
|
|
|
|
|
2024-05-03 11:19:57 +01:00
|
|
|
bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get());
|
2022-11-13 18:15:28 +00:00
|
|
|
bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
memory_sync_info sync_info = get_sync_info(instr.get());
|
2022-11-13 18:15:28 +00:00
|
|
|
kill(queued_imm, queued_delay, instr.get(), ctx, sync_info);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-03-27 16:38:25 +00:00
|
|
|
/* At the start of a possible clause, also emit waitcnts for each instruction to avoid
|
|
|
|
|
* splitting the clause.
|
|
|
|
|
*/
|
|
|
|
|
if (i >= clause_end || !queued_imm.empty()) {
|
|
|
|
|
std::optional<std::bitset<512>> regs_written;
|
|
|
|
|
for (clause_end = i + 1; clause_end < block.instructions.size(); clause_end++) {
|
|
|
|
|
Instruction* next = block.instructions[clause_end].get();
|
|
|
|
|
if (!should_form_clause(instr.get(), next))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (!regs_written) {
|
|
|
|
|
regs_written.emplace();
|
|
|
|
|
check_clause_raw(*regs_written, instr.get());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!check_clause_raw(*regs_written, next))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
kill(queued_imm, queued_delay, next, ctx, get_sync_info(next));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (program->gfx_level >= GFX11)
|
|
|
|
|
gen_alu(instr.get(), ctx);
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
gen(instr.get(), ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-11-13 18:15:28 +00:00
|
|
|
if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) {
|
2022-06-17 13:53:08 +01:00
|
|
|
if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
|
|
|
|
|
instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
|
|
|
|
|
queued_imm.exp = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
if (!queued_imm.empty())
|
2019-10-21 21:36:41 +01:00
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm);
|
2022-11-13 18:15:28 +00:00
|
|
|
if (!queued_delay.empty())
|
|
|
|
|
emit_delay_alu(ctx, new_instructions, queued_delay);
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2022-10-27 12:49:09 +01:00
|
|
|
bool is_ordered_count_acquire =
|
|
|
|
|
instr->opcode == aco_opcode::ds_ordered_count &&
|
|
|
|
|
!((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1);
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
new_instructions.emplace_back(std::move(instr));
|
|
|
|
|
perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);
|
2022-10-27 12:49:09 +01:00
|
|
|
|
|
|
|
|
if (is_ordered_count_acquire)
|
|
|
|
|
queued_imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2023-08-19 15:36:00 +08:00
|
|
|
/* For last block of a program which has succeed shader part, wait all memory ops done
|
|
|
|
|
* before go to next shader part.
|
|
|
|
|
*/
|
|
|
|
|
if (block.kind & block_kind_end_with_regs)
|
|
|
|
|
force_waitcnt(ctx, queued_imm);
|
|
|
|
|
|
2019-11-22 19:38:51 +00:00
|
|
|
if (!queued_imm.empty())
|
|
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm);
|
2022-11-13 18:15:28 +00:00
|
|
|
if (!queued_delay.empty())
|
|
|
|
|
emit_delay_alu(ctx, new_instructions, queued_delay);
|
2019-11-22 19:38:51 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
block.instructions.swap(new_instructions);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} /* end namespace */
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
insert_wait_states(Program* program)
|
|
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
target_info info(program->gfx_level);
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
/* per BB ctx */
|
|
|
|
|
std::vector<bool> done(program->blocks.size());
|
2024-05-03 11:19:55 +01:00
|
|
|
std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program, &info));
|
|
|
|
|
std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program, &info));
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2021-07-10 12:20:56 +02:00
|
|
|
std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
|
2019-10-21 21:36:41 +01:00
|
|
|
unsigned loop_progress = 0;
|
|
|
|
|
|
2023-08-15 15:24:09 +08:00
|
|
|
if (program->pending_lds_access) {
|
2024-05-03 11:19:55 +01:00
|
|
|
update_barrier_imm(in_ctx[0], info.get_counters_for_event(event_lds), event_lds,
|
2023-08-15 15:24:09 +08:00
|
|
|
memory_sync_info(storage_shared));
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-02 17:30:49 -08:00
|
|
|
for (Definition def : program->args_pending_vmem) {
|
|
|
|
|
update_counters(in_ctx[0], event_vmem);
|
|
|
|
|
insert_wait_entry(in_ctx[0], def, event_vmem);
|
2021-05-17 17:56:28 +01:00
|
|
|
}
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
for (unsigned i = 0; i < program->blocks.size();) {
|
|
|
|
|
Block& current = program->blocks[i++];
|
2023-04-15 21:45:12 +03:00
|
|
|
|
|
|
|
|
if (current.kind & block_kind_discard_early_exit) {
|
|
|
|
|
/* Because the jump to the discard early exit block may happen anywhere in a block, it's
|
|
|
|
|
* not possible to join it with its predecessors this way.
|
|
|
|
|
* We emit all required waits when emitting the discard block.
|
|
|
|
|
*/
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
wait_ctx ctx = in_ctx[current.index];
|
|
|
|
|
|
|
|
|
|
if (current.kind & block_kind_loop_header) {
|
|
|
|
|
loop_header_indices.push(current.index);
|
|
|
|
|
} else if (current.kind & block_kind_loop_exit) {
|
|
|
|
|
bool repeat = false;
|
|
|
|
|
if (loop_progress == loop_header_indices.size()) {
|
|
|
|
|
i = loop_header_indices.top();
|
|
|
|
|
repeat = true;
|
|
|
|
|
}
|
|
|
|
|
loop_header_indices.pop();
|
|
|
|
|
loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
|
|
|
|
|
if (repeat)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool changed = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned b : current.linear_preds)
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= ctx.join(&out_ctx[b], false);
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned b : current.logical_preds)
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= ctx.join(&out_ctx[b], true);
|
|
|
|
|
|
2019-12-04 14:41:18 +00:00
|
|
|
if (done[current.index] && !changed) {
|
|
|
|
|
in_ctx[current.index] = std::move(ctx);
|
2019-10-21 21:36:41 +01:00
|
|
|
continue;
|
2019-12-04 14:41:18 +00:00
|
|
|
} else {
|
|
|
|
|
in_ctx[current.index] = ctx;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
|
|
|
|
|
done[current.index] = true;
|
|
|
|
|
|
|
|
|
|
handle_block(program, current, ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-04 14:41:18 +00:00
|
|
|
out_ctx[current.index] = std::move(ctx);
|
|
|
|
|
}
|
2023-01-03 16:35:17 +00:00
|
|
|
|
|
|
|
|
/* Combine s_delay_alu using the skip field. */
|
|
|
|
|
if (program->gfx_level >= GFX11) {
|
|
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
int prev_delay_alu = -1;
|
|
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions) {
|
|
|
|
|
if (instr->opcode != aco_opcode::s_delay_alu) {
|
|
|
|
|
block.instructions[i++] = std::move(instr);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-19 15:46:56 +01:00
|
|
|
uint16_t imm = instr->salu().imm;
|
2023-01-03 16:35:17 +00:00
|
|
|
int skip = i - prev_delay_alu - 1;
|
|
|
|
|
if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
|
|
|
|
|
if (imm >> 7 == 0)
|
|
|
|
|
prev_delay_alu = i;
|
|
|
|
|
block.instructions[i++] = std::move(instr);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-19 15:46:56 +01:00
|
|
|
block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
|
2023-01-03 16:35:17 +00:00
|
|
|
prev_delay_alu = -1;
|
|
|
|
|
}
|
|
|
|
|
block.instructions.resize(i);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace aco
|