2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2019-09-17 13:22:17 +02:00
|
|
|
*/
|
|
|
|
|
|
2023-05-24 16:24:35 +01:00
|
|
|
#include "aco_builder.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2021-06-10 11:33:15 +02:00
|
|
|
#include "common/sid.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include <map>
|
|
|
|
|
#include <stack>
|
|
|
|
|
#include <vector>
|
2024-03-27 16:38:25 +00:00
|
|
|
#include <optional>
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
namespace aco {
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The general idea of this pass is:
|
2019-10-21 21:36:41 +01:00
|
|
|
* The CFG is traversed in reverse postorder (forward) and loops are processed
|
|
|
|
|
* several times until no progress is made.
|
|
|
|
|
* Per BB two wait_ctx is maintained: an in-context and out-context.
|
2019-09-17 13:22:17 +02:00
|
|
|
* The in-context is the joined out-contexts of the predecessors.
|
|
|
|
|
* The context contains a map: gpr -> wait_entry
|
|
|
|
|
* consisting of the information about the cnt values to be waited for.
|
|
|
|
|
* Note: After merge-nodes, it might occur that for the same register
|
|
|
|
|
* multiple cnt values are to be waited for.
|
|
|
|
|
*
|
|
|
|
|
* The values are updated according to the encountered instructions:
|
|
|
|
|
* - additional events increment the counter of waits of the same type
|
|
|
|
|
* - or erase gprs with counters higher than to be waited for.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
|
|
|
|
|
// when there is a load followed by a use of a previous load
|
|
|
|
|
|
|
|
|
|
/* Instructions of the same event will finish in-order except for smem
|
|
|
|
|
* and maybe flat. Instructions of different events may not finish in-order. */
|
2024-05-03 12:04:58 +01:00
|
|
|
enum wait_event : uint32_t {
|
2019-09-17 13:22:17 +02:00
|
|
|
event_smem = 1 << 0,
|
|
|
|
|
event_lds = 1 << 1,
|
|
|
|
|
event_gds = 1 << 2,
|
|
|
|
|
event_vmem = 1 << 3,
|
|
|
|
|
event_vmem_store = 1 << 4, /* GFX10+ */
|
2025-07-22 16:58:06 +01:00
|
|
|
event_exp_pos = 1 << 5,
|
|
|
|
|
event_exp_param = 1 << 6,
|
|
|
|
|
event_exp_mrt_null = 1 << 7,
|
|
|
|
|
event_exp_prim = 1 << 8,
|
|
|
|
|
event_exp_dual_src_blend = 1 << 9,
|
|
|
|
|
event_gds_gpr_lock = 1 << 10,
|
|
|
|
|
event_vmem_gpr_lock = 1 << 11,
|
|
|
|
|
event_sendmsg = 1 << 12,
|
2025-08-01 15:38:31 +01:00
|
|
|
event_sendmsg_rtn = 1 << 13,
|
|
|
|
|
event_ldsdir = 1 << 14,
|
|
|
|
|
event_vmem_sample = 1 << 15, /* GFX12+ */
|
|
|
|
|
event_vmem_bvh = 1 << 16, /* GFX12+ */
|
|
|
|
|
num_events = 17,
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum counter_type : uint8_t {
|
2024-05-03 11:19:55 +01:00
|
|
|
counter_exp = 1 << wait_type_exp,
|
|
|
|
|
counter_lgkm = 1 << wait_type_lgkm,
|
|
|
|
|
counter_vm = 1 << wait_type_vm,
|
|
|
|
|
counter_vs = 1 << wait_type_vs,
|
2024-05-03 12:04:58 +01:00
|
|
|
counter_sample = 1 << wait_type_sample,
|
|
|
|
|
counter_bvh = 1 << wait_type_bvh,
|
|
|
|
|
counter_km = 1 << wait_type_km,
|
2024-08-07 15:41:42 +01:00
|
|
|
num_counters = wait_type_num,
|
2022-11-13 18:15:28 +00:00
|
|
|
};
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
struct wait_entry {
|
|
|
|
|
wait_imm imm;
|
2024-05-03 12:04:58 +01:00
|
|
|
uint32_t events; /* use wait_event notion */
|
2025-05-02 11:05:21 +01:00
|
|
|
uint32_t logical_events; /* use wait_event notion */
|
2019-09-17 13:22:17 +02:00
|
|
|
uint8_t counters; /* use counter_type notion */
|
|
|
|
|
bool wait_on_read : 1;
|
2024-05-03 12:04:58 +01:00
|
|
|
uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */
|
2025-05-12 16:41:31 +01:00
|
|
|
uint8_t vm_mask : 2; /* which halves of the VGPR event_vmem uses */
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-05-13 14:25:51 +01:00
|
|
|
wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool wait_on_read_)
|
2025-05-02 11:05:21 +01:00
|
|
|
: imm(imm_), events(event_), logical_events(event_), counters(counters_),
|
2025-05-13 14:25:51 +01:00
|
|
|
wait_on_read(wait_on_read_), vmem_types(0), vm_mask(0)
|
2020-05-07 14:27:42 +01:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-05-13 14:25:51 +01:00
|
|
|
bool join(const wait_entry& other)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-10-21 21:36:41 +01:00
|
|
|
bool changed = (other.events & ~events) || (other.counters & ~counters) ||
|
2025-05-01 17:04:38 +01:00
|
|
|
(other.wait_on_read && !wait_on_read) || (other.vmem_types & ~vmem_types) ||
|
2025-05-13 14:25:51 +01:00
|
|
|
(other.vm_mask & ~vm_mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
events |= other.events;
|
|
|
|
|
counters |= other.counters;
|
2019-10-21 21:36:41 +01:00
|
|
|
changed |= imm.combine(other.imm);
|
2020-05-07 14:27:42 +01:00
|
|
|
wait_on_read |= other.wait_on_read;
|
2022-05-25 17:21:10 +01:00
|
|
|
vmem_types |= other.vmem_types;
|
2025-05-12 16:41:31 +01:00
|
|
|
vm_mask |= other.vm_mask;
|
2019-10-21 21:36:41 +01:00
|
|
|
return changed;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
void remove_wait(wait_type type, uint32_t type_events)
|
|
|
|
|
{
|
|
|
|
|
counters &= ~(1 << type);
|
|
|
|
|
imm[type] = wait_imm::unset_counter;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-06-09 14:54:46 +01:00
|
|
|
events &= ~type_events;
|
2022-11-13 18:15:28 +00:00
|
|
|
|
2025-05-02 11:05:21 +01:00
|
|
|
logical_events &= events;
|
2024-05-03 11:19:55 +01:00
|
|
|
if (type == wait_type_vm)
|
|
|
|
|
vmem_types = 0;
|
2025-05-12 16:41:31 +01:00
|
|
|
if (type_events & event_vmem)
|
|
|
|
|
vm_mask = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2023-09-25 12:29:26 +01:00
|
|
|
|
|
|
|
|
UNUSED void print(FILE* output) const
|
|
|
|
|
{
|
|
|
|
|
imm.print(output);
|
|
|
|
|
if (events)
|
|
|
|
|
fprintf(output, "events: %u\n", events);
|
2025-05-02 11:05:21 +01:00
|
|
|
if (logical_events)
|
|
|
|
|
fprintf(output, "logical_events: %u\n", logical_events);
|
2023-09-25 12:29:26 +01:00
|
|
|
if (counters)
|
|
|
|
|
fprintf(output, "counters: %u\n", counters);
|
|
|
|
|
if (!wait_on_read)
|
|
|
|
|
fprintf(output, "wait_on_read: %u\n", wait_on_read);
|
|
|
|
|
if (vmem_types)
|
|
|
|
|
fprintf(output, "vmem_types: %u\n", vmem_types);
|
2025-05-12 16:41:31 +01:00
|
|
|
if (vm_mask)
|
|
|
|
|
fprintf(output, "vm_mask: %u\n", vm_mask);
|
2023-09-25 12:29:26 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
struct target_info {
|
2024-05-03 11:19:57 +01:00
|
|
|
wait_imm max_cnt;
|
2024-05-03 11:19:55 +01:00
|
|
|
uint32_t events[wait_type_num] = {};
|
|
|
|
|
uint16_t unordered_events;
|
|
|
|
|
|
|
|
|
|
target_info(enum amd_gfx_level gfx_level)
|
|
|
|
|
{
|
2024-05-03 11:19:57 +01:00
|
|
|
max_cnt = wait_imm::max(gfx_level);
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++)
|
|
|
|
|
max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0;
|
2024-05-03 11:19:55 +01:00
|
|
|
|
|
|
|
|
events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
|
2025-07-22 16:58:06 +01:00
|
|
|
event_exp_prim | event_exp_dual_src_blend | event_gds_gpr_lock |
|
|
|
|
|
event_vmem_gpr_lock | event_ldsdir;
|
2025-08-01 15:38:31 +01:00
|
|
|
events[wait_type_lgkm] =
|
|
|
|
|
event_smem | event_lds | event_gds | event_sendmsg | event_sendmsg_rtn;
|
2025-06-09 14:54:46 +01:00
|
|
|
events[wait_type_vm] = event_vmem;
|
2024-05-03 11:19:55 +01:00
|
|
|
events[wait_type_vs] = event_vmem_store;
|
2024-05-03 12:04:58 +01:00
|
|
|
if (gfx_level >= GFX12) {
|
|
|
|
|
events[wait_type_sample] = event_vmem_sample;
|
|
|
|
|
events[wait_type_bvh] = event_vmem_bvh;
|
2025-08-01 15:38:31 +01:00
|
|
|
events[wait_type_km] = event_smem | event_sendmsg | event_sendmsg_rtn;
|
2024-05-03 12:04:58 +01:00
|
|
|
events[wait_type_lgkm] &= ~events[wait_type_km];
|
|
|
|
|
}
|
2024-05-03 11:19:55 +01:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
u_foreach_bit (j, events[i])
|
|
|
|
|
counters[j] |= (1 << i);
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-09 14:54:46 +01:00
|
|
|
unordered_events = event_smem;
|
2024-05-03 11:19:55 +01:00
|
|
|
}
|
|
|
|
|
|
2024-07-13 00:15:26 +02:00
|
|
|
uint8_t get_counters_for_event(wait_event event) const { return counters[ffs(event) - 1]; }
|
2024-05-03 11:19:55 +01:00
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
/* Bitfields of counters affected by each event */
|
|
|
|
|
uint8_t counters[num_events] = {};
|
|
|
|
|
};
|
|
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
enum barrier_info_kind {
|
|
|
|
|
/* Waits for all non-private accesses and all scratch/vgpr-spill accesses */
|
2025-07-22 16:36:22 +01:00
|
|
|
barrier_info_release_dep,
|
|
|
|
|
/* Waits for all atomics */
|
|
|
|
|
barrier_info_acquire_dep,
|
2025-09-03 11:22:40 +01:00
|
|
|
/* A wait that is to be emitted when an
|
|
|
|
|
* atomics/control_barriers/sendmsg_gs_done/position-primitive-export is encountered.
|
|
|
|
|
*/
|
|
|
|
|
barrier_info_release,
|
|
|
|
|
/* A wait that is to be emitted when any non-private access is encountered. */
|
|
|
|
|
barrier_info_acquire,
|
2025-07-17 12:29:41 +01:00
|
|
|
num_barrier_infos,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Used to keep track of wait imms that are yet to be emitted. */
|
|
|
|
|
struct barrier_info {
|
|
|
|
|
wait_imm imm[storage_count];
|
|
|
|
|
uint16_t events[storage_count] = {}; /* use wait_event notion */
|
2025-08-26 18:07:07 +01:00
|
|
|
sync_scope scope[storage_count] = {};
|
2025-07-17 12:29:41 +01:00
|
|
|
uint8_t storage = 0;
|
|
|
|
|
|
|
|
|
|
bool join(const barrier_info& other)
|
|
|
|
|
{
|
|
|
|
|
bool changed = false;
|
|
|
|
|
for (unsigned i = 0; i < storage_count; i++) {
|
|
|
|
|
changed |= imm[i].combine(other.imm[i]);
|
|
|
|
|
changed |= (other.events[i] & ~events[i]) != 0;
|
|
|
|
|
events[i] |= other.events[i];
|
2025-08-26 18:07:07 +01:00
|
|
|
changed |= other.scope[i] > scope[i];
|
|
|
|
|
scope[i] = MAX2(scope[i], other.scope[i]);
|
2025-07-17 12:29:41 +01:00
|
|
|
}
|
|
|
|
|
storage |= other.storage;
|
|
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
UNUSED void print(FILE* output) const
|
|
|
|
|
{
|
|
|
|
|
u_foreach_bit (i, storage) {
|
|
|
|
|
fprintf(output, "storage[%u] = {\n", i);
|
|
|
|
|
imm[i].print(output);
|
|
|
|
|
fprintf(output, "events: %u\n", events[i]);
|
2025-08-26 18:07:07 +01:00
|
|
|
fprintf(output, "scope: %u\n", scope[i]);
|
2025-07-17 12:29:41 +01:00
|
|
|
fprintf(output, "}\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
struct wait_ctx {
|
|
|
|
|
Program* program;
|
2022-05-12 02:50:17 -04:00
|
|
|
enum amd_gfx_level gfx_level;
|
2024-05-03 11:19:55 +01:00
|
|
|
const target_info* info;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
uint32_t nonzero = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
bool pending_flat_lgkm = false;
|
|
|
|
|
bool pending_flat_vm = false;
|
|
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
barrier_info bar[num_barrier_infos];
|
|
|
|
|
uint8_t bar_nonempty = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
std::map<PhysReg, wait_entry> gpr_map;
|
|
|
|
|
|
|
|
|
|
wait_ctx() {}
|
2024-05-03 11:19:55 +01:00
|
|
|
wait_ctx(Program* program_, const target_info* info_)
|
|
|
|
|
: program(program_), gfx_level(program_->gfx_level), info(info_)
|
2021-02-03 14:34:09 +00:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
aco: sometimes join linear wait entries on logical edges
fossil-db (gfx1201):
Totals from 1303 (1.64% of 79653) affected shaders:
Instrs: 6920949 -> 6917692 (-0.05%); split: -0.06%, +0.01%
CodeSize: 37112404 -> 37095728 (-0.04%); split: -0.05%, +0.01%
Latency: 70471343 -> 70365986 (-0.15%); split: -0.15%, +0.00%
InvThroughput: 11515673 -> 11504666 (-0.10%); split: -0.10%, +0.01%
fossil-db (navi31):
Totals from 1293 (1.62% of 79653) affected shaders:
Instrs: 6500186 -> 6496761 (-0.05%); split: -0.06%, +0.01%
CodeSize: 34562712 -> 34549236 (-0.04%); split: -0.04%, +0.01%
Latency: 68604746 -> 68666532 (+0.09%); split: -0.15%, +0.24%
InvThroughput: 11276591 -> 11284914 (+0.07%); split: -0.10%, +0.17%
fossil-db (navi21):
Totals from 811 (1.02% of 79653) affected shaders:
Instrs: 4110953 -> 4108788 (-0.05%); split: -0.05%, +0.00%
CodeSize: 22955984 -> 22948064 (-0.03%); split: -0.03%, +0.00%
Latency: 35070231 -> 35064448 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 6945610 -> 6945053 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
2025-05-13 12:54:00 +01:00
|
|
|
bool join(const wait_ctx* other, bool logical, bool logical_merge)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
bool changed = (other->pending_flat_lgkm && !pending_flat_lgkm) ||
|
|
|
|
|
(other->pending_flat_vm && !pending_flat_vm) || (~nonzero & other->nonzero);
|
|
|
|
|
|
|
|
|
|
nonzero |= other->nonzero;
|
2019-09-17 13:22:17 +02:00
|
|
|
pending_flat_lgkm |= other->pending_flat_lgkm;
|
|
|
|
|
pending_flat_vm |= other->pending_flat_vm;
|
|
|
|
|
|
2025-05-02 11:05:21 +01:00
|
|
|
using iterator = std::map<PhysReg, wait_entry>::iterator;
|
|
|
|
|
|
2025-05-13 14:25:51 +01:00
|
|
|
if (logical == logical_merge) {
|
|
|
|
|
for (const auto& entry : other->gpr_map) {
|
|
|
|
|
const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
|
|
|
|
|
if (insert_pair.second) {
|
2025-05-02 11:05:21 +01:00
|
|
|
insert_pair.first->second.logical_events = 0;
|
2025-05-13 14:25:51 +01:00
|
|
|
changed = true;
|
|
|
|
|
} else {
|
|
|
|
|
changed |= insert_pair.first->second.join(entry.second);
|
|
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2025-05-02 16:01:05 +01:00
|
|
|
if (logical) {
|
2025-05-13 14:25:51 +01:00
|
|
|
for (const auto& entry : other->gpr_map) {
|
|
|
|
|
iterator it = gpr_map.find(entry.first);
|
|
|
|
|
if (it != gpr_map.end()) {
|
|
|
|
|
changed |= (entry.second.logical_events & ~it->second.logical_events) != 0;
|
|
|
|
|
it->second.logical_events |= entry.second.logical_events;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
u_foreach_bit (i, other->bar_nonempty)
|
|
|
|
|
changed |= bar[i].join(other->bar[i]);
|
|
|
|
|
bar_nonempty |= other->bar_nonempty;
|
2020-02-11 16:52:20 +00:00
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
|
|
|
|
|
return changed;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2023-09-25 12:29:26 +01:00
|
|
|
UNUSED void print(FILE* output) const
|
|
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++)
|
|
|
|
|
fprintf(output, "nonzero[%u]: %u\n", i, nonzero & (1 << i) ? 1 : 0);
|
2023-09-25 12:29:26 +01:00
|
|
|
fprintf(output, "pending_flat_lgkm: %u\n", pending_flat_lgkm);
|
|
|
|
|
fprintf(output, "pending_flat_vm: %u\n", pending_flat_vm);
|
|
|
|
|
for (const auto& entry : gpr_map) {
|
|
|
|
|
fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
|
|
|
|
|
entry.first.reg() & 0xff);
|
|
|
|
|
entry.second.print(output);
|
|
|
|
|
fprintf(output, "}\n");
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
u_foreach_bit (i, bar_nonempty) {
|
|
|
|
|
fprintf(output, "barriers[%u] = {\n", i);
|
|
|
|
|
bar[i].print(output);
|
|
|
|
|
fprintf(output, "}\n");
|
2023-09-25 12:29:26 +01:00
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2024-05-03 12:04:58 +01:00
|
|
|
wait_event
|
|
|
|
|
get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type)
|
|
|
|
|
{
|
|
|
|
|
if (instr->definitions.empty() && ctx.gfx_level >= GFX10)
|
|
|
|
|
return event_vmem_store;
|
|
|
|
|
wait_event ev = event_vmem;
|
|
|
|
|
if (ctx.gfx_level >= GFX12 && type != vmem_nosampler)
|
|
|
|
|
ev = type == vmem_bvh ? event_vmem_bvh : event_vmem_sample;
|
|
|
|
|
return ev;
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-12 16:41:31 +01:00
|
|
|
uint32_t
|
|
|
|
|
get_vmem_mask(wait_ctx& ctx, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (ctx.program->dev.sram_ecc_enabled)
|
|
|
|
|
return 0xffffffff;
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::buffer_load_format_d16_x:
|
|
|
|
|
case aco_opcode::buffer_load_ubyte_d16:
|
|
|
|
|
case aco_opcode::buffer_load_sbyte_d16:
|
|
|
|
|
case aco_opcode::buffer_load_short_d16:
|
|
|
|
|
case aco_opcode::tbuffer_load_format_d16_x:
|
|
|
|
|
case aco_opcode::flat_load_ubyte_d16:
|
|
|
|
|
case aco_opcode::flat_load_sbyte_d16:
|
|
|
|
|
case aco_opcode::flat_load_short_d16:
|
|
|
|
|
case aco_opcode::global_load_ubyte_d16:
|
|
|
|
|
case aco_opcode::global_load_sbyte_d16:
|
|
|
|
|
case aco_opcode::global_load_short_d16:
|
|
|
|
|
case aco_opcode::scratch_load_ubyte_d16:
|
|
|
|
|
case aco_opcode::scratch_load_sbyte_d16:
|
|
|
|
|
case aco_opcode::scratch_load_short_d16: return 0x1;
|
|
|
|
|
case aco_opcode::buffer_load_ubyte_d16_hi:
|
|
|
|
|
case aco_opcode::buffer_load_sbyte_d16_hi:
|
|
|
|
|
case aco_opcode::buffer_load_short_d16_hi:
|
|
|
|
|
case aco_opcode::buffer_load_format_d16_hi_x:
|
|
|
|
|
case aco_opcode::flat_load_ubyte_d16_hi:
|
|
|
|
|
case aco_opcode::flat_load_sbyte_d16_hi:
|
|
|
|
|
case aco_opcode::flat_load_short_d16_hi:
|
|
|
|
|
case aco_opcode::global_load_ubyte_d16_hi:
|
|
|
|
|
case aco_opcode::global_load_sbyte_d16_hi:
|
|
|
|
|
case aco_opcode::global_load_short_d16_hi:
|
|
|
|
|
case aco_opcode::scratch_load_ubyte_d16_hi:
|
|
|
|
|
case aco_opcode::scratch_load_sbyte_d16_hi:
|
|
|
|
|
case aco_opcode::scratch_load_short_d16_hi: return 0x2;
|
|
|
|
|
case aco_opcode::buffer_load_format_d16_xyz:
|
|
|
|
|
case aco_opcode::tbuffer_load_format_d16_xyz: return 0x7;
|
|
|
|
|
default: return 0xffffffff;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-13 14:25:51 +01:00
|
|
|
wait_imm
|
|
|
|
|
get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry)
|
|
|
|
|
{
|
|
|
|
|
if (reg.reg() >= 256) {
|
|
|
|
|
uint32_t events = entry.logical_events;
|
|
|
|
|
|
|
|
|
|
/* ALU can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11+ without
|
|
|
|
|
* waiting for the load to finish, even if none of the lanes are involved in the load.
|
|
|
|
|
*/
|
|
|
|
|
if (ctx.gfx_level >= GFX11) {
|
|
|
|
|
uint32_t ds_vmem_events =
|
2025-06-09 14:54:46 +01:00
|
|
|
event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh;
|
2025-05-13 14:25:51 +01:00
|
|
|
events |= ds_vmem_events;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t counters = 0;
|
|
|
|
|
u_foreach_bit (i, entry.events & events)
|
|
|
|
|
counters |= ctx.info->get_counters_for_event((wait_event)(1 << i));
|
|
|
|
|
|
|
|
|
|
wait_imm imm;
|
|
|
|
|
u_foreach_bit (i, entry.counters & counters)
|
|
|
|
|
imm[i] = entry.imm[i];
|
|
|
|
|
|
|
|
|
|
return imm;
|
|
|
|
|
} else {
|
|
|
|
|
return entry.imm;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2024-08-07 15:41:42 +01:00
|
|
|
check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
for (const Operand op : instr->operands) {
|
|
|
|
|
if (op.isConstant() || op.isUndefined())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* check consecutively read gprs */
|
|
|
|
|
for (unsigned j = 0; j < op.size(); j++) {
|
2024-08-07 15:41:42 +01:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j});
|
|
|
|
|
if (it != ctx.gpr_map.end() && it->second.wait_on_read)
|
2025-05-13 14:25:51 +01:00
|
|
|
wait.combine(get_imm(ctx, PhysReg{op.physReg() + j}, it->second));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
/* check consecutively written gprs */
|
|
|
|
|
for (unsigned j = 0; j < def.getTemp().size(); j++) {
|
|
|
|
|
PhysReg reg{def.physReg() + j};
|
|
|
|
|
|
|
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
|
|
|
|
if (it == ctx.gpr_map.end())
|
|
|
|
|
continue;
|
|
|
|
|
|
2025-05-13 14:25:51 +01:00
|
|
|
wait_imm reg_imm = get_imm(ctx, reg, it->second);
|
2024-04-24 16:57:10 +01:00
|
|
|
|
2024-11-26 12:00:35 +00:00
|
|
|
/* Vector Memory reads and writes decrease the counter in the order they were issued.
|
|
|
|
|
* Before GFX12, they also write VGPRs in order if they're of the same type.
|
2025-05-12 16:35:21 +01:00
|
|
|
* We can do this for GFX12 and different types for GFX11 if we know that the two
|
|
|
|
|
* VMEM loads do not write the same register half or the same lanes.
|
|
|
|
|
*/
|
2025-11-27 13:27:00 +01:00
|
|
|
uint8_t vmem_type = get_vmem_type(instr, ctx.program->dev.has_point_sample_accel);
|
2025-04-29 17:37:59 +01:00
|
|
|
if (vmem_type) {
|
2024-07-13 00:15:26 +02:00
|
|
|
wait_event event = get_vmem_event(ctx, instr, vmem_type);
|
2024-05-03 12:04:58 +01:00
|
|
|
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
|
2025-05-08 18:17:41 +01:00
|
|
|
|
|
|
|
|
bool event_matches = (it->second.events & ctx.info->events[type]) == event;
|
|
|
|
|
/* wait_type_vm/counter_vm can have several different vmem_types */
|
|
|
|
|
bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type &&
|
|
|
|
|
util_bitcount(vmem_type) == 1);
|
|
|
|
|
|
2025-04-29 17:37:59 +01:00
|
|
|
bool different_halves = false;
|
|
|
|
|
if (event == event_vmem && event_matches) {
|
|
|
|
|
uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3;
|
|
|
|
|
different_halves = !(mask & it->second.vm_mask);
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-12 16:35:21 +01:00
|
|
|
bool different_lanes = (it->second.logical_events & ctx.info->events[type]) == 0;
|
|
|
|
|
|
|
|
|
|
if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves ||
|
|
|
|
|
different_lanes)
|
2024-05-03 11:19:55 +01:00
|
|
|
reg_imm[type] = wait_imm::unset_counter;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* LDS reads and writes return in the order they were issued. same for GDS */
|
2024-05-03 11:19:55 +01:00
|
|
|
if (instr->isDS() && (it->second.events & ctx.info->events[wait_type_lgkm]) ==
|
|
|
|
|
(instr->ds().gds ? event_gds : event_lds))
|
2024-04-24 16:57:10 +01:00
|
|
|
reg_imm.lgkm = wait_imm::unset_counter;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-04-24 16:57:10 +01:00
|
|
|
wait.combine(reg_imm);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-03 11:22:40 +01:00
|
|
|
/* We delay the waitcnt for a barrier until it's needed. This can help hide the cost or let it be
|
|
|
|
|
* eliminated. */
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2025-09-03 11:22:40 +01:00
|
|
|
setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
|
2020-06-26 15:54:22 +01:00
|
|
|
{
|
|
|
|
|
sync_scope subgroup_scope =
|
|
|
|
|
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
2025-07-22 16:36:22 +01:00
|
|
|
if (sync.scope <= subgroup_scope)
|
|
|
|
|
return;
|
2020-06-26 15:54:22 +01:00
|
|
|
|
2025-09-03 11:22:40 +01:00
|
|
|
barrier_info& src = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
|
2020-06-26 15:54:22 +01:00
|
|
|
|
2025-09-03 11:22:40 +01:00
|
|
|
wait_imm dst_imm;
|
|
|
|
|
uint16_t dst_events = 0;
|
|
|
|
|
u_foreach_bit (i, sync.storage & src.storage) {
|
2025-08-26 18:07:07 +01:00
|
|
|
/* LDS is private to the workgroup, so reduce the scope in that case. */
|
|
|
|
|
if (src.events[i] == event_lds && MIN2(sync.scope, scope_workgroup) <= subgroup_scope)
|
|
|
|
|
continue;
|
2025-07-22 16:36:22 +01:00
|
|
|
|
2025-08-26 18:07:07 +01:00
|
|
|
dst_imm.combine(src.imm[i]);
|
|
|
|
|
dst_events |= src.events[i];
|
2025-09-03 11:22:40 +01:00
|
|
|
}
|
|
|
|
|
if (!dst_events)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* Copy over wait into barrier_info_acquire/barrier_info_release */
|
|
|
|
|
unsigned dst_index = is_acquire ? barrier_info_acquire : barrier_info_release;
|
|
|
|
|
barrier_info& dst = ctx.bar[dst_index];
|
|
|
|
|
u_foreach_bit (i, sync.storage) {
|
|
|
|
|
dst.imm[i].combine(dst_imm);
|
|
|
|
|
dst.events[i] |= dst_events;
|
2025-08-26 18:07:07 +01:00
|
|
|
dst.scope[i] = MAX2(dst.scope[i], sync.scope);
|
2025-09-03 11:22:40 +01:00
|
|
|
}
|
|
|
|
|
dst.storage |= sync.storage;
|
|
|
|
|
ctx.bar_nonempty |= 1 << dst_index;
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-26 18:07:07 +01:00
|
|
|
void
|
2025-09-02 15:39:19 +01:00
|
|
|
finish_barrier_internal(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction* instr,
|
|
|
|
|
struct barrier_info* info, unsigned storage_idx)
|
2025-08-26 18:07:07 +01:00
|
|
|
{
|
|
|
|
|
uint16_t events = info->events[storage_idx];
|
2025-09-02 15:39:19 +01:00
|
|
|
bool vm_vsrc = false;
|
|
|
|
|
|
2025-08-26 18:07:07 +01:00
|
|
|
if (info->scope[storage_idx] <= scope_workgroup) {
|
|
|
|
|
bool is_vmem = instr->isVMEM() || (instr->isFlatLike() && !instr->flatlike().may_use_lds);
|
2025-09-02 15:40:03 +01:00
|
|
|
bool is_lds = instr->isDS() && !instr->ds().gds;
|
2025-09-02 15:39:19 +01:00
|
|
|
bool is_barrier = instr->isBarrier(); /* This is only called for control barriers. */
|
2025-08-26 18:07:07 +01:00
|
|
|
|
2025-09-02 15:39:19 +01:00
|
|
|
/* In non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same
|
|
|
|
|
* workgroup */
|
|
|
|
|
bool has_vmem_events = events & (event_vmem | event_vmem_store);
|
|
|
|
|
if (has_vmem_events && (is_vmem || is_barrier) && !ctx.program->wgp_mode) {
|
2025-08-26 18:07:07 +01:00
|
|
|
events &= ~(event_vmem | event_vmem_store);
|
2025-09-02 15:39:19 +01:00
|
|
|
vm_vsrc |= is_barrier && ctx.gfx_level >= GFX10;
|
|
|
|
|
}
|
2025-09-02 15:40:03 +01:00
|
|
|
|
|
|
|
|
/* Similar for LDS. */
|
|
|
|
|
if ((events & event_lds) &&
|
|
|
|
|
(is_lds || (is_barrier && ctx.gfx_level >= GFX10 && !ctx.program->wgp_mode))) {
|
|
|
|
|
events &= ~event_lds;
|
|
|
|
|
vm_vsrc |= is_barrier;
|
|
|
|
|
}
|
2025-08-26 18:07:07 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (events)
|
|
|
|
|
imm.combine(info->imm[storage_idx]);
|
2025-09-02 15:39:19 +01:00
|
|
|
if (vm_vsrc)
|
|
|
|
|
depctr.vm_vsrc = 0;
|
2025-08-26 18:07:07 +01:00
|
|
|
}
|
|
|
|
|
|
2025-09-03 11:22:40 +01:00
|
|
|
void
|
2025-09-02 15:39:19 +01:00
|
|
|
finish_barriers(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction* instr,
|
|
|
|
|
memory_sync_info sync)
|
2025-09-03 11:22:40 +01:00
|
|
|
{
|
|
|
|
|
if (ctx.bar_nonempty & (1 << barrier_info_release)) {
|
|
|
|
|
uint16_t storage_release =
|
|
|
|
|
is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release);
|
|
|
|
|
u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage)
|
2025-09-02 15:39:19 +01:00
|
|
|
finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_release], i);
|
2025-09-03 11:22:40 +01:00
|
|
|
}
|
|
|
|
|
if (ctx.bar_nonempty & (1 << barrier_info_acquire)) {
|
|
|
|
|
uint16_t storage_acquire = (sync.semantics & semantic_private) ? 0 : sync.storage;
|
|
|
|
|
u_foreach_bit (i, storage_acquire & ctx.bar[barrier_info_acquire].storage)
|
2025-09-02 15:39:19 +01:00
|
|
|
finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_acquire], i);
|
2020-06-26 15:54:22 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-19 10:40:35 +02:00
|
|
|
void
|
|
|
|
|
force_waitcnt(wait_ctx& ctx, wait_imm& imm)
|
|
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (i, ctx.nonzero)
|
|
|
|
|
imm[i] = 0;
|
2020-08-19 10:40:35 +02:00
|
|
|
}
|
|
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
void
|
|
|
|
|
update_barrier_info_for_wait(wait_ctx& ctx, unsigned idx, wait_imm imm)
|
|
|
|
|
{
|
|
|
|
|
barrier_info& info = ctx.bar[idx];
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if (imm[i] == wait_imm::unset_counter)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
u_foreach_bit (j, info.storage) {
|
|
|
|
|
wait_imm& bar = info.imm[j];
|
|
|
|
|
if (bar[i] != wait_imm::unset_counter && imm[i] <= bar[i]) {
|
|
|
|
|
/* Clear this counter */
|
|
|
|
|
bar[i] = wait_imm::unset_counter;
|
|
|
|
|
info.events[j] &= ~ctx.info->events[i];
|
|
|
|
|
|
|
|
|
|
if (!info.events[j]) {
|
2025-08-26 18:07:07 +01:00
|
|
|
assert(info.imm[j].empty());
|
|
|
|
|
info.scope[j] = scope_invocation;
|
2025-07-17 12:29:41 +01:00
|
|
|
info.storage &= ~(1 << j);
|
|
|
|
|
if (!info.storage)
|
|
|
|
|
ctx.bar_nonempty &= ~(1 << idx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
void
|
2025-09-02 15:39:19 +01:00
|
|
|
kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
|
|
|
|
|
memory_sync_info sync_info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-06-24 14:17:29 +02:00
|
|
|
if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
|
2020-08-19 10:40:35 +02:00
|
|
|
/* Force emitting waitcnt states right after the instruction if there is
|
2022-06-24 14:17:29 +02:00
|
|
|
* something to wait for. This is also applied for s_setpc_b64 to ensure
|
|
|
|
|
* waitcnt states are inserted before jumping to the PS epilog.
|
2020-08-19 10:40:35 +02:00
|
|
|
*/
|
2022-06-23 16:30:17 +01:00
|
|
|
force_waitcnt(ctx, imm);
|
2020-08-19 10:40:35 +02:00
|
|
|
}
|
|
|
|
|
|
2024-08-07 15:41:42 +01:00
|
|
|
check_instr(ctx, imm, instr);
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2025-09-22 12:59:55 +02:00
|
|
|
/* Only inserted by this pass, and outside loops. */
|
|
|
|
|
assert(ctx.gfx_level < GFX11 || instr->opcode != aco_opcode::s_sendmsg ||
|
|
|
|
|
instr->salu().imm != sendmsg_dealloc_vgprs);
|
|
|
|
|
|
2022-10-27 12:49:09 +01:00
|
|
|
if (instr->opcode == aco_opcode::ds_ordered_count &&
|
|
|
|
|
((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
|
2025-07-22 16:36:22 +01:00
|
|
|
barrier_info& bar = ctx.bar[barrier_info_release_dep];
|
2025-07-17 12:29:41 +01:00
|
|
|
imm.combine(bar.imm[ffs(storage_gds) - 1]);
|
2022-10-27 12:49:09 +01:00
|
|
|
}
|
|
|
|
|
|
2025-07-22 16:36:22 +01:00
|
|
|
if (instr->opcode == aco_opcode::p_barrier) {
|
|
|
|
|
if (instr->barrier().sync.semantics & semantic_release)
|
2025-09-03 11:22:40 +01:00
|
|
|
setup_barrier(ctx, imm, instr->barrier().sync, false);
|
2025-07-22 16:36:22 +01:00
|
|
|
if (instr->barrier().sync.semantics & semantic_acquire)
|
2025-09-03 11:22:40 +01:00
|
|
|
setup_barrier(ctx, imm, instr->barrier().sync, true);
|
2025-07-22 16:36:22 +01:00
|
|
|
} else if (sync_info.semantics & semantic_release) {
|
2025-09-03 11:22:40 +01:00
|
|
|
setup_barrier(ctx, imm, sync_info, false);
|
2025-07-22 16:36:22 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-09-02 15:39:19 +01:00
|
|
|
finish_barriers(ctx, imm, depctr, instr, sync_info);
|
2025-09-03 11:22:40 +01:00
|
|
|
|
2024-08-07 15:41:42 +01:00
|
|
|
if (!imm.empty()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
|
|
|
|
|
imm.vm = 0;
|
|
|
|
|
if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
|
|
|
|
|
imm.lgkm = 0;
|
|
|
|
|
|
|
|
|
|
/* reset counters */
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++)
|
|
|
|
|
ctx.nonzero &= imm[i] == 0 ? ~BITFIELD_BIT(i) : UINT32_MAX;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
u_foreach_bit (i, ctx.bar_nonempty)
|
|
|
|
|
update_barrier_info_for_wait(ctx, i, imm);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-09-12 15:28:49 +01:00
|
|
|
/* remove all gprs with higher counter from map */
|
2019-09-17 13:22:17 +02:00
|
|
|
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
|
|
|
|
|
while (it != ctx.gpr_map.end()) {
|
2024-05-03 11:19:55 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if (imm[i] != wait_imm::unset_counter && imm[i] <= it->second.imm[i])
|
|
|
|
|
it->second.remove_wait((wait_type)i, ctx.info->events[i]);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!it->second.counters)
|
|
|
|
|
it = ctx.gpr_map.erase(it);
|
|
|
|
|
else
|
|
|
|
|
it++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (imm.vm == 0)
|
|
|
|
|
ctx.pending_flat_vm = false;
|
2025-07-22 16:05:13 +01:00
|
|
|
if (imm.lgkm == 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.pending_flat_lgkm = false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
void
|
2025-07-17 12:29:41 +01:00
|
|
|
update_barrier_info_for_event(wait_ctx& ctx, uint8_t counters, wait_event event,
|
|
|
|
|
barrier_info_kind idx, uint16_t storage)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2025-07-17 12:29:41 +01:00
|
|
|
barrier_info& info = ctx.bar[idx];
|
|
|
|
|
if (storage) {
|
|
|
|
|
info.storage |= storage;
|
|
|
|
|
ctx.bar_nonempty |= 1 << idx;
|
|
|
|
|
}
|
2024-09-03 12:13:37 +01:00
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
unsigned storage_tmp = info.storage;
|
|
|
|
|
while (storage_tmp) {
|
|
|
|
|
unsigned i = u_bit_scan(&storage_tmp);
|
|
|
|
|
wait_imm& bar = info.imm[i];
|
|
|
|
|
uint16_t& bar_ev = info.events[i];
|
2024-09-03 12:13:37 +01:00
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
if (storage & (1 << i)) {
|
|
|
|
|
/* Reset counters to zero so that this instruction is waited on. */
|
2020-02-11 16:52:20 +00:00
|
|
|
bar_ev |= event;
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (j, counters)
|
|
|
|
|
bar[j] = 0;
|
2024-05-03 11:19:55 +01:00
|
|
|
} else if (!(bar_ev & ctx.info->unordered_events) && !(ctx.info->unordered_events & event)) {
|
2025-07-17 12:29:41 +01:00
|
|
|
/* Increase counters so that this instruction is ignored when waiting. */
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (j, counters) {
|
|
|
|
|
if (bar[j] != wait_imm::unset_counter && (bar_ev & ctx.info->events[j]) == event)
|
|
|
|
|
bar[j] = std::min<uint16_t>(bar[j] + 1, ctx.info->max_cnt[j]);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-17 12:29:41 +01:00
|
|
|
/* This resets or increases the counters for the barrier infos in response to an instruction. */
|
|
|
|
|
void
|
2025-07-22 16:36:22 +01:00
|
|
|
update_barriers(wait_ctx& ctx, uint8_t counters, wait_event event, Instruction* instr,
|
|
|
|
|
memory_sync_info sync)
|
2025-07-17 12:29:41 +01:00
|
|
|
{
|
2025-07-22 16:36:22 +01:00
|
|
|
uint16_t storage_rel = sync.storage;
|
|
|
|
|
/* We re-use barrier_info_release_dep to wait for all scratch stores to finish, so track those
|
|
|
|
|
* even if they are private. */
|
2025-07-17 12:29:41 +01:00
|
|
|
if (sync.semantics & semantic_private)
|
2025-07-22 16:36:22 +01:00
|
|
|
storage_rel &= storage_scratch | storage_vgpr_spill;
|
|
|
|
|
update_barrier_info_for_event(ctx, counters, event, barrier_info_release_dep, storage_rel);
|
|
|
|
|
|
|
|
|
|
if (instr) {
|
|
|
|
|
uint16_t storage_acq = is_atomic_or_control_instr(ctx.program, instr, sync, semantic_acquire);
|
|
|
|
|
update_barrier_info_for_event(ctx, counters, event, barrier_info_acquire_dep, storage_acq);
|
|
|
|
|
}
|
2025-09-03 11:22:40 +01:00
|
|
|
|
|
|
|
|
update_barrier_info_for_event(ctx, counters, event, barrier_info_release, 0);
|
|
|
|
|
update_barrier_info_for_event(ctx, counters, event, barrier_info_acquire, 0);
|
2025-07-17 12:29:41 +01:00
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
void
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(wait_ctx& ctx, wait_event event, Instruction* instr,
|
|
|
|
|
memory_sync_info sync = memory_sync_info())
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
uint8_t counters = ctx.info->get_counters_for_event(event);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
ctx.nonzero |= counters;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-07-22 16:36:22 +01:00
|
|
|
update_barriers(ctx, counters, event, instr, sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
if (ctx.info->unordered_events & event)
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
|
|
|
|
wait_entry& entry = e.second;
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
if (entry.events & ctx.info->unordered_events)
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
assert(entry.events);
|
|
|
|
|
|
2024-05-03 11:19:55 +01:00
|
|
|
u_foreach_bit (i, counters) {
|
|
|
|
|
if ((entry.events & ctx.info->events[i]) == event)
|
|
|
|
|
entry.imm[i] = std::min<uint16_t>(entry.imm[i] + 1, ctx.info->max_cnt[i]);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-07 14:27:42 +01:00
|
|
|
void
|
|
|
|
|
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
|
2025-05-13 14:25:51 +01:00
|
|
|
uint8_t vmem_types = 0, uint32_t vm_mask = 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
uint16_t counters = ctx.info->get_counters_for_event(event);
|
2019-09-17 13:22:17 +02:00
|
|
|
wait_imm imm;
|
2024-08-07 15:41:42 +01:00
|
|
|
u_foreach_bit (i, counters)
|
2024-05-03 11:19:55 +01:00
|
|
|
imm[i] = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-05-13 14:25:51 +01:00
|
|
|
wait_entry new_entry(event, imm, counters, wait_on_read);
|
2024-05-03 12:04:58 +01:00
|
|
|
if (counters & counter_vm)
|
|
|
|
|
new_entry.vmem_types |= vmem_types;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-05-12 16:41:31 +01:00
|
|
|
for (unsigned i = 0; i < rc.size(); i++, vm_mask >>= 2) {
|
|
|
|
|
new_entry.vm_mask = vm_mask & 0x3;
|
2020-02-07 11:55:43 +00:00
|
|
|
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
|
2025-05-13 14:25:51 +01:00
|
|
|
if (!it.second) {
|
|
|
|
|
it.first->second.join(new_entry);
|
|
|
|
|
it.first->second.logical_events |= event;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-07 14:27:42 +01:00
|
|
|
void
|
2025-05-12 16:41:31 +01:00
|
|
|
insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0,
|
|
|
|
|
uint32_t vm_mask = 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (!op.isConstant() && !op.isUndefined())
|
2025-05-12 16:41:31 +01:00
|
|
|
insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, vm_mask);
|
2022-11-13 18:15:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2025-05-12 16:41:31 +01:00
|
|
|
insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
|
|
|
|
|
uint32_t vm_mask = 0)
|
2022-11-13 18:15:28 +00:00
|
|
|
{
|
2025-05-13 14:25:51 +01:00
|
|
|
insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, vm_mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
gen(Instruction* instr, wait_ctx& ctx)
|
|
|
|
|
{
|
|
|
|
|
switch (instr->format) {
|
|
|
|
|
case Format::EXP: {
|
2021-01-21 16:13:34 +00:00
|
|
|
Export_instruction& exp_instr = instr->exp();
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
wait_event ev;
|
2025-07-22 16:58:06 +01:00
|
|
|
if (exp_instr.dest <= V_008DFC_SQ_EXP_NULL)
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_mrt_null;
|
2025-07-22 16:58:06 +01:00
|
|
|
else if (exp_instr.dest <= (V_008DFC_SQ_EXP_POS + 4))
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_pos;
|
2025-07-22 16:58:06 +01:00
|
|
|
else if (exp_instr.dest == V_008DFC_SQ_EXP_PRIM)
|
|
|
|
|
ev = event_exp_prim;
|
|
|
|
|
else if (exp_instr.dest == 21 || exp_instr.dest == 22)
|
|
|
|
|
ev = event_exp_dual_src_blend;
|
|
|
|
|
else if (exp_instr.dest >= V_008DFC_SQ_EXP_PARAM)
|
2019-09-17 13:22:17 +02:00
|
|
|
ev = event_exp_param;
|
2025-07-22 16:58:06 +01:00
|
|
|
else
|
|
|
|
|
UNREACHABLE("Invalid export destination");
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, ev, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* insert new entries for exported vgprs */
|
|
|
|
|
for (unsigned i = 0; i < 4; i++) {
|
2021-01-21 16:13:34 +00:00
|
|
|
if (exp_instr.enabled_mask & (1 << i)) {
|
|
|
|
|
unsigned idx = exp_instr.compressed ? i >> 1 : i;
|
|
|
|
|
assert(idx < exp_instr.operands.size());
|
|
|
|
|
insert_wait_entry(ctx, exp_instr.operands[idx], ev);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
insert_wait_entry(ctx, exec, s2, ev, false);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::FLAT: {
|
2021-01-21 16:13:34 +00:00
|
|
|
FLAT_instruction& flat = instr->flat();
|
2025-06-09 14:54:46 +01:00
|
|
|
wait_event vmem_ev = get_vmem_event(ctx, instr, vmem_nosampler);
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, vmem_ev, instr, flat.sync);
|
|
|
|
|
update_counters(ctx, event_lds, instr, flat.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-06-26 14:15:50 +01:00
|
|
|
if (!instr->definitions.empty())
|
2025-06-09 14:54:46 +01:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], vmem_ev, 0, get_vmem_mask(ctx, instr));
|
2025-06-26 14:15:50 +01:00
|
|
|
if (!instr->definitions.empty() && flat.may_use_lds)
|
2025-06-09 14:54:46 +01:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_lds);
|
|
|
|
|
|
2025-06-26 14:15:50 +01:00
|
|
|
if (ctx.gfx_level < GFX10 && !instr->definitions.empty() && flat.may_use_lds) {
|
2025-06-09 14:54:46 +01:00
|
|
|
ctx.pending_flat_lgkm = true;
|
|
|
|
|
ctx.pending_flat_vm = true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::SMEM: {
|
2021-01-21 16:13:34 +00:00
|
|
|
SMEM_instruction& smem = instr->smem();
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, event_smem, instr, smem.sync);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (!instr->definitions.empty())
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_smem);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case Format::DS: {
|
2021-01-21 16:13:34 +00:00
|
|
|
DS_instruction& ds = instr->ds();
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, ds.gds ? event_gds : event_lds, instr, ds.sync);
|
2021-01-21 16:13:34 +00:00
|
|
|
if (ds.gds)
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, event_gds_gpr_lock, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2023-12-19 21:10:41 +01:00
|
|
|
for (auto& definition : instr->definitions)
|
|
|
|
|
insert_wait_entry(ctx, definition, ds.gds ? event_gds : event_lds);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
if (ds.gds) {
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Operand& op : instr->operands)
|
|
|
|
|
insert_wait_entry(ctx, op, event_gds_gpr_lock);
|
|
|
|
|
insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-06-17 13:53:08 +01:00
|
|
|
case Format::LDSDIR: {
|
|
|
|
|
LDSDIR_instruction& ldsdir = instr->ldsdir();
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, event_ldsdir, instr, ldsdir.sync);
|
2022-06-17 13:53:08 +01:00
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_ldsdir);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
case Format::MUBUF:
|
|
|
|
|
case Format::MTBUF:
|
|
|
|
|
case Format::MIMG:
|
2022-05-19 15:55:53 +01:00
|
|
|
case Format::GLOBAL:
|
|
|
|
|
case Format::SCRATCH: {
|
2025-11-27 13:27:00 +01:00
|
|
|
uint8_t type = get_vmem_type(instr, ctx.program->dev.has_point_sample_accel);
|
2024-05-03 12:04:58 +01:00
|
|
|
wait_event ev = get_vmem_event(ctx, instr, type);
|
2025-05-12 16:41:31 +01:00
|
|
|
uint32_t mask = ev == event_vmem ? get_vmem_mask(ctx, instr) : 0;
|
2024-05-03 12:04:58 +01:00
|
|
|
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, ev, instr, get_sync_info(instr));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-03-12 22:43:57 +01:00
|
|
|
for (auto& definition : instr->definitions)
|
2025-05-12 16:41:31 +01:00
|
|
|
insert_wait_entry(ctx, definition, ev, type, mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, event_vmem_gpr_lock, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, event_vmem_gpr_lock, instr);
|
2021-01-14 17:46:50 +00:00
|
|
|
insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-01-16 16:54:35 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
2019-10-14 17:21:04 +01:00
|
|
|
case Format::SOPP: {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(ctx, event_sendmsg, instr);
|
2021-04-13 17:21:56 +02:00
|
|
|
break;
|
2019-10-14 17:21:04 +01:00
|
|
|
}
|
2022-10-24 02:14:24 +00:00
|
|
|
case Format::SOP1: {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_sendmsg_rtn_b64) {
|
2025-08-01 15:38:31 +01:00
|
|
|
update_counters(ctx, event_sendmsg_rtn, instr);
|
|
|
|
|
insert_wait_entry(ctx, instr->definitions[0], event_sendmsg_rtn);
|
2022-10-24 02:14:24 +00:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2021-07-14 13:49:20 +02:00
|
|
|
emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 12:04:58 +01:00
|
|
|
Builder bld(ctx.program, &instructions);
|
2024-09-19 12:24:39 +01:00
|
|
|
imm.build_waitcnt(bld);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2025-09-02 15:39:19 +01:00
|
|
|
void
|
|
|
|
|
emit_depctr(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, depctr_wait& depctr)
|
|
|
|
|
{
|
|
|
|
|
Builder bld(ctx.program, &instructions);
|
|
|
|
|
bld.sopp(aco_opcode::s_waitcnt_depctr, depctr.pack());
|
|
|
|
|
depctr = depctr_wait();
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-22 12:59:55 +02:00
|
|
|
void
|
|
|
|
|
deallocate_vgprs(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions)
|
|
|
|
|
{
|
|
|
|
|
if (ctx.gfx_level < GFX11)
|
|
|
|
|
return;
|
|
|
|
|
|
2025-09-22 16:20:26 +02:00
|
|
|
/* New waves are likely not vgpr limited. */
|
|
|
|
|
unsigned max_waves_limit = ctx.program->dev.physical_vgprs / ctx.program->dev.max_waves_per_simd;
|
|
|
|
|
if (ctx.program->config->num_vgprs <= max_waves_limit)
|
|
|
|
|
return;
|
|
|
|
|
|
2025-09-22 12:59:55 +02:00
|
|
|
/* s_sendmsg dealloc_vgprs waits for all counters except stores. */
|
|
|
|
|
if (!(ctx.nonzero & counter_vs))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
const uint32_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null |
|
|
|
|
|
event_exp_prim | event_exp_dual_src_blend;
|
|
|
|
|
|
|
|
|
|
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
|
|
|
|
wait_entry& entry = e.second;
|
|
|
|
|
|
|
|
|
|
/* Exports are high latency operations too, and we would wait for them.
|
|
|
|
|
* Assume any potential stores don't take much longer, and avoid
|
|
|
|
|
* the message bus traffic.
|
|
|
|
|
*/
|
|
|
|
|
if (entry.events & exp_events)
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Scratch is deallocated early too. To avoid write after free,
|
|
|
|
|
* we have to wait for scratch stores.
|
|
|
|
|
*/
|
|
|
|
|
barrier_info& bar = ctx.bar[barrier_info_release_dep];
|
|
|
|
|
wait_imm imm;
|
|
|
|
|
imm.combine(bar.imm[ffs(storage_scratch) - 1]);
|
|
|
|
|
imm.combine(bar.imm[ffs(storage_vgpr_spill) - 1]);
|
|
|
|
|
|
|
|
|
|
/* Waiting for all stores is pointless */
|
|
|
|
|
if (imm.vs == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Builder bld(ctx.program, &instructions);
|
|
|
|
|
|
|
|
|
|
if (!imm.empty())
|
|
|
|
|
imm.build_waitcnt(bld);
|
|
|
|
|
bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-27 16:38:25 +00:00
|
|
|
bool
|
|
|
|
|
check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
for (Operand op : instr->operands) {
|
|
|
|
|
if (op.isConstant())
|
|
|
|
|
continue;
|
|
|
|
|
for (unsigned i = 0; i < op.size(); i++) {
|
|
|
|
|
if (regs_written[op.physReg().reg() + i])
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
for (unsigned i = 0; i < def.size(); i++)
|
|
|
|
|
regs_written[def.physReg().reg() + i] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
|
|
|
|
handle_block(Program* program, Block& block, wait_ctx& ctx)
|
|
|
|
|
{
|
|
|
|
|
std::vector<aco_ptr<Instruction>> new_instructions;
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
wait_imm queued_imm;
|
2025-09-02 15:39:19 +01:00
|
|
|
depctr_wait queued_depctr;
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2024-03-27 16:38:25 +00:00
|
|
|
size_t clause_end = 0;
|
|
|
|
|
for (size_t i = 0; i < block.instructions.size(); i++) {
|
|
|
|
|
aco_ptr<Instruction>& instr = block.instructions[i];
|
|
|
|
|
|
2025-09-02 15:39:19 +01:00
|
|
|
bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get()) ||
|
|
|
|
|
instr->opcode == aco_opcode::s_waitcnt_depctr;
|
|
|
|
|
if (instr->opcode == aco_opcode::s_waitcnt_depctr)
|
|
|
|
|
queued_depctr = parse_depctr_wait(instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
memory_sync_info sync_info = get_sync_info(instr.get());
|
2025-09-02 15:39:19 +01:00
|
|
|
kill(queued_imm, queued_depctr, instr.get(), ctx, sync_info);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-03-27 16:38:25 +00:00
|
|
|
/* At the start of a possible clause, also emit waitcnts for each instruction to avoid
|
2025-10-04 13:07:40 +02:00
|
|
|
* splitting the clause. For LDS, clauses don't have a cache benefit, so only do this for
|
|
|
|
|
* memory instructions.
|
2024-03-27 16:38:25 +00:00
|
|
|
*/
|
2025-10-04 13:07:40 +02:00
|
|
|
if ((i >= clause_end || !queued_imm.empty()) && !instr->isDS()) {
|
2024-03-27 16:38:25 +00:00
|
|
|
std::optional<std::bitset<512>> regs_written;
|
|
|
|
|
for (clause_end = i + 1; clause_end < block.instructions.size(); clause_end++) {
|
|
|
|
|
Instruction* next = block.instructions[clause_end].get();
|
|
|
|
|
if (!should_form_clause(instr.get(), next))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (!regs_written) {
|
|
|
|
|
regs_written.emplace();
|
|
|
|
|
check_clause_raw(*regs_written, instr.get());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!check_clause_raw(*regs_written, next))
|
|
|
|
|
break;
|
|
|
|
|
|
2025-09-02 15:39:19 +01:00
|
|
|
kill(queued_imm, queued_depctr, next, ctx, get_sync_info(next));
|
2024-03-27 16:38:25 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-22 12:59:55 +02:00
|
|
|
if (instr->opcode == aco_opcode::s_endpgm)
|
|
|
|
|
deallocate_vgprs(ctx, new_instructions);
|
|
|
|
|
|
aco: fix update_alu(clear=true) for exports
For:
v_mov_b32_e32 v0, 1.0
exp mrtz v0, off, off, off
we should completely remove the ALU entry before creating the EXP's WaR entry for v0.
Otherwise, the two will be combined into an entry which will wait for
expcnt(0) for later uses of v0.
gen_alu() should also be before gen(), since gen_alu() performs the clear
while gen() creates the WaR entry.
fossil-db (gfx1100):
Totals from 3589 (2.69% of 133428) affected shaders:
Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00%
CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00%
Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23213>
2023-05-22 16:32:00 +01:00
|
|
|
gen(instr.get(), ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-08-07 15:41:42 +01:00
|
|
|
if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
|
2022-06-17 13:53:08 +01:00
|
|
|
if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
|
|
|
|
|
instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
|
|
|
|
|
queued_imm.exp = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
if (!queued_imm.empty())
|
2019-10-21 21:36:41 +01:00
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm);
|
2025-09-02 15:39:19 +01:00
|
|
|
if (!queued_depctr.empty())
|
|
|
|
|
emit_depctr(ctx, new_instructions, queued_depctr);
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2022-10-27 12:49:09 +01:00
|
|
|
bool is_ordered_count_acquire =
|
|
|
|
|
instr->opcode == aco_opcode::ds_ordered_count &&
|
|
|
|
|
!((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1);
|
|
|
|
|
|
2021-07-14 13:49:20 +02:00
|
|
|
new_instructions.emplace_back(std::move(instr));
|
2025-07-22 16:36:22 +01:00
|
|
|
if (sync_info.semantics & semantic_acquire)
|
2025-09-03 11:22:40 +01:00
|
|
|
setup_barrier(ctx, queued_imm, sync_info, true);
|
2022-10-27 12:49:09 +01:00
|
|
|
|
|
|
|
|
if (is_ordered_count_acquire)
|
2025-07-22 16:36:22 +01:00
|
|
|
queued_imm.combine(ctx.bar[barrier_info_release_dep].imm[ffs(storage_gds) - 1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2023-08-19 15:36:00 +08:00
|
|
|
/* For last block of a program which has succeed shader part, wait all memory ops done
|
|
|
|
|
* before go to next shader part.
|
|
|
|
|
*/
|
|
|
|
|
if (block.kind & block_kind_end_with_regs)
|
|
|
|
|
force_waitcnt(ctx, queued_imm);
|
|
|
|
|
|
2019-11-22 19:38:51 +00:00
|
|
|
if (!queued_imm.empty())
|
|
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm);
|
2025-09-02 15:39:19 +01:00
|
|
|
if (!queued_depctr.empty())
|
|
|
|
|
emit_depctr(ctx, new_instructions, queued_depctr);
|
2019-11-22 19:38:51 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
block.instructions.swap(new_instructions);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} /* end namespace */
|
|
|
|
|
|
|
|
|
|
void
|
2024-08-07 15:41:42 +01:00
|
|
|
insert_waitcnt(Program* program)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-05-03 11:19:55 +01:00
|
|
|
target_info info(program->gfx_level);
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
/* per BB ctx */
|
|
|
|
|
std::vector<bool> done(program->blocks.size());
|
2024-05-03 11:19:55 +01:00
|
|
|
std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program, &info));
|
|
|
|
|
std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program, &info));
|
2019-12-04 14:41:18 +00:00
|
|
|
|
2021-07-10 12:20:56 +02:00
|
|
|
std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
|
2019-10-21 21:36:41 +01:00
|
|
|
unsigned loop_progress = 0;
|
|
|
|
|
|
2023-08-15 15:24:09 +08:00
|
|
|
if (program->pending_lds_access) {
|
2025-07-22 16:36:22 +01:00
|
|
|
update_barriers(in_ctx[0], info.get_counters_for_event(event_lds), event_lds, NULL,
|
2025-07-17 12:29:41 +01:00
|
|
|
memory_sync_info(storage_shared));
|
2023-08-15 15:24:09 +08:00
|
|
|
}
|
|
|
|
|
|
2023-03-02 17:30:49 -08:00
|
|
|
for (Definition def : program->args_pending_vmem) {
|
2025-07-22 16:36:22 +01:00
|
|
|
update_counters(in_ctx[0], event_vmem, NULL);
|
2025-05-12 16:41:31 +01:00
|
|
|
insert_wait_entry(in_ctx[0], def, event_vmem, vmem_nosampler, 0xffffffff);
|
2021-05-17 17:56:28 +01:00
|
|
|
}
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
for (unsigned i = 0; i < program->blocks.size();) {
|
|
|
|
|
Block& current = program->blocks[i++];
|
2023-04-15 21:45:12 +03:00
|
|
|
|
|
|
|
|
if (current.kind & block_kind_discard_early_exit) {
|
|
|
|
|
/* Because the jump to the discard early exit block may happen anywhere in a block, it's
|
|
|
|
|
* not possible to join it with its predecessors this way.
|
|
|
|
|
* We emit all required waits when emitting the discard block.
|
|
|
|
|
*/
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
wait_ctx ctx = in_ctx[current.index];
|
|
|
|
|
|
|
|
|
|
if (current.kind & block_kind_loop_header) {
|
|
|
|
|
loop_header_indices.push(current.index);
|
|
|
|
|
} else if (current.kind & block_kind_loop_exit) {
|
|
|
|
|
bool repeat = false;
|
|
|
|
|
if (loop_progress == loop_header_indices.size()) {
|
|
|
|
|
i = loop_header_indices.top();
|
|
|
|
|
repeat = true;
|
|
|
|
|
}
|
|
|
|
|
loop_header_indices.pop();
|
|
|
|
|
loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
|
|
|
|
|
if (repeat)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
aco: sometimes join linear wait entries on logical edges
fossil-db (gfx1201):
Totals from 1303 (1.64% of 79653) affected shaders:
Instrs: 6920949 -> 6917692 (-0.05%); split: -0.06%, +0.01%
CodeSize: 37112404 -> 37095728 (-0.04%); split: -0.05%, +0.01%
Latency: 70471343 -> 70365986 (-0.15%); split: -0.15%, +0.00%
InvThroughput: 11515673 -> 11504666 (-0.10%); split: -0.10%, +0.01%
fossil-db (navi31):
Totals from 1293 (1.62% of 79653) affected shaders:
Instrs: 6500186 -> 6496761 (-0.05%); split: -0.06%, +0.01%
CodeSize: 34562712 -> 34549236 (-0.04%); split: -0.04%, +0.01%
Latency: 68604746 -> 68666532 (+0.09%); split: -0.15%, +0.24%
InvThroughput: 11276591 -> 11284914 (+0.07%); split: -0.10%, +0.17%
fossil-db (navi21):
Totals from 811 (1.02% of 79653) affected shaders:
Instrs: 4110953 -> 4108788 (-0.05%); split: -0.05%, +0.00%
CodeSize: 22955984 -> 22948064 (-0.03%); split: -0.03%, +0.00%
Latency: 35070231 -> 35064448 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 6945610 -> 6945053 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
2025-05-13 12:54:00 +01:00
|
|
|
/* Sometimes the counter for an entry is incremented or removed on all logical predecessors,
|
|
|
|
|
* so it might be better to join entries using the logical predecessors instead of the linear
|
|
|
|
|
* ones.
|
|
|
|
|
*/
|
|
|
|
|
bool logical_merge =
|
|
|
|
|
current.logical_preds.size() > 1 &&
|
|
|
|
|
std::any_of(current.linear_preds.begin(), current.linear_preds.end(),
|
|
|
|
|
[&](unsigned pred)
|
|
|
|
|
{
|
|
|
|
|
return std::find(current.logical_preds.begin(), current.logical_preds.end(),
|
|
|
|
|
pred) == current.logical_preds.end();
|
|
|
|
|
});
|
|
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
bool changed = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned b : current.linear_preds)
|
aco: sometimes join linear wait entries on logical edges
fossil-db (gfx1201):
Totals from 1303 (1.64% of 79653) affected shaders:
Instrs: 6920949 -> 6917692 (-0.05%); split: -0.06%, +0.01%
CodeSize: 37112404 -> 37095728 (-0.04%); split: -0.05%, +0.01%
Latency: 70471343 -> 70365986 (-0.15%); split: -0.15%, +0.00%
InvThroughput: 11515673 -> 11504666 (-0.10%); split: -0.10%, +0.01%
fossil-db (navi31):
Totals from 1293 (1.62% of 79653) affected shaders:
Instrs: 6500186 -> 6496761 (-0.05%); split: -0.06%, +0.01%
CodeSize: 34562712 -> 34549236 (-0.04%); split: -0.04%, +0.01%
Latency: 68604746 -> 68666532 (+0.09%); split: -0.15%, +0.24%
InvThroughput: 11276591 -> 11284914 (+0.07%); split: -0.10%, +0.17%
fossil-db (navi21):
Totals from 811 (1.02% of 79653) affected shaders:
Instrs: 4110953 -> 4108788 (-0.05%); split: -0.05%, +0.00%
CodeSize: 22955984 -> 22948064 (-0.03%); split: -0.03%, +0.00%
Latency: 35070231 -> 35064448 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 6945610 -> 6945053 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
2025-05-13 12:54:00 +01:00
|
|
|
changed |= ctx.join(&out_ctx[b], false, logical_merge);
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned b : current.logical_preds)
|
aco: sometimes join linear wait entries on logical edges
fossil-db (gfx1201):
Totals from 1303 (1.64% of 79653) affected shaders:
Instrs: 6920949 -> 6917692 (-0.05%); split: -0.06%, +0.01%
CodeSize: 37112404 -> 37095728 (-0.04%); split: -0.05%, +0.01%
Latency: 70471343 -> 70365986 (-0.15%); split: -0.15%, +0.00%
InvThroughput: 11515673 -> 11504666 (-0.10%); split: -0.10%, +0.01%
fossil-db (navi31):
Totals from 1293 (1.62% of 79653) affected shaders:
Instrs: 6500186 -> 6496761 (-0.05%); split: -0.06%, +0.01%
CodeSize: 34562712 -> 34549236 (-0.04%); split: -0.04%, +0.01%
Latency: 68604746 -> 68666532 (+0.09%); split: -0.15%, +0.24%
InvThroughput: 11276591 -> 11284914 (+0.07%); split: -0.10%, +0.17%
fossil-db (navi21):
Totals from 811 (1.02% of 79653) affected shaders:
Instrs: 4110953 -> 4108788 (-0.05%); split: -0.05%, +0.00%
CodeSize: 22955984 -> 22948064 (-0.03%); split: -0.03%, +0.00%
Latency: 35070231 -> 35064448 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 6945610 -> 6945053 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
2025-05-13 12:54:00 +01:00
|
|
|
changed |= ctx.join(&out_ctx[b], true, logical_merge);
|
2019-10-21 21:36:41 +01:00
|
|
|
|
2019-12-04 14:41:18 +00:00
|
|
|
if (done[current.index] && !changed) {
|
|
|
|
|
in_ctx[current.index] = std::move(ctx);
|
2019-10-21 21:36:41 +01:00
|
|
|
continue;
|
2019-12-04 14:41:18 +00:00
|
|
|
} else {
|
|
|
|
|
in_ctx[current.index] = ctx;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-21 21:36:41 +01:00
|
|
|
loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
|
|
|
|
|
done[current.index] = true;
|
|
|
|
|
|
|
|
|
|
handle_block(program, current, ctx);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-04 14:41:18 +00:00
|
|
|
out_ctx[current.index] = std::move(ctx);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace aco
|