mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 22:30:12 +01:00
aco: rework barriers and replace can_reorder
fossil-db (Navi): Totals from 273 (0.21% of 132058) affected shaders: CodeSize: 937472 -> 936556 (-0.10%) Instrs: 158874 -> 158648 (-0.14%) Cycles: 13563516 -> 13562612 (-0.01%) VMEM: 85246 -> 85244 (-0.00%) SMEM: 21407 -> 21310 (-0.45%); split: +0.05%, -0.50% VClause: 9321 -> 9317 (-0.04%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4905>
This commit is contained in:
parent
1bbb64f300
commit
d1f992f3c2
12 changed files with 521 additions and 341 deletions
|
|
@ -84,7 +84,7 @@ bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
|
|||
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
||||
[&uses] (const Definition& def) { return uses[def.tempId()];}))
|
||||
return false;
|
||||
return !instr_info.is_atomic[(int)instr->opcode];
|
||||
return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
|
||||
}
|
||||
|
||||
std::vector<uint16_t> dead_code_analysis(Program *program) {
|
||||
|
|
|
|||
|
|
@ -273,8 +273,8 @@ struct wait_ctx {
|
|||
bool pending_flat_vm = false;
|
||||
bool pending_s_buffer_store = false; /* GFX10 workaround */
|
||||
|
||||
wait_imm barrier_imm[barrier_count];
|
||||
uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */
|
||||
wait_imm barrier_imm[storage_count];
|
||||
uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
|
||||
|
||||
std::map<PhysReg,wait_entry> gpr_map;
|
||||
|
||||
|
|
@ -327,7 +327,7 @@ struct wait_ctx {
|
|||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < barrier_count; i++) {
|
||||
for (unsigned i = 0; i < storage_count; i++) {
|
||||
changed |= barrier_imm[i].combine(other->barrier_imm[i]);
|
||||
changed |= other->barrier_events[i] & ~barrier_events[i];
|
||||
barrier_events[i] |= other->barrier_events[i];
|
||||
|
|
@ -444,7 +444,31 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
|
|||
return wait_imm();
|
||||
}
|
||||
|
||||
wait_imm kill(Instruction* instr, wait_ctx& ctx)
|
||||
wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
|
||||
{
|
||||
wait_imm imm;
|
||||
sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
||||
if (sync.semantics & semantics) {
|
||||
unsigned storage = sync.storage;
|
||||
while (storage) {
|
||||
unsigned idx = u_bit_scan(&storage);
|
||||
|
||||
/* LDS is private to the workgroup */
|
||||
sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
|
||||
|
||||
uint16_t events = ctx.barrier_events[idx];
|
||||
if (bar_scope_lds <= subgroup_scope)
|
||||
events &= ~event_lds;
|
||||
|
||||
if (events)
|
||||
imm.combine(ctx.barrier_imm[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
return imm;
|
||||
}
|
||||
|
||||
wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
||||
{
|
||||
wait_imm imm;
|
||||
if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
|
||||
|
|
@ -471,44 +495,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
|
|||
SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
|
||||
if (ctx.pending_s_buffer_store &&
|
||||
!smem->definitions.empty() &&
|
||||
!smem->can_reorder && smem->barrier == barrier_buffer) {
|
||||
!smem->sync.can_reorder()) {
|
||||
imm.lgkm = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (instr->format == Format::PSEUDO_BARRIER) {
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::p_memory_barrier_common:
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
|
||||
if (ctx.program->workgroup_size > ctx.program->wave_size)
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
|
||||
break;
|
||||
case aco_opcode::p_memory_barrier_atomic:
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
|
||||
break;
|
||||
/* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
|
||||
case aco_opcode::p_memory_barrier_buffer:
|
||||
case aco_opcode::p_memory_barrier_image:
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
|
||||
break;
|
||||
case aco_opcode::p_memory_barrier_shared:
|
||||
if (ctx.program->workgroup_size > ctx.program->wave_size)
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
|
||||
break;
|
||||
case aco_opcode::p_memory_barrier_gs_data:
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]);
|
||||
break;
|
||||
case aco_opcode::p_memory_barrier_gs_sendmsg:
|
||||
imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]);
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (instr->opcode == aco_opcode::p_barrier)
|
||||
imm.combine(perform_barrier(ctx, static_cast<Pseudo_barrier_instruction *>(instr)->sync, semantic_acqrel));
|
||||
else
|
||||
imm.combine(perform_barrier(ctx, sync_info, semantic_release));
|
||||
|
||||
if (!imm.empty()) {
|
||||
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
|
||||
|
|
@ -523,7 +518,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
|
|||
ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
|
||||
|
||||
/* update barrier wait imms */
|
||||
for (unsigned i = 0; i < barrier_count; i++) {
|
||||
for (unsigned i = 0; i < storage_count; i++) {
|
||||
wait_imm& bar = ctx.barrier_imm[i];
|
||||
uint16_t& bar_ev = ctx.barrier_events[i];
|
||||
if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
|
||||
|
|
@ -581,12 +576,12 @@ void update_barrier_counter(uint8_t *ctr, unsigned max)
|
|||
(*ctr)++;
|
||||
}
|
||||
|
||||
void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier)
|
||||
void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
|
||||
{
|
||||
for (unsigned i = 0; i < barrier_count; i++) {
|
||||
for (unsigned i = 0; i < storage_count; i++) {
|
||||
wait_imm& bar = ctx.barrier_imm[i];
|
||||
uint16_t& bar_ev = ctx.barrier_events[i];
|
||||
if (barrier & (1 << i)) {
|
||||
if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
|
||||
bar_ev |= event;
|
||||
if (counters & counter_lgkm)
|
||||
bar.lgkm = 0;
|
||||
|
|
@ -609,7 +604,7 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barri
|
|||
}
|
||||
}
|
||||
|
||||
void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
|
||||
void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
|
||||
{
|
||||
uint8_t counters = get_counters_for_event(event);
|
||||
|
||||
|
|
@ -622,7 +617,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
|
|||
if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
|
||||
ctx.vs_cnt++;
|
||||
|
||||
update_barrier_imm(ctx, counters, event, barrier);
|
||||
update_barrier_imm(ctx, counters, event, sync);
|
||||
|
||||
if (ctx.unordered_events & event)
|
||||
return;
|
||||
|
|
@ -651,7 +646,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
|
|||
}
|
||||
}
|
||||
|
||||
void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
|
||||
void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
|
||||
{
|
||||
assert(ctx.chip_class < GFX10);
|
||||
|
||||
|
|
@ -660,7 +655,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba
|
|||
if (ctx.vm_cnt <= ctx.max_vm_cnt)
|
||||
ctx.vm_cnt++;
|
||||
|
||||
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier);
|
||||
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
|
||||
|
||||
for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
|
||||
{
|
||||
|
|
@ -748,10 +743,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
break;
|
||||
}
|
||||
case Format::FLAT: {
|
||||
FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
|
||||
if (ctx.chip_class < GFX10 && !instr->definitions.empty())
|
||||
update_counters_for_flat_load(ctx, barrier_buffer);
|
||||
update_counters_for_flat_load(ctx, flat->sync);
|
||||
else
|
||||
update_counters(ctx, event_flat, barrier_buffer);
|
||||
update_counters(ctx, event_flat, flat->sync);
|
||||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], event_flat);
|
||||
|
|
@ -759,27 +755,26 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
}
|
||||
case Format::SMEM: {
|
||||
SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
|
||||
update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
|
||||
update_counters(ctx, event_smem, smem->sync);
|
||||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], event_smem);
|
||||
else if (ctx.chip_class >= GFX10 &&
|
||||
!smem->can_reorder &&
|
||||
smem->barrier == barrier_buffer)
|
||||
!smem->sync.can_reorder())
|
||||
ctx.pending_s_buffer_store = true;
|
||||
|
||||
break;
|
||||
}
|
||||
case Format::DS: {
|
||||
bool gds = static_cast<DS_instruction*>(instr)->gds;
|
||||
update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
|
||||
if (gds)
|
||||
DS_instruction *ds = static_cast<DS_instruction*>(instr);
|
||||
update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync);
|
||||
if (ds->gds)
|
||||
update_counters(ctx, event_gds_gpr_lock);
|
||||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
|
||||
insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds);
|
||||
|
||||
if (gds) {
|
||||
if (ds->gds) {
|
||||
for (const Operand& op : instr->operands)
|
||||
insert_wait_entry(ctx, op, event_gds_gpr_lock);
|
||||
insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
|
||||
|
|
@ -791,7 +786,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
case Format::MIMG:
|
||||
case Format::GLOBAL: {
|
||||
wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
|
||||
update_counters(ctx, ev, get_barrier_interaction(instr));
|
||||
update_counters(ctx, ev, get_sync_info(instr));
|
||||
|
||||
bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
|
||||
|
||||
|
|
@ -817,7 +812,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
case Format::SOPP: {
|
||||
if (instr->opcode == aco_opcode::s_sendmsg ||
|
||||
instr->opcode == aco_opcode::s_sendmsghalt)
|
||||
update_counters(ctx, event_sendmsg, get_barrier_interaction(instr));
|
||||
update_counters(ctx, event_sendmsg);
|
||||
}
|
||||
default:
|
||||
break;
|
||||
|
|
@ -851,7 +846,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
|
|||
for (aco_ptr<Instruction>& instr : block.instructions) {
|
||||
bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
|
||||
|
||||
queued_imm.combine(kill(instr.get(), ctx));
|
||||
memory_sync_info sync_info = get_sync_info(instr.get());
|
||||
queued_imm.combine(kill(instr.get(), ctx, sync_info));
|
||||
|
||||
ctx.gen_instr = instr.get();
|
||||
gen(instr.get(), ctx);
|
||||
|
|
@ -863,6 +859,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
|
|||
}
|
||||
new_instructions.emplace_back(std::move(instr));
|
||||
|
||||
queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
|
||||
|
||||
if (ctx.collect_statistics)
|
||||
ctx.advance_unwaited_instrs();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3137,8 +3137,7 @@ struct LoadEmitInfo {
|
|||
|
||||
bool glc = false;
|
||||
unsigned swizzle_component_size = 0;
|
||||
barrier_interaction barrier = barrier_none;
|
||||
bool can_reorder = true;
|
||||
memory_sync_info sync;
|
||||
Temp soffset = Temp(0, s1);
|
||||
};
|
||||
|
||||
|
|
@ -3441,10 +3440,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
|
|||
|
||||
RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
|
||||
Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
|
||||
Instruction *instr;
|
||||
if (read2)
|
||||
bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
|
||||
instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
|
||||
else
|
||||
bld.ds(op, Definition(val), offset, m, const_offset);
|
||||
instr = bld.ds(op, Definition(val), offset, m, const_offset);
|
||||
static_cast<DS_instruction *>(instr)->sync = info->sync;
|
||||
|
||||
if (size < 4)
|
||||
val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
|
||||
|
|
@ -3490,8 +3491,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
|
|||
load->definitions[0] = Definition(val);
|
||||
load->glc = info->glc;
|
||||
load->dlc = info->glc && bld.program->chip_class >= GFX10;
|
||||
load->barrier = info->barrier;
|
||||
load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
|
||||
load->sync = info->sync;
|
||||
bld.insert(std::move(load));
|
||||
return val;
|
||||
}
|
||||
|
|
@ -3540,8 +3540,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
|
|||
mubuf->offen = (offset.type() == RegType::vgpr);
|
||||
mubuf->glc = info->glc;
|
||||
mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
|
||||
mubuf->barrier = info->barrier;
|
||||
mubuf->can_reorder = info->can_reorder;
|
||||
mubuf->sync = info->sync;
|
||||
mubuf->offset = const_offset;
|
||||
mubuf->swizzled = info->swizzle_component_size != 0;
|
||||
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
|
||||
|
|
@ -3605,7 +3604,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
|
|||
mubuf->offset = 0;
|
||||
mubuf->addr64 = offset.type() == RegType::vgpr;
|
||||
mubuf->disable_wqm = false;
|
||||
mubuf->barrier = info->barrier;
|
||||
mubuf->sync = info->sync;
|
||||
mubuf->definitions[0] = Definition(val);
|
||||
bld.insert(std::move(mubuf));
|
||||
} else {
|
||||
|
|
@ -3616,7 +3615,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
|
|||
flat->operands[1] = Operand(s1);
|
||||
flat->glc = info->glc;
|
||||
flat->dlc = info->glc && bld.program->chip_class >= GFX10;
|
||||
flat->barrier = info->barrier;
|
||||
flat->sync = info->sync;
|
||||
flat->offset = 0u;
|
||||
flat->definitions[0] = Definition(val);
|
||||
bld.insert(std::move(flat));
|
||||
|
|
@ -3638,8 +3637,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
|
|||
LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
|
||||
info.align_mul = align;
|
||||
info.align_offset = 0;
|
||||
info.barrier = barrier_shared;
|
||||
info.can_reorder = false;
|
||||
info.sync = memory_sync_info(storage_shared);
|
||||
info.const_offset = base_offset;
|
||||
emit_lds_load(ctx, bld, &info);
|
||||
|
||||
|
|
@ -3848,13 +3846,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
|
|||
}
|
||||
assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
|
||||
|
||||
Instruction *instr;
|
||||
if (write2) {
|
||||
Temp second_data = write_datas[second];
|
||||
inline_offset /= data.bytes();
|
||||
bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
|
||||
instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
|
||||
} else {
|
||||
bld.ds(op, address_offset, data, m, inline_offset);
|
||||
instr = bld.ds(op, address_offset, data, m, inline_offset);
|
||||
}
|
||||
static_cast<DS_instruction *>(instr)->sync =
|
||||
memory_sync_info(storage_shared);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -4017,7 +4018,8 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
|
|||
/* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
|
||||
/* dlc*/ false, /* slc */ slc);
|
||||
|
||||
static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
|
||||
if (!allow_reorder)
|
||||
static_cast<MUBUF_instruction *>(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private);
|
||||
}
|
||||
|
||||
void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
|
||||
|
|
@ -4847,15 +4849,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
}
|
||||
|
||||
if (use_mubuf) {
|
||||
Instruction *mubuf = bld.mubuf(opcode,
|
||||
bld.mubuf(opcode,
|
||||
Definition(fetch_dst), list, fetch_index, soffset,
|
||||
fetch_offset, false, false, true).instr;
|
||||
static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
|
||||
} else {
|
||||
Instruction *mtbuf = bld.mtbuf(opcode,
|
||||
bld.mtbuf(opcode,
|
||||
Definition(fetch_dst), list, fetch_index, soffset,
|
||||
fetch_dfmt, nfmt, fetch_offset, false, true).instr;
|
||||
static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
|
||||
}
|
||||
|
||||
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
|
||||
|
|
@ -5208,7 +5208,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
|
||||
void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
|
||||
Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
|
||||
bool glc=false, bool readonly=true, bool allow_smem=true)
|
||||
bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
|
||||
{
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
|
||||
|
|
@ -5218,8 +5218,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_
|
|||
|
||||
LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
|
||||
info.glc = glc;
|
||||
info.barrier = readonly ? barrier_none : barrier_buffer;
|
||||
info.can_reorder = readonly;
|
||||
info.sync = sync;
|
||||
info.align_mul = align_mul;
|
||||
info.align_offset = align_offset;
|
||||
if (use_smem)
|
||||
|
|
@ -5737,7 +5736,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
|
|||
load->unrm = true;
|
||||
load->da = da;
|
||||
load->dim = dim;
|
||||
load->can_reorder = true; /* fmask images shouldn't be modified */
|
||||
ctx->block->instructions.emplace_back(std::move(load));
|
||||
|
||||
Operand sample_index4;
|
||||
|
|
@ -5837,6 +5835,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
|
|||
}
|
||||
|
||||
|
||||
memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
|
||||
{
|
||||
/* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
|
||||
if (semantics & semantic_atomicrmw)
|
||||
return memory_sync_info(storage, semantics);
|
||||
|
||||
unsigned access = nir_intrinsic_access(instr);
|
||||
|
||||
if (access & ACCESS_VOLATILE)
|
||||
semantics |= semantic_volatile;
|
||||
if (access & ACCESS_CAN_REORDER)
|
||||
semantics |= semantic_can_reorder | semantic_private;
|
||||
|
||||
return memory_sync_info(storage, semantics);
|
||||
}
|
||||
|
||||
void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
|
||||
{
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
|
|
@ -5846,6 +5860,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
bool is_array = glsl_sampler_type_is_array(type);
|
||||
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
||||
|
||||
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
|
||||
|
||||
if (dim == GLSL_SAMPLER_DIM_BUF) {
|
||||
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
|
||||
unsigned num_channels = util_last_bit(mask);
|
||||
|
|
@ -5882,7 +5898,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
load->idxen = true;
|
||||
load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
|
||||
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
|
||||
load->barrier = barrier_image;
|
||||
load->sync = sync;
|
||||
ctx->block->instructions.emplace_back(std::move(load));
|
||||
|
||||
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
|
||||
|
|
@ -5914,7 +5930,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
load->dmask = dmask;
|
||||
load->unrm = true;
|
||||
load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
|
||||
load->barrier = barrier_image;
|
||||
load->sync = sync;
|
||||
ctx->block->instructions.emplace_back(std::move(load));
|
||||
|
||||
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
|
||||
|
|
@ -5929,6 +5945,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
bool is_array = glsl_sampler_type_is_array(type);
|
||||
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
|
||||
|
||||
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
|
||||
bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
|
||||
|
||||
if (dim == GLSL_SAMPLER_DIM_BUF) {
|
||||
|
|
@ -5960,7 +5977,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
store->glc = glc;
|
||||
store->dlc = false;
|
||||
store->disable_wqm = true;
|
||||
store->barrier = barrier_image;
|
||||
store->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
return;
|
||||
|
|
@ -5984,7 +6001,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
store->unrm = true;
|
||||
store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
|
||||
store->disable_wqm = true;
|
||||
store->barrier = barrier_image;
|
||||
store->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
return;
|
||||
|
|
@ -6062,6 +6079,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
}
|
||||
|
||||
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
||||
memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
|
||||
|
||||
if (dim == GLSL_SAMPLER_DIM_BUF) {
|
||||
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
|
||||
|
|
@ -6079,7 +6097,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
mubuf->glc = return_previous;
|
||||
mubuf->dlc = false; /* Not needed for atomics */
|
||||
mubuf->disable_wqm = true;
|
||||
mubuf->barrier = barrier_image;
|
||||
mubuf->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(mubuf));
|
||||
return;
|
||||
|
|
@ -6100,7 +6118,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
mimg->unrm = true;
|
||||
mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
|
||||
mimg->disable_wqm = true;
|
||||
mimg->barrier = barrier_image;
|
||||
mimg->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(mimg));
|
||||
return;
|
||||
|
|
@ -6164,7 +6182,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
|
||||
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
|
||||
mimg->da = glsl_sampler_type_is_array(type);
|
||||
mimg->can_reorder = true;
|
||||
Definition& def = mimg->definitions[0];
|
||||
ctx->block->instructions.emplace_back(std::move(mimg));
|
||||
|
||||
|
|
@ -6219,7 +6236,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
|
||||
|
||||
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
|
||||
nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
|
||||
nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
|
||||
get_memory_sync_info(instr, storage_buffer, 0));
|
||||
}
|
||||
|
||||
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
||||
|
|
@ -6233,6 +6251,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
|
||||
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
|
||||
|
||||
memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
|
||||
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
|
||||
uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
|
||||
/* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
|
||||
|
|
@ -6275,7 +6294,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
store->glc = glc;
|
||||
store->dlc = false;
|
||||
store->disable_wqm = true;
|
||||
store->barrier = barrier_buffer;
|
||||
store->sync = sync;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
ctx->program->wb_smem_l1_on_end = true;
|
||||
if (op == aco_opcode::p_fs_buffer_store_smem) {
|
||||
|
|
@ -6293,7 +6312,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
store->glc = glc;
|
||||
store->dlc = false;
|
||||
store->disable_wqm = true;
|
||||
store->barrier = barrier_buffer;
|
||||
store->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
}
|
||||
|
|
@ -6384,7 +6403,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
mubuf->glc = return_previous;
|
||||
mubuf->dlc = false; /* Not needed for atomics */
|
||||
mubuf->disable_wqm = true;
|
||||
mubuf->barrier = barrier_buffer;
|
||||
mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(mubuf));
|
||||
}
|
||||
|
|
@ -6409,8 +6428,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
|
||||
info.align_mul = nir_intrinsic_align_mul(instr);
|
||||
info.align_offset = nir_intrinsic_align_offset(instr);
|
||||
info.barrier = barrier_buffer;
|
||||
info.can_reorder = false;
|
||||
info.sync = get_memory_sync_info(instr, storage_buffer, 0);
|
||||
/* VMEM stores don't update the SMEM cache and it's difficult to prove that
|
||||
* it's safe to use SMEM */
|
||||
bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
|
||||
|
|
@ -6430,6 +6448,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
|
||||
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
|
||||
Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
|
||||
memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
|
||||
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
|
||||
|
||||
if (ctx->options->chip_class >= GFX7)
|
||||
|
|
@ -6495,7 +6514,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
flat->dlc = false;
|
||||
flat->offset = offset;
|
||||
flat->disable_wqm = true;
|
||||
flat->barrier = barrier_buffer;
|
||||
flat->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(flat));
|
||||
} else {
|
||||
|
|
@ -6515,7 +6534,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
mubuf->offset = offsets[i];
|
||||
mubuf->addr64 = addr.type() == RegType::vgpr;
|
||||
mubuf->disable_wqm = true;
|
||||
mubuf->barrier = barrier_buffer;
|
||||
mubuf->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(mubuf));
|
||||
}
|
||||
|
|
@ -6608,7 +6627,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
flat->dlc = false; /* Not needed for atomics */
|
||||
flat->offset = 0;
|
||||
flat->disable_wqm = true;
|
||||
flat->barrier = barrier_buffer;
|
||||
flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(flat));
|
||||
} else {
|
||||
|
|
@ -6675,7 +6694,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
mubuf->offset = 0;
|
||||
mubuf->addr64 = addr.type() == RegType::vgpr;
|
||||
mubuf->disable_wqm = true;
|
||||
mubuf->barrier = barrier_buffer;
|
||||
mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(mubuf));
|
||||
}
|
||||
|
|
@ -6683,20 +6702,30 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
|
||||
void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
|
||||
switch(instr->intrinsic) {
|
||||
case nir_intrinsic_group_memory_barrier:
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(all_mem, semantic_acqrel, scope_workgroup));
|
||||
break;
|
||||
case nir_intrinsic_memory_barrier:
|
||||
bld.barrier(aco_opcode::p_memory_barrier_common);
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(all_mem, semantic_acqrel, scope_device));
|
||||
break;
|
||||
case nir_intrinsic_memory_barrier_buffer:
|
||||
bld.barrier(aco_opcode::p_memory_barrier_buffer);
|
||||
break;
|
||||
case nir_intrinsic_memory_barrier_image:
|
||||
bld.barrier(aco_opcode::p_memory_barrier_image);
|
||||
/* since NIR splits barriers, we have to unify buffer and image barriers
|
||||
* for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent.
|
||||
* fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
|
||||
* passes
|
||||
*/
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device));
|
||||
break;
|
||||
case nir_intrinsic_memory_barrier_tcs_patch:
|
||||
case nir_intrinsic_memory_barrier_shared:
|
||||
bld.barrier(aco_opcode::p_memory_barrier_shared);
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
|
||||
break;
|
||||
default:
|
||||
unreachable("Unimplemented memory barrier intrinsic");
|
||||
|
|
@ -6844,6 +6873,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
ds->offset0 = offset;
|
||||
if (return_previous)
|
||||
ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
|
||||
ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
|
||||
ctx->block->instructions.emplace_back(std::move(ds));
|
||||
}
|
||||
|
||||
|
|
@ -6884,7 +6914,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
|
|||
info.align_mul = nir_intrinsic_align_mul(instr);
|
||||
info.align_offset = nir_intrinsic_align_offset(instr);
|
||||
info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
|
||||
info.can_reorder = false;
|
||||
info.sync = memory_sync_info(storage_buffer, semantic_private);
|
||||
info.soffset = ctx->program->scratch_offset;
|
||||
emit_scratch_load(ctx, bld, &info);
|
||||
}
|
||||
|
|
@ -6907,7 +6937,8 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
|
|||
|
||||
for (unsigned i = 0; i < write_count; i++) {
|
||||
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
|
||||
bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
|
||||
Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
|
||||
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_buffer, semantic_private);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -7021,8 +7052,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
|
|||
mtbuf->offset = const_offset;
|
||||
mtbuf->glc = true;
|
||||
mtbuf->slc = true;
|
||||
mtbuf->barrier = barrier_gs_data;
|
||||
mtbuf->can_reorder = true;
|
||||
mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
|
||||
bld.insert(std::move(mtbuf));
|
||||
}
|
||||
|
||||
|
|
@ -7347,8 +7377,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
load->glc = false;
|
||||
load->dlc = false;
|
||||
load->disable_wqm = false;
|
||||
load->barrier = barrier_none;
|
||||
load->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(load));
|
||||
}
|
||||
|
||||
|
|
@ -7531,17 +7559,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
visit_get_buffer_size(ctx, instr);
|
||||
break;
|
||||
case nir_intrinsic_control_barrier: {
|
||||
if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
|
||||
/* GFX6 only (thanks to a hw bug workaround):
|
||||
* The real barrier instruction isn’t needed, because an entire patch
|
||||
* always fits into a single wave.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
if (ctx->program->workgroup_size > ctx->program->wave_size)
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
|
||||
bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_memory_barrier_tcs_patch:
|
||||
|
|
@ -8093,7 +8111,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
aco_opcode opcode =
|
||||
nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
|
||||
aco_opcode::s_memrealtime : aco_opcode::s_memtime;
|
||||
bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
|
||||
bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile));
|
||||
emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
|
||||
break;
|
||||
}
|
||||
|
|
@ -8674,7 +8692,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
|
|||
tex->da = da;
|
||||
tex->definitions[0] = Definition(tmp_dst);
|
||||
tex->dim = dim;
|
||||
tex->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(tex));
|
||||
|
||||
if (div_by_6) {
|
||||
|
|
@ -8707,7 +8724,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
|
|||
tex->da = da;
|
||||
Temp size = bld.tmp(v2);
|
||||
tex->definitions[0] = Definition(size);
|
||||
tex->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(tex));
|
||||
emit_split_vector(ctx, size, size.size());
|
||||
|
||||
|
|
@ -8809,7 +8825,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
|
|||
mubuf->operands[2] = Operand((uint32_t) 0);
|
||||
mubuf->definitions[0] = Definition(tmp_dst);
|
||||
mubuf->idxen = true;
|
||||
mubuf->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(mubuf));
|
||||
|
||||
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
|
||||
|
|
@ -8858,7 +8873,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
|
|||
tex->unrm = true;
|
||||
tex->da = da;
|
||||
tex->definitions[0] = Definition(tmp_dst);
|
||||
tex->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(tex));
|
||||
|
||||
if (instr->op == nir_texop_samples_identical) {
|
||||
|
|
@ -9002,7 +9016,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
|
|||
tex->dmask = dmask;
|
||||
tex->da = da;
|
||||
tex->definitions[0] = Definition(tmp_dst);
|
||||
tex->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(tex));
|
||||
|
||||
if (tg4_integer_cube_workaround) {
|
||||
|
|
@ -10285,6 +10298,13 @@ static void create_fs_exports(isel_context *ctx)
|
|||
create_null_export(ctx);
|
||||
}
|
||||
|
||||
static void create_workgroup_barrier(Builder& bld)
|
||||
{
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
|
||||
scope_workgroup);
|
||||
}
|
||||
|
||||
static void write_tcs_tess_factors(isel_context *ctx)
|
||||
{
|
||||
unsigned outer_comps;
|
||||
|
|
@ -10309,9 +10329,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
|
|||
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
|
||||
bld.barrier(aco_opcode::p_memory_barrier_shared);
|
||||
if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
create_workgroup_barrier(bld);
|
||||
|
||||
Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
|
||||
Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
|
||||
|
|
@ -10470,7 +10488,6 @@ static void emit_stream_output(isel_context *ctx,
|
|||
store->glc = true;
|
||||
store->dlc = false;
|
||||
store->slc = true;
|
||||
store->can_reorder = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
}
|
||||
}
|
||||
|
|
@ -10890,8 +10907,7 @@ void ngg_emit_nogs_output(isel_context *ctx)
|
|||
|
||||
if (ctx->stage == ngg_vertex_gs) {
|
||||
/* Wait for GS threads to store primitive ID in LDS. */
|
||||
bld.barrier(aco_opcode::p_memory_barrier_shared);
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
create_workgroup_barrier(bld);
|
||||
|
||||
/* Calculate LDS address where the GS threads stored the primitive ID. */
|
||||
Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
|
||||
|
|
@ -10975,8 +10991,7 @@ void select_program(Program *program,
|
|||
if (i) {
|
||||
Builder bld(ctx.program, ctx.block);
|
||||
|
||||
bld.barrier(aco_opcode::p_memory_barrier_shared);
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
create_workgroup_barrier(bld);
|
||||
|
||||
if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
|
||||
ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
|
||||
|
|
@ -10999,7 +11014,8 @@ void select_program(Program *program,
|
|||
ngg_emit_nogs_output(&ctx);
|
||||
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
|
||||
Builder bld(ctx.program, ctx.block);
|
||||
bld.barrier(aco_opcode::p_memory_barrier_gs_data);
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_vmem_output, semantic_release, scope_device));
|
||||
bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
|
||||
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
|
||||
write_tcs_tess_factors(&ctx);
|
||||
|
|
@ -11031,7 +11047,7 @@ void select_program(Program *program,
|
|||
ctx.block->kind |= block_kind_uniform;
|
||||
Builder bld(ctx.program, ctx.block);
|
||||
if (ctx.program->wb_smem_l1_on_end)
|
||||
bld.smem(aco_opcode::s_dcache_wb, false);
|
||||
bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
|
||||
bld.sopp(aco_opcode::s_endpgm);
|
||||
|
||||
cleanup_cfg(program);
|
||||
|
|
@ -11116,8 +11132,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
|
|||
mubuf->glc = true;
|
||||
mubuf->slc = true;
|
||||
mubuf->dlc = args->options->chip_class >= GFX10;
|
||||
mubuf->barrier = barrier_none;
|
||||
mubuf->can_reorder = true;
|
||||
|
||||
ctx.outputs.mask[i] |= 1 << j;
|
||||
ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
|
||||
|
|
|
|||
|
|
@ -127,6 +127,28 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->next_fp_mode.round32 = fp_round_ne;
|
||||
}
|
||||
|
||||
memory_sync_info get_sync_info(const Instruction* instr)
|
||||
{
|
||||
switch (instr->format) {
|
||||
case Format::SMEM:
|
||||
return static_cast<const SMEM_instruction*>(instr)->sync;
|
||||
case Format::MUBUF:
|
||||
return static_cast<const MUBUF_instruction*>(instr)->sync;
|
||||
case Format::MIMG:
|
||||
return static_cast<const MIMG_instruction*>(instr)->sync;
|
||||
case Format::MTBUF:
|
||||
return static_cast<const MTBUF_instruction*>(instr)->sync;
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
return static_cast<const FLAT_instruction*>(instr)->sync;
|
||||
case Format::DS:
|
||||
return static_cast<const DS_instruction*>(instr)->sync;
|
||||
default:
|
||||
return memory_sync_info();
|
||||
}
|
||||
}
|
||||
|
||||
bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (!instr->isVALU())
|
||||
|
|
|
|||
|
|
@ -103,22 +103,79 @@ enum class Format : std::uint16_t {
|
|||
SDWA = 1 << 14,
|
||||
};
|
||||
|
||||
enum barrier_interaction : uint8_t {
|
||||
barrier_none = 0,
|
||||
barrier_buffer = 0x1,
|
||||
barrier_image = 0x2,
|
||||
barrier_atomic = 0x4,
|
||||
barrier_shared = 0x8,
|
||||
/* used for geometry shaders to ensure vertex data writes are before the
|
||||
* GS_DONE s_sendmsg. */
|
||||
barrier_gs_data = 0x10,
|
||||
/* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
|
||||
barrier_gs_sendmsg = 0x20,
|
||||
/* used by barriers. created by s_barrier */
|
||||
barrier_barrier = 0x40,
|
||||
barrier_count = 7,
|
||||
enum storage_class : uint8_t {
|
||||
storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
|
||||
storage_buffer = 0x1, /* SSBOs and global memory */
|
||||
storage_atomic_counter = 0x2, /* not used for Vulkan */
|
||||
storage_image = 0x4,
|
||||
storage_shared = 0x8, /* or TCS output */
|
||||
storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
|
||||
storage_scratch = 0x20,
|
||||
storage_vgpr_spill = 0x40,
|
||||
storage_count = 8,
|
||||
};
|
||||
|
||||
enum memory_semantics : uint8_t {
|
||||
semantic_none = 0x0,
|
||||
/* for loads: don't move any access after this load to before this load (even other loads)
|
||||
* for barriers: don't move any access after the barrier to before any
|
||||
* atomics/control_barriers/sendmsg_gs_done before the barrier */
|
||||
semantic_acquire = 0x1,
|
||||
/* for stores: don't move any access before this store to after this store
|
||||
* for barriers: don't move any access before the barrier to after any
|
||||
* atomics/control_barriers/sendmsg_gs_done after the barrier */
|
||||
semantic_release = 0x2,
|
||||
|
||||
/* the rest are for load/stores/atomics only */
|
||||
/* cannot be DCE'd or CSE'd */
|
||||
semantic_volatile = 0x4,
|
||||
/* does not interact with barriers and assumes this lane is the only lane
|
||||
* accessing this memory */
|
||||
semantic_private = 0x8,
|
||||
/* this operation can be reordered around operations of the same storage. says nothing about barriers */
|
||||
semantic_can_reorder = 0x10,
|
||||
/* this is a atomic instruction (may only read or write memory) */
|
||||
semantic_atomic = 0x20,
|
||||
/* this is instruction both reads and writes memory */
|
||||
semantic_rmw = 0x40,
|
||||
|
||||
semantic_acqrel = semantic_acquire | semantic_release,
|
||||
semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
|
||||
};
|
||||
|
||||
enum sync_scope : uint8_t {
|
||||
scope_invocation = 0,
|
||||
scope_subgroup = 1,
|
||||
scope_workgroup = 2,
|
||||
scope_queuefamily = 3,
|
||||
scope_device = 4,
|
||||
};
|
||||
|
||||
struct memory_sync_info {
|
||||
memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
|
||||
memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
|
||||
: storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
|
||||
|
||||
storage_class storage:8;
|
||||
memory_semantics semantics:8;
|
||||
sync_scope scope:8;
|
||||
|
||||
bool operator == (const memory_sync_info& rhs) const {
|
||||
return storage == rhs.storage &&
|
||||
semantics == rhs.semantics &&
|
||||
scope == rhs.scope;
|
||||
}
|
||||
|
||||
bool can_reorder() const {
|
||||
if (semantics & semantic_acqrel)
|
||||
return false;
|
||||
/* Also check storage so that zero-initialized memory_sync_info can be
|
||||
* reordered. */
|
||||
return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
|
||||
}
|
||||
};
|
||||
static_assert(sizeof(memory_sync_info) == 3);
|
||||
|
||||
enum fp_round {
|
||||
fp_round_ne = 0,
|
||||
fp_round_pi = 1,
|
||||
|
|
@ -931,14 +988,13 @@ static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected p
|
|||
*
|
||||
*/
|
||||
struct SMEM_instruction : public Instruction {
|
||||
barrier_interaction barrier;
|
||||
memory_sync_info sync;
|
||||
bool glc : 1; /* VI+: globally coherent */
|
||||
bool dlc : 1; /* NAVI: device level coherent */
|
||||
bool nv : 1; /* VEGA only: Non-volatile */
|
||||
bool can_reorder : 1;
|
||||
bool disable_wqm : 1;
|
||||
bool prevent_overflow : 1; /* avoid overflow when combining additions */
|
||||
uint32_t padding: 18;
|
||||
uint32_t padding: 3;
|
||||
};
|
||||
static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
|
||||
|
||||
|
|
@ -1066,11 +1122,13 @@ static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected
|
|||
*
|
||||
*/
|
||||
struct DS_instruction : public Instruction {
|
||||
memory_sync_info sync;
|
||||
bool gds;
|
||||
int16_t offset0;
|
||||
int8_t offset1;
|
||||
bool gds;
|
||||
uint8_t padding;
|
||||
};
|
||||
static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
|
||||
static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
|
||||
/**
|
||||
* Vector Memory Untyped-buffer Instructions
|
||||
|
|
@ -1081,7 +1139,7 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected pad
|
|||
*
|
||||
*/
|
||||
struct MUBUF_instruction : public Instruction {
|
||||
uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
|
||||
memory_sync_info sync;
|
||||
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
|
||||
bool idxen : 1; /* Supply an index from VGPR (VADDR) */
|
||||
bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
|
||||
|
|
@ -1091,12 +1149,11 @@ struct MUBUF_instruction : public Instruction {
|
|||
bool tfe : 1; /* texture fail enable */
|
||||
bool lds : 1; /* Return read-data to LDS instead of VGPRs */
|
||||
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
|
||||
bool can_reorder : 1;
|
||||
bool swizzled:1;
|
||||
uint8_t padding : 1;
|
||||
barrier_interaction barrier;
|
||||
uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
|
||||
bool swizzled : 1;
|
||||
uint32_t padding1 : 18;
|
||||
};
|
||||
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
|
||||
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
|
||||
/**
|
||||
* Vector Memory Typed-buffer Instructions
|
||||
|
|
@ -1107,8 +1164,7 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected
|
|||
*
|
||||
*/
|
||||
struct MTBUF_instruction : public Instruction {
|
||||
uint16_t offset; /* Unsigned byte offset - 12 bit */
|
||||
barrier_interaction barrier;
|
||||
memory_sync_info sync;
|
||||
uint8_t dfmt : 4; /* Data Format of data in memory buffer */
|
||||
uint8_t nfmt : 3; /* Numeric format of data in memory */
|
||||
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
|
||||
|
|
@ -1118,8 +1174,8 @@ struct MTBUF_instruction : public Instruction {
|
|||
bool slc : 1; /* system level coherent */
|
||||
bool tfe : 1; /* texture fail enable */
|
||||
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
|
||||
bool can_reorder : 1;
|
||||
uint32_t padding : 25;
|
||||
uint32_t padding : 10;
|
||||
uint16_t offset; /* Unsigned byte offset - 12 bit */
|
||||
};
|
||||
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
|
||||
|
|
@ -1133,6 +1189,7 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
|
|||
*
|
||||
*/
|
||||
struct MIMG_instruction : public Instruction {
|
||||
memory_sync_info sync;
|
||||
uint8_t dmask; /* Data VGPR enable mask */
|
||||
uint8_t dim : 3; /* NAVI: dimensionality */
|
||||
bool unrm : 1; /* Force address to be un-normalized */
|
||||
|
|
@ -1146,11 +1203,9 @@ struct MIMG_instruction : public Instruction {
|
|||
bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
|
||||
bool d16 : 1; /* Convert 32-bit data to 16-bit data */
|
||||
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
|
||||
bool can_reorder : 1;
|
||||
uint8_t padding : 1;
|
||||
barrier_interaction barrier;
|
||||
uint32_t padding : 18;
|
||||
};
|
||||
static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
|
||||
static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
|
||||
/**
|
||||
* Flat/Scratch/Global Instructions
|
||||
|
|
@ -1160,18 +1215,18 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected p
|
|||
*
|
||||
*/
|
||||
struct FLAT_instruction : public Instruction {
|
||||
uint16_t offset; /* Vega/Navi only */
|
||||
memory_sync_info sync;
|
||||
bool slc : 1; /* system level coherent */
|
||||
bool glc : 1; /* globally coherent */
|
||||
bool dlc : 1; /* NAVI: device level coherent */
|
||||
bool lds : 1;
|
||||
bool nv : 1;
|
||||
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
|
||||
bool can_reorder : 1;
|
||||
uint8_t padding : 1;
|
||||
barrier_interaction barrier;
|
||||
uint32_t padding0 : 2;
|
||||
uint16_t offset; /* Vega/Navi only */
|
||||
uint16_t padding1;
|
||||
};
|
||||
static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
|
||||
static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
|
||||
struct Export_instruction : public Instruction {
|
||||
uint8_t enabled_mask;
|
||||
|
|
@ -1200,8 +1255,10 @@ struct Pseudo_branch_instruction : public Instruction {
|
|||
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
|
||||
struct Pseudo_barrier_instruction : public Instruction {
|
||||
memory_sync_info sync;
|
||||
sync_scope exec_scope;
|
||||
};
|
||||
static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
|
||||
static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
|
||||
|
||||
enum ReduceOp : uint16_t {
|
||||
iadd8, iadd16, iadd32, iadd64,
|
||||
|
|
@ -1298,7 +1355,8 @@ static inline bool is_phi(aco_ptr<Instruction>& instr)
|
|||
return is_phi(instr.get());
|
||||
}
|
||||
|
||||
barrier_interaction get_barrier_interaction(const Instruction* instr);
|
||||
memory_sync_info get_sync_info(const Instruction* instr);
|
||||
|
||||
bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
|
||||
|
||||
bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
|
||||
|
|
|
|||
|
|
@ -1854,6 +1854,7 @@ void lower_to_hw_instr(Program* program)
|
|||
emit_gfx10_wave64_bpermute(program, instr, bld);
|
||||
else
|
||||
unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
|
|
@ -1908,6 +1909,18 @@ void lower_to_hw_instr(Program* program)
|
|||
reduce->operands[2].physReg(), // vtmp
|
||||
reduce->definitions[2].physReg(), // sitmp
|
||||
reduce->operands[0], reduce->definitions[0]);
|
||||
} else if (instr->format == Format::PSEUDO_BARRIER) {
|
||||
Pseudo_barrier_instruction* barrier = static_cast<Pseudo_barrier_instruction*>(instr.get());
|
||||
|
||||
/* Anything larger than a workgroup isn't possible. Anything
|
||||
* smaller requires no instructions and this pseudo instruction
|
||||
* would only be included to control optimizations. */
|
||||
bool emit_s_barrier = barrier->exec_scope == scope_workgroup &&
|
||||
program->workgroup_size > program->wave_size;
|
||||
|
||||
bld.insert(std::move(instr));
|
||||
if (emit_s_barrier)
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
} else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
|
||||
float_mode new_mode = block->fp_mode;
|
||||
new_mode.round16_64 = fp_round_ne;
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ class Format(Enum):
|
|||
return [('uint32_t', 'block', '-1'),
|
||||
('uint32_t', 'imm', '0')]
|
||||
elif self == Format.SMEM:
|
||||
return [('bool', 'can_reorder', 'true'),
|
||||
return [('memory_sync_info', 'sync', 'memory_sync_info()'),
|
||||
('bool', 'glc', 'false'),
|
||||
('bool', 'dlc', 'false'),
|
||||
('bool', 'nv', 'false')]
|
||||
|
|
@ -123,6 +123,9 @@ class Format(Enum):
|
|||
elif self == Format.PSEUDO_REDUCTION:
|
||||
return [('ReduceOp', 'op', None, 'reduce_op'),
|
||||
('unsigned', 'cluster_size', '0')]
|
||||
elif self == Format.PSEUDO_BARRIER:
|
||||
return [('memory_sync_info', 'sync', None),
|
||||
('sync_scope', 'exec_scope', 'scope_invocation')]
|
||||
elif self == Format.VINTRP:
|
||||
return [('unsigned', 'attribute', None),
|
||||
('unsigned', 'component', None)]
|
||||
|
|
@ -133,7 +136,7 @@ class Format(Enum):
|
|||
('bool', 'bound_ctrl', 'true')]
|
||||
elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
|
||||
return [('uint16_t', 'offset', 0),
|
||||
('bool', 'can_reorder', 'true'),
|
||||
('memory_sync_info', 'sync', 'memory_sync_info()'),
|
||||
('bool', 'glc', 'false'),
|
||||
('bool', 'slc', 'false'),
|
||||
('bool', 'lds', 'false'),
|
||||
|
|
@ -265,13 +268,7 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
|
|||
opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
|
||||
opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
|
||||
|
||||
opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
|
||||
opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
|
||||
opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
|
||||
opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
|
||||
opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
|
||||
opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
|
||||
opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
|
||||
opcode("p_barrier", format=Format.PSEUDO_BARRIER)
|
||||
|
||||
opcode("p_spill")
|
||||
opcode("p_reload")
|
||||
|
|
|
|||
|
|
@ -224,12 +224,15 @@ struct InstrPred {
|
|||
return aK->imm == bK->imm;
|
||||
}
|
||||
case Format::SMEM: {
|
||||
if (!a->operands.empty() && a->operands[0].bytes() == 16)
|
||||
return false;
|
||||
SMEM_instruction* aS = static_cast<SMEM_instruction*>(a);
|
||||
SMEM_instruction* bS = static_cast<SMEM_instruction*>(b);
|
||||
/* isel shouldn't be creating situations where this assertion fails */
|
||||
assert(aS->prevent_overflow == bS->prevent_overflow);
|
||||
return aS->can_reorder && bS->can_reorder &&
|
||||
aS->glc == bS->glc && aS->nv == bS->nv &&
|
||||
return aS->sync.can_reorder() && bS->sync.can_reorder() &&
|
||||
aS->sync == bS->sync && aS->glc == bS->glc && aS->dlc == bS->dlc &&
|
||||
aS->nv == bS->nv && aS->disable_wqm == bS->disable_wqm &&
|
||||
aS->prevent_overflow == bS->prevent_overflow;
|
||||
}
|
||||
case Format::VINTRP: {
|
||||
|
|
@ -251,8 +254,8 @@ struct InstrPred {
|
|||
case Format::MTBUF: {
|
||||
MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
|
||||
MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
|
||||
return aM->can_reorder && bM->can_reorder &&
|
||||
aM->barrier == bM->barrier &&
|
||||
return aM->sync.can_reorder() && bM->sync.can_reorder() &&
|
||||
aM->sync == bM->sync &&
|
||||
aM->dfmt == bM->dfmt &&
|
||||
aM->nfmt == bM->nfmt &&
|
||||
aM->offset == bM->offset &&
|
||||
|
|
@ -267,8 +270,8 @@ struct InstrPred {
|
|||
case Format::MUBUF: {
|
||||
MUBUF_instruction* aM = static_cast<MUBUF_instruction *>(a);
|
||||
MUBUF_instruction* bM = static_cast<MUBUF_instruction *>(b);
|
||||
return aM->can_reorder && bM->can_reorder &&
|
||||
aM->barrier == bM->barrier &&
|
||||
return aM->sync.can_reorder() && bM->sync.can_reorder() &&
|
||||
aM->sync == bM->sync &&
|
||||
aM->offset == bM->offset &&
|
||||
aM->offen == bM->offen &&
|
||||
aM->idxen == bM->idxen &&
|
||||
|
|
@ -295,7 +298,9 @@ struct InstrPred {
|
|||
return false;
|
||||
DS_instruction* aD = static_cast<DS_instruction *>(a);
|
||||
DS_instruction* bD = static_cast<DS_instruction *>(b);
|
||||
return aD->pass_flags == bD->pass_flags &&
|
||||
return aD->sync.can_reorder() && bD->sync.can_reorder() &&
|
||||
aD->sync == bD->sync &&
|
||||
aD->pass_flags == bD->pass_flags &&
|
||||
aD->gds == bD->gds &&
|
||||
aD->offset0 == bD->offset0 &&
|
||||
aD->offset1 == bD->offset1;
|
||||
|
|
@ -303,8 +308,8 @@ struct InstrPred {
|
|||
case Format::MIMG: {
|
||||
MIMG_instruction* aM = static_cast<MIMG_instruction*>(a);
|
||||
MIMG_instruction* bM = static_cast<MIMG_instruction*>(b);
|
||||
return aM->can_reorder && bM->can_reorder &&
|
||||
aM->barrier == bM->barrier &&
|
||||
return aM->sync.can_reorder() && bM->sync.can_reorder() &&
|
||||
aM->sync == bM->sync &&
|
||||
aM->dmask == bM->dmask &&
|
||||
aM->unrm == bM->unrm &&
|
||||
aM->glc == bM->glc &&
|
||||
|
|
|
|||
|
|
@ -1026,8 +1026,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
|
|||
new_instr->operands.back() = Operand(base);
|
||||
if (!smem->definitions.empty())
|
||||
new_instr->definitions[0] = smem->definitions[0];
|
||||
new_instr->can_reorder = smem->can_reorder;
|
||||
new_instr->barrier = smem->barrier;
|
||||
new_instr->sync = smem->sync;
|
||||
new_instr->glc = smem->glc;
|
||||
new_instr->dlc = smem->dlc;
|
||||
new_instr->nv = smem->nv;
|
||||
|
|
|
|||
|
|
@ -189,23 +189,73 @@ static void print_definition(const Definition *definition, FILE *output)
|
|||
print_physReg(definition->physReg(), definition->bytes(), output);
|
||||
}
|
||||
|
||||
static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output)
|
||||
static void print_storage(storage_class storage, FILE *output)
|
||||
{
|
||||
if (can_reorder)
|
||||
fprintf(output, " reorder");
|
||||
fprintf(output, " storage:");
|
||||
int printed = 0;
|
||||
if (storage & storage_buffer)
|
||||
printed += fprintf(output, "%sbuffer", printed ? "," : "");
|
||||
if (storage & storage_atomic_counter)
|
||||
printed += fprintf(output, "%satomic_counter", printed ? "," : "");
|
||||
if (storage & storage_image)
|
||||
printed += fprintf(output, "%simage", printed ? "," : "");
|
||||
if (storage & storage_shared)
|
||||
printed += fprintf(output, "%sshared", printed ? "," : "");
|
||||
if (storage & storage_vmem_output)
|
||||
printed += fprintf(output, "%svmem_output", printed ? "," : "");
|
||||
if (storage & storage_scratch)
|
||||
printed += fprintf(output, "%sscratch", printed ? "," : "");
|
||||
if (storage & storage_vgpr_spill)
|
||||
printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
|
||||
}
|
||||
|
||||
if (barrier & barrier_buffer)
|
||||
fprintf(output, " buffer");
|
||||
if (barrier & barrier_image)
|
||||
fprintf(output, " image");
|
||||
if (barrier & barrier_atomic)
|
||||
fprintf(output, " atomic");
|
||||
if (barrier & barrier_shared)
|
||||
fprintf(output, " shared");
|
||||
if (barrier & barrier_gs_data)
|
||||
fprintf(output, " gs_data");
|
||||
if (barrier & barrier_gs_sendmsg)
|
||||
fprintf(output, " gs_sendmsg");
|
||||
static void print_semantics(memory_semantics sem, FILE *output)
|
||||
{
|
||||
fprintf(output, " semantics:");
|
||||
int printed = 0;
|
||||
if (sem & semantic_acquire)
|
||||
printed += fprintf(output, "%sacquire", printed ? "," : "");
|
||||
if (sem & semantic_release)
|
||||
printed += fprintf(output, "%srelease", printed ? "," : "");
|
||||
if (sem & semantic_volatile)
|
||||
printed += fprintf(output, "%svolatile", printed ? "," : "");
|
||||
if (sem & semantic_private)
|
||||
printed += fprintf(output, "%sprivate", printed ? "," : "");
|
||||
if (sem & semantic_can_reorder)
|
||||
printed += fprintf(output, "%sreorder", printed ? "," : "");
|
||||
if (sem & semantic_atomic)
|
||||
printed += fprintf(output, "%satomic", printed ? "," : "");
|
||||
if (sem & semantic_rmw)
|
||||
printed += fprintf(output, "%srmw", printed ? "," : "");
|
||||
}
|
||||
|
||||
static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
|
||||
{
|
||||
fprintf(output, " %s:", prefix);
|
||||
switch (scope) {
|
||||
case scope_invocation:
|
||||
fprintf(output, "invocation");
|
||||
break;
|
||||
case scope_subgroup:
|
||||
fprintf(output, "subgroup");
|
||||
break;
|
||||
case scope_workgroup:
|
||||
fprintf(output, "workgroup");
|
||||
break;
|
||||
case scope_queuefamily:
|
||||
fprintf(output, "queuefamily");
|
||||
break;
|
||||
case scope_device:
|
||||
fprintf(output, "device");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void print_sync(memory_sync_info sync, FILE *output)
|
||||
{
|
||||
print_storage(sync.storage, output);
|
||||
print_semantics(sync.semantics, output);
|
||||
print_scope(sync.scope, output);
|
||||
}
|
||||
|
||||
static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
||||
|
|
@ -292,7 +342,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " dlc");
|
||||
if (smem->nv)
|
||||
fprintf(output, " nv");
|
||||
print_barrier_reorder(smem->can_reorder, smem->barrier, output);
|
||||
print_sync(smem->sync, output);
|
||||
break;
|
||||
}
|
||||
case Format::VINTRP: {
|
||||
|
|
@ -308,6 +358,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " offset1:%u", ds->offset1);
|
||||
if (ds->gds)
|
||||
fprintf(output, " gds");
|
||||
print_sync(ds->sync, output);
|
||||
break;
|
||||
}
|
||||
case Format::MUBUF: {
|
||||
|
|
@ -332,7 +383,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " lds");
|
||||
if (mubuf->disable_wqm)
|
||||
fprintf(output, " disable_wqm");
|
||||
print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output);
|
||||
print_sync(mubuf->sync, output);
|
||||
break;
|
||||
}
|
||||
case Format::MIMG: {
|
||||
|
|
@ -392,7 +443,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " d16");
|
||||
if (mimg->disable_wqm)
|
||||
fprintf(output, " disable_wqm");
|
||||
print_barrier_reorder(mimg->can_reorder, mimg->barrier, output);
|
||||
print_sync(mimg->sync, output);
|
||||
break;
|
||||
}
|
||||
case Format::EXP: {
|
||||
|
|
@ -439,6 +490,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " cluster_size:%u", reduce->cluster_size);
|
||||
break;
|
||||
}
|
||||
case Format::PSEUDO_BARRIER: {
|
||||
const Pseudo_barrier_instruction* barrier = static_cast<const Pseudo_barrier_instruction*>(instr);
|
||||
print_sync(barrier->sync, output);
|
||||
print_scope(barrier->exec_scope, output, "exec_scope");
|
||||
break;
|
||||
}
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH: {
|
||||
|
|
@ -457,7 +514,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " nv");
|
||||
if (flat->disable_wqm)
|
||||
fprintf(output, " disable_wqm");
|
||||
print_barrier_reorder(flat->can_reorder, flat->barrier, output);
|
||||
print_sync(flat->sync, output);
|
||||
break;
|
||||
}
|
||||
case Format::MTBUF: {
|
||||
|
|
@ -507,7 +564,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
fprintf(output, " tfe");
|
||||
if (mtbuf->disable_wqm)
|
||||
fprintf(output, " disable_wqm");
|
||||
print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output);
|
||||
print_sync(mtbuf->sync, output);
|
||||
break;
|
||||
}
|
||||
case Format::VOP3P: {
|
||||
|
|
|
|||
|
|
@ -318,26 +318,6 @@ void MoveState::upwards_skip()
|
|||
source_idx++;
|
||||
}
|
||||
|
||||
bool can_reorder(Instruction* candidate)
|
||||
{
|
||||
switch (candidate->format) {
|
||||
case Format::SMEM:
|
||||
return static_cast<SMEM_instruction*>(candidate)->can_reorder;
|
||||
case Format::MUBUF:
|
||||
return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
|
||||
case Format::MIMG:
|
||||
return static_cast<MIMG_instruction*>(candidate)->can_reorder;
|
||||
case Format::MTBUF:
|
||||
return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
return static_cast<FLAT_instruction*>(candidate)->can_reorder;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_gs_or_done_sendmsg(const Instruction *instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_sendmsg) {
|
||||
|
|
@ -357,96 +337,96 @@ bool is_done_sendmsg(const Instruction *instr)
|
|||
return false;
|
||||
}
|
||||
|
||||
barrier_interaction get_barrier_interaction(const Instruction* instr)
|
||||
memory_sync_info get_sync_info_with_hack(const Instruction* instr)
|
||||
{
|
||||
switch (instr->format) {
|
||||
case Format::SMEM:
|
||||
return static_cast<const SMEM_instruction*>(instr)->barrier;
|
||||
case Format::MUBUF:
|
||||
return static_cast<const MUBUF_instruction*>(instr)->barrier;
|
||||
case Format::MIMG:
|
||||
return static_cast<const MIMG_instruction*>(instr)->barrier;
|
||||
case Format::MTBUF:
|
||||
return static_cast<const MTBUF_instruction*>(instr)->barrier;
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
return static_cast<const FLAT_instruction*>(instr)->barrier;
|
||||
case Format::DS:
|
||||
return barrier_shared;
|
||||
case Format::SOPP:
|
||||
if (is_done_sendmsg(instr))
|
||||
return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
|
||||
else if (is_gs_or_done_sendmsg(instr))
|
||||
return barrier_gs_sendmsg;
|
||||
else
|
||||
return barrier_none;
|
||||
case Format::PSEUDO_BARRIER:
|
||||
return barrier_barrier;
|
||||
default:
|
||||
return barrier_none;
|
||||
memory_sync_info sync = get_sync_info(instr);
|
||||
if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
|
||||
// FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
|
||||
sync.storage = (storage_class)(sync.storage | storage_buffer);
|
||||
sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
|
||||
}
|
||||
return sync;
|
||||
}
|
||||
|
||||
barrier_interaction parse_barrier(Instruction *instr)
|
||||
{
|
||||
if (instr->format == Format::PSEUDO_BARRIER) {
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::p_memory_barrier_atomic:
|
||||
return barrier_atomic;
|
||||
/* For now, buffer and image barriers are treated the same. this is because of
|
||||
* dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
|
||||
* which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
|
||||
* I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
|
||||
* and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
|
||||
* Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
|
||||
* Either way, this solution should work. */
|
||||
case aco_opcode::p_memory_barrier_buffer:
|
||||
case aco_opcode::p_memory_barrier_image:
|
||||
return (barrier_interaction)(barrier_image | barrier_buffer);
|
||||
case aco_opcode::p_memory_barrier_shared:
|
||||
return barrier_shared;
|
||||
case aco_opcode::p_memory_barrier_common:
|
||||
return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
|
||||
case aco_opcode::p_memory_barrier_gs_data:
|
||||
return barrier_gs_data;
|
||||
case aco_opcode::p_memory_barrier_gs_sendmsg:
|
||||
return barrier_gs_sendmsg;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else if (instr->opcode == aco_opcode::s_barrier) {
|
||||
return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
|
||||
}
|
||||
return barrier_none;
|
||||
}
|
||||
struct memory_event_set {
|
||||
bool has_control_barrier;
|
||||
|
||||
unsigned bar_acquire;
|
||||
unsigned bar_release;
|
||||
unsigned bar_classes;
|
||||
|
||||
unsigned access_acquire;
|
||||
unsigned access_release;
|
||||
unsigned access_relaxed;
|
||||
unsigned access_atomic;
|
||||
};
|
||||
|
||||
struct hazard_query {
|
||||
bool contains_spill;
|
||||
int barriers;
|
||||
int barrier_interaction;
|
||||
bool can_reorder_vmem;
|
||||
bool can_reorder_smem;
|
||||
bool contains_sendmsg;
|
||||
memory_event_set mem_events;
|
||||
unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
|
||||
unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
|
||||
};
|
||||
|
||||
void init_hazard_query(hazard_query *query) {
|
||||
query->contains_spill = false;
|
||||
query->barriers = 0;
|
||||
query->barrier_interaction = 0;
|
||||
query->can_reorder_vmem = true;
|
||||
query->can_reorder_smem = true;
|
||||
query->contains_sendmsg = false;
|
||||
memset(&query->mem_events, 0, sizeof(query->mem_events));
|
||||
query->aliasing_storage = 0;
|
||||
query->aliasing_storage_smem = 0;
|
||||
}
|
||||
|
||||
void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
|
||||
{
|
||||
set->has_control_barrier |= is_done_sendmsg(instr);
|
||||
if (instr->opcode == aco_opcode::p_barrier) {
|
||||
Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
|
||||
if (bar->sync.semantics & semantic_acquire)
|
||||
set->bar_acquire |= bar->sync.storage;
|
||||
if (bar->sync.semantics & semantic_release)
|
||||
set->bar_release |= bar->sync.storage;
|
||||
set->bar_classes |= bar->sync.storage;
|
||||
|
||||
set->has_control_barrier |= bar->exec_scope > scope_invocation;
|
||||
}
|
||||
|
||||
if (!sync->storage)
|
||||
return;
|
||||
|
||||
if (sync->semantics & semantic_acquire)
|
||||
set->access_acquire |= sync->storage;
|
||||
if (sync->semantics & semantic_release)
|
||||
set->access_release |= sync->storage;
|
||||
|
||||
if (!(sync->semantics & semantic_private)) {
|
||||
if (sync->semantics & semantic_atomic)
|
||||
set->access_atomic |= sync->storage;
|
||||
else
|
||||
set->access_relaxed |= sync->storage;
|
||||
}
|
||||
}
|
||||
|
||||
void add_to_hazard_query(hazard_query *query, Instruction *instr)
|
||||
{
|
||||
query->barriers |= parse_barrier(instr);
|
||||
query->barrier_interaction |= get_barrier_interaction(instr);
|
||||
if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
|
||||
query->contains_spill = true;
|
||||
query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
|
||||
|
||||
bool can_reorder_instr = can_reorder(instr);
|
||||
query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
|
||||
query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
|
||||
memory_sync_info sync = get_sync_info_with_hack(instr);
|
||||
|
||||
add_memory_event(&query->mem_events, instr, &sync);
|
||||
|
||||
if (!(sync.semantics & semantic_can_reorder)) {
|
||||
unsigned storage = sync.storage;
|
||||
/* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
|
||||
if (storage & (storage_buffer | storage_image))
|
||||
storage |= storage_buffer | storage_image;
|
||||
if (instr->format == Format::SMEM)
|
||||
query->aliasing_storage_smem |= storage;
|
||||
else
|
||||
query->aliasing_storage |= storage;
|
||||
}
|
||||
}
|
||||
|
||||
enum HazardResult {
|
||||
|
|
@ -463,10 +443,8 @@ enum HazardResult {
|
|||
hazard_fail_unreorderable,
|
||||
};
|
||||
|
||||
HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
|
||||
HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
|
||||
{
|
||||
bool can_reorder_candidate = can_reorder(instr);
|
||||
|
||||
if (instr->opcode == aco_opcode::p_exit_early_if)
|
||||
return hazard_fail_exec;
|
||||
for (const Definition& def : instr->definitions) {
|
||||
|
|
@ -484,27 +462,61 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
|
|||
instr->opcode == aco_opcode::s_setprio)
|
||||
return hazard_fail_unreorderable;
|
||||
|
||||
barrier_interaction bar = parse_barrier(instr);
|
||||
if (query->barrier_interaction && (query->barrier_interaction & bar))
|
||||
memory_event_set instr_set;
|
||||
memset(&instr_set, 0, sizeof(instr_set));
|
||||
memory_sync_info sync = get_sync_info_with_hack(instr);
|
||||
add_memory_event(&instr_set, instr, &sync);
|
||||
|
||||
memory_event_set *first = &instr_set;
|
||||
memory_event_set *second = &query->mem_events;
|
||||
if (upwards)
|
||||
std::swap(first, second);
|
||||
|
||||
/* everything after barrier(acquire) happens after the atomics/control_barriers before
|
||||
* everything after load(acquire) happens after the load
|
||||
*/
|
||||
if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
|
||||
return hazard_fail_barrier;
|
||||
if (bar && query->barriers && (query->barriers & ~bar))
|
||||
return hazard_fail_barrier;
|
||||
if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
|
||||
if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
|
||||
((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
|
||||
return hazard_fail_barrier;
|
||||
|
||||
if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
|
||||
return hazard_fail_reorder_vmem_smem;
|
||||
if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
|
||||
return hazard_fail_reorder_vmem_smem;
|
||||
if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
|
||||
/* everything before barrier(release) happens before the atomics/control_barriers after *
|
||||
* everything before store(release) happens before the store
|
||||
*/
|
||||
if (first->bar_release && (second->has_control_barrier || second->access_atomic))
|
||||
return hazard_fail_barrier;
|
||||
if ((first->bar_classes && (second->bar_release || second->access_release)) ||
|
||||
((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
|
||||
return hazard_fail_barrier;
|
||||
|
||||
/* don't move memory barriers around other memory barriers */
|
||||
if (first->bar_classes && second->bar_classes)
|
||||
return hazard_fail_barrier;
|
||||
|
||||
/* Don't move memory loads/stores to before control barriers. This is to make
|
||||
* memory barriers followed by control barriers work. */
|
||||
if (first->has_control_barrier && (second->access_atomic | second->access_relaxed))
|
||||
return hazard_fail_barrier;
|
||||
|
||||
/* don't move memory loads/stores past potentially aliasing loads/stores */
|
||||
unsigned aliasing_storage = instr->format == Format::SMEM ?
|
||||
query->aliasing_storage_smem :
|
||||
query->aliasing_storage;
|
||||
if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
|
||||
unsigned intersect = sync.storage & aliasing_storage;
|
||||
if (intersect & storage_shared)
|
||||
return hazard_fail_reorder_ds;
|
||||
if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
|
||||
return hazard_fail_reorder_sendmsg;
|
||||
return hazard_fail_reorder_vmem_smem;
|
||||
}
|
||||
|
||||
if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
|
||||
query->contains_spill)
|
||||
return hazard_fail_spill;
|
||||
|
||||
if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
|
||||
return hazard_fail_reorder_sendmsg;
|
||||
|
||||
return hazard_success;
|
||||
}
|
||||
|
||||
|
|
@ -546,7 +558,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
|
|||
|
||||
bool can_move_down = true;
|
||||
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get());
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
|
||||
can_move_down = false;
|
||||
else if (haz != hazard_success)
|
||||
|
|
@ -594,7 +606,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
|
|||
break;
|
||||
|
||||
if (found_dependency) {
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get());
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
|
||||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
|
||||
haz == hazard_fail_export)
|
||||
|
|
@ -686,7 +698,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
/* if current depends on candidate, add additional dependencies and continue */
|
||||
bool can_move_down = !is_vmem || part_of_clause;
|
||||
|
||||
HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
|
||||
HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
|
||||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
|
||||
haz == hazard_fail_export)
|
||||
|
|
@ -735,7 +747,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
/* check if candidate depends on current */
|
||||
bool is_dependency = false;
|
||||
if (found_dependency) {
|
||||
HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
|
||||
HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
|
||||
haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
|
||||
haz == hazard_fail_barrier || haz == hazard_fail_export)
|
||||
|
|
@ -802,7 +814,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
|
|||
if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
|
||||
break;
|
||||
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get());
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
|
||||
if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
|
||||
break;
|
||||
|
||||
|
|
|
|||
|
|
@ -1565,10 +1565,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
|
|||
for (unsigned i = 0; i < temp.size(); i++)
|
||||
split->definitions[i] = bld.def(v1);
|
||||
bld.insert(split);
|
||||
for (unsigned i = 0; i < temp.size(); i++)
|
||||
bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
|
||||
for (unsigned i = 0; i < temp.size(); i++) {
|
||||
Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
|
||||
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
}
|
||||
} else {
|
||||
bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
|
||||
Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
|
||||
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
}
|
||||
} else {
|
||||
ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
|
||||
|
|
@ -1632,11 +1635,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
|
|||
for (unsigned i = 0; i < def.size(); i++) {
|
||||
Temp tmp = bld.tmp(v1);
|
||||
vec->operands[i] = Operand(tmp);
|
||||
bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
|
||||
Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
|
||||
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
}
|
||||
bld.insert(vec);
|
||||
} else {
|
||||
bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
|
||||
Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
|
||||
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
}
|
||||
} else {
|
||||
uint32_t spill_slot = slots[spill_id];
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue