mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-17 00:08:31 +02:00
aco: don't emit workgroup-scope p_barrier for single-wave workgroups
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41364>
This commit is contained in:
parent
e6703f8e68
commit
49fb361c0a
4 changed files with 34 additions and 25 deletions
|
|
@ -447,9 +447,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
|
|||
void
|
||||
setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
|
||||
{
|
||||
sync_scope subgroup_scope =
|
||||
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
||||
if (sync.scope <= subgroup_scope)
|
||||
if (sync.scope <= scope_subgroup)
|
||||
return;
|
||||
|
||||
barrier_info& src = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
|
||||
|
|
@ -457,8 +455,8 @@ setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acqui
|
|||
wait_imm dst_imm;
|
||||
uint16_t dst_events = 0;
|
||||
u_foreach_bit (i, sync.storage & src.storage) {
|
||||
/* LDS is private to the workgroup, so reduce the scope in that case. */
|
||||
if (src.events[i] == event_lds && MIN2(sync.scope, scope_workgroup) <= subgroup_scope)
|
||||
/* LDS is private to the workgroup, but sync.scope might be device scope. */
|
||||
if (src.events[i] == event_lds && ctx.program->workgroup_size <= ctx.program->wave_size)
|
||||
continue;
|
||||
|
||||
dst_imm.combine(src.imm[i]);
|
||||
|
|
|
|||
|
|
@ -2955,8 +2955,7 @@ lower_to_hw_instr(Program* program)
|
|||
/* Anything larger than a workgroup isn't possible. Anything
|
||||
* smaller requires no instructions and this pseudo instruction
|
||||
* would only be included to control optimizations. */
|
||||
bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
|
||||
program->workgroup_size > program->wave_size;
|
||||
bool emit_s_barrier = barrier.exec_scope == scope_workgroup;
|
||||
|
||||
bld.insert(std::move(instr));
|
||||
if (emit_s_barrier && ctx.program->gfx_level >= GFX12) {
|
||||
|
|
|
|||
|
|
@ -2948,6 +2948,12 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
|
||||
assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
|
||||
|
||||
if (ctx->program->workgroup_size <= ctx->program->wave_size) {
|
||||
exec_scope = scope_subgroup;
|
||||
if (mem_scope == scope_workgroup)
|
||||
mem_scope = scope_subgroup;
|
||||
}
|
||||
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
|
||||
exec_scope);
|
||||
|
|
|
|||
|
|
@ -1215,6 +1215,9 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name))
|
||||
continue;
|
||||
|
||||
sync_scope workgroup_scope =
|
||||
var.workgroup_size > program->wave_size ? scope_workgroup : scope_subgroup;
|
||||
|
||||
program->workgroup_size = var.workgroup_size;
|
||||
program->wgp_mode = var.wgp;
|
||||
|
||||
|
|
@ -1252,7 +1255,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
store_global();
|
||||
barrier(storage_buffer, semantic_release, scope_workgroup);
|
||||
barrier(storage_buffer, semantic_release, workgroup_scope);
|
||||
store_global(semantic_atomic);
|
||||
|
||||
//>> p_unit_test 3
|
||||
|
|
@ -1262,7 +1265,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
load_global();
|
||||
barrier(storage_buffer, semantic_release, scope_workgroup);
|
||||
barrier(storage_buffer, semantic_release, workgroup_scope);
|
||||
store_global(semantic_atomic);
|
||||
|
||||
/* shared->shared */
|
||||
|
|
@ -1272,7 +1275,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
store_shared();
|
||||
barrier(storage_shared, semantic_release, scope_workgroup);
|
||||
barrier(storage_shared, semantic_release, workgroup_scope);
|
||||
store_shared(semantic_atomic);
|
||||
|
||||
//>> p_unit_test 5
|
||||
|
|
@ -1281,7 +1284,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
load_shared();
|
||||
barrier(storage_shared, semantic_release, scope_workgroup);
|
||||
barrier(storage_shared, semantic_release, workgroup_scope);
|
||||
store_shared(semantic_atomic);
|
||||
|
||||
/* shared->global */
|
||||
|
|
@ -1292,7 +1295,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
store_shared();
|
||||
barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup);
|
||||
barrier(storage_buffer | storage_shared, semantic_release, workgroup_scope);
|
||||
store_global(semantic_atomic);
|
||||
|
||||
/* global->shared */
|
||||
|
|
@ -1303,7 +1306,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
|
||||
store_global();
|
||||
barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup);
|
||||
barrier(storage_buffer | storage_shared, semantic_release, workgroup_scope);
|
||||
store_shared(semantic_atomic);
|
||||
|
||||
/* global->global, device scope, release in the atomic */
|
||||
|
|
@ -1325,8 +1328,8 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
|
||||
store_global();
|
||||
barrier(storage_buffer, semantic_release, scope_workgroup);
|
||||
barrier(0, 0, scope_invocation, scope_workgroup);
|
||||
barrier(storage_buffer, semantic_release, workgroup_scope);
|
||||
barrier(0, 0, scope_invocation, workgroup_scope);
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
|
||||
/* shared->shared, workgroup scope, control barrier */
|
||||
|
|
@ -1338,8 +1341,8 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
|
||||
store_shared();
|
||||
barrier(storage_shared, semantic_release, scope_workgroup);
|
||||
barrier(0, 0, scope_invocation, scope_workgroup);
|
||||
barrier(storage_shared, semantic_release, workgroup_scope);
|
||||
barrier(0, 0, scope_invocation, workgroup_scope);
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
|
||||
/* global->global, device scope, delayed waitcnt */
|
||||
|
|
@ -1395,6 +1398,9 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name))
|
||||
continue;
|
||||
|
||||
sync_scope workgroup_scope =
|
||||
var.workgroup_size > program->wave_size ? scope_workgroup : scope_subgroup;
|
||||
|
||||
program->workgroup_size = var.workgroup_size;
|
||||
program->wgp_mode = var.wgp;
|
||||
|
||||
|
|
@ -1433,7 +1439,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
load_global(semantic_atomic);
|
||||
barrier(storage_buffer, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_buffer, semantic_acquire, workgroup_scope);
|
||||
load_global() = dest1;
|
||||
|
||||
//>> p_unit_test 3
|
||||
|
|
@ -1443,7 +1449,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
load_global(semantic_atomic);
|
||||
barrier(storage_buffer, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_buffer, semantic_acquire, workgroup_scope);
|
||||
store_global();
|
||||
|
||||
/* shared->shared */
|
||||
|
|
@ -1453,7 +1459,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
load_shared(semantic_atomic);
|
||||
barrier(storage_shared, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_shared, semantic_acquire, workgroup_scope);
|
||||
load_shared() = dest1;
|
||||
|
||||
//>> p_unit_test 5
|
||||
|
|
@ -1462,7 +1468,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
load_shared(semantic_atomic);
|
||||
barrier(storage_shared, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_shared, semantic_acquire, workgroup_scope);
|
||||
store_shared();
|
||||
|
||||
/* shared->global */
|
||||
|
|
@ -1473,7 +1479,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
load_shared(semantic_atomic);
|
||||
barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_buffer | storage_shared, semantic_acquire, workgroup_scope);
|
||||
load_global() = dest1;
|
||||
|
||||
/* global->shared */
|
||||
|
|
@ -1484,7 +1490,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
|
||||
load_global(semantic_atomic);
|
||||
barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_buffer | storage_shared, semantic_acquire, workgroup_scope);
|
||||
load_shared() = dest1;
|
||||
|
||||
/* global->global, device scope, acquire in the atomic */
|
||||
|
|
@ -1503,9 +1509,9 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
|
|||
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
|
||||
barrier(0, 0, scope_invocation, scope_workgroup);
|
||||
barrier(0, 0, scope_invocation, workgroup_scope);
|
||||
bld.sopp(aco_opcode::s_barrier);
|
||||
barrier(storage_buffer, semantic_acquire, scope_workgroup);
|
||||
barrier(storage_buffer, semantic_acquire, workgroup_scope);
|
||||
load_global();
|
||||
|
||||
/* global->global, device scope, delayed waitcnt */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue