diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 2d6cba63c09..cf7c099b718 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -171,6 +171,12 @@ enum barrier_info_kind {
    barrier_info_release_dep,
    /* Waits for all atomics */
    barrier_info_acquire_dep,
+   /* A wait that is to be emitted when an
+    * atomics/control_barriers/sendmsg_gs_done/position-primitive-export is encountered.
+    */
+   barrier_info_release,
+   /* A wait that is to be emitted when any non-private access is encountered. */
+   barrier_info_acquire,
    num_barrier_infos,
 };
 
@@ -428,18 +434,22 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
    }
 }
 
+/* We delay the waitcnt for a barrier until it's needed. This can help hide the cost or let it be
+ * eliminated. */
 void
-perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
+setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
 {
    sync_scope subgroup_scope =
       ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
    if (sync.scope <= subgroup_scope)
       return;
 
-   barrier_info& bar = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
+   barrier_info& src = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
 
-   u_foreach_bit (i, sync.storage & bar.storage) {
-      uint16_t events = bar.events[i];
+   wait_imm dst_imm;
+   uint16_t dst_events = 0;
+   u_foreach_bit (i, sync.storage & src.storage) {
+      uint16_t events = src.events[i];
 
       /* LDS is private to the workgroup */
       if (MIN2(sync.scope, scope_workgroup) <= subgroup_scope)
@@ -448,10 +458,40 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acq
       /* Until GFX11, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
        * in-order for the same workgroup */
       if (ctx.gfx_level < GFX11 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup)
-         events &= ~(event_vmem | event_vmem_store | event_smem);
+         events &= ~(event_vmem | event_vmem_store);
 
-      if (events)
-         imm.combine(bar.imm[i]);
+      if (events) {
+         dst_imm.combine(src.imm[i]);
+         dst_events |= src.events[i];
+      }
+   }
+   if (!dst_events)
+      return;
+
+   /* Copy over wait into barrier_info_acquire/barrier_info_release */
+   unsigned dst_index = is_acquire ? barrier_info_acquire : barrier_info_release;
+   barrier_info& dst = ctx.bar[dst_index];
+   u_foreach_bit (i, sync.storage) {
+      dst.imm[i].combine(dst_imm);
+      dst.events[i] |= dst_events;
+   }
+   dst.storage |= sync.storage;
+   ctx.bar_nonempty |= 1 << dst_index;
+}
+
+void
+finish_barriers(wait_ctx& ctx, wait_imm& imm, Instruction* instr, memory_sync_info sync)
+{
+   if (ctx.bar_nonempty & (1 << barrier_info_release)) {
+      uint16_t storage_release =
+         is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release);
+      u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage)
+         imm.combine(ctx.bar[barrier_info_release].imm[i]);
+   }
+   if (ctx.bar_nonempty & (1 << barrier_info_acquire)) {
+      uint16_t storage_acquire = (sync.semantics & semantic_private) ? 0 : sync.storage;
+      u_foreach_bit (i, storage_acquire & ctx.bar[barrier_info_acquire].storage)
+         imm.combine(ctx.bar[barrier_info_acquire].imm[i]);
    }
 }
 
@@ -558,13 +598,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
 
    if (instr->opcode == aco_opcode::p_barrier) {
       if (instr->barrier().sync.semantics & semantic_release)
-         perform_barrier(ctx, imm, instr->barrier().sync, false);
+         setup_barrier(ctx, imm, instr->barrier().sync, false);
       if (instr->barrier().sync.semantics & semantic_acquire)
-         perform_barrier(ctx, imm, instr->barrier().sync, true);
+         setup_barrier(ctx, imm, instr->barrier().sync, true);
    } else if (sync_info.semantics & semantic_release) {
-      perform_barrier(ctx, imm, sync_info, false);
+      setup_barrier(ctx, imm, sync_info, false);
    }
 
+   finish_barriers(ctx, imm, instr, sync_info);
+
    if (!imm.empty()) {
       if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
          imm.vm = 0;
@@ -647,6 +689,9 @@ update_barriers(wait_ctx& ctx, uint8_t counters, wait_event event, Instruction*
       uint16_t storage_acq = is_atomic_or_control_instr(ctx.program, instr, sync, semantic_acquire);
       update_barrier_info_for_event(ctx, counters, event, barrier_info_acquire_dep, storage_acq);
    }
+
+   update_barrier_info_for_event(ctx, counters, event, barrier_info_release, 0);
+   update_barrier_info_for_event(ctx, counters, event, barrier_info_acquire, 0);
 }
 
 void
@@ -915,7 +960,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
 
          new_instructions.emplace_back(std::move(instr));
          if (sync_info.semantics & semantic_acquire)
-            perform_barrier(ctx, queued_imm, sync_info, true);
+            setup_barrier(ctx, queued_imm, sync_info, true);
 
          if (is_ordered_count_acquire)
             queued_imm.combine(ctx.bar[barrier_info_release_dep].imm[ffs(storage_gds) - 1]);
diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp
index 2049583f954..4bfdc97309f 100644
--- a/src/amd/compiler/tests/test_insert_waitcnt.cpp
+++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp
@@ -610,7 +610,8 @@ BEGIN_TEST(insert_waitcnt.vmem_ds)
 
    //! s_wait_storecnt_dscnt dscnt(0) storecnt(0)
    bld.barrier(aco_opcode::p_barrier,
-               memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup));
+               memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup),
+               scope_workgroup);
 
    finish_waitcnt_test();
 END_TEST
@@ -1119,7 +1120,7 @@ BEGIN_TEST(insert_waitcnt.flat.barrier)
                  memory_sync_info(storage_buffer));
       bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true;
       bld.barrier(aco_opcode::p_barrier,
-                  memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
+                  memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup);
 
       //>> p_unit_test 1
       //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds storage:buffer
@@ -1132,7 +1133,7 @@ BEGIN_TEST(insert_waitcnt.flat.barrier)
                memory_sync_info(storage_buffer)).instr->flat().may_use_lds = true;
       bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0);
       bld.barrier(aco_opcode::p_barrier,
-                  memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
+                  memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup);
 
       //>> p_unit_test 2
       //! flat_store_dword %0:v[0-1], s1: undef, %0:v[0] may_use_lds storage:buffer
@@ -1144,7 +1145,7 @@ BEGIN_TEST(insert_waitcnt.flat.barrier)
       bld.flat(aco_opcode::flat_store_dword, addr, Operand(s1), data, 0,
                memory_sync_info(storage_buffer)).instr->flat().may_use_lds = true;
       bld.barrier(aco_opcode::p_barrier,
-                  memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
+                  memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup);
 
       finish_waitcnt_test();
    }