aco: rework barriers and replace can_reorder

fossil-db (Navi): Totals from 273 (0.21% of 132058) affected shaders: CodeSize: 937472 -> 936556 (-0.10%) Instrs: 158874 -> 158648 (-0.14%) Cycles: 13563516 -> 13562612 (-0.01%) VMEM: 85246 -> 85244 (-0.00%) SMEM: 21407 -> 21310 (-0.45%); split: +0.05%, -0.50% VClause: 9321 -> 9317 (-0.04%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4905>
2025-12-20 22:30:12 +01:00 · 2020-06-26 15:54:22 +01:00 · 2020-06-26 15:54:22 +01:00 · d1f992f3c2
commit d1f992f3c2
parent 1bbb64f300
12 changed files with 521 additions and 341 deletions
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@ -84,7 +84,7 @@ bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
   if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
          [&uses] (const Definition& def) { return uses[def.tempId()];}))
      return false;
-   return !instr_info.is_atomic[(int)instr->opcode];
+   return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
 }

 std::vector<uint16_t> dead_code_analysis(Program *program) {
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -273,8 +273,8 @@ struct wait_ctx {
   bool pending_flat_vm = false;
   bool pending_s_buffer_store = false; /* GFX10 workaround */

-   wait_imm barrier_imm[barrier_count];
-   uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */
+   wait_imm barrier_imm[storage_count];
+   uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */

   std::map<PhysReg,wait_entry> gpr_map;

@ -327,7 +327,7 @@ struct wait_ctx {
         }
      }

-      for (unsigned i = 0; i < barrier_count; i++) {
+      for (unsigned i = 0; i < storage_count; i++) {
         changed |= barrier_imm[i].combine(other->barrier_imm[i]);
         changed |= other->barrier_events[i] & ~barrier_events[i];
         barrier_events[i] |= other->barrier_events[i];
@ -444,7 +444,31 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
   return wait_imm();
 }

-wait_imm kill(Instruction* instr, wait_ctx& ctx)
+wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+{
+   wait_imm imm;
+   sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+   if (sync.semantics & semantics) {
+      unsigned storage = sync.storage;
+      while (storage) {
+         unsigned idx = u_bit_scan(&storage);
+
+         /* LDS is private to the workgroup */
+         sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
+
+         uint16_t events = ctx.barrier_events[idx];
+         if (bar_scope_lds <= subgroup_scope)
+            events &= ~event_lds;
+
+         if (events)
+            imm.combine(ctx.barrier_imm[idx]);
+      }
+   }
+
+   return imm;
+}
+
+wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
   wait_imm imm;
   if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
@ -471,44 +495,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
      SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
      if (ctx.pending_s_buffer_store &&
          !smem->definitions.empty() &&
-          !smem->can_reorder && smem->barrier == barrier_buffer) {
+          !smem->sync.can_reorder()) {
         imm.lgkm = 0;
      }
   }

-   if (instr->format == Format::PSEUDO_BARRIER) {
-      switch (instr->opcode) {
-      case aco_opcode::p_memory_barrier_common:
-         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         if (ctx.program->workgroup_size > ctx.program->wave_size)
-            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_atomic:
-         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
-         break;
-      /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
-      case aco_opcode::p_memory_barrier_buffer:
-      case aco_opcode::p_memory_barrier_image:
-         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_shared:
-         if (ctx.program->workgroup_size > ctx.program->wave_size)
-            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_gs_data:
-         imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_gs_sendmsg:
-         imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]);
-         break;
-      default:
-         assert(false);
-         break;
-      }
-   }
+   if (instr->opcode == aco_opcode::p_barrier)
+      imm.combine(perform_barrier(ctx, static_cast<Pseudo_barrier_instruction *>(instr)->sync, semantic_acqrel));
+   else
+      imm.combine(perform_barrier(ctx, sync_info, semantic_release));

   if (!imm.empty()) {
      if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
@ -523,7 +518,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
      ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);

      /* update barrier wait imms */
-      for (unsigned i = 0; i < barrier_count; i++) {
+      for (unsigned i = 0; i < storage_count; i++) {
         wait_imm& bar = ctx.barrier_imm[i];
         uint16_t& bar_ev = ctx.barrier_events[i];
         if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
@ -581,12 +576,12 @@ void update_barrier_counter(uint8_t *ctr, unsigned max)
      (*ctr)++;
 }

-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier)
+void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
 {
-   for (unsigned i = 0; i < barrier_count; i++) {
+   for (unsigned i = 0; i < storage_count; i++) {
      wait_imm& bar = ctx.barrier_imm[i];
      uint16_t& bar_ev = ctx.barrier_events[i];
-      if (barrier & (1 << i)) {
+      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
         bar_ev |= event;
         if (counters & counter_lgkm)
            bar.lgkm = 0;
@ -609,7 +604,7 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barri
   }
 }

-void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
+void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
 {
   uint8_t counters = get_counters_for_event(event);

@ -622,7 +617,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
   if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
      ctx.vs_cnt++;

-   update_barrier_imm(ctx, counters, event, barrier);
+   update_barrier_imm(ctx, counters, event, sync);

   if (ctx.unordered_events & event)
      return;
@ -651,7 +646,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
   }
 }

-void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
+void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
 {
   assert(ctx.chip_class < GFX10);

@ -660,7 +655,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba
   if (ctx.vm_cnt <= ctx.max_vm_cnt)
      ctx.vm_cnt++;

-   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier);
+   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);

   for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
   {
@ -748,10 +743,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
      break;
   }
   case Format::FLAT: {
+      FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
      if (ctx.chip_class < GFX10 && !instr->definitions.empty())
-         update_counters_for_flat_load(ctx, barrier_buffer);
+         update_counters_for_flat_load(ctx, flat->sync);
      else
-         update_counters(ctx, event_flat, barrier_buffer);
+         update_counters(ctx, event_flat, flat->sync);

      if (!instr->definitions.empty())
         insert_wait_entry(ctx, instr->definitions[0], event_flat);
@ -759,27 +755,26 @@ void gen(Instruction* instr, wait_ctx& ctx)
   }
   case Format::SMEM: {
      SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
-      update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
+      update_counters(ctx, event_smem, smem->sync);

      if (!instr->definitions.empty())
         insert_wait_entry(ctx, instr->definitions[0], event_smem);
      else if (ctx.chip_class >= GFX10 &&
-               !smem->can_reorder &&
-               smem->barrier == barrier_buffer)
+               !smem->sync.can_reorder())
         ctx.pending_s_buffer_store = true;

      break;
   }
   case Format::DS: {
-      bool gds = static_cast<DS_instruction*>(instr)->gds;
-      update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
-      if (gds)
+      DS_instruction *ds = static_cast<DS_instruction*>(instr);
+      update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync);
+      if (ds->gds)
         update_counters(ctx, event_gds_gpr_lock);

      if (!instr->definitions.empty())
-         insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
+         insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds);

-      if (gds) {
+      if (ds->gds) {
         for (const Operand& op : instr->operands)
            insert_wait_entry(ctx, op, event_gds_gpr_lock);
         insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
@ -791,7 +786,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
   case Format::MIMG:
   case Format::GLOBAL: {
      wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
-      update_counters(ctx, ev, get_barrier_interaction(instr));
+      update_counters(ctx, ev, get_sync_info(instr));

      bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;

@ -817,7 +812,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
   case Format::SOPP: {
      if (instr->opcode == aco_opcode::s_sendmsg ||
          instr->opcode == aco_opcode::s_sendmsghalt)
-         update_counters(ctx, event_sendmsg, get_barrier_interaction(instr));
+         update_counters(ctx, event_sendmsg);
   }
   default:
      break;
@ -851,7 +846,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
   for (aco_ptr<Instruction>& instr : block.instructions) {
      bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();

-      queued_imm.combine(kill(instr.get(), ctx));
+      memory_sync_info sync_info = get_sync_info(instr.get());
+      queued_imm.combine(kill(instr.get(), ctx, sync_info));

      ctx.gen_instr = instr.get();
      gen(instr.get(), ctx);
@ -863,6 +859,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
         }
         new_instructions.emplace_back(std::move(instr));

+         queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
+
         if (ctx.collect_statistics)
            ctx.advance_unwaited_instrs();
      }
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -3137,8 +3137,7 @@ struct LoadEmitInfo {

   bool glc = false;
   unsigned swizzle_component_size = 0;
-   barrier_interaction barrier = barrier_none;
-   bool can_reorder = true;
+   memory_sync_info sync;
   Temp soffset = Temp(0, s1);
 };

@ -3441,10 +3440,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,

   RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
   Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+   Instruction *instr;
   if (read2)
-      bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
   else
-      bld.ds(op, Definition(val), offset, m, const_offset);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset);
+   static_cast<DS_instruction *>(instr)->sync = info->sync;

   if (size < 4)
      val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
@ -3490,8 +3491,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
   load->definitions[0] = Definition(val);
   load->glc = info->glc;
   load->dlc = info->glc && bld.program->chip_class >= GFX10;
-   load->barrier = info->barrier;
-   load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+   load->sync = info->sync;
   bld.insert(std::move(load));
   return val;
 }
@ -3540,8 +3540,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
   mubuf->offen = (offset.type() == RegType::vgpr);
   mubuf->glc = info->glc;
   mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
-   mubuf->barrier = info->barrier;
-   mubuf->can_reorder = info->can_reorder;
+   mubuf->sync = info->sync;
   mubuf->offset = const_offset;
   mubuf->swizzled = info->swizzle_component_size != 0;
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
@ -3605,7 +3604,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
      mubuf->offset = 0;
      mubuf->addr64 = offset.type() == RegType::vgpr;
      mubuf->disable_wqm = false;
-      mubuf->barrier = info->barrier;
+      mubuf->sync = info->sync;
      mubuf->definitions[0] = Definition(val);
      bld.insert(std::move(mubuf));
   } else {
@ -3616,7 +3615,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
      flat->operands[1] = Operand(s1);
      flat->glc = info->glc;
      flat->dlc = info->glc && bld.program->chip_class >= GFX10;
-      flat->barrier = info->barrier;
+      flat->sync = info->sync;
      flat->offset = 0u;
      flat->definitions[0] = Definition(val);
      bld.insert(std::move(flat));
@ -3638,8 +3637,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
   LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
   info.align_mul = align;
   info.align_offset = 0;
-   info.barrier = barrier_shared;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_shared);
   info.const_offset = base_offset;
   emit_lds_load(ctx, bld, &info);

@ -3848,13 +3846,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
      }
      assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */

+      Instruction *instr;
      if (write2) {
         Temp second_data = write_datas[second];
         inline_offset /= data.bytes();
-         bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+         instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
      } else {
-         bld.ds(op, address_offset, data, m, inline_offset);
+         instr = bld.ds(op, address_offset, data, m, inline_offset);
      }
+      static_cast<DS_instruction *>(instr)->sync =
+         memory_sync_info(storage_shared);
   }
 }

@ -4017,7 +4018,8 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
                                 /* dlc*/ false, /* slc */ slc);

-   static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+   if (!allow_reorder)
+      static_cast<MUBUF_instruction *>(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private);
 }

 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
@ -4847,15 +4849,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
         }

         if (use_mubuf) {
-            Instruction *mubuf = bld.mubuf(opcode,
+            bld.mubuf(opcode,
                      Definition(fetch_dst), list, fetch_index, soffset,
                      fetch_offset, false, false, true).instr;
-            static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
         } else {
-            Instruction *mtbuf = bld.mtbuf(opcode,
+            bld.mtbuf(opcode,
                      Definition(fetch_dst), list, fetch_index, soffset,
                      fetch_dfmt, nfmt, fetch_offset, false, true).instr;
-            static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
         }

         emit_split_vector(ctx, fetch_dst, fetch_dst.size());
@ -5208,7 +5208,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)

 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
                 Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool readonly=true, bool allow_smem=true)
+                 bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
 {
   Builder bld(ctx->program, ctx->block);

@ -5218,8 +5218,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_

   LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
   info.glc = glc;
-   info.barrier = readonly ? barrier_none : barrier_buffer;
-   info.can_reorder = readonly;
+   info.sync = sync;
   info.align_mul = align_mul;
   info.align_offset = align_offset;
   if (use_smem)
@ -5737,7 +5736,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
   load->unrm = true;
   load->da = da;
   load->dim = dim;
-   load->can_reorder = true; /* fmask images shouldn't be modified */
   ctx->block->instructions.emplace_back(std::move(load));

   Operand sample_index4;
@ -5837,6 +5835,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
 }


+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+   if (semantics & semantic_atomicrmw)
+      return memory_sync_info(storage, semantics);
+
+   unsigned access = nir_intrinsic_access(instr);
+
+   if (access & ACCESS_VOLATILE)
+      semantics |= semantic_volatile;
+   if (access & ACCESS_CAN_REORDER)
+      semantics |= semantic_can_reorder | semantic_private;
+
+   return memory_sync_info(storage, semantics);
+}
+
 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
 {
   Builder bld(ctx->program, ctx->block);
@ -5846,6 +5860,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
   bool is_array = glsl_sampler_type_is_array(type);
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);

+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+
   if (dim == GLSL_SAMPLER_DIM_BUF) {
      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
      unsigned num_channels = util_last_bit(mask);
@ -5882,7 +5898,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
      load->idxen = true;
      load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
      load->dlc = load->glc && ctx->options->chip_class >= GFX10;
-      load->barrier = barrier_image;
+      load->sync = sync;
      ctx->block->instructions.emplace_back(std::move(load));

      expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
@ -5914,7 +5930,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
   load->dmask = dmask;
   load->unrm = true;
   load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
-   load->barrier = barrier_image;
+   load->sync = sync;
   ctx->block->instructions.emplace_back(std::move(load));

   expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
@ -5929,6 +5945,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
   bool is_array = glsl_sampler_type_is_array(type);
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));

+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
   bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;

   if (dim == GLSL_SAMPLER_DIM_BUF) {
@ -5960,7 +5977,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
      store->glc = glc;
      store->dlc = false;
      store->disable_wqm = true;
-      store->barrier = barrier_image;
+      store->sync = sync;
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(store));
      return;
@ -5984,7 +6001,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
   store->unrm = true;
   store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
   store->disable_wqm = true;
-   store->barrier = barrier_image;
+   store->sync = sync;
   ctx->program->needs_exact = true;
   ctx->block->instructions.emplace_back(std::move(store));
   return;
@ -6062,6 +6079,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
   }

   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);

   if (dim == GLSL_SAMPLER_DIM_BUF) {
      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
@ -6079,7 +6097,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
      mubuf->glc = return_previous;
      mubuf->dlc = false; /* Not needed for atomics */
      mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_image;
+      mubuf->sync = sync;
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(mubuf));
      return;
@ -6100,7 +6118,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
   mimg->unrm = true;
   mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
   mimg->disable_wqm = true;
-   mimg->barrier = barrier_image;
+   mimg->sync = sync;
   ctx->program->needs_exact = true;
   ctx->block->instructions.emplace_back(std::move(mimg));
   return;
@ -6164,7 +6182,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
   mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
   mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
   mimg->da = glsl_sampler_type_is_array(type);
-   mimg->can_reorder = true;
   Definition& def = mimg->definitions[0];
   ctx->block->instructions.emplace_back(std::move(mimg));

@ -6219,7 +6236,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
   allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);

   load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
-               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+               get_memory_sync_info(instr, storage_buffer, 0));
 }

 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@ -6233,6 +6251,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
   Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
   rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));

+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
@ -6275,7 +6294,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
         store->glc = glc;
         store->dlc = false;
         store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
         ctx->block->instructions.emplace_back(std::move(store));
         ctx->program->wb_smem_l1_on_end = true;
         if (op == aco_opcode::p_fs_buffer_store_smem) {
@ -6293,7 +6312,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
         store->glc = glc;
         store->dlc = false;
         store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
         ctx->program->needs_exact = true;
         ctx->block->instructions.emplace_back(std::move(store));
      }
@ -6384,7 +6403,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
   mubuf->glc = return_previous;
   mubuf->dlc = false; /* Not needed for atomics */
   mubuf->disable_wqm = true;
-   mubuf->barrier = barrier_buffer;
+   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
   ctx->program->needs_exact = true;
   ctx->block->instructions.emplace_back(std::move(mubuf));
 }
@ -6409,8 +6428,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
   info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
   info.align_mul = nir_intrinsic_align_mul(instr);
   info.align_offset = nir_intrinsic_align_offset(instr);
-   info.barrier = barrier_buffer;
-   info.can_reorder = false;
+   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
   /* VMEM stores don't update the SMEM cache and it's difficult to prove that
    * it's safe to use SMEM */
   bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
@ -6430,6 +6448,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)

   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
   Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);

   if (ctx->options->chip_class >= GFX7)
@ -6495,7 +6514,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
         flat->dlc = false;
         flat->offset = offset;
         flat->disable_wqm = true;
-         flat->barrier = barrier_buffer;
+         flat->sync = sync;
         ctx->program->needs_exact = true;
         ctx->block->instructions.emplace_back(std::move(flat));
      } else {
@ -6515,7 +6534,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
         mubuf->offset = offsets[i];
         mubuf->addr64 = addr.type() == RegType::vgpr;
         mubuf->disable_wqm = true;
-         mubuf->barrier = barrier_buffer;
+         mubuf->sync = sync;
         ctx->program->needs_exact = true;
         ctx->block->instructions.emplace_back(std::move(mubuf));
      }
@ -6608,7 +6627,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
      flat->dlc = false; /* Not needed for atomics */
      flat->offset = 0;
      flat->disable_wqm = true;
-      flat->barrier = barrier_buffer;
+      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(flat));
   } else {
@ -6675,7 +6694,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
      mubuf->offset = 0;
      mubuf->addr64 = addr.type() == RegType::vgpr;
      mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_buffer;
+      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(mubuf));
   }
@ -6683,20 +6702,30 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)

 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
   Builder bld(ctx->program, ctx->block);
+   storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
   switch(instr->intrinsic) {
      case nir_intrinsic_group_memory_barrier:
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(all_mem, semantic_acqrel, scope_workgroup));
+         break;
      case nir_intrinsic_memory_barrier:
-         bld.barrier(aco_opcode::p_memory_barrier_common);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(all_mem, semantic_acqrel, scope_device));
         break;
      case nir_intrinsic_memory_barrier_buffer:
-         bld.barrier(aco_opcode::p_memory_barrier_buffer);
-         break;
      case nir_intrinsic_memory_barrier_image:
-         bld.barrier(aco_opcode::p_memory_barrier_image);
+         /* since NIR splits barriers, we have to unify buffer and image barriers
+          * for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent.
+          * fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
+          * passes
+          */
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device));
         break;
      case nir_intrinsic_memory_barrier_tcs_patch:
      case nir_intrinsic_memory_barrier_shared:
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
         break;
      default:
         unreachable("Unimplemented memory barrier intrinsic");
@ -6844,6 +6873,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
   ds->offset0 = offset;
   if (return_previous)
      ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
   ctx->block->instructions.emplace_back(std::move(ds));
 }

@ -6884,7 +6914,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
   info.align_mul = nir_intrinsic_align_mul(instr);
   info.align_offset = nir_intrinsic_align_offset(instr);
   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_buffer, semantic_private);
   info.soffset = ctx->program->scratch_offset;
   emit_scratch_load(ctx, bld, &info);
 }
@ -6907,7 +6937,8 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {

   for (unsigned i = 0; i < write_count; i++) {
      aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_buffer, semantic_private);
   }
 }

@ -7021,8 +7052,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
            mtbuf->offset = const_offset;
            mtbuf->glc = true;
            mtbuf->slc = true;
-            mtbuf->barrier = barrier_gs_data;
-            mtbuf->can_reorder = true;
+            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
            bld.insert(std::move(mtbuf));
         }

@ -7347,8 +7377,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
         load->glc = false;
         load->dlc = false;
         load->disable_wqm = false;
-         load->barrier = barrier_none;
-         load->can_reorder = true;
         ctx->block->instructions.emplace_back(std::move(load));
      }

@ -7531,17 +7559,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
      visit_get_buffer_size(ctx, instr);
      break;
   case nir_intrinsic_control_barrier: {
-      if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         /* GFX6 only (thanks to a hw bug workaround):
-          * The real barrier instruction isn’t needed, because an entire patch
-          * always fits into a single wave.
-          */
-         break;
-      }
-
-      if (ctx->program->workgroup_size > ctx->program->wave_size)
-         bld.sopp(aco_opcode::s_barrier);
-
+      bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup);
      break;
   }
   case nir_intrinsic_memory_barrier_tcs_patch:
@ -8093,7 +8111,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
      aco_opcode opcode =
         nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
            aco_opcode::s_memrealtime : aco_opcode::s_memtime;
-      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile));
      emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
      break;
   }
@ -8674,7 +8692,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      tex->da = da;
      tex->definitions[0] = Definition(tmp_dst);
      tex->dim = dim;
-      tex->can_reorder = true;
      ctx->block->instructions.emplace_back(std::move(tex));

      if (div_by_6) {
@ -8707,7 +8724,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      tex->da = da;
      Temp size = bld.tmp(v2);
      tex->definitions[0] = Definition(size);
-      tex->can_reorder = true;
      ctx->block->instructions.emplace_back(std::move(tex));
      emit_split_vector(ctx, size, size.size());

@ -8809,7 +8825,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      mubuf->operands[2] = Operand((uint32_t) 0);
      mubuf->definitions[0] = Definition(tmp_dst);
      mubuf->idxen = true;
-      mubuf->can_reorder = true;
      ctx->block->instructions.emplace_back(std::move(mubuf));

      expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
@ -8858,7 +8873,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      tex->unrm = true;
      tex->da = da;
      tex->definitions[0] = Definition(tmp_dst);
-      tex->can_reorder = true;
      ctx->block->instructions.emplace_back(std::move(tex));

      if (instr->op == nir_texop_samples_identical) {
@ -9002,7 +9016,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
   tex->dmask = dmask;
   tex->da = da;
   tex->definitions[0] = Definition(tmp_dst);
-   tex->can_reorder = true;
   ctx->block->instructions.emplace_back(std::move(tex));

   if (tg4_integer_cube_workaround) {
@ -10285,6 +10298,13 @@ static void create_fs_exports(isel_context *ctx)
      create_null_export(ctx);
 }

+static void create_workgroup_barrier(Builder& bld)
+{
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+               scope_workgroup);
+}
+
 static void write_tcs_tess_factors(isel_context *ctx)
 {
   unsigned outer_comps;
@ -10309,9 +10329,7 @@ static void write_tcs_tess_factors(isel_context *ctx)

   Builder bld(ctx->program, ctx->block);

-   bld.barrier(aco_opcode::p_memory_barrier_shared);
-   if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
-      bld.sopp(aco_opcode::s_barrier);
+   create_workgroup_barrier(bld);

   Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
   Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
@ -10470,7 +10488,6 @@ static void emit_stream_output(isel_context *ctx,
      store->glc = true;
      store->dlc = false;
      store->slc = true;
-      store->can_reorder = true;
      ctx->block->instructions.emplace_back(std::move(store));
   }
 }
@ -10890,8 +10907,7 @@ void ngg_emit_nogs_output(isel_context *ctx)

      if (ctx->stage == ngg_vertex_gs) {
         /* Wait for GS threads to store primitive ID in LDS. */
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);

         /* Calculate LDS address where the GS threads stored the primitive ID. */
         Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
@ -10975,8 +10991,7 @@ void select_program(Program *program,
      if (i) {
         Builder bld(ctx.program, ctx.block);

-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);

         if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
            ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
@ -10999,7 +11014,8 @@ void select_program(Program *program,
         ngg_emit_nogs_output(&ctx);
      } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
         Builder bld(ctx.program, ctx.block);
-         bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
      } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
         write_tcs_tess_factors(&ctx);
@ -11031,7 +11047,7 @@ void select_program(Program *program,
   ctx.block->kind |= block_kind_uniform;
   Builder bld(ctx.program, ctx.block);
   if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, false);
+      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
   bld.sopp(aco_opcode::s_endpgm);

   cleanup_cfg(program);
@ -11116,8 +11132,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
            mubuf->glc = true;
            mubuf->slc = true;
            mubuf->dlc = args->options->chip_class >= GFX10;
-            mubuf->barrier = barrier_none;
-            mubuf->can_reorder = true;

            ctx.outputs.mask[i] |= 1 << j;
            ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -127,6 +127,28 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
   program->next_fp_mode.round32 = fp_round_ne;
 }

+memory_sync_info get_sync_info(const Instruction* instr)
+{
+   switch (instr->format) {
+   case Format::SMEM:
+      return static_cast<const SMEM_instruction*>(instr)->sync;
+   case Format::MUBUF:
+      return static_cast<const MUBUF_instruction*>(instr)->sync;
+   case Format::MIMG:
+      return static_cast<const MIMG_instruction*>(instr)->sync;
+   case Format::MTBUF:
+      return static_cast<const MTBUF_instruction*>(instr)->sync;
+   case Format::FLAT:
+   case Format::GLOBAL:
+   case Format::SCRATCH:
+      return static_cast<const FLAT_instruction*>(instr)->sync;
+   case Format::DS:
+      return static_cast<const DS_instruction*>(instr)->sync;
+   default:
+      return memory_sync_info();
+   }
+}
+
 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr)
 {
   if (!instr->isVALU())
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -103,22 +103,79 @@ enum class Format : std::uint16_t {
   SDWA = 1 << 14,
 };

-enum barrier_interaction : uint8_t {
-   barrier_none = 0,
-   barrier_buffer = 0x1,
-   barrier_image = 0x2,
-   barrier_atomic = 0x4,
-   barrier_shared = 0x8,
-   /* used for geometry shaders to ensure vertex data writes are before the
-    * GS_DONE s_sendmsg. */
-   barrier_gs_data = 0x10,
-   /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
-   barrier_gs_sendmsg = 0x20,
-   /* used by barriers. created by s_barrier */
-   barrier_barrier = 0x40,
-   barrier_count = 7,
+enum storage_class : uint8_t {
+   storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
+   storage_buffer = 0x1, /* SSBOs and global memory */
+   storage_atomic_counter = 0x2, /* not used for Vulkan */
+   storage_image = 0x4,
+   storage_shared = 0x8, /* or TCS output */
+   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
+   storage_scratch = 0x20,
+   storage_vgpr_spill = 0x40,
+   storage_count = 8,
 };

+enum memory_semantics : uint8_t {
+   semantic_none = 0x0,
+   /* for loads: don't move any access after this load to before this load (even other loads)
+    * for barriers: don't move any access after the barrier to before any
+    * atomics/control_barriers/sendmsg_gs_done before the barrier */
+   semantic_acquire = 0x1,
+   /* for stores: don't move any access before this store to after this store
+    * for barriers: don't move any access before the barrier to after any
+    * atomics/control_barriers/sendmsg_gs_done after the barrier */
+   semantic_release = 0x2,
+
+   /* the rest are for load/stores/atomics only */
+   /* cannot be DCE'd or CSE'd */
+   semantic_volatile = 0x4,
+   /* does not interact with barriers and assumes this lane is the only lane
+    * accessing this memory */
+   semantic_private = 0x8,
+   /* this operation can be reordered around operations of the same storage. says nothing about barriers */
+   semantic_can_reorder = 0x10,
+   /* this is a atomic instruction (may only read or write memory) */
+   semantic_atomic = 0x20,
+   /* this is instruction both reads and writes memory */
+   semantic_rmw = 0x40,
+
+   semantic_acqrel = semantic_acquire | semantic_release,
+   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
+};
+
+enum sync_scope : uint8_t {
+   scope_invocation = 0,
+   scope_subgroup = 1,
+   scope_workgroup = 2,
+   scope_queuefamily = 3,
+   scope_device = 4,
+};
+
+struct memory_sync_info {
+   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
+   memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
+      : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
+
+   storage_class storage:8;
+   memory_semantics semantics:8;
+   sync_scope scope:8;
+
+   bool operator == (const memory_sync_info& rhs) const {
+      return storage == rhs.storage &&
+             semantics == rhs.semantics &&
+             scope == rhs.scope;
+   }
+
+   bool can_reorder() const {
+      if (semantics & semantic_acqrel)
+         return false;
+      /* Also check storage so that zero-initialized memory_sync_info can be
+       * reordered. */
+      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
+   }
+};
+static_assert(sizeof(memory_sync_info) == 3);
+
 enum fp_round {
   fp_round_ne = 0,
   fp_round_pi = 1,
@ -931,14 +988,13 @@ static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected p
 *
 */
 struct SMEM_instruction : public Instruction {
-   barrier_interaction barrier;
+   memory_sync_info sync;
   bool glc : 1; /* VI+: globally coherent */
   bool dlc : 1; /* NAVI: device level coherent */
   bool nv : 1; /* VEGA only: Non-volatile */
-   bool can_reorder : 1;
   bool disable_wqm : 1;
   bool prevent_overflow : 1; /* avoid overflow when combining additions */
-   uint32_t padding: 18;
+   uint32_t padding: 3;
 };
 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

@ -1066,11 +1122,13 @@ static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected
 *
 */
 struct DS_instruction : public Instruction {
+   memory_sync_info sync;
+   bool gds;
   int16_t offset0;
   int8_t offset1;
-   bool gds;
+   uint8_t padding;
 };
-static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

 /**
 * Vector Memory Untyped-buffer Instructions
@ -1081,7 +1139,7 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected pad
 *
 */
 struct MUBUF_instruction : public Instruction {
-   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   memory_sync_info sync;
   bool offen : 1; /* Supply an offset from VGPR (VADDR) */
   bool idxen : 1; /* Supply an index from VGPR (VADDR) */
   bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
@ -1091,12 +1149,11 @@ struct MUBUF_instruction : public Instruction {
   bool tfe : 1; /* texture fail enable */
   bool lds : 1; /* Return read-data to LDS instead of VGPRs */
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   bool swizzled:1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   bool swizzled : 1;
+   uint32_t padding1 : 18;
 };
-static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

 /**
 * Vector Memory Typed-buffer Instructions
@ -1107,8 +1164,7 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected
 *
 */
 struct MTBUF_instruction : public Instruction {
-   uint16_t offset; /* Unsigned byte offset - 12 bit */
-   barrier_interaction barrier;
+   memory_sync_info sync;
   uint8_t dfmt : 4; /* Data Format of data in memory buffer */
   uint8_t nfmt : 3; /* Numeric format of data in memory */
   bool offen : 1; /* Supply an offset from VGPR (VADDR) */
@ -1118,8 +1174,8 @@ struct MTBUF_instruction : public Instruction {
   bool slc : 1; /* system level coherent */
   bool tfe : 1; /* texture fail enable */
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint32_t padding : 25;
+   uint32_t padding : 10;
+   uint16_t offset; /* Unsigned byte offset - 12 bit */
 };
 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

@ -1133,6 +1189,7 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
 *
 */
 struct MIMG_instruction : public Instruction {
+   memory_sync_info sync;
   uint8_t dmask; /* Data VGPR enable mask */
   uint8_t dim : 3; /* NAVI: dimensionality */
   bool unrm : 1; /* Force address to be un-normalized */
@ -1146,11 +1203,9 @@ struct MIMG_instruction : public Instruction {
   bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
   bool d16 : 1; /* Convert 32-bit data to 16-bit data */
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint32_t padding : 18;
 };
-static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

 /**
 * Flat/Scratch/Global Instructions
@ -1160,18 +1215,18 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected p
 *
 */
 struct FLAT_instruction : public Instruction {
-   uint16_t offset; /* Vega/Navi only */
+   memory_sync_info sync;
   bool slc : 1; /* system level coherent */
   bool glc : 1; /* globally coherent */
   bool dlc : 1; /* NAVI: device level coherent */
   bool lds : 1;
   bool nv : 1;
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint32_t padding0 : 2;
+   uint16_t offset; /* Vega/Navi only */
+   uint16_t padding1;
 };
-static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

 struct Export_instruction : public Instruction {
   uint8_t enabled_mask;
@ -1200,8 +1255,10 @@ struct Pseudo_branch_instruction : public Instruction {
 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

 struct Pseudo_barrier_instruction : public Instruction {
+   memory_sync_info sync;
+   sync_scope exec_scope;
 };
-static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
+static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

 enum ReduceOp : uint16_t {
   iadd8, iadd16, iadd32, iadd64,
@ -1298,7 +1355,8 @@ static inline bool is_phi(aco_ptr<Instruction>& instr)
   return is_phi(instr.get());
 }

-barrier_interaction get_barrier_interaction(const Instruction* instr);
+memory_sync_info get_sync_info(const Instruction* instr);
+
 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);

 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -1854,6 +1854,7 @@ void lower_to_hw_instr(Program* program)
                  emit_gfx10_wave64_bpermute(program, instr, bld);
               else
                  unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
+               break;
            }
            default:
               break;
@ -1908,6 +1909,18 @@ void lower_to_hw_instr(Program* program)
                           reduce->operands[2].physReg(), // vtmp
                           reduce->definitions[2].physReg(), // sitmp
                           reduce->operands[0], reduce->definitions[0]);
+         } else if (instr->format == Format::PSEUDO_BARRIER) {
+            Pseudo_barrier_instruction* barrier = static_cast<Pseudo_barrier_instruction*>(instr.get());
+
+            /* Anything larger than a workgroup isn't possible. Anything
+             * smaller requires no instructions and this pseudo instruction
+             * would only be included to control optimizations. */
+            bool emit_s_barrier = barrier->exec_scope == scope_workgroup &&
+                                  program->workgroup_size > program->wave_size;
+
+            bld.insert(std::move(instr));
+            if (emit_s_barrier)
+               bld.sopp(aco_opcode::s_barrier);
         } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
            float_mode new_mode = block->fp_mode;
            new_mode.round16_64 = fp_round_ne;
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -66,7 +66,7 @@ class Format(Enum):
         return [('uint32_t', 'block', '-1'),
                 ('uint32_t', 'imm', '0')]
      elif self == Format.SMEM:
-         return [('bool', 'can_reorder', 'true'),
+         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
                 ('bool', 'glc', 'false'),
                 ('bool', 'dlc', 'false'),
                 ('bool', 'nv', 'false')]
@ -123,6 +123,9 @@ class Format(Enum):
      elif self == Format.PSEUDO_REDUCTION:
         return [('ReduceOp', 'op', None, 'reduce_op'),
                 ('unsigned', 'cluster_size', '0')]
+      elif self == Format.PSEUDO_BARRIER:
+         return [('memory_sync_info', 'sync', None),
+                 ('sync_scope', 'exec_scope', 'scope_invocation')]
      elif self == Format.VINTRP:
         return [('unsigned', 'attribute', None),
                 ('unsigned', 'component', None)]
@ -133,7 +136,7 @@ class Format(Enum):
                 ('bool', 'bound_ctrl', 'true')]
      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
         return [('uint16_t', 'offset', 0),
-                 ('bool', 'can_reorder', 'true'),
+                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
                 ('bool', 'glc', 'false'),
                 ('bool', 'slc', 'false'),
                 ('bool', 'lds', 'false'),
@ -265,13 +268,7 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
 opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
 opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)

-opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
-opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
+opcode("p_barrier", format=Format.PSEUDO_BARRIER)

 opcode("p_spill")
 opcode("p_reload")
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@ -224,12 +224,15 @@ struct InstrPred {
            return aK->imm == bK->imm;
         }
         case Format::SMEM: {
+            if (!a->operands.empty() && a->operands[0].bytes() == 16)
+               return false;
            SMEM_instruction* aS = static_cast<SMEM_instruction*>(a);
            SMEM_instruction* bS = static_cast<SMEM_instruction*>(b);
            /* isel shouldn't be creating situations where this assertion fails */
            assert(aS->prevent_overflow == bS->prevent_overflow);
-            return aS->can_reorder && bS->can_reorder &&
-                   aS->glc == bS->glc && aS->nv == bS->nv &&
+            return aS->sync.can_reorder() && bS->sync.can_reorder() &&
+                   aS->sync == bS->sync && aS->glc == bS->glc && aS->dlc == bS->dlc &&
+                   aS->nv == bS->nv && aS->disable_wqm == bS->disable_wqm &&
                   aS->prevent_overflow == bS->prevent_overflow;
         }
         case Format::VINTRP: {
@ -251,8 +254,8 @@ struct InstrPred {
         case Format::MTBUF: {
            MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
            MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                   aM->dfmt == bM->dfmt &&
                   aM->nfmt == bM->nfmt &&
                   aM->offset == bM->offset &&
@ -267,8 +270,8 @@ struct InstrPred {
         case Format::MUBUF: {
            MUBUF_instruction* aM = static_cast<MUBUF_instruction *>(a);
            MUBUF_instruction* bM = static_cast<MUBUF_instruction *>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                   aM->offset == bM->offset &&
                   aM->offen == bM->offen &&
                   aM->idxen == bM->idxen &&
@ -295,7 +298,9 @@ struct InstrPred {
               return false;
            DS_instruction* aD = static_cast<DS_instruction *>(a);
            DS_instruction* bD = static_cast<DS_instruction *>(b);
-            return aD->pass_flags == bD->pass_flags &&
+            return aD->sync.can_reorder() && bD->sync.can_reorder() &&
+                   aD->sync == bD->sync &&
+                   aD->pass_flags == bD->pass_flags &&
                   aD->gds == bD->gds &&
                   aD->offset0 == bD->offset0 &&
                   aD->offset1 == bD->offset1;
@ -303,8 +308,8 @@ struct InstrPred {
         case Format::MIMG: {
            MIMG_instruction* aM = static_cast<MIMG_instruction*>(a);
            MIMG_instruction* bM = static_cast<MIMG_instruction*>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                   aM->dmask == bM->dmask &&
                   aM->unrm == bM->unrm &&
                   aM->glc == bM->glc &&
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -1026,8 +1026,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
               new_instr->operands.back() = Operand(base);
               if (!smem->definitions.empty())
                  new_instr->definitions[0] = smem->definitions[0];
-               new_instr->can_reorder = smem->can_reorder;
-               new_instr->barrier = smem->barrier;
+               new_instr->sync = smem->sync;
               new_instr->glc = smem->glc;
               new_instr->dlc = smem->dlc;
               new_instr->nv = smem->nv;
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@ -189,23 +189,73 @@ static void print_definition(const Definition *definition, FILE *output)
      print_physReg(definition->physReg(), definition->bytes(), output);
 }

-static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output)
+static void print_storage(storage_class storage, FILE *output)
 {
-   if (can_reorder)
-      fprintf(output, " reorder");
+   fprintf(output, " storage:");
+   int printed = 0;
+   if (storage & storage_buffer)
+      printed += fprintf(output, "%sbuffer", printed ? "," : "");
+   if (storage & storage_atomic_counter)
+      printed += fprintf(output, "%satomic_counter", printed ? "," : "");
+   if (storage & storage_image)
+      printed += fprintf(output, "%simage", printed ? "," : "");
+   if (storage & storage_shared)
+      printed += fprintf(output, "%sshared", printed ? "," : "");
+   if (storage & storage_vmem_output)
+      printed += fprintf(output, "%svmem_output", printed ? "," : "");
+   if (storage & storage_scratch)
+      printed += fprintf(output, "%sscratch", printed ? "," : "");
+   if (storage & storage_vgpr_spill)
+      printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
+}

-   if (barrier & barrier_buffer)
-      fprintf(output, " buffer");
-   if (barrier & barrier_image)
-      fprintf(output, " image");
-   if (barrier & barrier_atomic)
-      fprintf(output, " atomic");
-   if (barrier & barrier_shared)
-      fprintf(output, " shared");
-   if (barrier & barrier_gs_data)
-      fprintf(output, " gs_data");
-   if (barrier & barrier_gs_sendmsg)
-      fprintf(output, " gs_sendmsg");
+static void print_semantics(memory_semantics sem, FILE *output)
+{
+   fprintf(output, " semantics:");
+   int printed = 0;
+   if (sem & semantic_acquire)
+      printed += fprintf(output, "%sacquire", printed ? "," : "");
+   if (sem & semantic_release)
+      printed += fprintf(output, "%srelease", printed ? "," : "");
+   if (sem & semantic_volatile)
+      printed += fprintf(output, "%svolatile", printed ? "," : "");
+   if (sem & semantic_private)
+      printed += fprintf(output, "%sprivate", printed ? "," : "");
+   if (sem & semantic_can_reorder)
+      printed += fprintf(output, "%sreorder", printed ? "," : "");
+   if (sem & semantic_atomic)
+      printed += fprintf(output, "%satomic", printed ? "," : "");
+   if (sem & semantic_rmw)
+      printed += fprintf(output, "%srmw", printed ? "," : "");
+}
+
+static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
+{
+   fprintf(output, " %s:", prefix);
+   switch (scope) {
+   case scope_invocation:
+      fprintf(output, "invocation");
+      break;
+   case scope_subgroup:
+      fprintf(output, "subgroup");
+      break;
+   case scope_workgroup:
+      fprintf(output, "workgroup");
+      break;
+   case scope_queuefamily:
+      fprintf(output, "queuefamily");
+      break;
+   case scope_device:
+      fprintf(output, "device");
+      break;
+   }
+}
+
+static void print_sync(memory_sync_info sync, FILE *output)
+{
+   print_storage(sync.storage, output);
+   print_semantics(sync.semantics, output);
+   print_scope(sync.scope, output);
 }

 static void print_instr_format_specific(const Instruction *instr, FILE *output)
@ -292,7 +342,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " dlc");
      if (smem->nv)
         fprintf(output, " nv");
-      print_barrier_reorder(smem->can_reorder, smem->barrier, output);
+      print_sync(smem->sync, output);
      break;
   }
   case Format::VINTRP: {
@ -308,6 +358,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " offset1:%u", ds->offset1);
      if (ds->gds)
         fprintf(output, " gds");
+      print_sync(ds->sync, output);
      break;
   }
   case Format::MUBUF: {
@ -332,7 +383,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " lds");
      if (mubuf->disable_wqm)
         fprintf(output, " disable_wqm");
-      print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output);
+      print_sync(mubuf->sync, output);
      break;
   }
   case Format::MIMG: {
@ -392,7 +443,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " d16");
      if (mimg->disable_wqm)
         fprintf(output, " disable_wqm");
-      print_barrier_reorder(mimg->can_reorder, mimg->barrier, output);
+      print_sync(mimg->sync, output);
      break;
   }
   case Format::EXP: {
@ -439,6 +490,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " cluster_size:%u", reduce->cluster_size);
      break;
   }
+   case Format::PSEUDO_BARRIER: {
+      const Pseudo_barrier_instruction* barrier = static_cast<const Pseudo_barrier_instruction*>(instr);
+      print_sync(barrier->sync, output);
+      print_scope(barrier->exec_scope, output, "exec_scope");
+      break;
+   }
   case Format::FLAT:
   case Format::GLOBAL:
   case Format::SCRATCH: {
@ -457,7 +514,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " nv");
      if (flat->disable_wqm)
         fprintf(output, " disable_wqm");
-      print_barrier_reorder(flat->can_reorder, flat->barrier, output);
+      print_sync(flat->sync, output);
      break;
   }
   case Format::MTBUF: {
@ -507,7 +564,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         fprintf(output, " tfe");
      if (mtbuf->disable_wqm)
         fprintf(output, " disable_wqm");
-      print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output);
+      print_sync(mtbuf->sync, output);
      break;
   }
   case Format::VOP3P: {
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -318,26 +318,6 @@ void MoveState::upwards_skip()
   source_idx++;
 }

-bool can_reorder(Instruction* candidate)
-{
-   switch (candidate->format) {
-   case Format::SMEM:
-      return static_cast<SMEM_instruction*>(candidate)->can_reorder;
-   case Format::MUBUF:
-      return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
-   case Format::MIMG:
-      return static_cast<MIMG_instruction*>(candidate)->can_reorder;
-   case Format::MTBUF:
-      return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
-   case Format::FLAT:
-   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return static_cast<FLAT_instruction*>(candidate)->can_reorder;
-   default:
-      return true;
-   }
-}
-
 bool is_gs_or_done_sendmsg(const Instruction *instr)
 {
   if (instr->opcode == aco_opcode::s_sendmsg) {
@ -357,96 +337,96 @@ bool is_done_sendmsg(const Instruction *instr)
   return false;
 }

-barrier_interaction get_barrier_interaction(const Instruction* instr)
+memory_sync_info get_sync_info_with_hack(const Instruction* instr)
 {
-   switch (instr->format) {
-   case Format::SMEM:
-      return static_cast<const SMEM_instruction*>(instr)->barrier;
-   case Format::MUBUF:
-      return static_cast<const MUBUF_instruction*>(instr)->barrier;
-   case Format::MIMG:
-      return static_cast<const MIMG_instruction*>(instr)->barrier;
-   case Format::MTBUF:
-      return static_cast<const MTBUF_instruction*>(instr)->barrier;
-   case Format::FLAT:
-   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return static_cast<const FLAT_instruction*>(instr)->barrier;
-   case Format::DS:
-      return barrier_shared;
-   case Format::SOPP:
-      if (is_done_sendmsg(instr))
-         return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
-      else if (is_gs_or_done_sendmsg(instr))
-         return barrier_gs_sendmsg;
-      else
-         return barrier_none;
-   case Format::PSEUDO_BARRIER:
-      return barrier_barrier;
-   default:
-      return barrier_none;
+   memory_sync_info sync = get_sync_info(instr);
+   if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
+      // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
+      sync.storage = (storage_class)(sync.storage | storage_buffer);
+      sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
   }
+   return sync;
 }

-barrier_interaction parse_barrier(Instruction *instr)
-{
-   if (instr->format == Format::PSEUDO_BARRIER) {
-      switch (instr->opcode) {
-      case aco_opcode::p_memory_barrier_atomic:
-         return barrier_atomic;
-      /* For now, buffer and image barriers are treated the same. this is because of
-       * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
-       * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
-       * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
-       * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
-       * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
-       * Either way, this solution should work. */
-      case aco_opcode::p_memory_barrier_buffer:
-      case aco_opcode::p_memory_barrier_image:
-         return (barrier_interaction)(barrier_image | barrier_buffer);
-      case aco_opcode::p_memory_barrier_shared:
-         return barrier_shared;
-      case aco_opcode::p_memory_barrier_common:
-         return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
-      case aco_opcode::p_memory_barrier_gs_data:
-         return barrier_gs_data;
-      case aco_opcode::p_memory_barrier_gs_sendmsg:
-         return barrier_gs_sendmsg;
-      default:
-         break;
-      }
-   } else if (instr->opcode == aco_opcode::s_barrier) {
-      return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
-   }
-   return barrier_none;
-}
+struct memory_event_set {
+   bool has_control_barrier;
+
+   unsigned bar_acquire;
+   unsigned bar_release;
+   unsigned bar_classes;
+
+   unsigned access_acquire;
+   unsigned access_release;
+   unsigned access_relaxed;
+   unsigned access_atomic;
+};

 struct hazard_query {
   bool contains_spill;
-   int barriers;
-   int barrier_interaction;
-   bool can_reorder_vmem;
-   bool can_reorder_smem;
+   bool contains_sendmsg;
+   memory_event_set mem_events;
+   unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
+   unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
 };

 void init_hazard_query(hazard_query *query) {
   query->contains_spill = false;
-   query->barriers = 0;
-   query->barrier_interaction = 0;
-   query->can_reorder_vmem = true;
-   query->can_reorder_smem = true;
+   query->contains_sendmsg = false;
+   memset(&query->mem_events, 0, sizeof(query->mem_events));
+   query->aliasing_storage = 0;
+   query->aliasing_storage_smem = 0;
+}
+
+void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
+{
+   set->has_control_barrier |= is_done_sendmsg(instr);
+   if (instr->opcode == aco_opcode::p_barrier) {
+      Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
+      if (bar->sync.semantics & semantic_acquire)
+         set->bar_acquire |= bar->sync.storage;
+      if (bar->sync.semantics & semantic_release)
+         set->bar_release |= bar->sync.storage;
+      set->bar_classes |= bar->sync.storage;
+
+      set->has_control_barrier |= bar->exec_scope > scope_invocation;
+   }
+
+   if (!sync->storage)
+      return;
+
+   if (sync->semantics & semantic_acquire)
+      set->access_acquire |= sync->storage;
+   if (sync->semantics & semantic_release)
+      set->access_release |= sync->storage;
+
+   if (!(sync->semantics & semantic_private)) {
+      if (sync->semantics & semantic_atomic)
+         set->access_atomic |= sync->storage;
+      else
+         set->access_relaxed |= sync->storage;
+   }
 }

 void add_to_hazard_query(hazard_query *query, Instruction *instr)
 {
-   query->barriers |= parse_barrier(instr);
-   query->barrier_interaction |= get_barrier_interaction(instr);
   if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
      query->contains_spill = true;
+   query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;

-   bool can_reorder_instr = can_reorder(instr);
-   query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
-   query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
+   memory_sync_info sync = get_sync_info_with_hack(instr);
+
+   add_memory_event(&query->mem_events, instr, &sync);
+
+   if (!(sync.semantics & semantic_can_reorder)) {
+      unsigned storage = sync.storage;
+      /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
+      if (storage & (storage_buffer | storage_image))
+         storage |= storage_buffer | storage_image;
+      if (instr->format == Format::SMEM)
+         query->aliasing_storage_smem |= storage;
+      else
+         query->aliasing_storage |= storage;
+   }
 }

 enum HazardResult {
@ -463,10 +443,8 @@ enum HazardResult {
   hazard_fail_unreorderable,
 };

-HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
+HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
 {
-   bool can_reorder_candidate = can_reorder(instr);
-
   if (instr->opcode == aco_opcode::p_exit_early_if)
      return hazard_fail_exec;
   for (const Definition& def : instr->definitions) {
@ -484,27 +462,61 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
       instr->opcode == aco_opcode::s_setprio)
      return hazard_fail_unreorderable;

-   barrier_interaction bar = parse_barrier(instr);
-   if (query->barrier_interaction && (query->barrier_interaction & bar))
+   memory_event_set instr_set;
+   memset(&instr_set, 0, sizeof(instr_set));
+   memory_sync_info sync = get_sync_info_with_hack(instr);
+   add_memory_event(&instr_set, instr, &sync);
+
+   memory_event_set *first = &instr_set;
+   memory_event_set *second = &query->mem_events;
+   if (upwards)
+      std::swap(first, second);
+
+   /* everything after barrier(acquire) happens after the atomics/control_barriers before
+    * everything after load(acquire) happens after the load
+    */
+   if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
      return hazard_fail_barrier;
-   if (bar && query->barriers && (query->barriers & ~bar))
-      return hazard_fail_barrier;
-   if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
+   if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
+       ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
      return hazard_fail_barrier;

-   if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
-      return hazard_fail_reorder_vmem_smem;
-   if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
-      return hazard_fail_reorder_vmem_smem;
-   if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
+   /* everything before barrier(release) happens before the atomics/control_barriers after *
+    * everything before store(release) happens before the store
+    */
+   if (first->bar_release && (second->has_control_barrier || second->access_atomic))
+      return hazard_fail_barrier;
+   if ((first->bar_classes && (second->bar_release || second->access_release)) ||
+       ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
+      return hazard_fail_barrier;
+
+   /* don't move memory barriers around other memory barriers */
+   if (first->bar_classes && second->bar_classes)
+      return hazard_fail_barrier;
+
+   /* Don't move memory loads/stores to before control barriers. This is to make
+    * memory barriers followed by control barriers work. */
+   if (first->has_control_barrier && (second->access_atomic | second->access_relaxed))
+      return hazard_fail_barrier;
+
+   /* don't move memory loads/stores past potentially aliasing loads/stores */
+   unsigned aliasing_storage = instr->format == Format::SMEM ?
+                               query->aliasing_storage_smem :
+                               query->aliasing_storage;
+   if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
+      unsigned intersect = sync.storage & aliasing_storage;
+      if (intersect & storage_shared)
         return hazard_fail_reorder_ds;
-   if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
-      return hazard_fail_reorder_sendmsg;
+      return hazard_fail_reorder_vmem_smem;
+   }

   if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
       query->contains_spill)
      return hazard_fail_spill;

+   if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
+      return hazard_fail_reorder_sendmsg;
+
   return hazard_success;
 }

@ -546,7 +558,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,

      bool can_move_down = true;

-      HazardResult haz = perform_hazard_query(&hq, candidate.get());
+      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
         can_move_down = false;
      else if (haz != hazard_success)
@ -594,7 +606,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         break;

      if (found_dependency) {
-         HazardResult haz = perform_hazard_query(&hq, candidate.get());
+         HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
         if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
             haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
             haz == hazard_fail_export)
@ -686,7 +698,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
      /* if current depends on candidate, add additional dependencies and continue */
      bool can_move_down = !is_vmem || part_of_clause;

-      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
+      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
          haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
          haz == hazard_fail_export)
@ -735,7 +747,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
      /* check if candidate depends on current */
      bool is_dependency = false;
      if (found_dependency) {
-         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
+         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
         if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
             haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
             haz == hazard_fail_barrier || haz == hazard_fail_export)
@ -802,7 +814,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
      if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
         break;

-      HazardResult haz = perform_hazard_query(&hq, candidate.get());
+      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
      if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
         break;

--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@ -1565,10 +1565,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                  for (unsigned i = 0; i < temp.size(); i++)
                     split->definitions[i] = bld.def(v1);
                  bld.insert(split);
-                  for (unsigned i = 0; i < temp.size(); i++)
-                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                  for (unsigned i = 0; i < temp.size(); i++) {
+                     Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                     static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+                  }
               } else {
-                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
               }
            } else {
               ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@ -1632,11 +1635,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                  for (unsigned i = 0; i < def.size(); i++) {
                     Temp tmp = bld.tmp(v1);
                     vec->operands[i] = Operand(tmp);
-                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                  }
                  bld.insert(vec);
               } else {
-                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
               }
            } else {
               uint32_t spill_slot = slots[spill_id];