aco: use ac_hw_cache_flags

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29243>
2026-01-06 08:50:09 +01:00 · 2024-05-14 18:34:01 +01:00 · 2024-05-14 18:34:01 +01:00 · b41f0f6cc1
commit b41f0f6cc1
parent cdaf269924
9 changed files with 244 additions and 205 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -126,11 +126,14 @@ template <typename T>
 uint32_t
 get_gfx12_cpol(const T& instr)
 {
+   bool glc = instr.cache.value & ac_glc;
+   bool slc = instr.cache.value & ac_slc;
+   bool dlc = instr.cache.value & ac_dlc;
   if (instr_info.is_atomic[(int)instr.opcode]) {
-      return (instr.glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2;
+      return (glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2;
   } else {
-      return (instr.definitions.empty() || instr.glc || instr.slc || instr.dlc) ? 3 /*SCOPE_SYS*/
-                                                                                : 0 /*SCOPE_CU*/;
+      return (instr.definitions.empty() || glc || slc || dlc) ? 3 /*SCOPE_SYS*/
+                                                              : 0 /*SCOPE_CU*/;
   }
 }

@ -228,6 +231,8 @@ emit_smem_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
 {
   uint32_t opcode = ctx.opcode[(int)instr->opcode];
   SMEM_instruction& smem = instr->smem();
+   bool glc = smem.cache.value & ac_glc;
+   bool dlc = smem.cache.value & ac_dlc;

   bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
   bool is_load = !instr->definitions.empty();
@ -258,22 +263,21 @@ emit_smem_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*

   if (ctx.gfx_level <= GFX9) {
      encoding = (0b110000 << 26);
-      assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */
-      encoding |= smem.nv ? 1 << 15 : 0;
+      assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */
+      /* We don't use the NV bit. */
   } else {
      encoding = (0b111101 << 26);
-      assert(!smem.nv); /* Non-volatile is not supported on GFX10 */
      if (ctx.gfx_level <= GFX11_5)
-         encoding |= smem.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0;
+         encoding |= dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0;
   }

   if (ctx.gfx_level <= GFX11_5) {
      encoding |= opcode << 18;
-      encoding |= smem.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
+      encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
   } else {
      encoding |= opcode << 13;
      if (is_load)
-         encoding |= ((smem.glc || smem.dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21;
+         encoding |= ((glc || dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21;
   }

   if (ctx.gfx_level <= GFX9) {
@ -536,6 +540,9 @@ emit_mubuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
 {
   uint32_t opcode = ctx.opcode[(int)instr->opcode];
   MUBUF_instruction& mubuf = instr->mubuf();
+   bool glc = mubuf.cache.value & ac_glc;
+   bool slc = mubuf.cache.value & ac_slc;
+   bool dlc = mubuf.cache.value & ac_dlc;

   uint32_t encoding = (0b111000 << 26);
   if (ctx.gfx_level >= GFX11 && mubuf.lds) /* GFX11 has separate opcodes for LDS loads */
@ -543,7 +550,7 @@ emit_mubuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
   else
      encoding |= (mubuf.lds ? 1 : 0) << 16;
   encoding |= opcode << 18;
-   encoding |= (mubuf.glc ? 1 : 0) << 14;
+   encoding |= (glc ? 1 : 0) << 14;
   if (ctx.gfx_level <= GFX10_3)
      encoding |= (mubuf.idxen ? 1 : 0) << 13;
   assert(!mubuf.addr64 || ctx.gfx_level <= GFX7);
@ -552,19 +559,19 @@ emit_mubuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
   if (ctx.gfx_level <= GFX10_3)
      encoding |= (mubuf.offen ? 1 : 0) << 12;
   if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) {
-      assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */
-      encoding |= (mubuf.slc ? 1 : 0) << 17;
+      assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */
+      encoding |= (slc ? 1 : 0) << 17;
   } else if (ctx.gfx_level >= GFX11) {
-      encoding |= (mubuf.slc ? 1 : 0) << 12;
-      encoding |= (mubuf.dlc ? 1 : 0) << 13;
+      encoding |= (slc ? 1 : 0) << 12;
+      encoding |= (dlc ? 1 : 0) << 13;
   } else if (ctx.gfx_level >= GFX10) {
-      encoding |= (mubuf.dlc ? 1 : 0) << 15;
+      encoding |= (dlc ? 1 : 0) << 15;
   }
   encoding |= 0x0FFF & mubuf.offset;
   out.push_back(encoding);
   encoding = 0;
   if (ctx.gfx_level <= GFX7 || (ctx.gfx_level >= GFX10 && ctx.gfx_level <= GFX10_3)) {
-      encoding |= (mubuf.slc ? 1 : 0) << 22;
+      encoding |= (slc ? 1 : 0) << 22;
   }
   encoding |= reg(ctx, instr->operands[2]) << 24;
   if (ctx.gfx_level >= GFX11) {
@ -625,24 +632,27 @@ emit_mtbuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
 {
   uint32_t opcode = ctx.opcode[(int)instr->opcode];
   MTBUF_instruction& mtbuf = instr->mtbuf();
+   bool glc = mtbuf.cache.value & ac_glc;
+   bool slc = mtbuf.cache.value & ac_slc;
+   bool dlc = mtbuf.cache.value & ac_dlc;

   uint32_t img_format = ac_get_tbuffer_format(ctx.gfx_level, mtbuf.dfmt, mtbuf.nfmt);

   uint32_t encoding = (0b111010 << 26);
   assert(img_format <= 0x7F);
-   assert(!mtbuf.dlc || ctx.gfx_level >= GFX10);
+   assert(!dlc || ctx.gfx_level >= GFX10);
   if (ctx.gfx_level >= GFX11) {
-      encoding |= (mtbuf.slc ? 1 : 0) << 12;
-      encoding |= (mtbuf.dlc ? 1 : 0) << 13;
+      encoding |= (slc ? 1 : 0) << 12;
+      encoding |= (dlc ? 1 : 0) << 13;
   } else {
      /* DLC bit replaces one bit of the OPCODE on GFX10 */
-      encoding |= (mtbuf.dlc ? 1 : 0) << 15;
+      encoding |= (dlc ? 1 : 0) << 15;
   }
   if (ctx.gfx_level <= GFX10_3) {
      encoding |= (mtbuf.idxen ? 1 : 0) << 13;
      encoding |= (mtbuf.offen ? 1 : 0) << 12;
   }
-   encoding |= (mtbuf.glc ? 1 : 0) << 14;
+   encoding |= (glc ? 1 : 0) << 14;
   encoding |= 0x0FFF & mtbuf.offset;
   encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */

@ -662,7 +672,7 @@ emit_mtbuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
      encoding |= (mtbuf.idxen ? 1 : 0) << 23;
   } else {
      encoding |= (mtbuf.tfe ? 1 : 0) << 23;
-      encoding |= (mtbuf.slc ? 1 : 0) << 22;
+      encoding |= (slc ? 1 : 0) << 22;
   }
   encoding |= (reg(ctx, instr->operands[0]) >> 2) << 16;
   if (instr->operands.size() > 3)
@ -721,6 +731,9 @@ emit_mimg_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
 {
   uint32_t opcode = ctx.opcode[(int)instr->opcode];
   MIMG_instruction& mimg = instr->mimg();
+   bool glc = mimg.cache.value & ac_glc;
+   bool slc = mimg.cache.value & ac_slc;
+   bool dlc = mimg.cache.value & ac_dlc;

   unsigned nsa_dwords = get_mimg_nsa_dwords(instr);
   assert(!nsa_dwords || ctx.gfx_level >= GFX10);
@ -732,23 +745,23 @@ emit_mimg_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      encoding |= mimg.dim << 2;
      encoding |= mimg.unrm ? 1 << 7 : 0;
      encoding |= (0xF & mimg.dmask) << 8;
-      encoding |= mimg.slc ? 1 << 12 : 0;
-      encoding |= mimg.dlc ? 1 << 13 : 0;
-      encoding |= mimg.glc ? 1 << 14 : 0;
+      encoding |= slc ? 1 << 12 : 0;
+      encoding |= dlc ? 1 << 13 : 0;
+      encoding |= glc ? 1 << 14 : 0;
      encoding |= mimg.r128 ? 1 << 15 : 0;
      encoding |= mimg.a16 ? 1 << 16 : 0;
      encoding |= mimg.d16 ? 1 << 17 : 0;
      encoding |= (opcode & 0xFF) << 18;
   } else {
-      encoding |= mimg.slc ? 1 << 25 : 0;
+      encoding |= slc ? 1 << 25 : 0;
      encoding |= (opcode & 0x7f) << 18;
      encoding |= (opcode >> 7) & 1;
      encoding |= mimg.lwe ? 1 << 17 : 0;
      encoding |= mimg.tfe ? 1 << 16 : 0;
-      encoding |= mimg.glc ? 1 << 13 : 0;
+      encoding |= glc ? 1 << 13 : 0;
      encoding |= mimg.unrm ? 1 << 12 : 0;
      if (ctx.gfx_level <= GFX9) {
-         assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */
+         assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */
         assert(!mimg.r128);
         encoding |= mimg.a16 ? 1 << 15 : 0;
         encoding |= mimg.da ? 1 << 14 : 0;
@ -757,7 +770,7 @@ emit_mimg_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
                               : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
         encoding |= nsa_dwords << 1;
         encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
-         encoding |= mimg.dlc ? 1 << 7 : 0;
+         encoding |= dlc ? 1 << 7 : 0;
      }
      encoding |= (0xF & mimg.dmask) << 8;
   }
@ -856,6 +869,9 @@ emit_flatlike_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruct
 {
   uint32_t opcode = ctx.opcode[(int)instr->opcode];
   FLAT_instruction& flat = instr->flatlike();
+   bool glc = flat.cache.value & ac_glc;
+   bool slc = flat.cache.value & ac_slc;
+   bool dlc = flat.cache.value & ac_dlc;

   uint32_t encoding = (0b110111 << 26);
   encoding |= opcode << 18;
@ -879,13 +895,13 @@ emit_flatlike_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruct
   else if (instr->isGlobal())
      encoding |= 2 << (ctx.gfx_level >= GFX11 ? 16 : 14);
   encoding |= flat.lds ? 1 << 13 : 0;
-   encoding |= flat.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
-   encoding |= flat.slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0;
+   encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
+   encoding |= slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0;
   if (ctx.gfx_level >= GFX10) {
      assert(!flat.nv);
-      encoding |= flat.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0;
+      encoding |= dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0;
   } else {
-      assert(!flat.dlc);
+      assert(!dlc);
   }
   out.push_back(encoding);
   encoding = reg(ctx, instr->operands[0], 8);
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -4423,6 +4423,35 @@ lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned

 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};

+ac_hw_cache_flags
+get_gfx6_cache_flags(bool glc, bool slc, bool dlc)
+{
+   uint8_t value = 0;
+   value |= glc ? ac_glc : 0;
+   value |= slc ? ac_slc : 0;
+   value |= dlc ? ac_dlc : 0;
+   return ac_hw_cache_flags{value};
+}
+
+ac_hw_cache_flags
+get_load_cache_flags(Builder& bld, bool glc, bool slc)
+{
+   bool dlc = glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+   return get_gfx6_cache_flags(glc, slc, dlc);
+}
+
+ac_hw_cache_flags
+get_store_cache_flags(Builder& bld, bool glc, bool slc)
+{
+   return get_gfx6_cache_flags(glc, slc, false);
+}
+
+ac_hw_cache_flags
+get_atomic_cache_flags(Builder& bld, bool return_previous)
+{
+   return get_gfx6_cache_flags(return_previous, false, false);
+}
+
 Temp
 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
                   unsigned align, unsigned const_offset, Temp dst_hint)
@ -4478,9 +4507,7 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
   RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
   Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
   load->definitions[0] = Definition(val);
-   load->smem().glc = info.glc;
-   load->smem().dlc =
-      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+   load->smem().cache = get_load_cache_flags(bld, info.glc, false);
   load->smem().sync = info.sync;
   bld.insert(std::move(load));
   return val;
@ -4539,13 +4566,11 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
   mubuf->operands[2] = soffset;
   mubuf->mubuf().offen = offen;
   mubuf->mubuf().idxen = idxen;
-   mubuf->mubuf().glc = info.glc;
-   mubuf->mubuf().dlc =
-      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   mubuf->mubuf().slc = info.slc;
+   mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc);
+   if (info.swizzle_component_size != 0)
+      mubuf->mubuf().cache.value |= ac_swizzled;
   mubuf->mubuf().sync = info.sync;
   mubuf->mubuf().offset = const_offset;
-   mubuf->mubuf().swizzled = info.swizzle_component_size != 0;
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
   mubuf->definitions[0] = Definition(val);
@ -4607,10 +4632,7 @@ mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
   mubuf->operands[2] = soffset;
   mubuf->mubuf().offen = offen;
   mubuf->mubuf().idxen = idxen;
-   mubuf->mubuf().glc = info.glc;
-   mubuf->mubuf().dlc =
-      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   mubuf->mubuf().slc = info.slc;
+   mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc);
   mubuf->mubuf().sync = info.sync;
   mubuf->mubuf().offset = const_offset;
   RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
@ -4818,8 +4840,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign
      mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
      mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
      mubuf->operands[2] = Operand(offset);
-      mubuf->mubuf().glc = info.glc;
-      mubuf->mubuf().dlc = false;
+      mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, false);
      mubuf->mubuf().offset = const_offset;
      mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
      mubuf->mubuf().disable_wqm = false;
@ -4838,9 +4859,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign
         flat->operands[0] = Operand(addr);
         flat->operands[1] = Operand(s1);
      }
-      flat->flatlike().glc = info.glc;
-      flat->flatlike().dlc =
-         info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+      flat->flatlike().cache = get_load_cache_flags(bld, info.glc, false);
      flat->flatlike().sync = info.sync;
      assert(global || !const_offset);
      flat->flatlike().offset = const_offset;
@ -5673,10 +5692,7 @@ mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
   mtbuf->operands[2] = soffset;
   mtbuf->mtbuf().offen = offen;
   mtbuf->mtbuf().idxen = idxen;
-   mtbuf->mtbuf().glc = info.glc;
-   mtbuf->mtbuf().dlc =
-      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   mtbuf->mtbuf().slc = info.slc;
+   mtbuf->mtbuf().cache = get_load_cache_flags(bld, info.glc, info.slc);
   mtbuf->mtbuf().sync = info.sync;
   mtbuf->mtbuf().offset = const_offset;
   mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
@ -6220,6 +6236,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)

   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
   unsigned access = nir_intrinsic_access(instr);
+   bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);

   unsigned result_size = instr->def.num_components - is_sparse;
   unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
@ -6275,9 +6292,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
      load->operands[2] = Operand::c32(0);
      load->definitions[0] = Definition(tmp);
      load->mubuf().idxen = true;
-      load->mubuf().glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
-      load->mubuf().dlc = load->mubuf().glc &&
-                          (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
+      load->mubuf().cache = get_load_cache_flags(bld, glc, false);
      load->mubuf().sync = sync;
      load->mubuf().tfe = is_sparse;
      if (load->mubuf().tfe)
@ -6296,9 +6311,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)

      Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
      MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
-      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
-      load->dlc =
-         load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
+      load->cache = get_load_cache_flags(bld, glc, false);
      load->a16 = instr->src[1].ssa->bit_size == 16;
      load->d16 = d16;
      load->dmask = dmask;
@ -6422,8 +6435,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
      store->operands[2] = Operand::c32(0);
      store->operands[3] = Operand(data);
      store->mubuf().idxen = true;
-      store->mubuf().glc = glc;
-      store->mubuf().dlc = false;
+      store->mubuf().cache = get_store_cache_flags(bld, glc, false);
      store->mubuf().disable_wqm = true;
      store->mubuf().sync = sync;
      ctx->program->needs_exact = true;
@ -6440,8 +6452,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)

   MIMG_instruction* store =
      emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
-   store->glc = glc;
-   store->dlc = false;
+   store->cache = get_store_cache_flags(bld, glc, false);
   store->a16 = instr->src[1].ssa->bit_size == 16;
   store->d16 = d16;
   store->dmask = dmask;
@ -6581,8 +6592,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
         mubuf->definitions[0] = def;
      mubuf->mubuf().offset = 0;
      mubuf->mubuf().idxen = true;
-      mubuf->mubuf().glc = return_previous;
-      mubuf->mubuf().dlc = false; /* Not needed for atomics */
+      mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous);
      mubuf->mubuf().disable_wqm = true;
      mubuf->mubuf().sync = sync;
      ctx->program->needs_exact = true;
@ -6597,8 +6607,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
   Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
   MIMG_instruction* mimg =
      emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
-   mimg->glc = return_previous;
-   mimg->dlc = false; /* Not needed for atomics */
+   mimg->cache = get_atomic_cache_flags(bld, return_previous);
   mimg->dmask = (1 << data.size()) - 1;
   mimg->a16 = instr->src[1].ssa->bit_size == 16;
   mimg->unrm = true;
@ -6670,8 +6679,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
      store->operands[3] = Operand(write_datas[i]);
      store->mubuf().offset = offsets[i];
      store->mubuf().offen = (offset.type() == RegType::vgpr);
-      store->mubuf().glc = glc || (ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4);
-      store->mubuf().dlc = false;
+      store->mubuf().cache = get_store_cache_flags(
+         bld, glc || (ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4), false);
      store->mubuf().disable_wqm = true;
      store->mubuf().sync = sync;
      ctx->program->needs_exact = true;
@ -6712,8 +6721,7 @@ visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
      mubuf->definitions[0] = def;
   mubuf->mubuf().offset = 0;
   mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
-   mubuf->mubuf().glc = return_previous;
-   mubuf->mubuf().dlc = false; /* Not needed for atomics */
+   mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous);
   mubuf->mubuf().disable_wqm = true;
   mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
   ctx->program->needs_exact = true;
@ -6846,8 +6854,7 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
            flat->operands[1] = Operand(s1);
         }
         flat->operands[2] = Operand(write_datas[i]);
-         flat->flatlike().glc = glc;
-         flat->flatlike().dlc = false;
+         flat->flatlike().cache = get_store_cache_flags(bld, glc, false);
         assert(global || !write_const_offset);
         flat->flatlike().offset = write_const_offset;
         flat->flatlike().disable_wqm = true;
@ -6867,8 +6874,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
            write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
         mubuf->operands[2] = Operand(write_offset);
         mubuf->operands[3] = Operand(write_datas[i]);
-         mubuf->mubuf().glc = glc || write_datas[i].bytes() < 4;
-         mubuf->mubuf().dlc = false;
+         mubuf->mubuf().cache =
+            get_store_cache_flags(bld, glc || write_datas[i].bytes() < 4, false);
         mubuf->mubuf().offset = write_const_offset;
         mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
         mubuf->mubuf().disable_wqm = true;
@ -6980,8 +6987,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
      flat->operands[2] = Operand(data);
      if (return_previous)
         flat->definitions[0] = Definition(dst);
-      flat->flatlike().glc = return_previous;
-      flat->flatlike().dlc = false; /* Not needed for atomics */
+      flat->flatlike().cache = get_atomic_cache_flags(bld, return_previous);
      assert(global || !const_offset);
      flat->flatlike().offset = const_offset;
      flat->flatlike().disable_wqm = true;
@ -7007,8 +7013,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
         return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
      if (return_previous)
         mubuf->definitions[0] = def;
-      mubuf->mubuf().glc = return_previous;
-      mubuf->mubuf().dlc = false;
+      mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous);
      mubuf->mubuf().offset = const_offset;
      mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
      mubuf->mubuf().disable_wqm = true;
@ -7167,6 +7172,9 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
      bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
      glc |= ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4;
      glc &= ctx->program->gfx_level < GFX11;
+      ac_hw_cache_flags cache = get_store_cache_flags(bld, glc, slc);
+      if (swizzled)
+         cache.value |= ac_swizzled;

      Operand vaddr_op(v1);
      if (offen && idxen)
@ -7177,9 +7185,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
         vaddr_op = Operand(idx);

      Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset,
-                                     Operand(write_datas[i]), const_offset, offen, swizzled, idxen,
-                                     /* addr64 */ false, /* disable_wqm */ false, glc,
-                                     /* dlc */ false, slc)
+                                     Operand(write_datas[i]), const_offset, offen, idxen,
+                                     /* addr64 */ false, /* disable_wqm */ false, cache)
                              .instr;
      mubuf->mubuf().sync = sync;
   }
@ -7637,9 +7644,11 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
      for (unsigned i = 0; i < write_count; i++) {
         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
         Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
-                                        write_datas[i], offsets[i], true, true);
+                                        write_datas[i], offsets[i], true);
         mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
-         mubuf->mubuf().glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4;
+         bool glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4;
+         mubuf->mubuf().cache = get_store_cache_flags(bld, glc, false);
+         mubuf->mubuf().cache.value |= ac_swizzled;
      }
   }
 }
@ -12098,9 +12107,12 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
   bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
            Operand::zero());

+   ac_hw_cache_flags cache_glc;
+   cache_glc.value = ac_glc;
+
   /* Store TTMP0-TTMP1. */
   bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
-            Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
+            Operand(PhysReg{ttmp0}, s2), memory_sync_info(), cache_glc);

   uint32_t hw_regs_idx[] = {
      2, /* HW_REG_STATUS */
@ -12116,7 +12128,8 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
               ((20 - 1) << 11) | hw_regs_idx[i]);

      bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
-               Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
+               Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(),
+               cache_glc);
   }

   program->config->float_mode = program->blocks[0].fp_mode.val;
@ -12632,18 +12645,18 @@ load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index,
   PhysReg scratch(load.scratch);
   if (load.d16) {
      bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index,
-                Operand::c32(0u), offset, false, false, true);
+                Operand::c32(0u), offset, false, true);
      bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index,
-                Operand::c32(0u), offset + 2, false, false, true);
+                Operand::c32(0u), offset + 2, false, true);
      bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index,
-                Operand::c32(0u), offset + 1, false, false, true);
+                Operand::c32(0u), offset + 1, false, true);
      bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index,
-                Operand::c32(0u), offset + 3, false, false, true);
+                Operand::c32(0u), offset + 3, false, true);
   } else {
      for (unsigned i = 0; i < size; i++) {
         Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
         bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(0u), offset + i,
-                   false, false, true);
+                   false, true);
      }
   }

@ -12835,7 +12848,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
            i += slots;
         } else {
            bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
-                      Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
+                      Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true);
            loc++;
            i++;
         }
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -15,6 +15,7 @@

 #include "ac_binary.h"
 #include "ac_hw_stage.h"
+#include "ac_shader_util.h"
 #include "amd_family.h"
 #include <algorithm>
 #include <bitset>
@ -1309,11 +1310,7 @@ static_assert(sizeof(SALU_instruction) == sizeof(Instruction) + 4, "Unexpected p
 */
 struct SMEM_instruction : public Instruction {
   memory_sync_info sync;
-   bool glc : 1; /* VI+: globally coherent */
-   bool dlc : 1; /* NAVI: device level coherent */
-   bool nv : 1;  /* VEGA only: Non-volatile */
-   bool disable_wqm : 1;
-   uint8_t padding : 4;
+   ac_hw_cache_flags cache;
 };
 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

@ -1492,19 +1489,16 @@ static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected
 */
 struct MUBUF_instruction : public Instruction {
   memory_sync_info sync;
+   ac_hw_cache_flags cache;
   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
   bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
-   bool glc : 1;             /* globally coherent */
-   bool dlc : 1;             /* NAVI: device level coherent */
-   bool slc : 1;             /* system level coherent */
   bool tfe : 1;             /* texture fail enable */
   bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
-   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
-   uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
-   uint16_t swizzled : 1;
-   uint16_t padding0 : 2;
-   uint16_t padding1;
+   bool disable_wqm : 1;     /* Require an exec mask without helper invocations */
+   uint8_t padding0 : 2;
+   uint8_t padding1;
+   uint16_t offset; /* Unsigned byte offset - 12 bit */
 };
 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

@ -1518,16 +1512,14 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
 */
 struct MTBUF_instruction : public Instruction {
   memory_sync_info sync;
+   ac_hw_cache_flags cache;
   uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
   uint8_t nfmt : 3;         /* Numeric format of data in memory */
   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
-   uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
-   uint16_t glc : 1;         /* globally coherent */
-   uint16_t dlc : 1;         /* NAVI: device level coherent */
-   uint16_t slc : 1;         /* system level coherent */
-   uint16_t tfe : 1;         /* texture fail enable */
-   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
-   uint16_t padding : 10;
+   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
+   bool tfe : 1;             /* texture fail enable */
+   bool disable_wqm : 1;     /* Require an exec mask without helper invocations */
+   uint8_t padding : 5;
   uint16_t offset; /* Unsigned byte offset - 12 bit */
 };
 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
@ -1543,12 +1535,10 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
 */
 struct MIMG_instruction : public Instruction {
   memory_sync_info sync;
+   ac_hw_cache_flags cache;
   uint8_t dmask;        /* Data VGPR enable mask */
   uint8_t dim : 3;      /* NAVI: dimensionality */
   bool unrm : 1;        /* Force address to be un-normalized */
-   bool dlc : 1;         /* NAVI: device level coherent */
-   bool glc : 1;         /* globally coherent */
-   bool slc : 1;         /* system level coherent */
   bool tfe : 1;         /* texture fail enable */
   bool da : 1;          /* declare an array */
   bool lwe : 1;         /* LOD warning enable */
@ -1557,9 +1547,8 @@ struct MIMG_instruction : public Instruction {
   bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
   bool strict_wqm : 1;  /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
-   uint8_t padding0 : 1;
+   uint8_t padding0 : 4;
   uint8_t padding1;
-   uint8_t padding2;
 };
 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

@ -1572,15 +1561,13 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected p
 */
 struct FLAT_instruction : public Instruction {
   memory_sync_info sync;
-   bool slc : 1; /* system level coherent */
-   bool glc : 1; /* globally coherent */
-   bool dlc : 1; /* NAVI: device level coherent */
+   ac_hw_cache_flags cache;
   bool lds : 1;
   bool nv : 1;
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   uint8_t padding0 : 2;
+   uint8_t padding0 : 5;
+   uint8_t padding1;
   int16_t offset; /* Vega/Navi only */
-   uint16_t padding1;
 };
 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -105,9 +105,7 @@ class Format(IntEnum):
         return [('uint32_t', 'imm', '0')]
      elif self == Format.SMEM:
         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
-                 ('bool', 'glc', 'false'),
-                 ('bool', 'dlc', 'false'),
-                 ('bool', 'nv', 'false')]
+                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}')]
      elif self == Format.DS:
         return [('uint16_t', 'offset0', '0'),
                 ('uint8_t', 'offset1', '0'),
@ -125,20 +123,15 @@ class Format(IntEnum):
                 ('bool', 'offen', None),
                 ('bool', 'idxen', 'false'),
                 ('bool', 'disable_wqm', 'false'),
-                 ('bool', 'glc', 'false'),
-                 ('bool', 'dlc', 'false'),
-                 ('bool', 'slc', 'false'),
+                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
                 ('bool', 'tfe', 'false')]
      elif self == Format.MUBUF:
         return [('unsigned', 'offset', None),
                 ('bool', 'offen', None),
-                 ('bool', 'swizzled', 'false'),
                 ('bool', 'idxen', 'false'),
                 ('bool', 'addr64', 'false'),
                 ('bool', 'disable_wqm', 'false'),
-                 ('bool', 'glc', 'false'),
-                 ('bool', 'dlc', 'false'),
-                 ('bool', 'slc', 'false'),
+                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
                 ('bool', 'tfe', 'false'),
                 ('bool', 'lds', 'false')]
      elif self == Format.MIMG:
@ -146,9 +139,7 @@ class Format(IntEnum):
                 ('bool', 'da', 'false'),
                 ('bool', 'unrm', 'false'),
                 ('bool', 'disable_wqm', 'false'),
-                 ('bool', 'glc', 'false'),
-                 ('bool', 'dlc', 'false'),
-                 ('bool', 'slc', 'false'),
+                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
                 ('bool', 'tfe', 'false'),
                 ('bool', 'lwe', 'false'),
                 ('bool', 'r128', 'false'),
@ -195,8 +186,7 @@ class Format(IntEnum):
      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
         return [('int16_t', 'offset', 0),
                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
-                 ('bool', 'glc', 'false'),
-                 ('bool', 'slc', 'false'),
+                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
                 ('bool', 'lds', 'false'),
                 ('bool', 'nv', 'false')]
      else:
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@ -164,8 +164,7 @@ struct InstrPred {
      case Format::SMEM: {
         SMEM_instruction& aS = a->smem();
         SMEM_instruction& bS = b->smem();
-         return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv &&
-                aS.disable_wqm == bS.disable_wqm;
+         return aS.sync == bS.sync && aS.cache.value == bS.cache.value;
      }
      case Format::VINTRP: {
         VINTRP_instruction& aI = a->vintrp();
@ -203,21 +202,21 @@ struct InstrPred {
         MTBUF_instruction& bM = b->mtbuf();
         return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt &&
                aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen &&
-                aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe &&
+                aM.cache.value == bM.cache.value && aM.tfe == bM.tfe &&
                aM.disable_wqm == bM.disable_wqm;
      }
      case Format::MUBUF: {
         MUBUF_instruction& aM = a->mubuf();
         MUBUF_instruction& bM = b->mubuf();
         return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen &&
-                aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc &&
-                aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
+                aM.idxen == bM.idxen && aM.cache.value == bM.cache.value && aM.tfe == bM.tfe &&
+                aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
      }
      case Format::MIMG: {
         MIMG_instruction& aM = a->mimg();
         MIMG_instruction& bM = b->mimg();
         return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm &&
-                aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da &&
+                aM.cache.value == bM.cache.value && aM.tfe == bM.tfe && aM.da == bM.da &&
                aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 &&
                aM.disable_wqm == bM.disable_wqm;
      }
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -860,10 +860,7 @@ smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
            if (!smem.definitions.empty())
               new_instr->definitions[0] = smem.definitions[0];
            new_instr->smem().sync = smem.sync;
-            new_instr->smem().glc = smem.glc;
-            new_instr->smem().dlc = smem.dlc;
-            new_instr->smem().nv = smem.nv;
-            new_instr->smem().disable_wqm = smem.disable_wqm;
+            new_instr->smem().cache = smem.cache;
            instr.reset(new_instr);
         }
      }
@ -1429,13 +1426,14 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
         while (info.is_temp())
            info = ctx.info[info.temp.id()];

+         bool swizzled = mubuf.cache.value & ac_swizzled;
         /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
          * overflow for scratch accesses works only on GFX9+ and saddr overflow
          * never works. Since swizzling is the only thing that separates
          * scratch accesses and other accesses and swizzling changing how
          * addressing works significantly, this probably applies to swizzled
          * MUBUF accesses. */
-         bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9;
+         bool vaddr_prevent_overflow = swizzled && ctx.program->gfx_level < GFX9;

         if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
             info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
@ -1465,7 +1463,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
            mubuf.offset += offset;
            continue;
         } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
-                    base.regClass() == s1 && mubuf.offset + offset < 4096 && !mubuf.swizzled) {
+                    base.regClass() == s1 && mubuf.offset + offset < 4096 && !swizzled) {
            instr->operands[i].setTemp(base);
            mubuf.offset += offset;
            continue;
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@ -262,6 +262,20 @@ print_sync(memory_sync_info sync, FILE* output)
      print_scope(sync.scope, output);
 }

+template <typename T>
+static void
+print_cache_flags(enum amd_gfx_level gfx_level, const T& instr, FILE* output)
+{
+   if (instr.cache.value & ac_glc)
+      fprintf(output, " glc");
+   if (instr.cache.value & ac_slc)
+      fprintf(output, " slc");
+   if (instr.cache.value & ac_dlc)
+      fprintf(output, " dlc");
+   if (instr.cache.value & ac_swizzled)
+      fprintf(output, " swizzled");
+}
+
 static void
 print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output)
 {
@ -428,12 +442,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
   }
   case Format::SMEM: {
      const SMEM_instruction& smem = instr->smem();
-      if (smem.glc)
-         fprintf(output, " glc");
-      if (smem.dlc)
-         fprintf(output, " dlc");
-      if (smem.nv)
-         fprintf(output, " nv");
+      print_cache_flags(gfx_level, smem, output);
      print_sync(smem.sync, output);
      break;
   }
@ -482,12 +491,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
         fprintf(output, " idxen");
      if (mubuf.addr64)
         fprintf(output, " addr64");
-      if (mubuf.glc)
-         fprintf(output, " glc");
-      if (mubuf.dlc)
-         fprintf(output, " dlc");
-      if (mubuf.slc)
-         fprintf(output, " slc");
+      print_cache_flags(gfx_level, mubuf, output);
      if (mubuf.tfe)
         fprintf(output, " tfe");
      if (mubuf.lds)
@ -517,12 +521,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
      }
      if (mimg.unrm)
         fprintf(output, " unrm");
-      if (mimg.glc)
-         fprintf(output, " glc");
-      if (mimg.dlc)
-         fprintf(output, " dlc");
-      if (mimg.slc)
-         fprintf(output, " slc");
+      print_cache_flags(gfx_level, mimg, output);
      if (mimg.tfe)
         fprintf(output, " tfe");
      if (mimg.da)
@ -594,12 +593,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
      const FLAT_instruction& flat = instr->flatlike();
      if (flat.offset)
         fprintf(output, " offset:%d", flat.offset);
-      if (flat.glc)
-         fprintf(output, " glc");
-      if (flat.dlc)
-         fprintf(output, " dlc");
-      if (flat.slc)
-         fprintf(output, " slc");
+      print_cache_flags(gfx_level, flat, output);
      if (flat.lds)
         fprintf(output, " lds");
      if (flat.nv)
@ -646,12 +640,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
         fprintf(output, " offen");
      if (mtbuf.idxen)
         fprintf(output, " idxen");
-      if (mtbuf.glc)
-         fprintf(output, " glc");
-      if (mtbuf.dlc)
-         fprintf(output, " dlc");
-      if (mtbuf.slc)
-         fprintf(output, " slc");
+      print_cache_flags(gfx_level, mtbuf, output);
      if (mtbuf.tfe)
         fprintf(output, " tfe");
      if (mtbuf.disable_wqm)
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@ -1324,8 +1324,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
                        offset, memory_sync_info(storage_vgpr_spill, semantic_private));
         } else {
            Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc,
-                                           Operand(v1), scratch_offset, elem, offset, false, true);
+                                           Operand(v1), scratch_offset, elem, offset, false);
            instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+            instr->mubuf().cache.value = ac_swizzled;
         }
      }
   } else if (ctx.program->gfx_level >= GFX9) {
@ -1333,8 +1334,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
                  memory_sync_info(storage_vgpr_spill, semantic_private));
   } else {
      Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
-                                     scratch_offset, temp, offset, false, true);
+                                     scratch_offset, temp, offset, false);
      instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+      instr->mubuf().cache.value = ac_swizzled;
   }
 }

@ -1366,8 +1368,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
         } else {
            Instruction* instr =
               bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc,
-                         Operand(v1), scratch_offset, offset, false, true);
+                         Operand(v1), scratch_offset, offset, false);
            instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+            instr->mubuf().cache.value = ac_swizzled;
         }
      }
      bld.insert(vec);
@ -1376,8 +1379,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
                  memory_sync_info(storage_vgpr_spill, semantic_private));
   } else {
      Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
-                                     Operand(v1), scratch_offset, offset, false, true);
+                                     Operand(v1), scratch_offset, offset, false);
      instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+      instr->mubuf().cache.value = ac_swizzled;
   }
 }

--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@ -411,13 +411,19 @@ BEGIN_TEST(assembler.smem)
      //! s_load_b32 s4, s[16:17], s8 offset:0x2a                     ; f4000108 1000002a
      bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1);

+      ac_hw_cache_flags cache_coherent;
+      ac_hw_cache_flags cache_non_temporal;
+      cache_coherent.value = ac_glc;
+      cache_non_temporal.value = ac_dlc;
+
      //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc                      ; f4204110 10000000
      //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_SYS ; f4620110 10000000
-      bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().glc = true;
+      bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent;

      //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc                      ; f4202110 10000000
      //~gfx12! (then repeated 1 times)
-      bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().dlc = true;
+      bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache =
+         cache_non_temporal;

      finish_assembler_test();
   }
@ -482,22 +488,31 @@ BEGIN_TEST(assembler.mubuf)
      bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false);

      /* Various flags */
+      ac_hw_cache_flags cache_coherent;
+      ac_hw_cache_flags cache_sys_coherent;
+      ac_hw_cache_flags cache_non_temporal;
+      ac_hw_cache_flags cache_atomic_rtn;
+      cache_coherent.value = ac_glc;
+      cache_sys_coherent.value = ac_slc;
+      cache_non_temporal.value = ac_dlc;
+      cache_atomic_rtn.value = ac_glc;
+
      //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc                   ; e0504000 80082a80
      //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS    ; c405007c 008c402a 00000000
      bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
         ->mubuf()
-         .glc = true;
+         .cache = cache_coherent;

      //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc                   ; e0502000 80082a80
      //~gfx12! (then repeated 2 times)
      bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
         ->mubuf()
-         .dlc = true;
+         .cache = cache_non_temporal;

      //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc                   ; e0501000 80082a80
      bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
         ->mubuf()
-         .slc = true;
+         .cache = cache_sys_coherent;

      //; if llvm_ver >= 16 and variant == 'gfx11':
      //;    insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe              ; e0500000 80282a80')
@ -562,7 +577,7 @@ BEGIN_TEST(assembler.mubuf)
      bld.mubuf(aco_opcode::buffer_atomic_add, Definition(op_v1.physReg(), v1), op_s4, Operand(v1),
                Operand::zero(), op_v1, 0, false)
         ->mubuf()
-         .glc = true;
+         .cache = cache_atomic_rtn;

      finish_assembler_test();
   }
@ -632,25 +647,32 @@ BEGIN_TEST(assembler.mtbuf)
                false);

      /* Various flags */
+      ac_hw_cache_flags cache_coherent;
+      ac_hw_cache_flags cache_sys_coherent;
+      ac_hw_cache_flags cache_non_temporal;
+      cache_coherent.value = ac_glc;
+      cache_sys_coherent.value = ac_slc;
+      cache_non_temporal.value = ac_dlc;
+
      //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80
      //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080
      bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
                nfmt, 0, false)
         ->mtbuf()
-         .glc = true;
+         .cache = cache_coherent;

      //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80
      //~gfx12! (then repeated 2 times)
      bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
                nfmt, 0, false)
         ->mtbuf()
-         .dlc = true;
+         .cache = cache_non_temporal;

      //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80
      bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
                nfmt, 0, false)
         ->mtbuf()
-         .slc = true;
+         .cache = cache_sys_coherent;

      //; if llvm_ver >= 16 and variant == 'gfx11':
      //;    insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80')
@ -718,19 +740,28 @@ BEGIN_TEST(assembler.mimg)
         0x1;

      /* Various flags */
+      ac_hw_cache_flags cache_coherent;
+      ac_hw_cache_flags cache_sys_coherent;
+      ac_hw_cache_flags cache_non_temporal;
+      ac_hw_cache_flags cache_atomic_rtn;
+      cache_coherent.value = ac_glc;
+      cache_sys_coherent.value = ac_slc;
+      cache_non_temporal.value = ac_dlc;
+      cache_atomic_rtn.value = ac_glc;
+
      //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a
      //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a
-      bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().dlc =
-         true;
+      bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
+         cache_non_temporal;

      //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a
      //~gfx12! (then repeated 2 times)
-      bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().glc =
-         true;
+      bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
+         cache_coherent;

      //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a
-      bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().slc =
-         true;
+      bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
+         cache_sys_coherent;

      //~gfx11! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; f06c0f00 2030540a
      //~gfx12! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; e7c6c008 10008054 0000000a
@ -799,7 +830,7 @@ BEGIN_TEST(assembler.mimg)
      //~gfx11! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D glc ; f0304f04 00100a14
      //~gfx12! image_atomic_add_uint v10, [v20, v21, v0, v0], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN ; d3c30001 0010800a 00001514
      bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
-               op_v1, op_v2, 0xf, false, false, false, true)
+               op_v1, op_v2, 0xf, false, false, false, cache_atomic_rtn)
         ->mimg()
         .dim = ac_image_2d;

@ -876,16 +907,28 @@ BEGIN_TEST(assembler.flat)
      bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84);

      /* Various flags */
+      ac_hw_cache_flags cache_coherent;
+      ac_hw_cache_flags cache_sys_coherent;
+      ac_hw_cache_flags cache_non_temporal;
+      ac_hw_cache_flags cache_atomic_rtn;
+      cache_coherent.value = ac_glc;
+      cache_sys_coherent.value = ac_slc;
+      cache_non_temporal.value = ac_dlc;
+      cache_atomic_rtn.value = ac_glc;
+
      //~gfx11! flat_load_b32 v42, v[20:21] slc                             ; dc508000 2a7c0014
      //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS                 ; ec05007c 000c002a 00000014
-      bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().slc = true;
+      bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
+         cache_sys_coherent;

      //~gfx11! flat_load_b32 v42, v[20:21] glc                             ; dc504000 2a7c0014
      //~gfx12! (then repeated 2 times)
-      bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().glc = true;
+      bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
+         cache_coherent;

      //~gfx11! flat_load_b32 v42, v[20:21] dlc                             ; dc502000 2a7c0014
-      bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().dlc = true;
+      bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
+         cache_non_temporal;

      /* Stores */
      //~gfx11! flat_store_b32 v[20:21], v10                                ; dc680000 007c0a14
@ -895,8 +938,8 @@ BEGIN_TEST(assembler.flat)
      /* Atomic with return */
      //~gfx11! global_atomic_add_u32 v42, v[20:21], v10, off glc           ; dcd64000 2a7c0a14
      //~gfx12! global_atomic_add_u32 v42, v[20:21], v10, off th:TH_ATOMIC_RETURN ; ee0d407c 0510002a 00000014
-      bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().glc =
-         true;
+      bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().cache =
+         cache_atomic_rtn;

      finish_assembler_test();
   }