ac/llvm: rewrite and unify how GLC, DLC, SLC are set

Use ACCESS_* flags in call sites instead of GLC/DLC/SLC. ACCESS_* flags are extended to describe other aspects of memory instructions like load/store/atomic/smem. Then add a function that converts the access flags to GLC, DLC, SLC. The new functions are also usable by ACO. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22770>
2026-05-06 07:18:17 +02:00 · 2023-04-27 03:49:10 -04:00 · 2023-04-27 03:49:10 -04:00 · f98871608c
commit f98871608c
parent 968db0208d
6 changed files with 270 additions and 123 deletions
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@ -1014,3 +1014,143 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
   *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
 }
+
+/* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC)
+ * from a NIR memory intrinsic.
+ */
+enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr)
+{
+   enum gl_access_qualifier access =
+      nir_intrinsic_has_access(instr) ? nir_intrinsic_access(instr) : 0;
+
+   /* Determine ACCESS_MAY_STORE_SUBDWORD. (for the GFX6 TC L1 bug workaround) */
+   if (!nir_intrinsic_infos[instr->intrinsic].has_dest) {
+      switch (instr->intrinsic) {
+      case nir_intrinsic_bindless_image_store:
+         access |= ACCESS_MAY_STORE_SUBDWORD;
+         break;
+
+      case nir_intrinsic_store_ssbo:
+      case nir_intrinsic_store_buffer_amd:
+      case nir_intrinsic_store_global:
+      case nir_intrinsic_store_global_amd:
+         if (access & ACCESS_USES_FORMAT_AMD ||
+             (nir_intrinsic_has_align_offset(instr) && nir_intrinsic_align(instr) % 4 != 0) ||
+             ((instr->src[0].ssa->bit_size / 8) * instr->src[0].ssa->num_components) % 4 != 0)
+            access |= ACCESS_MAY_STORE_SUBDWORD;
+         break;
+
+      default:
+         unreachable("unexpected store instruction");
+      }
+   }
+
+   return access;
+}
+
+/* Convert chip-agnostic memory access flags into hw-specific cache flags.
+ *
+ * "access" must be a result of ac_get_mem_access_flags() with the appropriate ACCESS_TYPE_*
+ * flags set.
+ */
+union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,
+                                              enum gl_access_qualifier access)
+{
+   union ac_hw_cache_flags result;
+   result.value = 0;
+
+   assert(util_bitcount(access & (ACCESS_TYPE_LOAD | ACCESS_TYPE_STORE |
+                                  ACCESS_TYPE_ATOMIC)) == 1);
+   assert(!(access & ACCESS_TYPE_SMEM) || access & ACCESS_TYPE_LOAD);
+   assert(!(access & ACCESS_IS_SWIZZLED_AMD) || !(access & ACCESS_TYPE_SMEM));
+   assert(!(access & ACCESS_MAY_STORE_SUBDWORD) || access & ACCESS_TYPE_STORE);
+
+   bool scope_is_device = access & (ACCESS_COHERENT | ACCESS_VOLATILE);
+
+   if (gfx_level >= GFX11) {
+      /* GFX11 simplified it and exposes what is actually useful.
+       *
+       * GLC means device scope for loads only. (stores and atomics are always device scope)
+       * SLC means non-temporal for GL1 and GL2 caches. (GL1 = hit-evict, GL2 = stream, unavailable in SMEM)
+       * DLC means non-temporal for MALL. (noalloc, i.e. coherent bypass)
+       *
+       * GL0 doesn't have a non-temporal flag, so you always get LRU caching in CU scope.
+       */
+      if (access & ACCESS_TYPE_LOAD && scope_is_device)
+         result.value |= ac_glc;
+
+      if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
+         result.value |= ac_slc;
+   } else if (gfx_level >= GFX10) {
+      /* GFX10-10.3:
+       *
+       * VMEM and SMEM loads (SMEM only supports the first four):
+       * !GLC && !DLC && !SLC means CU scope          <== use for normal loads with CU scope
+       *  GLC && !DLC && !SLC means SA scope
+       * !GLC &&  DLC && !SLC means CU scope, GL1 bypass
+       *  GLC &&  DLC && !SLC means device scope      <== use for normal loads with device scope
+       * !GLC && !DLC &&  SLC means CU scope, non-temporal (GL0 = GL1 = hit-evict, GL2 = stream)  <== use for non-temporal loads with CU scope
+       *  GLC && !DLC &&  SLC means SA scope, non-temporal (GL1 = hit-evict, GL2 = stream)
+       * !GLC &&  DLC &&  SLC means CU scope, GL0 non-temporal, GL1-GL2 coherent bypass (GL0 = hit-evict, GL1 = bypass, GL2 = noalloc)
+       *  GLC &&  DLC &&  SLC means device scope, GL2 coherent bypass (noalloc)  <== use for non-temporal loads with device scope
+       *
+       * VMEM stores/atomics (stores are CU scope only if they overwrite the whole cache line,
+       * atomics are always device scope, GL1 is always bypassed):
+       * !GLC && !DLC && !SLC means CU scope          <== use for normal stores with CU scope
+       *  GLC && !DLC && !SLC means device scope      <== use for normal stores with device scope
+       * !GLC &&  DLC && !SLC means CU scope, GL2 non-coherent bypass
+       *  GLC &&  DLC && !SLC means device scope, GL2 non-coherent bypass
+       * !GLC && !DLC &&  SLC means CU scope, GL2 non-temporal (stream)  <== use for non-temporal stores with CU scope
+       *  GLC && !DLC &&  SLC means device scope, GL2 non-temporal (stream)  <== use for non-temporal stores with device scope
+       * !GLC &&  DLC &&  SLC means CU scope, GL2 coherent bypass (noalloc)
+       *  GLC &&  DLC &&  SLC means device scope, GL2 coherent bypass (noalloc)
+       *
+       * "stream" allows write combining in GL2. "coherent bypass" doesn't.
+       * "non-coherent bypass" doesn't guarantee ordering with any coherent stores.
+       */
+      if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC))
+         result.value |= ac_glc | (access & ACCESS_TYPE_LOAD ? ac_dlc : 0);
+
+      if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
+         result.value |= ac_slc;
+   } else {
+      /* GFX6-GFX9:
+       *
+       * VMEM loads:
+       * !GLC && !SLC means CU scope
+       *  GLC && !SLC means (GFX6: device scope, GFX7-9: device scope [*])
+       * !GLC &&  SLC means (GFX6: CU scope, GFX7: device scope, GFX8-9: CU scope), GL2 non-temporal (stream)
+       *  GLC &&  SLC means device scope, GL2 non-temporal (stream)
+       *
+       * VMEM stores (atomics don't have [*]):
+       * !GLC && !SLC means (GFX6: CU scope, GFX7-9: device scope [*])
+       *  GLC && !SLC means (GFX6-7: device scope, GFX8-9: device scope [*])
+       * !GLC &&  SLC means (GFX6: CU scope, GFX7-9: device scope [*]), GL2 non-temporal (stream)
+       *  GLC &&  SLC means device scope, GL2 non-temporal (stream)
+       *
+       * [*] data can be cached in GL1 for future CU scope
+       *
+       * SMEM loads:
+       *  GLC means device scope (available on GFX8+)
+       */
+      if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC)) {
+         /* SMEM doesn't support the device scope on GFX6-7. */
+         assert(gfx_level >= GFX8 || !(access & ACCESS_TYPE_SMEM));
+         result.value |= ac_glc;
+      }
+
+      if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
+         result.value |= ac_slc;
+
+      /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All store opcodes not
+       * aligned to a dword are affected.
+       */
+      if (gfx_level == GFX6 && access & ACCESS_MAY_STORE_SUBDWORD)
+         result.value |= ac_glc;
+   }
+
+   if (access & ACCESS_IS_SWIZZLED_AMD)
+      result.value |= ac_swizzled;
+
+   return result;
+}
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@ -46,6 +46,41 @@ extern "C" {
 #define AC_SENDMSG_GS_OP_EMIT     (2 << 4)
 #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4)

+/* An extension of gl_access_qualifier describing other aspects of memory operations
+ * for code generation.
+ */
+enum {
+   /* Only one of LOAD/STORE/ATOMIC can be set. */
+   ACCESS_TYPE_LOAD            = BITFIELD_BIT(27),
+   ACCESS_TYPE_STORE           = BITFIELD_BIT(28),
+   ACCESS_TYPE_ATOMIC          = BITFIELD_BIT(29),
+
+   /* This access is expected to use an SMEM instruction if source operands are non-divergent.
+    * Only loads can set this.
+    */
+   ACCESS_TYPE_SMEM            = BITFIELD_BIT(30),
+
+   /* Whether a store offset or size alignment is less than 4. */
+   ACCESS_MAY_STORE_SUBDWORD   = BITFIELD_BIT(31),
+};
+
+/* The meaning of these enums is different between chips. They match LLVM definitions,
+ * but they can also be used by ACO. Use ac_get_hw_cache_flags to get these.
+ */
+enum ac_cache_flags
+{
+   ac_glc = BITFIELD_BIT(0),
+   ac_slc = BITFIELD_BIT(1),
+   ac_dlc = BITFIELD_BIT(2),
+   ac_swizzled = BITFIELD_BIT(3),
+};
+
+union ac_hw_cache_flags
+{
+   /* NOTE: This will contain more fields in the future. */
+   enum ac_cache_flags value;
+};
+
 enum ac_image_dim
 {
   ac_image_1d,
@ -199,6 +234,11 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
                            bool streamout_enabled,
                            bool can_cull);

+enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr);
+
+union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,
+                                              enum gl_access_qualifier access);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@ -1221,23 +1221,15 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
   return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false);
 }

-static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
+static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
 {
-   return cache_policy |
-          (ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0);
-}
-
-static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
-{
-   if (ctx->gfx_level >= GFX11)
-      cache_policy &= ~ac_glc; /* GLC has no effect on stores */
-   return cache_policy;
+   return ac_get_hw_cache_flags(ctx->gfx_level, access).value;
 }

 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                         LLVMValueRef data, LLVMValueRef vindex,
                                         LLVMValueRef voffset, LLVMValueRef soffset,
-                                         unsigned cache_policy, bool use_format)
+                                         enum gl_access_qualifier access, bool use_format)
 {
   LLVMValueRef args[6];
   int idx = 0;
@ -1247,7 +1239,7 @@ static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueR
      args[idx++] = vindex ? vindex : ctx->i32_0;
   args[idx++] = voffset ? voffset : ctx->i32_0;
   args[idx++] = soffset ? soffset : ctx->i32_0;
-   args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0);
+   args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
   const char *indexing_kind = vindex ? "struct" : "raw";
   char name[256], type_name[8];

@ -1264,15 +1256,15 @@ static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueR
 }

 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
-                                  LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
+                                  LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
 {
-   ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true);
+   ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
 }

 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
                                 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
-                                 unsigned cache_policy)
+                                 enum gl_access_qualifier access)
 {
   unsigned num_channels = ac_get_llvm_num_components(vdata);

@ -1288,19 +1280,19 @@ void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
      voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
                              LLVMConstInt(ctx->i32, 8, 0), "");

-      ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy);
-      ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy);
+      ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
+      ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
      return;
   }

   ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
-                                cache_policy, false);
+                                access, false);
 }

 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                                LLVMValueRef vindex, LLVMValueRef voffset,
                                                LLVMValueRef soffset, unsigned num_channels,
-                                                LLVMTypeRef channel_type, unsigned cache_policy,
+                                                LLVMTypeRef channel_type, enum gl_access_qualifier access,
                                                bool can_speculate, bool use_format)
 {
   LLVMValueRef args[5];
@ -1310,7 +1302,7 @@ static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLV
      args[idx++] = vindex;
   args[idx++] = voffset ? voffset : ctx->i32_0;
   args[idx++] = soffset ? soffset : ctx->i32_0;
-   args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
+   args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
   unsigned func =
      !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
   const char *indexing_kind = vindex ? "struct" : "raw";
@ -1339,11 +1331,10 @@ static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLV

 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
-                                  LLVMTypeRef channel_type, unsigned cache_policy,
+                                  LLVMTypeRef channel_type, enum gl_access_qualifier access,
                                  bool can_speculate, bool allow_smem)
 {
-   if (allow_smem && !(cache_policy & ac_slc) &&
-       (!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) {
+   if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
      assert(vindex == NULL);

      LLVMValueRef result[32];
@ -1365,7 +1356,8 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
         LLVMValueRef args[3] = {
            rsrc,
            offset,
-            LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
+            LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
+                                                        ACCESS_TYPE_SMEM), 0),
         };
         result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
      }
@ -1386,7 +1378,7 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
                         LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
      LLVMValueRef item =
         ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
-                                     channel_type, cache_policy, can_speculate, false);
+                                     channel_type, access, can_speculate, false);
      result = ac_build_concat(ctx, result, item);
   }

@ -1395,13 +1387,13 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc

 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                         LLVMValueRef vindex, LLVMValueRef voffset,
-                                         unsigned num_channels, unsigned cache_policy,
+                                         unsigned num_channels, enum gl_access_qualifier access,
                                         bool can_speculate, bool d16, bool tfe)
 {
   if (tfe) {
      assert(!d16);

-      cache_policy = get_load_cache_policy(ctx, cache_policy);
+      unsigned cache_flags = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD);

      char code[256];
      /* The definition in the assembly and the one in the constraint string
@ -1415,9 +1407,9 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueR
               "v_mov_b32 v4, 0\n"
               "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
               "s_waitcnt vmcnt(0)",
-               cache_policy & ac_glc ? "glc" : "",
-               cache_policy & ac_slc ? "slc" : "",
-               cache_policy & ac_dlc ? "dlc" : "");
+               cache_flags & ac_glc ? "glc" : "",
+               cache_flags & ac_slc ? "slc" : "",
+               cache_flags & ac_dlc ? "dlc" : "");

      LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
      LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
@ -1435,7 +1427,7 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueR
   }

   return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
-                                      num_channels, d16 ? ctx->f16 : ctx->f32, cache_policy,
+                                      num_channels, d16 ? ctx->f16 : ctx->f32, access,
                                      can_speculate, true);
 }

@ -1443,7 +1435,7 @@ static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValue
                                          LLVMValueRef vindex, LLVMValueRef voffset,
                                          LLVMValueRef soffset, unsigned num_channels,
                                          unsigned tbuffer_format, LLVMTypeRef channel_type,
-                                          unsigned cache_policy, bool can_speculate)
+                                          enum gl_access_qualifier access, bool can_speculate)
 {
   LLVMValueRef args[6];
   int idx = 0;
@ -1453,7 +1445,7 @@ static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValue
   args[idx++] = voffset ? voffset : ctx->i32_0;
   args[idx++] = soffset ? soffset : ctx->i32_0;
   args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
-   args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
+   args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
   const char *indexing_kind = vindex ? "struct" : "raw";
   char name[256], type_name[8];

@ -1474,7 +1466,7 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
                                        unsigned align_offset,
                                        unsigned align_mul,
                                        unsigned num_channels,
-                                        unsigned cache_policy,
+                                        enum gl_access_qualifier access,
                                        bool can_speculate)
 {
   const unsigned max_channels = vtx_info->num_channels;
@ -1503,7 +1495,7 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
      LLVMValueRef item =
         ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
                               fetch_num_channels, fetch_format, channel_type,
-                               cache_policy, can_speculate);
+                               access, can_speculate);
      result = ac_build_concat(ctx, result, item);
   }

@ -1513,35 +1505,35 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe

 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                        LLVMValueRef voffset, LLVMValueRef soffset,
-                                        unsigned cache_policy)
+                                        enum gl_access_qualifier access)
 {
   return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
-                                      cache_policy, false, false);
+                                      access, false, false);
 }

 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                       LLVMValueRef voffset, LLVMValueRef soffset,
-                                       unsigned cache_policy)
+                                       enum gl_access_qualifier access)
 {
-   return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
+   return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
                                      false, false);
 }

 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
-                                 unsigned cache_policy)
+                                 enum gl_access_qualifier access)
 {
   vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");

-   ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
+   ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
 }

 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
-                                LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
+                                LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
 {
   vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");

-   ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
+   ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
 }

 /**
@ -2025,7 +2017,11 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_

   args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
   args[num_args++] = LLVMConstInt(
-      ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
+      ctx->i32, get_cache_flags(ctx,
+                                a->access |
+                                (atomic ? ACCESS_TYPE_ATOMIC :
+                                 load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
+      false);

   const char *name;
   const char *atomic_subop = "";
--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@ -281,28 +281,28 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,

 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
                                 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
-                                 unsigned cache_policy);
+                                 enum gl_access_qualifier access);

 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
-                                  LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy);
+                                  LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access);

 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
-                                  LLVMTypeRef channel_type, unsigned cache_policy,
+                                  LLVMTypeRef channel_type, enum gl_access_qualifier access,
                                  bool can_speculate, bool allow_smem);

 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                         LLVMValueRef vindex, LLVMValueRef voffset,
-                                         unsigned num_channels, unsigned cache_policy,
+                                         unsigned num_channels, enum gl_access_qualifier access,
                                         bool can_speculate, bool d16, bool tfe);

 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                        LLVMValueRef voffset, LLVMValueRef soffset,
-                                        unsigned cache_policy);
+                                        enum gl_access_qualifier access);

 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                       LLVMValueRef voffset, LLVMValueRef soffset,
-                                       unsigned cache_policy);
+                                       enum gl_access_qualifier access);

 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                        LLVMValueRef vindex, LLVMValueRef voffset,
@ -312,15 +312,15 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
                                        unsigned align_offset,
                                        unsigned align_mul,
                                        unsigned num_channels,
-                                        unsigned cache_policy,
+                                        enum gl_access_qualifier access,
                                        bool can_speculate);

 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
                                 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
-                                 unsigned cache_policy);
+                                 enum gl_access_qualifier access);

 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
-                                LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy);
+                                LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access);

 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
                           unsigned hi);
@ -391,21 +391,12 @@ enum ac_atomic_op
   ac_atomic_fmax,
 };

-/* These cache policy bits match the definitions used by the LLVM intrinsics. */
-enum ac_image_cache_policy
-{
-   ac_glc = 1 << 0,      /* per-CU cache control */
-   ac_slc = 1 << 1,      /* global L2 cache control */
-   ac_dlc = 1 << 2,      /* per-shader-array cache control */
-   ac_swizzled = 1 << 3, /* the access is swizzled, disabling load/store merging */
-};
-
 struct ac_image_args {
   enum ac_image_opcode opcode;
   enum ac_atomic_op atomic; /* for the ac_image_atomic opcode */
   enum ac_image_dim dim;
+   enum gl_access_qualifier access;
   unsigned dmask : 4;
-   unsigned cache_policy : 3;
   bool unorm : 1;
   bool level_zero : 1;
   bool d16 : 1;        /* GFX8+: data and return values are 16-bit */
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@ -1804,26 +1804,6 @@ static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueR
   }
 }

-static unsigned get_cache_policy(struct ac_nir_context *ctx, enum gl_access_qualifier access,
-                                 bool may_store_unaligned)
-{
-   unsigned cache_policy = 0;
-
-   /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores.  All
-    * store opcodes not aligned to a dword are affected. The only way to
-    * get unaligned stores is through shader images.
-    */
-   if (((may_store_unaligned && ctx->ac.gfx_level == GFX6) ||
-        access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
-      cache_policy |= ac_glc;
-   }
-
-   if (access & ACCESS_NON_TEMPORAL)
-      cache_policy |= ac_slc | ac_glc;
-
-   return cache_policy;
-}
-
 static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, struct waterfall_context *wctx,
                                         const nir_intrinsic_instr *instr, nir_src src)
 {
@ -1841,8 +1821,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
   LLVMValueRef src_data = get_src(ctx, instr->src[0]);
   int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
   unsigned writemask = nir_intrinsic_write_mask(instr);
-   enum gl_access_qualifier access = nir_intrinsic_access(instr);
-   unsigned cache_policy = get_cache_policy(ctx, access, false);
+   enum gl_access_qualifier access = ac_get_mem_access_flags(instr);

   struct waterfall_context wctx;
   LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
@ -1897,9 +1876,9 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
                            LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), "");

      if (num_bytes == 1) {
-         ac_build_buffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
+         ac_build_buffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, access);
      } else if (num_bytes == 2) {
-         ac_build_buffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
+         ac_build_buffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, access);
      } else {
         switch (num_bytes) {
         case 16: /* v4f32 */
@ -1920,7 +1899,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
         data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, "");

         ac_build_buffer_store_dword(&ctx->ac, rsrc, data, NULL, offset,
-                                     ctx->ac.i32_0, cache_policy);
+                                     ctx->ac.i32_0, access);
      }
   }

@ -2066,11 +2045,16 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_
         data = ac_to_float(&ctx->ac, data);
         return_type = LLVMTypeOf(data);
      }
+
+      unsigned cache_flags =
+         ac_get_hw_cache_flags(ctx->ac.gfx_level,
+			       ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value;
+
      params[arg_count++] = data;
      params[arg_count++] = descriptor;
      params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
      params[arg_count++] = ctx->ac.i32_0;               /* soffset */
-      params[arg_count++] = ctx->ac.i32_0;               /* slc */
+      params[arg_count++] = LLVMConstInt(ctx->ac.i32, cache_flags, 0);

      ac_build_type_name_for_intr(return_type, type, sizeof(type));
      snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
@ -2095,8 +2079,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_

   int elem_size_bytes = instr->dest.ssa.bit_size / 8;
   int num_components = instr->num_components;
-   enum gl_access_qualifier access = nir_intrinsic_access(instr);
-   unsigned cache_policy = get_cache_policy(ctx, access, false);
+   enum gl_access_qualifier access = ac_get_mem_access_flags(instr);

   LLVMValueRef offset = get_src(ctx, instr->src[1]);
   LLVMValueRef rsrc = ctx->abi->load_ssbo ?
@ -2122,16 +2105,16 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_

      if (load_bytes == 1) {
         ret = ac_build_buffer_load_byte(&ctx->ac, rsrc, voffset, ctx->ac.i32_0,
-                                          cache_policy);
+                                          access);
      } else if (load_bytes == 2) {
         ret = ac_build_buffer_load_short(&ctx->ac, rsrc, voffset, ctx->ac.i32_0,
-                                           cache_policy);
+                                           access);
      } else {
         int num_channels = util_next_power_of_two(load_bytes) / 4;
         bool can_speculate = access & ACCESS_CAN_REORDER;

         ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, vindex, voffset, ctx->ac.i32_0,
-                                    ctx->ac.f32, cache_policy, can_speculate, false);
+                                    ctx->ac.f32, access, can_speculate, false);
      }

      LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret)));
@ -2507,7 +2490,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri

   struct ac_image_args args = {0};

-   args.cache_policy = get_cache_policy(ctx, access, false);
+   args.access = ac_get_mem_access_flags(instr);
   args.tfe = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;

   if (dim == GLSL_SAMPLER_DIM_BUF) {
@ -2523,7 +2506,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
      assert(instr->dest.is_ssa);
      bool can_speculate = access & ACCESS_CAN_REORDER;
      res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels,
-                                        args.cache_policy, can_speculate,
+                                        args.access, can_speculate,
                                        instr->dest.ssa.bit_size == 16,
                                        args.tfe);
      res = ac_build_expand(&ctx->ac, res, num_channels, args.tfe ? 5 : 4);
@ -2588,14 +2571,13 @@ static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_in
   }

   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
-   enum gl_access_qualifier access = nir_intrinsic_access(instr);
   bool is_array = nir_intrinsic_image_array(instr);

   struct waterfall_context wctx;
   LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);

   struct ac_image_args args = {0};
-   args.cache_policy = get_cache_policy(ctx, access, true);
+   args.access = ac_get_mem_access_flags(instr);

   LLVMValueRef src = get_src(ctx, instr->src[3]);
   if (instr->src[3].ssa->bit_size == 64) {
@ -2617,7 +2599,7 @@ static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_in
      vindex =
         LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, "");

-      ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.cache_policy);
+      ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.access);
   } else {
      bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;

@ -2730,9 +2712,12 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
      } else {
         LLVMTypeRef data_type = LLVMTypeOf(params[0]);
         char type[8];
+         unsigned cache_flags =
+            ac_get_hw_cache_flags(ctx->ac.gfx_level,
+				  ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value;

         params[param_count++] = ctx->ac.i32_0; /* soffset */
-         params[param_count++] = ctx->ac.i32_0; /* slc */
+         params[param_count++] = LLVMConstInt(ctx->ac.i32, cache_flags, 0);

         ac_build_type_name_for_intr(data_type, type, sizeof(type));
         length = snprintf(intrinsic_name, sizeof(intrinsic_name),
@ -2752,6 +2737,7 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
      args.resource = ctx->abi->load_sampler_desc(ctx->abi, dynamic_index, AC_DESC_IMAGE);
      get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
      args.dim = ac_get_image_dim(ctx->ac.gfx_level, dim, is_array);
+      args.access = ac_get_mem_access_flags(instr);

      result = ac_build_image_opcode(&ctx->ac, &args);
   }
@ -3805,19 +3791,9 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
      LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[src_base + 3]) : NULL;
      unsigned num_components = instr->dest.ssa.num_components;
      unsigned const_offset = nir_intrinsic_base(instr);
-      bool swizzled = nir_intrinsic_access(instr) & ACCESS_IS_SWIZZLED_AMD;
      bool reorder = nir_intrinsic_can_reorder(instr);
-      bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
-      bool slc = nir_intrinsic_access(instr) & ACCESS_NON_TEMPORAL;
-      bool uses_format = nir_intrinsic_access(instr) & ACCESS_USES_FORMAT_AMD;
-
-      enum ac_image_cache_policy cache_policy = 0;
-      if (swizzled)
-         cache_policy |= ac_swizzled;
-      if (slc)
-         cache_policy |= ac_slc;
-      if (coherent)
-         cache_policy |= ac_glc;
+      enum gl_access_qualifier access = ac_get_mem_access_flags(instr);
+      bool uses_format = access & ACCESS_USES_FORMAT_AMD;

      LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, addr_voffset,
                                          LLVMConstInt(ctx->ac.i32, const_offset, 0), "");
@ -3825,12 +3801,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
      if (instr->intrinsic == nir_intrinsic_load_buffer_amd && uses_format) {
         assert(instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 32);
         result = ac_build_buffer_load_format(&ctx->ac, descriptor, vidx, voffset, num_components,
-                                              cache_policy, reorder,
+                                              access, reorder,
                                              instr->dest.ssa.bit_size == 16, false);
         result = ac_to_integer(&ctx->ac, result);
      } else if (instr->intrinsic == nir_intrinsic_store_buffer_amd && uses_format) {
         assert(instr->src[0].ssa->bit_size == 16 || instr->src[0].ssa->bit_size == 32);
-         ac_build_buffer_store_format(&ctx->ac, descriptor, store_data, vidx, voffset, cache_policy);
+         ac_build_buffer_store_format(&ctx->ac, descriptor, store_data, vidx, voffset, access);
      } else if (instr->intrinsic == nir_intrinsic_load_buffer_amd ||
                 instr->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
         /* LLVM is unable to select instructions for larger than 32-bit channel types.
@ -3843,7 +3819,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins

         if (instr->intrinsic == nir_intrinsic_load_buffer_amd) {
            result = ac_build_buffer_load(&ctx->ac, descriptor, fetch_num_components, vidx, voffset,
-                                          addr_soffset, channel_type, cache_policy, reorder, false);
+                                          addr_soffset, channel_type, access, reorder, false);
         } else {
            const unsigned align_offset = nir_intrinsic_align_offset(instr);
            const unsigned align_mul = nir_intrinsic_align_mul(instr);
@ -3854,7 +3830,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
            result =
               ac_build_safe_tbuffer_load(&ctx->ac, descriptor, vidx, addr_voffset, addr_soffset,
                                          channel_type, vtx_info, const_offset, align_offset,
-                                          align_mul, fetch_num_components, cache_policy, reorder);
+                                          align_mul, fetch_num_components, access, reorder);
         }

         /* Trim to needed vector components. */
@ -3884,7 +3860,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins

            LLVMValueRef data = extract_vector_range(&ctx->ac, store_data, start, count);
            ac_build_buffer_store_dword(&ctx->ac, descriptor, data, vidx, voffset, addr_soffset,
-                                        cache_policy);
+                                        access);
         }
      }
      break;
@ -3933,12 +3909,15 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
      LLVMValueRef data = get_src(ctx, instr->src[1]);
      unsigned base = nir_intrinsic_base(instr);
      LLVMTypeRef return_type = LLVMTypeOf(data);
+      unsigned cache_flags =
+         ac_get_hw_cache_flags(ctx->ac.gfx_level,
+			       ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value;

      LLVMValueRef args[] = {
         data, desc,
         LLVMConstInt(ctx->ac.i32, base, false),
         ctx->ac.i32_0, /* soffset */
-         ctx->ac.i32_0, /* cachepolicy */
+         LLVMConstInt(ctx->ac.i32, cache_flags, 0),
      };

      char name[64], type[8];
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@ -372,7 +372,8 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
      ac_build_ifcc(&ctx->ac,
                    LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
      ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
-                                  NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base, ac_glc);
+                                  NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base,
+                                  ACCESS_COHERENT);
      ac_build_endif(&ctx->ac, 6504);
      offset += 4;
   }
@ -381,13 +382,13 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
   ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, NULL,
                               LLVMBuildAdd(ctx->ac.builder, byteoffset,
                                            LLVMConstInt(ctx->ac.i32, offset, 0), ""),
-                               tf_base, ac_glc);
+                               tf_base, ACCESS_COHERENT);
   offset += 16;
   if (vec1)
      ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, NULL,
                                  LLVMBuildAdd(ctx->ac.builder, byteoffset,
                                               LLVMConstInt(ctx->ac.i32, offset, 0), ""),
-                                  tf_base, ac_glc);
+                                  tf_base, ACCESS_COHERENT);

   /* Store the tess factors into the offchip buffer if TES reads them. */
   if (shader->key.ge.part.tcs.epilog.tes_reads_tess_factors) {
@ -405,7 +406,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
      outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_comps);

      ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, NULL, tf_outer_offset,
-                                  base, ac_glc);
+                                  base, ACCESS_COHERENT);
      if (inner_comps) {
         param_inner = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
@ -413,7 +414,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader

         inner_vec = ac_build_gather_values(&ctx->ac, inner, inner_comps);
         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, NULL,
-                                     tf_inner_offset, base, ac_glc);
+                                     tf_inner_offset, base, ACCESS_COHERENT);
      }
   }