util/cache_ops: Add some cache flush helpers

The x86 implementation was shamelessly stolen from intel_mem.c and the aarch64 implementaiton was based on the code in Turnip. Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Tested-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37803>
2025-12-20 11:40:10 +01:00 · 2025-07-24 19:47:33 +00:00 · 2025-07-24 19:47:33 +00:00 · 555881e574
commit 555881e574
parent 1dea86f773
6 changed files with 611 additions and 1 deletions
--- a/src/util/cache_ops.h
+++ b/src/util/cache_ops.h
@ -0,0 +1,115 @@
 /*
 * Copyright © 2025 Collabora Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef UTIL_CACHE_OPS_H
 #define UTIL_CACHE_OPS_H
 #include <stdbool.h>
 #include <stddef.h>
 #include "detect_arch.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /** Returns true if we have cache operations available */
 static inline bool
 util_has_cache_ops(void)
 {
   /* TODO: Port to MSVC if and when we have Windows hardware drivers that
    * need cache flushing ops.
    */
 #if defined(_MSC_VER)
   return false;
 #endif
   return DETECT_ARCH_X86 || DETECT_ARCH_X86_64 || DETECT_ARCH_AARCH64;
 }
 /** Returns the cache granularity
 *
 * This is the maximum number of bytes that may be overwritten as the result
 * of a cache flush or cache line eviction.  On big.LITTLE platforms, the
 * cache flush helpers may sometimes operate at a smaller granularity but may
 * also round up to at most util_cache_granularity().
 *
 * Vulkan drivers should return this as nonCoherentAtomSize.
 */
 size_t util_cache_granularity(void);
 /** Flushes a range to main memory */
 void util_flush_range(void *start, size_t size);
 /** Flushes a range to main memory and invalidates those cache lines */
 void util_flush_inval_range(void *start, size_t size);
 /** Flushes a range to main memory without fencing
 *
 * This is for the case where you have a lot of ranges to flush and want to
 * avoid unnecessary fencing.  In this case, call
 *
 *    util_pre_flush_fence()
 *    util_flush_range_no_fence()
 *    util_flush_range_no_fence()
 *    util_post_flush_fence()
 */
 void util_flush_range_no_fence(void *start, size_t size);
 /** Flushes a range to main memory and invalidates those cache lines without
 * fencing
 *
 * This is for the case where you have a lot of ranges to flush and invalidate
 * and want to avoid unnecessary fencing.  In this case, call
 *
 *    util_pre_flush_fence()
 *    util_flush_inval_range_no_fence()
 *    util_flush_range_no_fence()
 *    util_flush_inval_range_no_fence()
 *    util_post_flush_inval_fence()
 */
 void util_flush_inval_range_no_fence(void *start, size_t size);
 /** Fence between memory access and cache flush operations
 *
 * see util_flush_range_no_fence()
 */
 void util_pre_flush_fence(void);
 /** Fence between cache flush operations and memory access
 *
 * see util_flush_range_no_fence()
 */
 void util_post_flush_fence(void);
 /** Fence between cache invalidate operations and memory access
 *
 * see util_flush_inval_range_no_fence()
 */
 void util_post_flush_inval_fence(void);
 #ifdef __cplusplus
 }
 #endif
 #endif /* UTIL_CACHE_OPS_H */
--- a/src/util/cache_ops_aarch64.c
+++ b/src/util/cache_ops_aarch64.c
@ -0,0 +1,228 @@
 /*
 * Copyright © 2025 Collabora Ltd. and Igalia S.L.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "cache_ops.h"
 #include "util/macros.h"
 #include "util/u_atomic.h"
 static uint32_t
 get_ctr_el0(void)
 {
   uint32_t ctr_el0;
   __asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0));
   return ctr_el0;
 }
 static uint32_t
 get_ctr_cwg(void)
 {
   return (get_ctr_el0() >> 24) & 0xf;
 }
 size_t
 util_cache_granularity(void)
 {
   static uint32_t cached_size = 0;
   uint32_t size = p_atomic_read(&cached_size);
   if (likely(size > 0))
      return size;
   /* We use CTR_EL0.CWG as the cache granularity.  According to Arm:
    *
    *    "CWG, [27:24]
    *
    *    Cache write-back granule. Log2 of the number of words of the maximum
    *    size of memory that can be overwritten as a result of the eviction of
    *    a cache entry that has had a memory location in it modified"
    *
    * On big.LITTLE CPUs, Linux will trap on fetching CTR_EL0 and take the
    * maximum across all CPU cores so this should really be the maximum that
    * drivers and clients can assume.
    */
   size = 4 << ((get_ctr_el0() >> 24) & 0xf);
   p_atomic_set(&cached_size, size);
   return size;
 }
 static size_t
 get_dmin_line(void)
 {
   static uint32_t cached_size = 0;
   uint32_t size = p_atomic_read(&cached_size);
   if (likely(size > 0))
      return size;
   /* For walking cache lines, we want to use CTR_EL0.DminLine as the step
    * size.  According to Arm:
    *
    *    "DminLine, [19:16]
    *
    *    Log2 of the number of words in the smallest cache line of all the
    *    data and unified caches that the core controls"
    *
    * On big.LITTLE CPUs, Linux will trap on fetching CTR_EL0 and take the
    * minimum across all CPU cores so this should be safe no matter what core
    * we happen to be living on.
    */
   size = 4 << ((get_ctr_el0() >> 16) & 0xf);
   p_atomic_set(&cached_size, size);
   return size;
 }
 static void
 flush_l1_cacheline(UNUSED void *p)
 {
   /* Clean data cache. */
   __asm volatile("dc cvac, %0" : : "r" (p) : "memory");
 }
 static void
 flush_inval_l1_cacheline(UNUSED void *p)
 {
   /* Clean and Invalidate data cache, there is no separate Invalidate. */
   __asm volatile("dc civac, %0" : : "r" (p) : "memory");
 }
 static void
 data_sync_bar(void)
 {
   __asm volatile("dsb sy");
 }
 void
 util_flush_range_no_fence(void *start, size_t size)
 {
   uintptr_t l1_cacheline_size = get_dmin_line();
   char *p = (char *) (((uintptr_t) start) & ~(l1_cacheline_size - 1));
   char *end = ((char *) start) + size;
   while (p < end) {
      flush_l1_cacheline(p);
      p += l1_cacheline_size;
   }
 }
 void
 util_flush_inval_range_no_fence(void *start, size_t size)
 {
   uintptr_t l1_cacheline_size = get_dmin_line();
   char *p = (char *) (((uintptr_t) start) & ~(l1_cacheline_size - 1));
   char *end = ((char *) start) + size;
   while (p < end) {
      flush_inval_l1_cacheline(p);
      p += l1_cacheline_size;
   }
 }
 void
 util_flush_range(void *p, size_t size)
 {
   if (size == 0)
      return;
   util_pre_flush_fence();
   util_flush_range_no_fence(p, size);
   util_post_flush_fence();
 }
 void
 util_flush_inval_range(void *p, size_t size)
 {
   if (size == 0)
      return;
   util_pre_flush_fence();
   util_flush_inval_range_no_fence(p, size);
   util_post_flush_inval_fence();
 }
 void
 util_pre_flush_fence(void)
 {
   /* From the Arm ® Architecture Reference Manual (revision L.b):
    *
    *    "All data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
    *    that specify an address: [...] Execute in program order relative to
    *    other data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
    *    that specify an address within the same cache line of minimum size,
    *    as indicated by CTR_EL0.DMinLine."
    *
    * So cache flush operations are properly ordered against memory accesses
    * and there's nothing we need to do to ensure that prior writes land
    * before the cache flush operations flush the data.
    *
    * In the case where this pre_flush_fence() is called before a flush/inval
    * used for a GPU -> CPU barrier, there is also nothing to do because it's
    * the responsibility of the GPU to ensure that all memory writes have
    * landed before we see this on the CPU side.
    */
 }
 void
 util_post_flush_fence(void)
 {
   /* From the Arm ® Architecture Reference Manual (revision L.b):
    *
    *    "A cache maintenance instruction can complete at any time after it is
    *    executed, but is only guaranteed to be complete, and its effects
    *    visible to other observers, following a DSB instruction executed by
    *    the PE that executed the cache maintenance instruction."
    *
    * In order to ensure that the GPU sees data flushed by pror cache flushes,
    * we need to execute a DSB to ensure the flushes land.
    */
   data_sync_bar();
 }
 void
 util_post_flush_inval_fence(void)
 {
   /* From the Arm ® Architecture Reference Manual (revision L.b):
    *
    *    "All data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
    *    that specify an address: [...] Execute in program order relative to
    *    other data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
    *    that specify an address within the same cache line of minimum size,
    *    as indicated by CTR_EL0.DMinLine."
    *
    * This seems to imply that memory access that happens after the cache
    * flush/invalidate operation would be properly ordered with respect to it.
    * However, the manual also says:
    *
    *    "A cache maintenance instruction can complete at any time after it is
    *    executed, but is only guaranteed to be complete, and its effects
    *    visible to other observers, following a DSB instruction executed by
    *    the PE that executed the cache maintenance instruction."
    *
    * In practice, it appears that the ordering guarantees only really apply
    * to the queue order in the data cache and not the order in which
    * operations complete.  In other words, a read which is queued after the
    * invalidate may still use the stale cache line unless we explicitly
    * insert a DSB between them.
    */
   data_sync_bar();
 }
--- a/src/util/cache_ops_null.c
+++ b/src/util/cache_ops_null.c
@ -0,0 +1,70 @@
 /*
 * Copyright © 2025 Collabora Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "cache_ops.h"
 #include "util/macros.h"
 size_t
 util_cache_granularity()
 {
   return 0;
 }
 void
 util_flush_range(void *start, size_t size)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
 void
 util_flush_inval_range(void *start, size_t size)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
 void
 util_flush_range_no_fence(void *start, size_t size)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
 void
 util_flush_inval_range_no_fence(void *start, size_t size)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
 void util_pre_flush_fence(void)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
 void util_post_flush_fence(void)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
 void util_post_flush_inval_fence(void)
 {
   UNREACHABLE("Cache ops are not implemented on this platform");
 }
--- a/src/util/cache_ops_x86.c
+++ b/src/util/cache_ops_x86.c
@ -0,0 +1,129 @@
 /*
 * Copyright © 2017 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "cache_ops.h"
 #include "u_cpu_detect.h"
 #define CACHELINE_SIZE 64
 #define CACHELINE_MASK 63
 size_t
 util_cache_granularity(void)
 {
   return util_get_cpu_caps()->cacheline;
 }
 /* Defined in cache_ops_x86_clflushopt.c */
 #ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT
 void util_clflushopt_range(void *start, size_t size);
 #endif
 static void
 util_clflush_range(void *start, size_t size)
 {
   char *p = (char *) (((uintptr_t) start) & ~CACHELINE_MASK);
   char *end = ((char *) start) + size;
   while (p < end) {
      __builtin_ia32_clflush(p);
      p += CACHELINE_SIZE;
   }
 }
 void
 util_flush_range_no_fence(void *start, size_t size)
 {
 #ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT
   if (util_get_cpu_caps()->has_clflushopt) {
      util_clflushopt_range(start, size);
      return;
   }
 #endif
   util_clflush_range(start, size);
 }
 void
 util_flush_range(void *start, size_t size)
 {
   __builtin_ia32_mfence();
   util_clflush_range(start, size);
 #ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT
   /* clflushopt doesn't include an mfence like clflush */
   if (util_get_cpu_caps()->has_clflushopt)
      __builtin_ia32_mfence();
 #endif
 }
 void
 util_flush_inval_range_no_fence(void *start, size_t size)
 {
   if (size == 0)
      return;
   util_flush_range_no_fence(start, size);
   /* Modern Atom CPUs (Baytrail+) have issues with clflush serialization,
    * where mfence is not a sufficient synchronization barrier.  We must
    * double clflush the last cacheline.  This guarantees it will be ordered
    * after the preceding clflushes, and then the mfence guards against
    * prefetches crossing the clflush boundary.
    *
    * See kernel commit 396f5d62d1a5fd99421855a08ffdef8edb43c76e
    * ("drm: Restore double clflush on the last partial cacheline")
    * and https://bugs.freedesktop.org/show_bug.cgi?id=92845.
    */
 #ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT
   if (util_get_cpu_caps()->has_clflushopt) {
      /* clflushopt doesn't include an mfence like clflush */
      __builtin_ia32_mfence();
      util_clflushopt_range((char *)start + size - 1, 1);
      return;
   }
 #endif
   __builtin_ia32_clflush((char *)start + size - 1);
 }
 void
 util_flush_inval_range(void *start, size_t size)
 {
   util_flush_inval_range_no_fence(start, size);
   __builtin_ia32_mfence();
 }
 void
 util_pre_flush_fence(void)
 {
   __builtin_ia32_mfence();
 }
 void
 util_post_flush_fence(void)
 {
   __builtin_ia32_mfence();
 }
 void
 util_post_flush_inval_fence(void)
 {
   __builtin_ia32_mfence();
 }
--- a/src/util/cache_ops_x86_clflushopt.c
+++ b/src/util/cache_ops_x86_clflushopt.c
@ -0,0 +1,46 @@
 /*
 * Copyright © 2017 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "util/u_cpu_detect.h"
 #ifndef HAVE___BUILTIN_IA32_CLFLUSHOPT
 #error "Compiler doesn't support clflushopt!"
 #endif
 void util_clflushopt_range(void *start, size_t size);
 void
 util_clflushopt_range(void *start, size_t size)
 {
   const struct util_cpu_caps_t *cpu_caps = util_get_cpu_caps();
   assert(cpu_caps->has_clflushopt);
   assert(cpu_caps->cacheline > 0);
   void *p = (void *) (((uintptr_t) start) &
                       ~((uintptr_t)cpu_caps->cacheline - 1));
   void *end = start + size;
   while (p < end) {
      __builtin_ia32_clflushopt(p);
      p += cpu_caps->cacheline;
   }
 }
--- a/src/util/meson.build
+++ b/src/util/meson.build
@ -25,6 +25,7 @@ files_mesa_util = files(
  'box.h',
  'build_id.c',
  'build_id.h',
  'cache_ops.h',
  'cnd_monotonic.c',
  'cnd_monotonic.h',
  'compiler.h',
@ -182,6 +183,26 @@ files_mesa_util = files(
  'mesa_cache_db_multipart.h',
 )
 libmesa_util_links = []
 if host_machine.cpu_family() == 'aarch64' and cc.get_id() != 'msvc'
  files_mesa_util += files('cache_ops_aarch64.c')
 elif host_machine.cpu_family() in ['x86', 'x86_64'] and cc.get_id() != 'msvc'
  files_mesa_util += files('cache_ops_x86.c')
  if with_clflushopt
    libmesa_util_clflushopt = static_library(
      'mesa_util_clflushopt',
      ['cache_ops_x86_clflushopt.c'],
      include_directories : [inc_util],
      c_args : [no_override_init_args] + clflushopt_args,
      gnu_symbol_visibility : 'hidden',
    )
    libmesa_util_links += libmesa_util_clflushopt
  endif
 else
  files_mesa_util += files('cache_ops_null.c')
 endif
 files_drirc = files('00-mesa-defaults.conf')
 if with_amd_vk
@ -304,13 +325,14 @@ libmesa_util_simd = static_library(
  gnu_symbol_visibility : 'hidden',
  build_by_default : false,
 )
 libmesa_util_links += libmesa_util_simd
 _libmesa_util = static_library(
  'mesa_util',
  [files_mesa_util, files_debug_stack, format_srgb],
  include_directories : [inc_util, include_directories('format')],
  dependencies : deps_for_libmesa_util,
-  link_with: [libmesa_util_simd],
+  link_with: libmesa_util_links,
  c_args : [c_msvc_compat_args],
  gnu_symbol_visibility : 'hidden',
  build_by_default : false