mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-23 18:00:36 +01:00
The x86 implementation was shamelessly stolen from intel_mem.c and the aarch64 implementaiton was based on the code in Turnip. Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Tested-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37803>
228 lines
7.1 KiB
C
228 lines
7.1 KiB
C
/*
|
|
* Copyright © 2025 Collabora Ltd. and Igalia S.L.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "cache_ops.h"
|
|
|
|
#include "util/macros.h"
|
|
#include "util/u_atomic.h"
|
|
|
|
static uint32_t
|
|
get_ctr_el0(void)
|
|
{
|
|
uint32_t ctr_el0;
|
|
__asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0));
|
|
return ctr_el0;
|
|
}
|
|
|
|
static uint32_t
|
|
get_ctr_cwg(void)
|
|
{
|
|
return (get_ctr_el0() >> 24) & 0xf;
|
|
}
|
|
|
|
size_t
|
|
util_cache_granularity(void)
|
|
{
|
|
static uint32_t cached_size = 0;
|
|
uint32_t size = p_atomic_read(&cached_size);
|
|
if (likely(size > 0))
|
|
return size;
|
|
|
|
/* We use CTR_EL0.CWG as the cache granularity. According to Arm:
|
|
*
|
|
* "CWG, [27:24]
|
|
*
|
|
* Cache write-back granule. Log2 of the number of words of the maximum
|
|
* size of memory that can be overwritten as a result of the eviction of
|
|
* a cache entry that has had a memory location in it modified"
|
|
*
|
|
* On big.LITTLE CPUs, Linux will trap on fetching CTR_EL0 and take the
|
|
* maximum across all CPU cores so this should really be the maximum that
|
|
* drivers and clients can assume.
|
|
*/
|
|
size = 4 << ((get_ctr_el0() >> 24) & 0xf);
|
|
|
|
p_atomic_set(&cached_size, size);
|
|
return size;
|
|
}
|
|
|
|
static size_t
|
|
get_dmin_line(void)
|
|
{
|
|
static uint32_t cached_size = 0;
|
|
uint32_t size = p_atomic_read(&cached_size);
|
|
if (likely(size > 0))
|
|
return size;
|
|
|
|
/* For walking cache lines, we want to use CTR_EL0.DminLine as the step
|
|
* size. According to Arm:
|
|
*
|
|
* "DminLine, [19:16]
|
|
*
|
|
* Log2 of the number of words in the smallest cache line of all the
|
|
* data and unified caches that the core controls"
|
|
*
|
|
* On big.LITTLE CPUs, Linux will trap on fetching CTR_EL0 and take the
|
|
* minimum across all CPU cores so this should be safe no matter what core
|
|
* we happen to be living on.
|
|
*/
|
|
size = 4 << ((get_ctr_el0() >> 16) & 0xf);
|
|
|
|
p_atomic_set(&cached_size, size);
|
|
return size;
|
|
}
|
|
|
|
static void
|
|
flush_l1_cacheline(UNUSED void *p)
|
|
{
|
|
/* Clean data cache. */
|
|
__asm volatile("dc cvac, %0" : : "r" (p) : "memory");
|
|
}
|
|
|
|
static void
|
|
flush_inval_l1_cacheline(UNUSED void *p)
|
|
{
|
|
/* Clean and Invalidate data cache, there is no separate Invalidate. */
|
|
__asm volatile("dc civac, %0" : : "r" (p) : "memory");
|
|
}
|
|
|
|
static void
|
|
data_sync_bar(void)
|
|
{
|
|
__asm volatile("dsb sy");
|
|
}
|
|
|
|
void
|
|
util_flush_range_no_fence(void *start, size_t size)
|
|
{
|
|
uintptr_t l1_cacheline_size = get_dmin_line();
|
|
char *p = (char *) (((uintptr_t) start) & ~(l1_cacheline_size - 1));
|
|
char *end = ((char *) start) + size;
|
|
|
|
while (p < end) {
|
|
flush_l1_cacheline(p);
|
|
p += l1_cacheline_size;
|
|
}
|
|
}
|
|
|
|
void
|
|
util_flush_inval_range_no_fence(void *start, size_t size)
|
|
{
|
|
uintptr_t l1_cacheline_size = get_dmin_line();
|
|
char *p = (char *) (((uintptr_t) start) & ~(l1_cacheline_size - 1));
|
|
char *end = ((char *) start) + size;
|
|
|
|
while (p < end) {
|
|
flush_inval_l1_cacheline(p);
|
|
p += l1_cacheline_size;
|
|
}
|
|
}
|
|
|
|
void
|
|
util_flush_range(void *p, size_t size)
|
|
{
|
|
if (size == 0)
|
|
return;
|
|
|
|
util_pre_flush_fence();
|
|
util_flush_range_no_fence(p, size);
|
|
util_post_flush_fence();
|
|
}
|
|
|
|
void
|
|
util_flush_inval_range(void *p, size_t size)
|
|
{
|
|
if (size == 0)
|
|
return;
|
|
|
|
util_pre_flush_fence();
|
|
util_flush_inval_range_no_fence(p, size);
|
|
util_post_flush_inval_fence();
|
|
}
|
|
|
|
void
|
|
util_pre_flush_fence(void)
|
|
{
|
|
/* From the Arm ® Architecture Reference Manual (revision L.b):
|
|
*
|
|
* "All data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
|
|
* that specify an address: [...] Execute in program order relative to
|
|
* other data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
|
|
* that specify an address within the same cache line of minimum size,
|
|
* as indicated by CTR_EL0.DMinLine."
|
|
*
|
|
* So cache flush operations are properly ordered against memory accesses
|
|
* and there's nothing we need to do to ensure that prior writes land
|
|
* before the cache flush operations flush the data.
|
|
*
|
|
* In the case where this pre_flush_fence() is called before a flush/inval
|
|
* used for a GPU -> CPU barrier, there is also nothing to do because it's
|
|
* the responsibility of the GPU to ensure that all memory writes have
|
|
* landed before we see this on the CPU side.
|
|
*/
|
|
}
|
|
|
|
void
|
|
util_post_flush_fence(void)
|
|
{
|
|
/* From the Arm ® Architecture Reference Manual (revision L.b):
|
|
*
|
|
* "A cache maintenance instruction can complete at any time after it is
|
|
* executed, but is only guaranteed to be complete, and its effects
|
|
* visible to other observers, following a DSB instruction executed by
|
|
* the PE that executed the cache maintenance instruction."
|
|
*
|
|
* In order to ensure that the GPU sees data flushed by pror cache flushes,
|
|
* we need to execute a DSB to ensure the flushes land.
|
|
*/
|
|
data_sync_bar();
|
|
}
|
|
|
|
void
|
|
util_post_flush_inval_fence(void)
|
|
{
|
|
/* From the Arm ® Architecture Reference Manual (revision L.b):
|
|
*
|
|
* "All data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
|
|
* that specify an address: [...] Execute in program order relative to
|
|
* other data cache instructions, other than DC ZVA, DC GVA, and DC GZVA
|
|
* that specify an address within the same cache line of minimum size,
|
|
* as indicated by CTR_EL0.DMinLine."
|
|
*
|
|
* This seems to imply that memory access that happens after the cache
|
|
* flush/invalidate operation would be properly ordered with respect to it.
|
|
* However, the manual also says:
|
|
*
|
|
* "A cache maintenance instruction can complete at any time after it is
|
|
* executed, but is only guaranteed to be complete, and its effects
|
|
* visible to other observers, following a DSB instruction executed by
|
|
* the PE that executed the cache maintenance instruction."
|
|
*
|
|
* In practice, it appears that the ordering guarantees only really apply
|
|
* to the queue order in the data cache and not the order in which
|
|
* operations complete. In other words, a read which is queued after the
|
|
* invalidate may still use the stale cache line unless we explicitly
|
|
* insert a DSB between them.
|
|
*/
|
|
data_sync_bar();
|
|
}
|