iris: Split system memory heap into cached-coherent and uncached heaps

On non-LLC systems, most system memory isn't coherent between the CPU
and GPU by default.  However, we can enable snooping or 1-way coherency
at a performance cost.  In the old days, we maintained one set of cache
buckets and toggled coherency as needed via I915_GEM_SET_CACHING.  But
in the modern uAPI (GEM_CREATE_EXT_SET_PAT) we have to select coherency
up front at BO creation time.  So this doesn't work out well anymore.

This patch splits system memory into two distinct heaps:

   - IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT
   - IRIS_HEAP_SYSTEM_MEMORY_UNCACHED

The flags_to_heap() function will select a heap for a given allocation
based on the coherency/scanout requirements, as well as the hardware
configuration (LLC integrated, non-LLC integrated, or discrete card).

Once a heap is selected, it completely determines the cacheability and
coherency settings.  A given heap will always have the same mmap mode
and PAT index.  This enables us to simplify a lot of code.

Because each heap also has its own bucket cache, we no longer have to
disable bucket caching based on flags, cacheability, coherency, mmap
modes, or so on, as all BOs in each cache have matching settings.
This effectively enables bucket-caching for non-LLC systems.

Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25447>
This commit is contained in:
Kenneth Graunke 2023-11-20 17:42:22 -08:00 committed by Marge Bot
parent 81ebb6a10a
commit 21170a58d8
5 changed files with 94 additions and 124 deletions

View file

@ -110,8 +110,7 @@ i915_gem_create(struct iris_bufmgr *bufmgr,
/* Set PAT param */
struct drm_i915_gem_create_ext_set_pat set_pat_param = { 0 };
if (devinfo->has_set_pat_uapi) {
set_pat_param.pat_index =
iris_bufmgr_get_pat_index_for_bo_flags(bufmgr, alloc_flags);
set_pat_param.pat_index = iris_heap_to_pat_entry(devinfo, heap)->index;
intel_i915_gem_add_ext(&create.extensions,
I915_GEM_CREATE_EXT_SET_PAT,
&set_pat_param.base);

View file

@ -291,11 +291,6 @@ bucket_for_size(struct iris_bufmgr *bufmgr, uint64_t size,
const struct intel_device_info *devinfo = &bufmgr->devinfo;
struct iris_bucket_cache *cache = &bufmgr->bucket_cache[heap];
if (devinfo->has_set_pat_uapi &&
iris_bufmgr_get_pat_entry_for_bo_flags(bufmgr, flags) !=
iris_bufmgr_get_pat_entry_for_bo_flags(bufmgr, 0 /* alloc_flags */))
return NULL;
if (devinfo->kmd_type == INTEL_KMD_TYPE_XE &&
(flags & (BO_ALLOC_SHARED | BO_ALLOC_SCANOUT)))
return NULL;
@ -773,7 +768,8 @@ iris_slab_alloc(void *priv,
}
assert(slab_size != 0);
if (heap == IRIS_HEAP_SYSTEM_MEMORY)
if (heap == IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT ||
heap == IRIS_HEAP_SYSTEM_MEMORY_UNCACHED)
flags = BO_ALLOC_SMEM;
else if (heap == IRIS_HEAP_DEVICE_LOCAL)
flags = BO_ALLOC_LMEM;
@ -827,19 +823,41 @@ fail:
return NULL;
}
/**
* Selects a heap for the given buffer allocation flags.
*
* This determines the cacheability, coherency, and mmap mode settings.
*/
static enum iris_heap
flags_to_heap(struct iris_bufmgr *bufmgr, unsigned flags)
{
const struct intel_device_info *devinfo = &bufmgr->devinfo;
if (bufmgr->vram.size > 0) {
/* Discrete GPUs currently always snoop CPU caches. */
if ((flags & BO_ALLOC_SMEM) || (flags & BO_ALLOC_COHERENT))
return IRIS_HEAP_SYSTEM_MEMORY;
return IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT;
if ((flags & BO_ALLOC_LMEM) ||
((flags & BO_ALLOC_SCANOUT) && !(flags & BO_ALLOC_SHARED)))
return IRIS_HEAP_DEVICE_LOCAL;
return IRIS_HEAP_DEVICE_LOCAL_PREFERRED;
} else {
} else if (devinfo->has_llc) {
assert(!(flags & BO_ALLOC_LMEM));
return IRIS_HEAP_SYSTEM_MEMORY;
if (flags & (BO_ALLOC_SCANOUT | BO_ALLOC_SHARED))
return IRIS_HEAP_SYSTEM_MEMORY_UNCACHED;
return IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT;
} else {
assert(!devinfo->has_llc);
assert(!(flags & BO_ALLOC_LMEM));
if (flags & BO_ALLOC_COHERENT)
return IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT;
return IRIS_HEAP_SYSTEM_MEMORY_UNCACHED;
}
}
@ -1064,9 +1082,11 @@ alloc_fresh_bo(struct iris_bufmgr *bufmgr, uint64_t bo_size, unsigned flags)
case IRIS_HEAP_DEVICE_LOCAL:
regions[num_regions++] = bufmgr->vram.region;
break;
case IRIS_HEAP_SYSTEM_MEMORY:
case IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT:
regions[num_regions++] = bufmgr->sys.region;
break;
case IRIS_HEAP_SYSTEM_MEMORY_UNCACHED:
/* not valid; discrete cards always enable snooping */
case IRIS_HEAP_MAX:
unreachable("invalid heap for BO");
}
@ -1090,34 +1110,28 @@ alloc_fresh_bo(struct iris_bufmgr *bufmgr, uint64_t bo_size, unsigned flags)
const char *
iris_heap_to_string[IRIS_HEAP_MAX] = {
[IRIS_HEAP_SYSTEM_MEMORY] = "system",
[IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT] = "system-cached-coherent",
[IRIS_HEAP_SYSTEM_MEMORY_UNCACHED] = "system-uncached",
[IRIS_HEAP_DEVICE_LOCAL] = "local",
[IRIS_HEAP_DEVICE_LOCAL_PREFERRED] = "local-preferred",
};
static enum iris_mmap_mode
iris_bo_alloc_get_mmap_mode(struct iris_bufmgr *bufmgr, enum iris_heap heap,
unsigned flags)
heap_to_mmap_mode(struct iris_bufmgr *bufmgr, enum iris_heap heap)
{
if (bufmgr->devinfo.kmd_type == INTEL_KMD_TYPE_XE)
return iris_xe_bo_flags_to_mmap_mode(bufmgr, heap, flags);
const struct intel_device_info *devinfo = &bufmgr->devinfo;
/* i915 */
const bool local = iris_heap_is_device_local(heap);
const bool is_coherent = bufmgr->devinfo.has_llc ||
(bufmgr->vram.size > 0 && !local) ||
(flags & BO_ALLOC_COHERENT);
const bool is_scanout = (flags & BO_ALLOC_SCANOUT) != 0;
enum iris_mmap_mode mmap_mode;
if (!intel_vram_all_mappable(&bufmgr->devinfo) && heap == IRIS_HEAP_DEVICE_LOCAL)
mmap_mode = IRIS_MMAP_NONE;
else if (!local && is_coherent && !is_scanout)
mmap_mode = IRIS_MMAP_WB;
else
mmap_mode = IRIS_MMAP_WC;
return mmap_mode;
switch (heap) {
case IRIS_HEAP_DEVICE_LOCAL:
case IRIS_HEAP_DEVICE_LOCAL_PREFERRED:
return intel_vram_all_mappable(devinfo) ? IRIS_MMAP_WC : IRIS_MMAP_NONE;
case IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT:
return IRIS_MMAP_WB;
case IRIS_HEAP_SYSTEM_MEMORY_UNCACHED:
return IRIS_MMAP_WC;
default:
unreachable("invalid heap");
}
}
struct iris_bo *
@ -1147,7 +1161,7 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
*/
uint64_t bo_size =
bucket ? bucket->size : MAX2(ALIGN(size, page_size), page_size);
enum iris_mmap_mode mmap_mode = iris_bo_alloc_get_mmap_mode(bufmgr, heap, flags);
enum iris_mmap_mode mmap_mode = heap_to_mmap_mode(bufmgr, heap);
simple_mtx_lock(&bufmgr->lock);
@ -1207,8 +1221,6 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
!bufmgr->devinfo.has_llc && bufmgr->devinfo.has_caching_uapi) {
if (bufmgr->kmd_backend->bo_set_caching(bo, true) != 0)
goto err_free;
bo->real.reusable = false;
}
DBG("bo_create: buf %d (%s) (%s memzone) (%s) %llub\n", bo->gem_handle,
@ -1237,19 +1249,6 @@ iris_bo_close(int fd, uint32_t gem_handle)
return intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
}
static enum iris_mmap_mode
iris_bo_create_userptr_get_mmap_mode(struct iris_bufmgr *bufmgr)
{
switch (bufmgr->devinfo.kmd_type) {
case INTEL_KMD_TYPE_I915:
return IRIS_MMAP_WB;
case INTEL_KMD_TYPE_XE:
return iris_xe_bo_flags_to_mmap_mode(bufmgr, IRIS_HEAP_SYSTEM_MEMORY, 0);
default:
return IRIS_MMAP_NONE;
}
}
struct iris_bo *
iris_bo_create_userptr(struct iris_bufmgr *bufmgr, const char *name,
void *ptr, size_t size,
@ -1286,8 +1285,8 @@ iris_bo_create_userptr(struct iris_bufmgr *bufmgr, const char *name,
p_atomic_set(&bo->refcount, 1);
bo->index = -1;
bo->idle = true;
bo->real.heap = IRIS_HEAP_SYSTEM_MEMORY;
bo->real.mmap_mode = iris_bo_create_userptr_get_mmap_mode(bufmgr);
bo->real.heap = IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT;
bo->real.mmap_mode = heap_to_mmap_mode(bufmgr, bo->real.heap);
bo->real.prime_fd = -1;
if (!bufmgr->kmd_backend->gem_vm_bind(bo))
@ -1387,6 +1386,8 @@ iris_bo_gem_create_from_name(struct iris_bufmgr *bufmgr,
bo->real.prime_fd = -1;
bo->real.reusable = false;
bo->real.imported = true;
/* Xe KMD expects at least 1-way coherency for imports */
bo->real.heap = IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT;
bo->real.mmap_mode = IRIS_MMAP_NONE;
bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
if (INTEL_DEBUG(DEBUG_CAPTURE_ALL))
@ -1906,6 +1907,8 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd,
bo->name = "prime";
bo->real.reusable = false;
bo->real.imported = true;
/* Xe KMD expects at least 1-way coherency for imports */
bo->real.heap = IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT;
bo->real.mmap_mode = IRIS_MMAP_NONE;
bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
if (INTEL_DEBUG(DEBUG_CAPTURE_ALL))
@ -2149,22 +2152,6 @@ init_cache_buckets(struct iris_bufmgr *bufmgr, enum iris_heap heap)
}
}
static enum iris_mmap_mode
iris_bo_alloc_aux_map_get_mmap_mode(struct iris_bufmgr *bufmgr,
enum iris_heap heap)
{
switch (bufmgr->devinfo.kmd_type) {
case INTEL_KMD_TYPE_I915:
return iris_heap_is_device_local(heap) ||
bufmgr->devinfo.has_set_pat_uapi ?
IRIS_MMAP_WC : IRIS_MMAP_WB;
case INTEL_KMD_TYPE_XE:
return iris_xe_bo_flags_to_mmap_mode(bufmgr, heap, 0);
default:
return IRIS_MMAP_NONE;
}
}
static struct intel_buffer *
intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
{
@ -2199,8 +2186,7 @@ intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
bo->index = -1;
bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED |
EXEC_OBJECT_CAPTURE;
bo->real.mmap_mode = iris_bo_alloc_aux_map_get_mmap_mode(bufmgr,
bo->real.heap);
bo->real.mmap_mode = heap_to_mmap_mode(bufmgr, bo->real.heap);
bo->real.prime_fd = -1;
buf->driver_bo = bo;
@ -2547,17 +2533,17 @@ iris_bufmgr_use_global_vm_id(struct iris_bufmgr *bufmgr)
* Return the pat entry based on the bo heap and allocation flags.
*/
const struct intel_device_info_pat_entry *
iris_bufmgr_get_pat_entry_for_bo_flags(const struct iris_bufmgr *bufmgr,
unsigned alloc_flags)
iris_heap_to_pat_entry(const struct intel_device_info *devinfo,
enum iris_heap heap)
{
const struct intel_device_info *devinfo = &bufmgr->devinfo;
if (alloc_flags & BO_ALLOC_COHERENT)
switch (heap) {
case IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT:
return &devinfo->pat.cached_coherent;
if (alloc_flags & (BO_ALLOC_SHARED | BO_ALLOC_SCANOUT))
return &devinfo->pat.scanout;
/* Iris don't have any clflush() calls so it can't use incoherent WB */
return &devinfo->pat.writecombining;
case IRIS_HEAP_SYSTEM_MEMORY_UNCACHED:
return &devinfo->pat.writecombining;
case IRIS_HEAP_DEVICE_LOCAL:
case IRIS_HEAP_DEVICE_LOCAL_PREFERRED:
default:
unreachable("invalid heap for platforms using PAT entries");
}
}

View file

@ -156,9 +156,33 @@ enum iris_mmap_mode {
};
enum iris_heap {
IRIS_HEAP_SYSTEM_MEMORY,
/**
* System memory which is CPU-cached at (at least 1-way) coherent.
*
* This will use WB (write-back) CPU mappings.
*
* LLC systems and discrete cards (which enable snooping) will mostly use
* this heap. Non-LLC systems will only use it when explicit coherency is
* required, as snooping is expensive there.
*/
IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT,
/**
* System memory which is not CPU cached.
*
* This will use WC (write-combining) CPU mappings, which has uncached
* performance for reads. This can be used for scanout on integrated
* GPUs (which is never coherent with CPU caches). It will be used for
* most buffers on non-LLC platforms, where cache coherency is expensive.
*/
IRIS_HEAP_SYSTEM_MEMORY_UNCACHED,
/** Device-local memory (VRAM). Cannot be placed in system memory! */
IRIS_HEAP_DEVICE_LOCAL,
/** Device-local memory that may be evicted to system memory if needed. */
IRIS_HEAP_DEVICE_LOCAL_PREFERRED,
IRIS_HEAP_MAX,
};
@ -546,19 +570,12 @@ iris_bo_bump_seqno(struct iris_bo *bo, uint64_t seqno,
prev_seqno = tmp;
}
const struct intel_device_info_pat_entry *
iris_bufmgr_get_pat_entry_for_bo_flags(const struct iris_bufmgr *bufmgr,
unsigned alloc_flags);
/**
* Return the pat index based on the bo allocation flags.
* Return the PAT entry based for the given heap.
*/
static inline uint32_t
iris_bufmgr_get_pat_index_for_bo_flags(const struct iris_bufmgr *bufmgr,
unsigned alloc_flags)
{
return iris_bufmgr_get_pat_entry_for_bo_flags(bufmgr, alloc_flags)->index;
}
const struct intel_device_info_pat_entry *
iris_heap_to_pat_entry(const struct intel_device_info *devinfo,
enum iris_heap heap);
enum iris_memory_zone iris_memzone_for_address(uint64_t address);

View file

@ -49,31 +49,3 @@ iris_xe_destroy_global_vm(struct iris_bufmgr *bufmgr)
return intel_ioctl(iris_bufmgr_get_fd(bufmgr), DRM_IOCTL_XE_VM_DESTROY,
&destroy) == 0;
}
/*
* Xe kmd has fixed caching modes for each heap, only scanout bos can change
* it.
*/
enum iris_mmap_mode
iris_xe_bo_flags_to_mmap_mode(struct iris_bufmgr *bufmgr, enum iris_heap heap,
unsigned flags)
{
const struct intel_device_info *devinfo = iris_bufmgr_get_device_info(bufmgr);
/* TODO: might be different for MTL/platforms without LLC */
switch (heap) {
case IRIS_HEAP_DEVICE_LOCAL_PREFERRED:
/* TODO: Can vary on current placement?! */
return IRIS_MMAP_WC;
case IRIS_HEAP_DEVICE_LOCAL:
if (!intel_vram_all_mappable(devinfo))
return IRIS_MMAP_NONE;
return IRIS_MMAP_WC;
case IRIS_HEAP_SYSTEM_MEMORY:
if (flags & BO_ALLOC_SCANOUT)
return IRIS_MMAP_WC;
return IRIS_MMAP_WB;
default:
return IRIS_MMAP_NONE;
}
}

View file

@ -30,7 +30,3 @@ enum iris_heap;
bool iris_xe_init_global_vm(struct iris_bufmgr *bufmgr, uint32_t *vm_id);
bool iris_xe_destroy_global_vm(struct iris_bufmgr *bufmgr);
enum iris_mmap_mode
iris_xe_bo_flags_to_mmap_mode(struct iris_bufmgr *bufmgr, enum iris_heap heap,
unsigned flags);