anv: Store batch buffers in a null-initialized VMA heap

The command streamer will blindly prefetch up to 4KiB ahead of a batch buffer
depending on the engine. To avoid page faults with the scratch page disabled,
we can create a special VMA heap for batch buffers that has pages initialized
with the null tile bit by default.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
This commit is contained in:
Calder Young 2026-02-06 16:12:29 -08:00 committed by Marge Bot
parent 80e6b468f4
commit 5fb78a26db
5 changed files with 95 additions and 17 deletions

View file

@ -215,6 +215,16 @@ enum intel_wa_steppings intel_device_info_wa_stepping(struct intel_device_info *
uint32_t intel_device_info_get_max_slm_size(const struct intel_device_info *devinfo);
uint32_t intel_device_info_get_max_preferred_slm_size(const struct intel_device_info *devinfo);
static inline unsigned
intel_device_info_get_max_engine_prefetch(const struct intel_device_info *devinfo)
{
unsigned max_prefetch = 0;
for (unsigned engine = INTEL_ENGINE_CLASS_RENDER;
engine < ARRAY_SIZE(devinfo->engine_class_prefetch); engine++)
max_prefetch = MAX2(max_prefetch, devinfo->engine_class_prefetch[engine]);
return max_prefetch;
}
/**
* True if this device supports the Extended Bindless Surface Offset mode,
* which offers 26-bit surface handles, instead of 20-bit. This effectively

View file

@ -401,6 +401,25 @@ anv_device_finish_descriptors_view(struct anv_device *device)
device->descriptor_view_state);
}
static VkResult
anv_device_bind_null_va(struct anv_device *device,
struct anv_va_range *range,
enum anv_vm_bind_op op)
{
struct anv_vm_bind bind = {
.address = range->addr,
.size = range->size,
.op = op,
};
struct anv_sparse_submission submit = {
.binds = &bind,
.binds_len = 1,
.binds_capacity = 1,
};
return device->kmd_backend->vm_bind(device, &submit,
ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
}
static VkResult
anv_device_init_vma_heaps(struct anv_device *device)
{
@ -416,6 +435,16 @@ anv_device_init_vma_heaps(struct anv_device *device)
device->physical->va.high_heap.addr,
device->physical->va.high_heap.size);
/* Reduce the usable size of the null initialized heap by enough pages so
* that no batch buffers get placed where the CS could end up prefetching
* beyond the limit of the null pages.
*/
unsigned max_prefetch = intel_device_info_get_max_engine_prefetch(device->info);
max_prefetch = align(max_prefetch, device->info->mem_alignment);
util_vma_heap_init(&device->vma_null_initialized,
device->physical->va.null_initialized_heap.addr,
device->physical->va.null_initialized_heap.size - max_prefetch);
if (device->physical->indirect_descriptors) {
util_vma_heap_init(&device->vma_desc,
device->physical->va.indirect_descriptor_pool.addr,
@ -442,6 +471,7 @@ anv_device_init_vma_heaps(struct anv_device *device)
static void
anv_device_finish_vma_heaps(struct anv_device *device)
{
util_vma_heap_finish(&device->vma_null_initialized);
util_vma_heap_finish(&device->vma_trtt);
util_vma_heap_finish(&device->vma_dynamic_visible);
util_vma_heap_finish(&device->vma_desc);
@ -843,12 +873,18 @@ VkResult anv_CreateDevice(
goto fail_vmas;
}
result = anv_device_bind_null_va(device,
&device->physical->va.null_initialized_heap,
ANV_VM_BIND);
if (result != VK_SUCCESS)
goto fail_mutex;
if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV)
anv_memory_trace_init(device);
result = anv_bo_cache_init(&device->bo_cache, device);
if (result != VK_SUCCESS)
goto fail_mutex;
goto fail_null_vma_init;
if (!anv_slab_bo_init(device))
goto fail_cache;
@ -1281,6 +1317,10 @@ VkResult anv_CreateDevice(
anv_slab_bo_deinit(device);
fail_cache:
anv_bo_cache_finish(&device->bo_cache);
fail_null_vma_init:
anv_device_bind_null_va(device,
&device->physical->va.null_initialized_heap,
ANV_VM_UNBIND);
fail_mutex:
pthread_mutex_destroy(&device->mutex);
fail_vmas:
@ -1485,6 +1525,9 @@ anv_vma_heap_for_flags(struct anv_device *device,
if (alloc_flags & ANV_BO_ALLOC_DYNAMIC_VISIBLE_POOL)
return &device->vma_dynamic_visible;
if (alloc_flags & ANV_BO_ALLOC_NULL_INITIALIZED_HEAP)
return &device->vma_null_initialized;
return &device->vma_hi;
}
@ -1539,7 +1582,8 @@ anv_vma_free(struct anv_device *device,
vma_heap == &device->vma_hi ||
vma_heap == &device->vma_desc ||
vma_heap == &device->vma_dynamic_visible ||
vma_heap == &device->vma_trtt);
vma_heap == &device->vma_trtt ||
vma_heap == &device->vma_null_initialized);
const uint64_t addr_48b = intel_48b_address(address);

View file

@ -474,6 +474,13 @@ enum anv_bo_alloc_flags {
/** Specifies that this bo is a slab parent */
ANV_BO_ALLOC_SLAB_PARENT = (1 << 22),
/** Specifies that the bo should be allocated from a special heap that maps
* all unused pages to null, we also reserved extra space at the end of the
* heap so that we can put batch buffers in it without the CS prefetching
* beyond the limit of the mapped null pages (causing a page fault).
*/
ANV_BO_ALLOC_NULL_INITIALIZED_HEAP = (1 << 23),
};
/** Specifies that the BO should be cached and coherent. */
@ -492,12 +499,14 @@ enum anv_bo_alloc_flags {
#define ANV_BO_ALLOC_BATCH_BUFFER_FLAGS (ANV_BO_ALLOC_MAPPED | \
ANV_BO_ALLOC_HOST_CACHED_COHERENT | \
ANV_BO_ALLOC_CAPTURE)
ANV_BO_ALLOC_CAPTURE | \
ANV_BO_ALLOC_NULL_INITIALIZED_HEAP)
#define ANV_BO_ALLOC_BATCH_BUFFER_INTERNAL_FLAGS (ANV_BO_ALLOC_MAPPED | \
ANV_BO_ALLOC_HOST_COHERENT | \
ANV_BO_ALLOC_INTERNAL | \
ANV_BO_ALLOC_CAPTURE)
ANV_BO_ALLOC_CAPTURE | \
ANV_BO_ALLOC_NULL_INITIALIZED_HEAP)
struct anv_bo {
const char *name;
@ -1688,6 +1697,10 @@ struct anv_physical_device {
* Push descriptor with descriptor buffers
*/
struct anv_va_range push_descriptor_buffer_pool;
/**
* Null page initialized heap
*/
struct anv_va_range null_initialized_heap;
/**
* AUX-TT
*/
@ -2604,6 +2617,7 @@ struct anv_device {
pthread_mutex_t vma_mutex;
struct util_vma_heap vma_lo;
struct util_vma_heap vma_hi;
struct util_vma_heap vma_null_initialized;
struct util_vma_heap vma_desc;
struct util_vma_heap vma_dynamic_visible;
struct util_vma_heap vma_trtt;

View file

@ -62,6 +62,7 @@ anv_device_print_vas(struct anv_physical_device *device)
PRINT_HEAP(dynamic_state_pool);
PRINT_HEAP(dynamic_visible_pool);
PRINT_HEAP(push_descriptor_buffer_pool);
PRINT_HEAP(null_initialized_heap);
PRINT_HEAP(high_heap);
PRINT_HEAP(trtt);
}
@ -144,8 +145,9 @@ anv_physical_device_init_va_ranges(struct anv_physical_device *device)
if (device->info.verx10 >= 125)
address = va_add(&device->va.push_descriptor_buffer_pool, address, _1Gb - 4096);
address = align64(address, device->info.mem_alignment);
address = align64(address, _1Gb);
address = va_add(&device->va.aux_tt_pool, address, 2 * _1Gb);
address = va_add(&device->va.null_initialized_heap, address, _1Gb * 8);
/* What's left to do for us is to set va.high_heap and va.trtt without
* overlap, but there are a few things to be considered:

View file

@ -390,13 +390,26 @@ xe_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
static VkResult
xe_vm_unbind_bo(struct anv_device *device, struct anv_bo *bo)
{
struct anv_vm_bind bind = {
.bo = bo,
.address = 0,
.bo_offset = 0,
.size = 0,
.op = ANV_VM_UNBIND_ALL,
};
struct anv_vm_bind bind;
if (bo->alloc_flags & ANV_BO_ALLOC_NULL_INITIALIZED_HEAP) {
bind = (struct anv_vm_bind) {
.address = bo->offset,
.size = bo->actual_size,
.op = ANV_VM_BIND,
};
} else if (bo->from_host_ptr) {
bind = (struct anv_vm_bind) {
.bo = bo,
.address = bo->offset,
.size = bo->actual_size,
.op = ANV_VM_UNBIND,
};
} else {
bind = (struct anv_vm_bind) {
.bo = bo,
.op = ANV_VM_UNBIND_ALL,
};
}
struct anv_sparse_submission submit = {
.queue = NULL,
.binds = &bind,
@ -405,11 +418,6 @@ xe_vm_unbind_bo(struct anv_device *device, struct anv_bo *bo)
.wait_count = 0,
.signal_count = 0,
};
if (bo->from_host_ptr) {
bind.address = bo->offset;
bind.size = bo->actual_size;
bind.op = ANV_VM_UNBIND;
}
return xe_vm_bind_op(device, &submit,
ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
}