mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 13:48:06 +02:00
anv: Store batch buffers in a null-initialized VMA heap
The command streamer will blindly prefetch up to 4KiB ahead of a batch buffer depending on the engine. To avoid page faults with the scratch page disabled, we can create a special VMA heap for batch buffers that has pages initialized with the null tile bit by default. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
This commit is contained in:
parent
80e6b468f4
commit
5fb78a26db
5 changed files with 95 additions and 17 deletions
|
|
@ -215,6 +215,16 @@ enum intel_wa_steppings intel_device_info_wa_stepping(struct intel_device_info *
|
|||
uint32_t intel_device_info_get_max_slm_size(const struct intel_device_info *devinfo);
|
||||
uint32_t intel_device_info_get_max_preferred_slm_size(const struct intel_device_info *devinfo);
|
||||
|
||||
static inline unsigned
|
||||
intel_device_info_get_max_engine_prefetch(const struct intel_device_info *devinfo)
|
||||
{
|
||||
unsigned max_prefetch = 0;
|
||||
for (unsigned engine = INTEL_ENGINE_CLASS_RENDER;
|
||||
engine < ARRAY_SIZE(devinfo->engine_class_prefetch); engine++)
|
||||
max_prefetch = MAX2(max_prefetch, devinfo->engine_class_prefetch[engine]);
|
||||
return max_prefetch;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if this device supports the Extended Bindless Surface Offset mode,
|
||||
* which offers 26-bit surface handles, instead of 20-bit. This effectively
|
||||
|
|
|
|||
|
|
@ -401,6 +401,25 @@ anv_device_finish_descriptors_view(struct anv_device *device)
|
|||
device->descriptor_view_state);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
anv_device_bind_null_va(struct anv_device *device,
|
||||
struct anv_va_range *range,
|
||||
enum anv_vm_bind_op op)
|
||||
{
|
||||
struct anv_vm_bind bind = {
|
||||
.address = range->addr,
|
||||
.size = range->size,
|
||||
.op = op,
|
||||
};
|
||||
struct anv_sparse_submission submit = {
|
||||
.binds = &bind,
|
||||
.binds_len = 1,
|
||||
.binds_capacity = 1,
|
||||
};
|
||||
return device->kmd_backend->vm_bind(device, &submit,
|
||||
ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
anv_device_init_vma_heaps(struct anv_device *device)
|
||||
{
|
||||
|
|
@ -416,6 +435,16 @@ anv_device_init_vma_heaps(struct anv_device *device)
|
|||
device->physical->va.high_heap.addr,
|
||||
device->physical->va.high_heap.size);
|
||||
|
||||
/* Reduce the usable size of the null initialized heap by enough pages so
|
||||
* that no batch buffers get placed where the CS could end up prefetching
|
||||
* beyond the limit of the null pages.
|
||||
*/
|
||||
unsigned max_prefetch = intel_device_info_get_max_engine_prefetch(device->info);
|
||||
max_prefetch = align(max_prefetch, device->info->mem_alignment);
|
||||
util_vma_heap_init(&device->vma_null_initialized,
|
||||
device->physical->va.null_initialized_heap.addr,
|
||||
device->physical->va.null_initialized_heap.size - max_prefetch);
|
||||
|
||||
if (device->physical->indirect_descriptors) {
|
||||
util_vma_heap_init(&device->vma_desc,
|
||||
device->physical->va.indirect_descriptor_pool.addr,
|
||||
|
|
@ -442,6 +471,7 @@ anv_device_init_vma_heaps(struct anv_device *device)
|
|||
static void
|
||||
anv_device_finish_vma_heaps(struct anv_device *device)
|
||||
{
|
||||
util_vma_heap_finish(&device->vma_null_initialized);
|
||||
util_vma_heap_finish(&device->vma_trtt);
|
||||
util_vma_heap_finish(&device->vma_dynamic_visible);
|
||||
util_vma_heap_finish(&device->vma_desc);
|
||||
|
|
@ -843,12 +873,18 @@ VkResult anv_CreateDevice(
|
|||
goto fail_vmas;
|
||||
}
|
||||
|
||||
result = anv_device_bind_null_va(device,
|
||||
&device->physical->va.null_initialized_heap,
|
||||
ANV_VM_BIND);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_mutex;
|
||||
|
||||
if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV)
|
||||
anv_memory_trace_init(device);
|
||||
|
||||
result = anv_bo_cache_init(&device->bo_cache, device);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_mutex;
|
||||
goto fail_null_vma_init;
|
||||
|
||||
if (!anv_slab_bo_init(device))
|
||||
goto fail_cache;
|
||||
|
|
@ -1281,6 +1317,10 @@ VkResult anv_CreateDevice(
|
|||
anv_slab_bo_deinit(device);
|
||||
fail_cache:
|
||||
anv_bo_cache_finish(&device->bo_cache);
|
||||
fail_null_vma_init:
|
||||
anv_device_bind_null_va(device,
|
||||
&device->physical->va.null_initialized_heap,
|
||||
ANV_VM_UNBIND);
|
||||
fail_mutex:
|
||||
pthread_mutex_destroy(&device->mutex);
|
||||
fail_vmas:
|
||||
|
|
@ -1485,6 +1525,9 @@ anv_vma_heap_for_flags(struct anv_device *device,
|
|||
if (alloc_flags & ANV_BO_ALLOC_DYNAMIC_VISIBLE_POOL)
|
||||
return &device->vma_dynamic_visible;
|
||||
|
||||
if (alloc_flags & ANV_BO_ALLOC_NULL_INITIALIZED_HEAP)
|
||||
return &device->vma_null_initialized;
|
||||
|
||||
return &device->vma_hi;
|
||||
}
|
||||
|
||||
|
|
@ -1539,7 +1582,8 @@ anv_vma_free(struct anv_device *device,
|
|||
vma_heap == &device->vma_hi ||
|
||||
vma_heap == &device->vma_desc ||
|
||||
vma_heap == &device->vma_dynamic_visible ||
|
||||
vma_heap == &device->vma_trtt);
|
||||
vma_heap == &device->vma_trtt ||
|
||||
vma_heap == &device->vma_null_initialized);
|
||||
|
||||
const uint64_t addr_48b = intel_48b_address(address);
|
||||
|
||||
|
|
|
|||
|
|
@ -474,6 +474,13 @@ enum anv_bo_alloc_flags {
|
|||
|
||||
/** Specifies that this bo is a slab parent */
|
||||
ANV_BO_ALLOC_SLAB_PARENT = (1 << 22),
|
||||
|
||||
/** Specifies that the bo should be allocated from a special heap that maps
|
||||
* all unused pages to null, we also reserved extra space at the end of the
|
||||
* heap so that we can put batch buffers in it without the CS prefetching
|
||||
* beyond the limit of the mapped null pages (causing a page fault).
|
||||
*/
|
||||
ANV_BO_ALLOC_NULL_INITIALIZED_HEAP = (1 << 23),
|
||||
};
|
||||
|
||||
/** Specifies that the BO should be cached and coherent. */
|
||||
|
|
@ -492,12 +499,14 @@ enum anv_bo_alloc_flags {
|
|||
|
||||
#define ANV_BO_ALLOC_BATCH_BUFFER_FLAGS (ANV_BO_ALLOC_MAPPED | \
|
||||
ANV_BO_ALLOC_HOST_CACHED_COHERENT | \
|
||||
ANV_BO_ALLOC_CAPTURE)
|
||||
ANV_BO_ALLOC_CAPTURE | \
|
||||
ANV_BO_ALLOC_NULL_INITIALIZED_HEAP)
|
||||
|
||||
#define ANV_BO_ALLOC_BATCH_BUFFER_INTERNAL_FLAGS (ANV_BO_ALLOC_MAPPED | \
|
||||
ANV_BO_ALLOC_HOST_COHERENT | \
|
||||
ANV_BO_ALLOC_INTERNAL | \
|
||||
ANV_BO_ALLOC_CAPTURE)
|
||||
ANV_BO_ALLOC_CAPTURE | \
|
||||
ANV_BO_ALLOC_NULL_INITIALIZED_HEAP)
|
||||
|
||||
struct anv_bo {
|
||||
const char *name;
|
||||
|
|
@ -1688,6 +1697,10 @@ struct anv_physical_device {
|
|||
* Push descriptor with descriptor buffers
|
||||
*/
|
||||
struct anv_va_range push_descriptor_buffer_pool;
|
||||
/**
|
||||
* Null page initialized heap
|
||||
*/
|
||||
struct anv_va_range null_initialized_heap;
|
||||
/**
|
||||
* AUX-TT
|
||||
*/
|
||||
|
|
@ -2604,6 +2617,7 @@ struct anv_device {
|
|||
pthread_mutex_t vma_mutex;
|
||||
struct util_vma_heap vma_lo;
|
||||
struct util_vma_heap vma_hi;
|
||||
struct util_vma_heap vma_null_initialized;
|
||||
struct util_vma_heap vma_desc;
|
||||
struct util_vma_heap vma_dynamic_visible;
|
||||
struct util_vma_heap vma_trtt;
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ anv_device_print_vas(struct anv_physical_device *device)
|
|||
PRINT_HEAP(dynamic_state_pool);
|
||||
PRINT_HEAP(dynamic_visible_pool);
|
||||
PRINT_HEAP(push_descriptor_buffer_pool);
|
||||
PRINT_HEAP(null_initialized_heap);
|
||||
PRINT_HEAP(high_heap);
|
||||
PRINT_HEAP(trtt);
|
||||
}
|
||||
|
|
@ -144,8 +145,9 @@ anv_physical_device_init_va_ranges(struct anv_physical_device *device)
|
|||
if (device->info.verx10 >= 125)
|
||||
address = va_add(&device->va.push_descriptor_buffer_pool, address, _1Gb - 4096);
|
||||
|
||||
address = align64(address, device->info.mem_alignment);
|
||||
address = align64(address, _1Gb);
|
||||
address = va_add(&device->va.aux_tt_pool, address, 2 * _1Gb);
|
||||
address = va_add(&device->va.null_initialized_heap, address, _1Gb * 8);
|
||||
|
||||
/* What's left to do for us is to set va.high_heap and va.trtt without
|
||||
* overlap, but there are a few things to be considered:
|
||||
|
|
|
|||
|
|
@ -390,13 +390,26 @@ xe_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
|
|||
static VkResult
|
||||
xe_vm_unbind_bo(struct anv_device *device, struct anv_bo *bo)
|
||||
{
|
||||
struct anv_vm_bind bind = {
|
||||
.bo = bo,
|
||||
.address = 0,
|
||||
.bo_offset = 0,
|
||||
.size = 0,
|
||||
.op = ANV_VM_UNBIND_ALL,
|
||||
};
|
||||
struct anv_vm_bind bind;
|
||||
if (bo->alloc_flags & ANV_BO_ALLOC_NULL_INITIALIZED_HEAP) {
|
||||
bind = (struct anv_vm_bind) {
|
||||
.address = bo->offset,
|
||||
.size = bo->actual_size,
|
||||
.op = ANV_VM_BIND,
|
||||
};
|
||||
} else if (bo->from_host_ptr) {
|
||||
bind = (struct anv_vm_bind) {
|
||||
.bo = bo,
|
||||
.address = bo->offset,
|
||||
.size = bo->actual_size,
|
||||
.op = ANV_VM_UNBIND,
|
||||
};
|
||||
} else {
|
||||
bind = (struct anv_vm_bind) {
|
||||
.bo = bo,
|
||||
.op = ANV_VM_UNBIND_ALL,
|
||||
};
|
||||
}
|
||||
struct anv_sparse_submission submit = {
|
||||
.queue = NULL,
|
||||
.binds = &bind,
|
||||
|
|
@ -405,11 +418,6 @@ xe_vm_unbind_bo(struct anv_device *device, struct anv_bo *bo)
|
|||
.wait_count = 0,
|
||||
.signal_count = 0,
|
||||
};
|
||||
if (bo->from_host_ptr) {
|
||||
bind.address = bo->offset;
|
||||
bind.size = bo->actual_size;
|
||||
bind.op = ANV_VM_UNBIND;
|
||||
}
|
||||
return xe_vm_bind_op(device, &submit,
|
||||
ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue