diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c index de31331b5a3..3a8e1f5152d 100644 --- a/src/intel/vulkan/anv_sparse.c +++ b/src/intel/vulkan/anv_sparse.c @@ -798,7 +798,76 @@ anv_sparse_bind_vm_bind(struct anv_device *device, if (!queue) assert(submit->wait_count == 0 && submit->signal_count == 0); - return device->kmd_backend->vm_bind(device, submit, ANV_VM_BIND_FLAG_NONE); + VkResult result = device->kmd_backend->vm_bind(device, submit, + ANV_VM_BIND_FLAG_NONE); + + if (result == VK_ERROR_OUT_OF_HOST_MEMORY) { + /* If we get this, the system is under memory pressure. First we + * manually wait for all our dependency syncobjs hoping that some memory + * will be released while we wait, then we try to issue each bind + * operation in a single ioctl as it requires less Kernel memory and so + * we may be able to move things forward, although slowly, while also + * waiting for each operation to complete before issuing the next. + * Performance isn't a concern at this point: we're just trying to move + * progress forward without crashing until whatever is eating too much + * memory goes away. + */ + + result = vk_sync_wait_many(&device->vk, submit->wait_count, + submit->waits, VK_SYNC_WAIT_COMPLETE, + INT64_MAX); + if (result != VK_SUCCESS) + return vk_queue_set_lost(&queue->vk, "vk_sync_wait_many failed"); + + struct vk_sync *sync; + result = vk_sync_create(&device->vk, + &device->physical->sync_syncobj_type, + VK_SYNC_IS_TIMELINE, 0 /* initial_value */, + &sync); + if (result != VK_SUCCESS) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + for (int b = 0; b < submit->binds_len; b++) { + struct vk_sync_signal sync_signal = { + .sync = sync, + .signal_value = b + 1, + }; + struct anv_sparse_submission s = { + .queue = submit->queue, + .binds = &submit->binds[b], + .binds_len = 1, + .binds_capacity = 1, + .wait_count = 0, + .signal_count = 1, + .waits = NULL, + .signals = &sync_signal, + }; + result = device->kmd_backend->vm_bind(device, &s, + ANV_VM_BIND_FLAG_NONE); + if (result != VK_SUCCESS) { + vk_sync_destroy(&device->vk, sync); + return vk_error(device, result); /* Well, at least we tried... */ + } + + result = vk_sync_wait(&device->vk, sync, sync_signal.signal_value, + VK_SYNC_WAIT_COMPLETE, UINT64_MAX); + if (result != VK_SUCCESS) { + vk_sync_destroy(&device->vk, sync); + return vk_queue_set_lost(&queue->vk, "vk_sync_wait failed"); + } + } + + vk_sync_destroy(&device->vk, sync); + + for (uint32_t i = 0; i < submit->signal_count; i++) { + struct vk_sync_signal *s = &submit->signals[i]; + result = vk_sync_signal(&device->vk, s->sync, s->signal_value); + if (result != VK_SUCCESS) + return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed"); + } + } + + return VK_SUCCESS; } VkResult diff --git a/src/intel/vulkan/xe/anv_kmd_backend.c b/src/intel/vulkan/xe/anv_kmd_backend.c index edbd2f53390..93d9d74eebe 100644 --- a/src/intel/vulkan/xe/anv_kmd_backend.c +++ b/src/intel/vulkan/xe/anv_kmd_backend.c @@ -242,10 +242,23 @@ xe_vm_bind_op(struct anv_device *device, if (signal_bind_timeline) intel_bind_timeline_bind_end(&device->bind_timeline); + /* The vm_bind ioctl can return a wide variety of error codes, but most of + * them shouldn't happen in the real world. Here we list the interesting + * error case: + * + * - EINVAL: shouldn't happen. This is most likely a bug in our driver. + * - ENOMEM: generic out-of-memory error. + * - ENOBUFS: an out-of-memory error that is related to having too many + * bind operations in the same ioctl, so the recommendation here is to + * try to issue fewer binds per ioctl (ideally 1). + * + * The xe.ko team has plans to differentiate between lack of device memory + * vs lack of host memory in the future. + */ if (ret) { assert(errno_ != EINVAL); - if (errno_ == ENOMEM) - result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + if (errno_ == ENOMEM || errno_ == ENOBUFS) + result = VK_ERROR_OUT_OF_HOST_MEMORY; else result = vk_device_set_lost(&device->vk, "vm_bind failed with errno %d", errno_);