/* * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "v3dv_private.h" #include "drm-uapi/v3d_drm.h" #include "broadcom/clif/clif_dump.h" #include "util/os_time.h" #include #include static void v3dv_clif_dump(struct v3dv_device *device, struct v3dv_job *job, struct drm_v3d_submit_cl *submit) { if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CL_NO_BIN | V3D_DEBUG_CLIF)))) return; struct clif_dump *clif = clif_dump_init(&device->devinfo, stderr, V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CL_NO_BIN), V3D_DEBUG & V3D_DEBUG_CL_NO_BIN); set_foreach(job->bos, entry) { struct v3dv_bo *bo = (void *)entry->key; char *name = ralloc_asprintf(NULL, "%s_0x%x", bo->name, bo->offset); bool ok = v3dv_bo_map(device, bo, bo->size); if (!ok) { fprintf(stderr, "failed to map BO for clif_dump.\n"); ralloc_free(name); goto free_clif; } clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map); ralloc_free(name); } clif_dump(clif, submit); free_clif: clif_dump_destroy(clif); } static VkResult queue_submit_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info, pthread_t *wait_thread); /* Waits for active CPU wait threads spawned before the current thread to * complete and submit all their GPU jobs. */ static void cpu_queue_wait_idle(struct v3dv_queue *queue) { const pthread_t this_thread = pthread_self(); retry: mtx_lock(&queue->mutex); list_for_each_entry(struct v3dv_queue_submit_wait_info, info, &queue->submit_wait_list, list_link) { for (uint32_t i = 0; i < info->wait_thread_count; i++) { if (info->wait_threads[i].finished) continue; /* Because we are testing this against the list of spawned threads * it will never match for the main thread, so when we call this from * the main thread we are effectively waiting for all active threads * to complete, and otherwise we are only waiting for work submitted * before the wait thread that called this (a wait thread should never * be waiting for work submitted after it). */ if (info->wait_threads[i].thread == this_thread) goto done; /* Wait and try again */ mtx_unlock(&queue->mutex); usleep(500); /* 0.5 ms */ goto retry; } } done: mtx_unlock(&queue->mutex); } static VkResult gpu_queue_wait_idle(struct v3dv_queue *queue) { struct v3dv_device *device = queue->device; int render_fd = device->pdevice->render_fd; struct v3dv_last_job_sync last_job_syncs; mtx_lock(&device->mutex); memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs)); mtx_unlock(&device->mutex); if (device->pdevice->caps.multisync) { int ret = drmSyncobjWait(render_fd, (uint32_t *) &last_job_syncs.syncs, 3, INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL); if (ret) return vk_queue_set_lost(&queue->vk, "Syncobj wait failed: %m"); } else { int ret = drmSyncobjWait(render_fd, &last_job_syncs.syncs[V3DV_QUEUE_ANY], 1, INT64_MAX, 0, NULL); if (ret) return vk_queue_set_lost(&queue->vk, "Syncobj wait failed: %m"); } return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_QueueWaitIdle(VkQueue _queue) { V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); if (vk_device_is_lost(&queue->device->vk)) return VK_ERROR_DEVICE_LOST; /* Check that we don't have any wait threads running in the CPU first, * as these can spawn new GPU jobs. */ cpu_queue_wait_idle(queue); /* Check we don't have any GPU jobs running */ return gpu_queue_wait_idle(queue); } static VkResult handle_reset_query_cpu_job(struct v3dv_job *job) { struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset; assert(info->pool); /* We are about to reset query counters so we need to make sure that * The GPU is not using them. The exception is timestamp queries, since * we handle those in the CPU. * * FIXME: we could avoid blocking the main thread for this if we use * submission thread. */ if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); v3dv_reset_query_pools(job->device, info->pool, info->first, info->count); return VK_SUCCESS; } static VkResult handle_end_query_cpu_job(struct v3dv_job *job) { mtx_lock(&job->device->query_mutex); struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end; for (uint32_t i = 0; i < info->count; i++) { assert(info->query + i < info->pool->query_count); struct v3dv_query *query = &info->pool->queries[info->query + i]; query->maybe_available = true; } cnd_broadcast(&job->device->query_ended); mtx_unlock(&job->device->query_mutex); return VK_SUCCESS; } static VkResult handle_copy_query_results_cpu_job(struct v3dv_job *job) { struct v3dv_copy_query_results_cpu_job_info *info = &job->cpu.query_copy_results; assert(info->dst && info->dst->mem && info->dst->mem->bo); struct v3dv_bo *bo = info->dst->mem->bo; /* Map the entire dst buffer for the CPU copy if needed */ assert(!bo->map || bo->map_size == bo->size); if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a * sync wait on the CPU for the corresponding GPU jobs to finish. We might * want to use a submission thread to avoid blocking on the main thread. */ uint8_t *offset = ((uint8_t *) bo->map) + info->offset + info->dst->mem_offset; v3dv_get_query_pool_results_cpu(job->device, info->pool, info->first, info->count, offset, info->stride, info->flags); return VK_SUCCESS; } static VkResult handle_set_event_cpu_job(struct v3dv_job *job) { /* From the Vulkan 1.0 spec: * * "When vkCmdSetEvent is submitted to a queue, it defines an execution * dependency on commands that were submitted before it, and defines an * event signal operation which sets the event to the signaled state. * The first synchronization scope includes every command previously * submitted to the same queue, including those in the same command * buffer and batch". * * So we should wait for all prior work to be completed before signaling * the event, this includes all active CPU wait threads spawned for any * command buffer submitted *before* this. * * FIXME: we could avoid blocking the main thread for this if we use a * submission thread. */ /* If we are calling this from a wait thread it will only wait * wait threads sspawned before it, otherwise it will wait for * all active threads to complete. */ cpu_queue_wait_idle(&job->device->queue); VkResult result = gpu_queue_wait_idle(&job->device->queue); if (result != VK_SUCCESS) return result; struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set; p_atomic_set(&info->event->state, info->state); return VK_SUCCESS; } static VkResult copy_semaphores(struct v3dv_device *device, VkSemaphore *sems_src, uint32_t sems_src_count, VkSemaphore **sems_dst, uint32_t *sems_dst_count) { *sems_dst_count = sems_src_count; if (*sems_dst_count == 0) { *sems_dst = NULL; return VK_SUCCESS; } *sems_dst = vk_alloc(&device->vk.alloc, *sems_dst_count * sizeof(VkSemaphore), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!sems_dst) { *sems_dst_count = 0; return VK_ERROR_OUT_OF_HOST_MEMORY; } memcpy(*sems_dst, sems_src, *sems_dst_count * sizeof(VkSemaphore)); return VK_SUCCESS; } static struct v3dv_submit_info_semaphores * copy_semaphores_info(struct v3dv_device *device, struct v3dv_submit_info_semaphores *info) { VkResult result; struct v3dv_submit_info_semaphores *info_copy = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_submit_info_semaphores), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!info_copy) return NULL; result = copy_semaphores(device, info->wait_sems, info->wait_sem_count, &info_copy->wait_sems, &info_copy->wait_sem_count); if (result != VK_SUCCESS) goto fail; result = copy_semaphores(device, info->signal_sems, info->signal_sem_count, &info_copy->signal_sems, &info_copy->signal_sem_count); if (result != VK_SUCCESS) goto fail; return info_copy; fail: if (info_copy->wait_sem_count > 0) vk_free(&device->vk.alloc, info_copy->wait_sems); vk_free(&device->vk.alloc, info_copy); return NULL; } static struct v3dv_wait_thread_info * create_wait_thread_info(struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info) { struct v3dv_wait_thread_info *info = vk_alloc(&job->device->vk.alloc, sizeof(*info), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!info) return NULL; info->job = job; info->sems_info = copy_semaphores_info(job->device, sems_info); if (!info->sems_info) { vk_free(&job->device->vk.alloc, info); return NULL; } return info; } static void free_wait_thread_info(struct v3dv_device *device, struct v3dv_wait_thread_info *info) { assert(info != NULL); if (info->sems_info->wait_sem_count > 0) vk_free(&device->vk.alloc, info->sems_info->wait_sems); if (info->sems_info->signal_sem_count > 0) vk_free(&device->vk.alloc, info->sems_info->signal_sems); vk_free(&device->vk.alloc, info->sems_info); vk_free(&device->vk.alloc, info); } static bool check_wait_events_complete(struct v3dv_job *job) { assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait; for (uint32_t i = 0; i < info->event_count; i++) { if (!p_atomic_read(&info->events[i]->state)) return false; } return true; } static void wait_thread_finish(struct v3dv_queue *queue, pthread_t thread) { mtx_lock(&queue->mutex); list_for_each_entry(struct v3dv_queue_submit_wait_info, info, &queue->submit_wait_list, list_link) { for (uint32_t i = 0; i < info->wait_thread_count; i++) { if (info->wait_threads[i].thread == thread) { info->wait_threads[i].finished = true; goto done; } } } unreachable(!"Failed to finish wait thread: not found"); done: mtx_unlock(&queue->mutex); } static void * event_wait_thread_func(void *_info) { struct v3dv_wait_thread_info *info = (struct v3dv_wait_thread_info *) _info; struct v3dv_job *job = info->job; assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); /* Wait for events to be signaled */ const useconds_t wait_interval_ms = 1; while (!check_wait_events_complete(job)) usleep(wait_interval_ms * 1000); /* Now continue submitting pending jobs for the same command buffer after * the wait job. */ struct v3dv_queue *queue = &job->device->queue; list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next, &job->cmd_buffer->jobs, list_link) { /* We can't signal semaphores from wait threads because in this case * we can't ensure job completion order any more (i.e. if the wait for * events is in the first command buffer of a batch then the last job * from the last command buffer in that batch can't signal). We always * need to signal from the master thread in that case, when we know we * are done submitting all jobs from all command buffers. */ pjob->do_sem_signal = false; /* We don't want to spawn more than one wait thread per command buffer. * If this job also requires a wait for events, we will do the wait here. */ VkResult result = queue_submit_job(queue, pjob, info->sems_info, NULL); if (result == VK_NOT_READY) { while (!check_wait_events_complete(pjob)) { usleep(wait_interval_ms * 1000); } result = VK_SUCCESS; } if (result != VK_SUCCESS) { fprintf(stderr, "Wait thread job execution failed.\n"); goto done; } } done: wait_thread_finish(queue, pthread_self()); free_wait_thread_info(job->device, info); return NULL; } static VkResult spawn_event_wait_thread(struct v3dv_wait_thread_info *info, pthread_t *wait_thread) { assert(info->job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); assert(info->job->cmd_buffer); assert(wait_thread != NULL); if (pthread_create(wait_thread, NULL, event_wait_thread_func, info)) return vk_queue_set_lost(&info->job->device->queue.vk, "Thread create failed: %m"); return VK_NOT_READY; } static VkResult handle_wait_events_cpu_job(struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info, pthread_t *wait_thread) { assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); /* If all events are signaled then we are done and can continue submitting * the rest of the command buffer normally. */ if (check_wait_events_complete(job)) return VK_SUCCESS; /* Otherwise, we put the rest of the command buffer on a wait thread until * all events are signaled. We only spawn a new thread on the first * wait job we see for a command buffer, any additional wait jobs in the * same command buffer will run in that same wait thread and will get here * with a NULL wait_thread pointer. * * Also, whether we spawn a wait thread or not, we always return * VK_NOT_READY (unless an error happened), so we stop trying to submit * any jobs in the same command buffer after the wait job. The wait thread * will attempt to submit them after the wait completes. */ if (!wait_thread) return VK_NOT_READY; /* As events can be signaled by the host, jobs after the event wait must * still wait for semaphores, if any. So, whenever we spawn a wait thread, * we keep a copy of the semaphores (info->sems_info) to be used when * submitting pending jobs in the wait thread context. */ struct v3dv_wait_thread_info *info = create_wait_thread_info(job, sems_info); if (!info) return VK_ERROR_OUT_OF_HOST_MEMORY; return spawn_event_wait_thread(info, wait_thread); } static VkResult handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job) { assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE); struct v3dv_copy_buffer_to_image_cpu_job_info *info = &job->cpu.copy_buffer_to_image; /* Wait for all GPU work to finish first, since we may be accessing * the BOs involved in the operation. */ v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); /* Map BOs */ struct v3dv_bo *dst_bo = info->image->mem->bo; assert(!dst_bo->map || dst_bo->map_size == dst_bo->size); if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size)) return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); void *dst_ptr = dst_bo->map; struct v3dv_bo *src_bo = info->buffer->mem->bo; assert(!src_bo->map || src_bo->map_size == src_bo->size); if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size)) return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); void *src_ptr = src_bo->map; const struct v3d_resource_slice *slice = &info->image->slices[info->mip_level]; const struct pipe_box box = { info->image_offset.x, info->image_offset.y, info->base_layer, info->image_extent.width, info->image_extent.height, info->layer_count, }; /* Copy each layer */ for (uint32_t i = 0; i < info->layer_count; i++) { const uint32_t dst_offset = v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i); const uint32_t src_offset = info->buffer->mem_offset + info->buffer_offset + info->buffer_layer_stride * i; v3d_store_tiled_image( dst_ptr + dst_offset, slice->stride, src_ptr + src_offset, info->buffer_stride, slice->tiling, info->image->cpp, slice->padded_height, &box); } return VK_SUCCESS; } static VkResult handle_timestamp_query_cpu_job(struct v3dv_job *job) { assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY); struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp; /* Wait for completion of all work queued before the timestamp query */ v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); mtx_lock(&job->device->query_mutex); /* Compute timestamp */ struct timespec t; clock_gettime(CLOCK_MONOTONIC, &t); for (uint32_t i = 0; i < info->count; i++) { assert(info->query + i < info->pool->query_count); struct v3dv_query *query = &info->pool->queries[info->query + i]; query->maybe_available = true; if (i == 0) query->value = t.tv_sec * 1000000000ull + t.tv_nsec; } cnd_broadcast(&job->device->query_ended); mtx_unlock(&job->device->query_mutex); return VK_SUCCESS; } static VkResult handle_csd_indirect_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info) { assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect; assert(info->csd_job); /* Make sure the GPU is no longer using the indirect buffer*/ assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE); /* Map the indirect buffer and read the dispatch parameters */ assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); struct v3dv_bo *bo = info->buffer->mem->bo; if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); assert(bo->map); const uint32_t offset = info->buffer->mem_offset + info->offset; const uint32_t *group_counts = (uint32_t *) (bo->map + offset); if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0) return VK_SUCCESS; if (memcmp(group_counts, info->csd_job->csd.wg_count, sizeof(info->csd_job->csd.wg_count)) != 0) { v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); } return VK_SUCCESS; } static uint32_t semaphore_get_sync(struct v3dv_semaphore *sem) { if (!sem->has_temp) return sem->sync; assert(sem->temp_sync > 0); return sem->temp_sync; } static uint32_t fence_get_sync(struct v3dv_fence *fence) { if (!fence->has_temp) return fence->sync; assert(fence->temp_sync > 0); return fence->temp_sync; } static VkResult process_semaphores_to_signal(struct v3dv_device *device, uint32_t count, const VkSemaphore *sems, bool is_master_thread) { if (count == 0) return VK_SUCCESS; /* If multisync is supported, we are signalling semaphores in the last job * of the last command buffer and, therefore, we do not need to process any * semaphores here, unless we come from a wait thread, because in that case * we never signal. */ if (device->pdevice->caps.multisync && !is_master_thread) return VK_SUCCESS; int render_fd = device->pdevice->render_fd; int fd; mtx_lock(&device->mutex); drmSyncobjExportSyncFile(render_fd, device->last_job_syncs.syncs[V3DV_QUEUE_ANY], &fd); mtx_unlock(&device->mutex); if (fd == -1) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < count; i++) { struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]); uint32_t sync = semaphore_get_sync(sem); int ret = drmSyncobjImportSyncFile(render_fd, sync, fd); if (ret) { result = VK_ERROR_OUT_OF_HOST_MEMORY; break; } } assert(fd >= 0); close(fd); return result; } static VkResult queue_submit_noop_job(struct v3dv_queue *queue, struct v3dv_submit_info_semaphores *sems_info, bool do_sem_signal, bool serialize); static VkResult process_fence_to_signal(struct v3dv_device *device, VkFence _fence) { if (_fence == VK_NULL_HANDLE) return VK_SUCCESS; struct v3dv_fence *fence = v3dv_fence_from_handle(_fence); int render_fd = device->pdevice->render_fd; if (device->pdevice->caps.multisync) { struct v3dv_queue *queue = &device->queue; /* We signal the fence once all submitted command buffers have completed * execution. For this, we emit a noop job that waits on the completion * of all submitted jobs and signal the fence for this submission. * FIXME: In simpler cases (for instance, when all jobs were submitted to * the same queue), we can just import the last out sync produced into * the fence. */ struct v3dv_submit_info_semaphores sems_info = { .wait_sem_count = 0, .wait_sems = NULL, .signal_sem_count = 0, .signal_sems = NULL, .fence = _fence, }; return queue_submit_noop_job(queue, &sems_info, false, true); } int fd; mtx_lock(&device->mutex); drmSyncobjExportSyncFile(render_fd, device->last_job_syncs.syncs[V3DV_QUEUE_ANY], &fd); mtx_unlock(&device->mutex); if (fd == -1) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); uint32_t sync = fence_get_sync(fence); int ret = drmSyncobjImportSyncFile(render_fd, sync, fd); assert(fd >= 0); close(fd); return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS; } static void multisync_free(struct v3dv_device *device, struct drm_v3d_multi_sync *ms) { vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs); vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs); } static struct drm_v3d_sem * set_in_syncs(struct v3dv_device *device, struct v3dv_job *job, enum v3dv_queue_type queue, uint32_t *count, struct v3dv_submit_info_semaphores *sems_info) { uint32_t n_sems = 0; /* If this is the first job submitted to a given GPU queue in this cmd buf * batch, it has to wait on wait semaphores (if any) before running. */ if (device->last_job_syncs.first[queue]) n_sems = sems_info->wait_sem_count; /* If we don't need to wait on wait semaphores but the serialize flag is * set, this job waits for completion of all GPU jobs submitted in any * queue V3DV_QUEUE_(CL/TFU/CSD) before running. */ *count = n_sems == 0 && job->serialize ? 3 : n_sems; if (!*count) return NULL; struct drm_v3d_sem *syncs = vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!syncs) return NULL; if (n_sems) { for (int i = 0; i < *count; i++) { struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems_info->wait_sems[i]); syncs[i].handle = semaphore_get_sync(sem); /* From the Vulkan 1.0 spec: * * "If the import is temporary, the implementation must restore * the semaphore to its prior permanent state after submitting * the next semaphore wait operation." * * We can't destroy the temporary sync until the kernel is done * with it, this is why we need to have this 'has_temp' flag instead * of checking temp_sync for 0 to know if we have a temporary * payload. The temporary sync will be destroyed if we import into * the semaphore again or if the semaphore is destroyed by the * client. */ sem->has_temp = false; } } else { for (int i = 0; i < *count; i++) syncs[i].handle = device->last_job_syncs.syncs[i]; } return syncs; } static struct drm_v3d_sem * set_out_syncs(struct v3dv_device *device, struct v3dv_job *job, enum v3dv_queue_type queue, uint32_t *count, struct v3dv_submit_info_semaphores *sems_info) { uint32_t n_sems = job->do_sem_signal ? sems_info->signal_sem_count : 0; /* We always signal the syncobj from `device->last_job_syncs` related to * this v3dv_queue_type to track the last job submitted to this queue. */ (*count) = n_sems + 1; if (sems_info->fence) (*count)++; struct drm_v3d_sem *syncs = vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!syncs) return NULL; if (n_sems) { for (unsigned i = 0; i < n_sems; i++) { struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems_info->signal_sems[i]); syncs[i].handle = semaphore_get_sync(sem); } } syncs[n_sems].handle = device->last_job_syncs.syncs[queue]; if (sems_info->fence) { struct v3dv_fence *fence = v3dv_fence_from_handle(sems_info->fence); syncs[++n_sems].handle = fence_get_sync(fence); } return syncs; } static void set_ext(struct drm_v3d_extension *ext, struct drm_v3d_extension *next, uint32_t id, uintptr_t flags) { ext->next = (uintptr_t)(void *)next; ext->id = id; ext->flags = flags; } /* This function sets the extension for multiple in/out syncobjs. When it is * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC. * Otherwise, the extension id is 0, which means an out-of-memory error. */ static void set_multisync(struct drm_v3d_multi_sync *ms, struct v3dv_submit_info_semaphores *sems_info, struct drm_v3d_extension *next, struct v3dv_device *device, struct v3dv_job *job, enum v3dv_queue_type queue_sync, enum v3d_queue wait_stage) { uint32_t out_sync_count = 0, in_sync_count = 0; struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL; in_syncs = set_in_syncs(device, job, queue_sync, &in_sync_count, sems_info); if (!in_syncs && in_sync_count) goto fail; out_syncs = set_out_syncs(device, job, queue_sync, &out_sync_count, sems_info); assert(out_sync_count > 0); if (!out_syncs) goto fail; set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0); ms->wait_stage = wait_stage; ms->out_sync_count = out_sync_count; ms->out_syncs = (uintptr_t)(void *)out_syncs; ms->in_sync_count = in_sync_count; ms->in_syncs = (uintptr_t)(void *)in_syncs; device->last_job_syncs.first[queue_sync] = false; return; fail: if (in_syncs) vk_free(&device->vk.alloc, in_syncs); assert(!out_syncs); return; } static VkResult handle_cl_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info) { struct v3dv_device *device = queue->device; struct drm_v3d_submit_cl submit = { 0 }; /* Sanity check: we should only flag a bcl sync on a job that needs to be * serialized. */ assert(job->serialize || !job->needs_bcl_sync); /* We expect to have just one RCL per job which should fit in just one BO. * Our BCL, could chain multiple BOS together though. */ assert(list_length(&job->rcl.bo_list) == 1); assert(list_length(&job->bcl.bo_list) >= 1); struct v3dv_bo *bcl_fist_bo = list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link); submit.bcl_start = bcl_fist_bo->offset; submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl); submit.rcl_start = job->rcl.bo->offset; submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl); submit.qma = job->tile_alloc->offset; submit.qms = job->tile_alloc->size; submit.qts = job->tile_state->offset; submit.flags = 0; if (job->tmu_dirty_rcl) submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE; submit.bo_handle_count = job->bo_count; uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count); uint32_t bo_idx = 0; set_foreach(job->bos, entry) { struct v3dv_bo *bo = (struct v3dv_bo *)entry->key; bo_handles[bo_idx++] = bo->handle; } assert(bo_idx == submit.bo_handle_count); submit.bo_handles = (uintptr_t)(void *)bo_handles; /* We need a binning sync if we are waiting on a semaphore or if the job * comes after a pipeline barrier that involves geometry stages * (needs_bcl_sync). * * We need a render sync if the job doesn't need a binning sync but has * still been flagged for serialization. It should be noted that RCL jobs * don't start until the previous RCL job has finished so we don't really * need to add a fence for those, however, we might need to wait on a CSD or * TFU job, which are not automatically serialized with CL jobs. * * FIXME: see if we can do better and avoid bcl syncs for any jobs in the * command buffer after the first job where we should be able to track bcl * dependencies strictly through barriers. */ const bool needs_bcl_sync = sems_info->wait_sem_count > 0 || job->needs_bcl_sync; const bool needs_rcl_sync = job->serialize && !needs_bcl_sync; mtx_lock(&queue->device->mutex); /* Replace single semaphore settings whenever our kernel-driver supports * multiple semaphores extension. */ struct drm_v3d_multi_sync ms = { 0 }; if (device->pdevice->caps.multisync) { enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN; set_multisync(&ms, sems_info, NULL, device, job, V3DV_QUEUE_CL, wait_stage); if (!ms.base.id) { mtx_unlock(&queue->device->mutex); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } submit.flags |= DRM_V3D_SUBMIT_EXTENSION; submit.extensions = (uintptr_t)(void *)&ms; /* Disable legacy sync interface when multisync extension is used */ submit.in_sync_rcl = 0; submit.in_sync_bcl = 0; submit.out_sync = 0; } else { uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY]; submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0; submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0; submit.out_sync = last_job_sync; } v3dv_clif_dump(device, job, &submit); int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit); mtx_unlock(&queue->device->mutex); static bool warned = false; if (ret && !warned) { fprintf(stderr, "Draw call returned %s. Expect corruption.\n", strerror(errno)); warned = true; } free(bo_handles); multisync_free(device, &ms); if (ret) return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m"); return VK_SUCCESS; } static VkResult handle_tfu_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info) { struct v3dv_device *device = queue->device; const bool needs_sync = sems_info->wait_sem_count || job->serialize; mtx_lock(&device->mutex); /* Replace single semaphore settings whenever our kernel-driver supports * multiple semaphore extension. */ struct drm_v3d_multi_sync ms = { 0 }; if (device->pdevice->caps.multisync) { set_multisync(&ms, sems_info, NULL, device, job, V3DV_QUEUE_TFU, V3D_TFU); if (!ms.base.id) { mtx_unlock(&device->mutex); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION; job->tfu.extensions = (uintptr_t)(void *)&ms; /* Disable legacy sync interface when multisync extension is used */ job->tfu.in_sync = 0; job->tfu.out_sync = 0; } else { uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY]; job->tfu.in_sync = needs_sync ? last_job_sync : 0; job->tfu.out_sync = last_job_sync; } int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu); mtx_unlock(&device->mutex); multisync_free(device, &ms); if (ret != 0) return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m"); return VK_SUCCESS; } static VkResult handle_csd_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info) { struct v3dv_device *device = queue->device; struct drm_v3d_submit_csd *submit = &job->csd.submit; submit->bo_handle_count = job->bo_count; uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2)); uint32_t bo_idx = 0; set_foreach(job->bos, entry) { struct v3dv_bo *bo = (struct v3dv_bo *)entry->key; bo_handles[bo_idx++] = bo->handle; } assert(bo_idx == submit->bo_handle_count); submit->bo_handles = (uintptr_t)(void *)bo_handles; const bool needs_sync = sems_info->wait_sem_count || job->serialize; mtx_lock(&queue->device->mutex); /* Replace single semaphore settings whenever our kernel-driver supports * multiple semaphore extension. */ struct drm_v3d_multi_sync ms = { 0 }; if (device->pdevice->caps.multisync) { set_multisync(&ms, sems_info, NULL, device, job, V3DV_QUEUE_CSD, V3D_CSD); if (!ms.base.id) { mtx_unlock(&queue->device->mutex); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } submit->flags |= DRM_V3D_SUBMIT_EXTENSION; submit->extensions = (uintptr_t)(void *)&ms; /* Disable legacy sync interface when multisync extension is used */ submit->in_sync = 0; submit->out_sync = 0; } else { uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY]; submit->in_sync = needs_sync ? last_job_sync : 0; submit->out_sync = last_job_sync; } int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit); mtx_unlock(&queue->device->mutex); static bool warned = false; if (ret && !warned) { fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n", strerror(errno)); warned = true; } free(bo_handles); multisync_free(device, &ms); if (ret) return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m"); return VK_SUCCESS; } static VkResult queue_submit_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_info_semaphores *sems_info, pthread_t *wait_thread) { assert(job); /* CPU jobs typically execute explicit waits before they are processed. For * example, a query reset CPU job will explicitly wait for the queries * being unused before proceeding, etc. However, if we have any wait * semaphores, we need to honour that too for the first CPU job we process * in the command buffer batch. We do that by waiting for idle to ensure * that any previous work has been completed, at which point any wait * semaphores must be signalled, and we never need to do this again for the * same batch. * * There is a corner case here when the semaphore has been imported from * another instance/process. In that scenario, the Vulkan spec still requires * that a signaling operation has been submitted before this semaphore wait * but our wait for idle checks won't know about that submission (since they * are based on the last jobs sent from our instance). To fix that we submit * a noop job to "consume" the semaphores and then we wait for idle, which * will ensure that our CPU job waits for the semaphores to be signaled even * if they are signaled from another instance or process. */ if (!v3dv_job_type_is_gpu(job) && sems_info->wait_sem_count) { queue_submit_noop_job(queue, sems_info, false, false); v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); #ifdef DEBUG /* Loop through wait sems and check they are all signalled */ for (int i = 0; i < sems_info->wait_sem_count; i++) { int render_fd = queue->device->pdevice->render_fd; struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems_info->wait_sems[i]); uint32_t sem_sync = semaphore_get_sync(sem); int ret = drmSyncobjWait(render_fd, &sem_sync, 1, 0, 0, NULL); assert(ret == 0); } #endif sems_info->wait_sem_count = 0; } switch (job->type) { case V3DV_JOB_TYPE_GPU_CL: return handle_cl_job(queue, job, sems_info); case V3DV_JOB_TYPE_GPU_TFU: return handle_tfu_job(queue, job, sems_info); case V3DV_JOB_TYPE_GPU_CSD: return handle_csd_job(queue, job, sems_info); case V3DV_JOB_TYPE_CPU_RESET_QUERIES: return handle_reset_query_cpu_job(job); case V3DV_JOB_TYPE_CPU_END_QUERY: return handle_end_query_cpu_job(job); case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: return handle_copy_query_results_cpu_job(job); case V3DV_JOB_TYPE_CPU_SET_EVENT: return handle_set_event_cpu_job(job); case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: return handle_wait_events_cpu_job(job, sems_info, wait_thread); case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE: return handle_copy_buffer_to_image_cpu_job(job); case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: return handle_csd_indirect_cpu_job(queue, job, sems_info); case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY: return handle_timestamp_query_cpu_job(job); default: unreachable("Unhandled job type"); } } static VkResult queue_create_noop_job(struct v3dv_queue *queue) { struct v3dv_device *device = queue->device; queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!queue->noop_job) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1); v3dv_X(device, job_emit_noop)(queue->noop_job); return VK_SUCCESS; } static VkResult queue_submit_noop_job(struct v3dv_queue *queue, struct v3dv_submit_info_semaphores *sems_info, bool do_sem_signal, bool serialize) { if (!do_sem_signal && !serialize && !sems_info->wait_sem_count) return VK_SUCCESS; /* We need to protect noop_job against concurrent access. While * the client must externally synchronize queue submissions, we * may spawn threads that can submit noop jobs themselves. */ mtx_lock(&queue->noop_mutex); if (!queue->noop_job) { VkResult result = queue_create_noop_job(queue); if (result != VK_SUCCESS) { mtx_unlock(&queue->noop_mutex); return result; } } queue->noop_job->do_sem_signal = do_sem_signal; queue->noop_job->serialize = serialize; VkResult result = queue_submit_job(queue, queue->noop_job, sems_info, NULL); mtx_unlock(&queue->noop_mutex); return result; } /* This function takes a job type and returns True if we have * previously submitted any jobs for the same command buffer batch * to a queue different to the one for this job type. */ static bool cmd_buffer_batch_is_multi_queue(struct v3dv_device *device, enum v3dv_job_type job_type) { enum v3dv_queue_type queue_type = V3DV_QUEUE_ANY; struct v3dv_last_job_sync last_job_syncs; mtx_lock(&device->mutex); memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs)); mtx_unlock(&device->mutex); switch (job_type) { case V3DV_JOB_TYPE_GPU_CL: case V3DV_JOB_TYPE_GPU_CL_SECONDARY: queue_type = V3DV_QUEUE_CL; break; case V3DV_JOB_TYPE_GPU_TFU: queue_type = V3DV_QUEUE_TFU; break; case V3DV_JOB_TYPE_GPU_CSD: queue_type = V3DV_QUEUE_CSD; break; default: unreachable("Queue type is undefined"); break; } for (int i = 0; i < V3DV_QUEUE_ANY; i++) { if (i != queue_type && !last_job_syncs.first[i]) { return true; } } return false; } static VkResult queue_submit_cmd_buffer(struct v3dv_queue *queue, struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_submit_info_semaphores *sems_info, bool is_last_cmd_buffer, pthread_t *wait_thread) { struct v3dv_job *last; bool do_sem_signal = is_last_cmd_buffer && sems_info->signal_sem_count > 0; assert(cmd_buffer); assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE); if (list_is_empty(&cmd_buffer->jobs)) return queue_submit_noop_job(queue, sems_info, do_sem_signal, false); /* When we are in the last cmd buffer and there are semaphores to signal, * we process semaphores in the last job, following these conditions: * - CPU-job: we can't signal until all GPU work has completed, so we * submit a serialized noop GPU job to handle signaling when all on-going * GPU work on all queues has completed. * - GPU-job: can signal semaphores only if we have not submitted jobs to * a queue other than the queue of this job. Otherwise, we submit a * serialized noop job to handle signaling. */ if (do_sem_signal) { last = list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link); if (v3dv_job_type_is_gpu(last)) last->do_sem_signal = true; } list_for_each_entry_safe(struct v3dv_job, job, &cmd_buffer->jobs, list_link) { if (job->do_sem_signal && cmd_buffer_batch_is_multi_queue(queue->device, job->type)) job->do_sem_signal = false; VkResult result = queue_submit_job(queue, job, sems_info, wait_thread); if (result != VK_SUCCESS) return result; } /* If we are in the last cmd buffer batch, but the last job cannot handle * signal semaphores, we emit a serialized noop_job for signalling. */ if (do_sem_signal && !(last && last->do_sem_signal)) return queue_submit_noop_job(queue, sems_info, true, true); return VK_SUCCESS; } static void add_wait_thread_to_list(struct v3dv_device *device, pthread_t thread, struct v3dv_queue_submit_wait_info **wait_info) { /* If this is the first time we spawn a wait thread for this queue * submission create a v3dv_queue_submit_wait_info to track this and * any other threads in the same submission and add it to the global list * in the queue. */ if (*wait_info == NULL) { *wait_info = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); (*wait_info)->device = device; } /* And add the thread to the list of wait threads for this submission */ const uint32_t thread_idx = (*wait_info)->wait_thread_count; assert(thread_idx < 16); (*wait_info)->wait_threads[thread_idx].thread = thread; (*wait_info)->wait_threads[thread_idx].finished = false; (*wait_info)->wait_thread_count++; } static void add_signal_semaphores_to_wait_list(struct v3dv_device *device, const VkSubmitInfo *pSubmit, struct v3dv_queue_submit_wait_info *wait_info) { assert(wait_info); if (pSubmit->signalSemaphoreCount == 0) return; /* Otherwise, we put all the semaphores in a list and we signal all of them * together from the submit master thread when the last wait thread in the * submit completes. */ /* Check the size of the current semaphore list */ const uint32_t prev_count = wait_info->signal_semaphore_count; const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore); VkSemaphore *prev_list = wait_info->signal_semaphores; /* Resize the list to hold the additional semaphores */ const uint32_t extra_alloc_size = pSubmit->signalSemaphoreCount * sizeof(VkSemaphore); wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount; wait_info->signal_semaphores = vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); /* Copy the old list to the new allocation and free the old list */ if (prev_count > 0) { memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size); vk_free(&device->vk.alloc, prev_list); } /* Add the new semaphores to the list */ memcpy(wait_info->signal_semaphores + prev_count, pSubmit->pSignalSemaphores, extra_alloc_size); } static VkResult queue_submit_cmd_buffer_batch(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit, struct v3dv_queue_submit_wait_info **wait_info) { VkResult result = VK_SUCCESS; bool has_wait_threads = false; /* Wrap wait semaphores info from VkSubmitInfo to use it whenever we need * the data to submit all jobs in the same command buffer batch. */ struct v3dv_submit_info_semaphores sems_info = { .wait_sem_count = pSubmit->waitSemaphoreCount, .wait_sems = (VkSemaphore *) pSubmit->pWaitSemaphores, .signal_sem_count = pSubmit->signalSemaphoreCount, .signal_sems = (VkSemaphore *) pSubmit->pSignalSemaphores, .fence = 0, }; /* In the beginning of a cmd buffer batch, we set all last_job_syncs as * first. It helps to determine wait semaphores conditions. */ for (unsigned i = 0; i < V3DV_QUEUE_COUNT; i++) queue->device->last_job_syncs.first[i] = true; /* Even if we don't have any actual work to submit we still need to wait * on the wait semaphores and signal the signal semaphores and fence, so * in this scenario we just submit a trivial no-op job so we don't have * to do anything special, it should not be a common case anyway. */ if (pSubmit->commandBufferCount == 0) { result = queue_submit_noop_job(queue, &sems_info, sems_info.signal_sem_count > 0, false); } else { const uint32_t last_cmd_buffer_idx = pSubmit->commandBufferCount - 1; for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) { pthread_t wait_thread; struct v3dv_cmd_buffer *cmd_buffer = v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]); result = queue_submit_cmd_buffer(queue, cmd_buffer, &sems_info, (i == last_cmd_buffer_idx), &wait_thread); /* We get VK_NOT_READY if we had to spawn a wait thread for the * command buffer. In that scenario, we want to continue submitting * any pending command buffers in the batch, but we don't want to * process any signal semaphores for the batch until we know we have * submitted every job for every command buffer in the batch. */ if (result == VK_NOT_READY) { result = VK_SUCCESS; add_wait_thread_to_list(queue->device, wait_thread, wait_info); has_wait_threads = true; } if (result != VK_SUCCESS) break; } } if (result != VK_SUCCESS) return result; /* If had to emit any wait threads in this submit we need to wait for all * of them to complete before we can signal any semaphores. */ if (!has_wait_threads) { return process_semaphores_to_signal(queue->device, pSubmit->signalSemaphoreCount, pSubmit->pSignalSemaphores, false); } else { assert(*wait_info); add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info); return VK_NOT_READY; } } static void * master_wait_thread_func(void *_wait_info) { struct v3dv_queue_submit_wait_info *wait_info = (struct v3dv_queue_submit_wait_info *) _wait_info; struct v3dv_queue *queue = &wait_info->device->queue; /* Wait for all command buffer wait threads to complete */ for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) { int res = pthread_join(wait_info->wait_threads[i].thread, NULL); if (res != 0) fprintf(stderr, "Wait thread failed to join.\n"); } /* Signal semaphores and fences */ VkResult result; result = process_semaphores_to_signal(wait_info->device, wait_info->signal_semaphore_count, wait_info->signal_semaphores, true); if (result != VK_SUCCESS) fprintf(stderr, "Wait thread semaphore signaling failed."); result = process_fence_to_signal(wait_info->device, wait_info->fence); if (result != VK_SUCCESS) fprintf(stderr, "Wait thread fence signaling failed."); /* Release wait_info */ mtx_lock(&queue->mutex); list_del(&wait_info->list_link); mtx_unlock(&queue->mutex); vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores); vk_free(&wait_info->device->vk.alloc, wait_info); return NULL; } static VkResult spawn_master_wait_thread(struct v3dv_queue *queue, struct v3dv_queue_submit_wait_info *wait_info) { VkResult result = VK_SUCCESS; mtx_lock(&queue->mutex); if (pthread_create(&wait_info->master_wait_thread, NULL, master_wait_thread_func, wait_info)) { result = vk_queue_set_lost(&queue->vk, "Thread create failed: %m"); goto done; } list_addtail(&wait_info->list_link, &queue->submit_wait_list); done: mtx_unlock(&queue->mutex); return result; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_QueueSubmit(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence) { V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); if (vk_device_is_lost(&queue->device->vk)) return VK_ERROR_DEVICE_LOST; struct v3dv_queue_submit_wait_info *wait_info = NULL; VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < submitCount; i++) { result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info); if (result != VK_SUCCESS && result != VK_NOT_READY) goto done; } if (!wait_info) { assert(result != VK_NOT_READY); result = process_fence_to_signal(queue->device, fence); goto done; } /* We emitted wait threads, so we have to spwan a master thread for this * queue submission that waits for all other threads to complete and then * will signal any semaphores and fences. */ assert(wait_info); wait_info->fence = fence; result = spawn_master_wait_thread(queue, wait_info); done: return result; } static void destroy_syncobj(uint32_t device_fd, uint32_t *sync) { assert(sync); drmSyncobjDestroy(device_fd, *sync); *sync = 0; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateSemaphore(VkDevice _device, const VkSemaphoreCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSemaphore *pSemaphore) { V3DV_FROM_HANDLE(v3dv_device, device, _device); assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO); struct v3dv_semaphore *sem = vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore), VK_OBJECT_TYPE_SEMAPHORE); if (sem == NULL) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync); if (ret) { vk_object_free(&device->vk, pAllocator, sem); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } *pSemaphore = v3dv_semaphore_to_handle(sem); return VK_SUCCESS; } VKAPI_ATTR void VKAPI_CALL v3dv_GetPhysicalDeviceExternalSemaphoreProperties( VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo, VkExternalSemaphoreProperties *pExternalSemaphoreProperties) { V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice); switch (pExternalSemaphoreInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; /* We need to have multisync support in our kernel interface to support * external semaphore imports because once we have an imported semaphore * in our list of semaphores to wait on, we can no longer use the * workaround of waiting on the last syncobj fence produced from the * device, since the imported semaphore may not (and in fact, it would * typically not) have been produced from same device. */ pExternalSemaphoreProperties->externalSemaphoreFeatures = pdevice->caps.multisync ? VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT : 0; /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties * for details on why we can't export to SYNC_FD. */ if (pExternalSemaphoreInfo->handleType != VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { pExternalSemaphoreProperties->externalSemaphoreFeatures |= VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT; } break; default: pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; pExternalSemaphoreProperties->compatibleHandleTypes = 0; pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; break; } } VKAPI_ATTR VkResult VKAPI_CALL v3dv_ImportSemaphoreFdKHR( VkDevice _device, const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore); assert(pImportSemaphoreFdInfo->sType == VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR); int fd = pImportSemaphoreFdInfo->fd; int render_fd = device->pdevice->render_fd; bool is_temporary = pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT || (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT); uint32_t new_sync; switch (pImportSemaphoreFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: { /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the * special value -1 for fd is treated like a valid sync file descriptor * referring to an object that has already signaled. The import * operation will succeed and the VkSemaphore will have a temporarily * imported payload as if a valid file descriptor had been provided." */ unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0; if (drmSyncobjCreate(render_fd, flags, &new_sync)) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); if (fd != -1) { if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) { drmSyncobjDestroy(render_fd, new_sync); return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); } } break; } case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: { if (drmSyncobjFDToHandle(render_fd, fd, &new_sync)) return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); break; } default: return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); } if (sem->temp_sync) { destroy_syncobj(render_fd, &sem->temp_sync); sem->has_temp = false; } if (is_temporary) { sem->temp_sync = new_sync; sem->has_temp = true; } else { destroy_syncobj(render_fd, &sem->sync); sem->sync = new_sync; } /* From the Vulkan 1.0.53 spec: * * "Importing a semaphore payload from a file descriptor transfers * ownership of the file descriptor from the application to the * Vulkan implementation. The application must not perform any * operations on the file descriptor after a successful import." * * If the import fails, we leave the file descriptor open. */ if (fd != -1) close(fd); return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_GetSemaphoreFdKHR(VkDevice _device, const VkSemaphoreGetFdInfoKHR *pGetFdInfo, int *pFd) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore); assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR); *pFd = -1; int render_fd = device->pdevice->render_fd; switch (pGetFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: { drmSyncobjExportSyncFile(render_fd, sem->sync, pFd); if (*pFd == -1) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); break; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: drmSyncobjHandleToFD(render_fd, sem->sync, pFd); if (*pFd == -1) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); break; } default: unreachable("Unsupported external semaphore handle type"); } return VK_SUCCESS; } VKAPI_ATTR void VKAPI_CALL v3dv_DestroySemaphore(VkDevice _device, VkSemaphore semaphore, const VkAllocationCallbacks *pAllocator) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore); if (sem == NULL) return; destroy_syncobj(device->pdevice->render_fd, &sem->sync); if (sem->temp_sync) destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync); vk_object_free(&device->vk, pAllocator, sem); } VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateFence(VkDevice _device, const VkFenceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkFence *pFence) { V3DV_FROM_HANDLE(v3dv_device, device, _device); assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO); struct v3dv_fence *fence = vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence), VK_OBJECT_TYPE_FENCE); if (fence == NULL) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); unsigned flags = 0; if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) flags |= DRM_SYNCOBJ_CREATE_SIGNALED; int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync); if (ret) { vk_object_free(&device->vk, pAllocator, fence); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } *pFence = v3dv_fence_to_handle(fence); return VK_SUCCESS; } VKAPI_ATTR void VKAPI_CALL v3dv_GetPhysicalDeviceExternalFenceProperties( VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo, VkExternalFenceProperties *pExternalFenceProperties) { switch (pExternalFenceInfo->handleType) { case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: pExternalFenceProperties->exportFromImportedHandleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT; pExternalFenceProperties->compatibleHandleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT; pExternalFenceProperties->externalFenceFeatures = VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT; /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not * the syncobj itself, and that fence is only created after we have * submitted to the kernel and updated the syncobj for the fence to import * the actual DRM fence created with the submission. Unfortunately, if the * queue submission has a 'wait for events' we may hold any jobs after the * wait in a user-space thread until the events are signaled, and in that * case we don't update the out fence of the submit until the events are * signaled and we can submit all the jobs involved with the vkQueueSubmit * call. This means that if the applications submits with an out fence and * a wait for events, trying to export the out fence to a SYNC_FD rigth * after the submission and before the events are signaled will fail, * because the actual DRM fence won't exist yet. This is not a problem * with OPAQUE_FD because in this case we export the entire syncobj, not * the underlying DRM fence. To fix this we need to rework our kernel * interface to be more flexible and accept multiple in/out syncobjs so * we can implement event waits as regular fence waits on the kernel side, * until then, we can only reliably export OPAQUE_FD. */ if (pExternalFenceInfo->handleType != VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) { pExternalFenceProperties->externalFenceFeatures |= VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT; } break; default: pExternalFenceProperties->exportFromImportedHandleTypes = 0; pExternalFenceProperties->compatibleHandleTypes = 0; pExternalFenceProperties->externalFenceFeatures = 0; break; } } VKAPI_ATTR VkResult VKAPI_CALL v3dv_ImportFenceFdKHR(VkDevice _device, const VkImportFenceFdInfoKHR *pImportFenceFdInfo) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence); assert(pImportFenceFdInfo->sType == VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR); int fd = pImportFenceFdInfo->fd; int render_fd = device->pdevice->render_fd; bool is_temporary = pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT || (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT); uint32_t new_sync; switch (pImportFenceFdInfo->handleType) { case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the * special value -1 for fd is treated like a valid sync file descriptor * referring to an object that has already signaled. The import * operation will succeed and the VkFence will have a temporarily * imported payload as if a valid file descriptor had been provided." */ unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0; if (drmSyncobjCreate(render_fd, flags, &new_sync)) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); if (fd != -1) { if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) { drmSyncobjDestroy(render_fd, new_sync); return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); } } break; } case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: { if (drmSyncobjFDToHandle(render_fd, fd, &new_sync)) return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); break; } default: return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); } if (fence->temp_sync) { destroy_syncobj(render_fd, &fence->temp_sync); fence->has_temp = false; } if (is_temporary) { fence->temp_sync = new_sync; fence->has_temp = true; } else { destroy_syncobj(render_fd, &fence->sync); fence->sync = new_sync; } /* From the Vulkan 1.0.53 spec: * * "Importing a fence payload from a file descriptor transfers * ownership of the file descriptor from the application to the * Vulkan implementation. The application must not perform any * operations on the file descriptor after a successful import." * * If the import fails, we leave the file descriptor open. */ if (fd != -1) close(fd); return VK_SUCCESS; } VKAPI_ATTR void VKAPI_CALL v3dv_DestroyFence(VkDevice _device, VkFence _fence, const VkAllocationCallbacks *pAllocator) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_fence, fence, _fence); if (fence == NULL) return; destroy_syncobj(device->pdevice->render_fd, &fence->sync); if (fence->temp_sync) destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync); vk_object_free(&device->vk, pAllocator, fence); } VKAPI_ATTR VkResult VKAPI_CALL v3dv_GetFenceStatus(VkDevice _device, VkFence _fence) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_fence, fence, _fence); if (vk_device_is_lost(&device->vk)) return VK_ERROR_DEVICE_LOST; uint32_t sync = fence_get_sync(fence); int ret = drmSyncobjWait(device->pdevice->render_fd, &sync, 1, 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL); if (ret == -ETIME) return VK_NOT_READY; else if (ret) return vk_device_set_lost(&device->vk, "Syncobj wait failed: %m"); return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_GetFenceFdKHR(VkDevice _device, const VkFenceGetFdInfoKHR *pGetFdInfo, int *pFd) { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence); assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR); *pFd = -1; int render_fd = device->pdevice->render_fd; switch (pGetFdInfo->handleType) { case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { drmSyncobjExportSyncFile(render_fd, fence->sync, pFd); if (*pFd == -1) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); break; case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: drmSyncobjHandleToFD(render_fd, fence->sync, pFd); if (*pFd == -1) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); break; } default: unreachable("Unsupported external fence handle type"); } return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences) { V3DV_FROM_HANDLE(v3dv_device, device, _device); uint32_t *syncobjs = vk_alloc(&device->vk.alloc, sizeof(*syncobjs) * fenceCount, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!syncobjs) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); int render_fd = device->pdevice->render_fd; uint32_t reset_count = 0; for (uint32_t i = 0; i < fenceCount; i++) { struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]); /* From the Vulkan spec, section 'Importing Fence Payloads': * * "If the import is temporary, the fence will be restored to its * permanent state the next time that fence is passed to * vkResetFences. * * Note: Restoring a fence to its prior permanent payload is a * distinct operation from resetting a fence payload." * * To restore the previous state, we just need to destroy the temporary. */ if (fence->has_temp) { assert(fence->temp_sync); destroy_syncobj(render_fd, &fence->temp_sync); fence->has_temp = false; } else { syncobjs[reset_count++] = fence->sync; } } int ret = 0; if (reset_count > 0) ret = drmSyncobjReset(render_fd, syncobjs, reset_count); vk_free(&device->vk.alloc, syncobjs); if (ret) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_WaitForFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences, VkBool32 waitAll, uint64_t timeout) { V3DV_FROM_HANDLE(v3dv_device, device, _device); if (vk_device_is_lost(&device->vk)) return VK_ERROR_DEVICE_LOST; const uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); uint32_t *syncobjs = vk_alloc(&device->vk.alloc, sizeof(*syncobjs) * fenceCount, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!syncobjs) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); for (uint32_t i = 0; i < fenceCount; i++) { struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]); syncobjs[i] = fence_get_sync(fence); } unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT; if (waitAll) flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL; int ret; do { ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount, timeout, flags, NULL); } while (ret == -ETIME && os_time_get_nano() < abs_timeout); vk_free(&device->vk.alloc, syncobjs); if (ret == -ETIME) return VK_TIMEOUT; else if (ret) return vk_device_set_lost(&device->vk, "Syncobj wait failed: %m"); return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL v3dv_QueueBindSparse(VkQueue _queue, uint32_t bindInfoCount, const VkBindSparseInfo *pBindInfo, VkFence fence) { V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT); }