From bce77e758acbb168de802d30fd0a6d6b7ea1d1a4 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Thu, 16 Dec 2021 22:43:49 +0000 Subject: [PATCH] v3dv: process signal semaphores in the very last job With multiple semaphores support, we can use a GPU job to handle multiple signal semaphores in the end of a cmd buffer batch. It means, the last job in the last cmd buffer will be in change of signalling semaphores as long as it meets some conditions: 1 - A GPU-job signals semaphores whenever we only have submitted jobs for the same queue (there is no syncobj created for any other type). Otherwise, we emit a noop job that waits on the completion of all jobs submitted and then signals semaphores. 2 - A CPU-job is never in charge of signalling semaphores. We process it first and emit a noop job that depends on all jobs previously submitted to signal semaphores. Signed-off-by: Melissa Wen Reviewed-by: Iago Toral Quiroga Part-of: --- src/broadcom/vulkan/v3dv_private.h | 3 + src/broadcom/vulkan/v3dv_queue.c | 129 ++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 31 deletions(-) diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index 43ea60d4359..284fa7099e3 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -1102,6 +1102,9 @@ struct v3dv_job { /* Whether we need to serialize this job in our command stream */ bool serialize; + /* Whether this job is in charge of signalling semaphores */ + bool do_sem_signal; + /* If this is a CL job, whether we should sync before binning */ bool needs_bcl_sync; diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index 0dcf13bd0eb..775777eabd0 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -592,6 +592,13 @@ process_semaphores_to_signal(struct v3dv_device *device, if (count == 0) return VK_SUCCESS; + /* If multisync is supported, we are signalling semaphores in the last job + * of the last command buffer and, therefore, we do not need to process any + * semaphores here. + */ + if (device->pdevice->caps.multisync) + return VK_SUCCESS; + int render_fd = device->pdevice->render_fd; int fd; @@ -712,12 +719,12 @@ set_in_syncs(struct v3dv_device *device, static struct drm_v3d_sem * set_out_syncs(struct v3dv_device *device, - bool do_sem_signal, + struct v3dv_job *job, enum v3dv_queue_type queue, uint32_t *count, struct v3dv_submit_info_semaphores *sems_info) { - uint32_t n_sems = do_sem_signal ? sems_info->signal_sem_count : 0; + uint32_t n_sems = job->do_sem_signal ? sems_info->signal_sem_count : 0; /* We always signal the syncobj from `device->last_job_syncs` related to * this v3dv_queue_type to track the last job submitted to this queue. We @@ -770,7 +777,6 @@ set_multisync(struct drm_v3d_multi_sync *ms, struct v3dv_job *job, struct drm_v3d_sem *out_syncs, struct drm_v3d_sem *in_syncs, - bool do_sem_signal, enum v3dv_queue_type queue_sync, enum v3d_queue wait_stage) { @@ -781,7 +787,7 @@ set_multisync(struct drm_v3d_multi_sync *ms, if (!in_syncs && in_sync_count) goto fail; - out_syncs = set_out_syncs(device, do_sem_signal, queue_sync, + out_syncs = set_out_syncs(device, job, queue_sync, &out_sync_count, sems_info); assert(out_sync_count > 0); @@ -880,13 +886,8 @@ handle_cl_job(struct v3dv_queue *queue, if (device->pdevice->caps.multisync) { struct drm_v3d_multi_sync ms = { 0 }; enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN; - /* We are processing all signal VkSemaphores together in the submit - * master thread and therefore we don't handle signal VkSemaphores in cl - * submission yet. For this reason, we set do_sem_signal to false in the - * multisync extension. - */ set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs, - false, V3DV_QUEUE_CL, wait_stage); + V3DV_QUEUE_CL, wait_stage); if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -942,13 +943,8 @@ handle_tfu_job(struct v3dv_queue *queue, */ if (device->pdevice->caps.multisync) { struct drm_v3d_multi_sync ms = { 0 }; - /* We are processing all signal VkSemaphores together in the submit - * master thread and therefore we don't handle signal VkSemaphores in - * tfu jobs yet. For this reason, we set do_sem_signal to false in the - * multisync extension. - */ set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs, - false, V3DV_QUEUE_TFU, V3D_TFU); + V3DV_QUEUE_TFU, V3D_TFU); if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -1006,13 +1002,8 @@ handle_csd_job(struct v3dv_queue *queue, */ if (device->pdevice->caps.multisync) { struct drm_v3d_multi_sync ms = { 0 }; - /* We are processing all signal VkSemaphores together in the submit - * master thread and therefore we don't handle signal VkSemaphores in - * csd jobs yet. For this reason, we set do_sem_signal to false in the - * multisync extension. - */ set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs, - false, V3DV_QUEUE_CSD, V3D_CSD); + V3DV_QUEUE_CSD, V3D_CSD); if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -1125,7 +1116,8 @@ queue_create_noop_job(struct v3dv_queue *queue) static VkResult queue_submit_noop_job(struct v3dv_queue *queue, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_info_semaphores *sems_info, + bool do_sem_signal, bool serialize) { /* VkQueue host access is externally synchronized so we don't need to lock * here for the static variable. @@ -1135,29 +1127,99 @@ queue_submit_noop_job(struct v3dv_queue *queue, if (result != VK_SUCCESS) return result; } + queue->noop_job->do_sem_signal = do_sem_signal; + queue->noop_job->serialize = serialize; return queue_submit_job(queue, queue->noop_job, sems_info, NULL); } +/* This function takes a job type and returns True if we have + * previously submitted any jobs for the same command buffer batch + * to a queue different to the one for this job type. + */ +static bool +cmd_buffer_batch_is_multi_queue(struct v3dv_device *device, + enum v3dv_job_type job_type) +{ + enum v3dv_queue_type queue_type = V3DV_QUEUE_ANY; + struct v3dv_last_job_sync last_job_syncs; + + mtx_lock(&device->mutex); + memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs)); + mtx_unlock(&device->mutex); + + switch (job_type) { + case V3DV_JOB_TYPE_GPU_CL: + case V3DV_JOB_TYPE_GPU_CL_SECONDARY: + queue_type = V3DV_QUEUE_CL; + break; + case V3DV_JOB_TYPE_GPU_TFU: + queue_type = V3DV_QUEUE_TFU; + break; + case V3DV_JOB_TYPE_GPU_CSD: + queue_type = V3DV_QUEUE_CSD; + break; + default: + unreachable("Queue type is undefined"); + break; + } + + for (int i = 0; i < V3DV_QUEUE_ANY; i++) { + if (i != queue_type && !last_job_syncs.first[i]) { + return true; + } + } + + return false; +} + static VkResult queue_submit_cmd_buffer(struct v3dv_queue *queue, struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_submit_info_semaphores *sems_info, + bool is_last_cmd_buffer, pthread_t *wait_thread) { + struct v3dv_job *last; + bool do_sem_signal = is_last_cmd_buffer && sems_info->signal_sem_count > 0; + assert(cmd_buffer); assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE); if (list_is_empty(&cmd_buffer->jobs)) - return queue_submit_noop_job(queue, sems_info); + return queue_submit_noop_job(queue, sems_info, do_sem_signal, false); + + /* When we are in the last cmd buffer and there are semaphores to signal, + * we process semaphores in the last job, following these conditions: + * - CPU-job: we can't signal until all GPU work has completed, so we + * submit a serialized noop GPU job to handle signaling when all on-going + * GPU work on all queues has completed. + * - GPU-job: can signal semaphores only if we have not submitted jobs to + * a queue other than the queue of this job. Otherwise, we submit a + * serialized noop job to handle signaling. + */ + if (do_sem_signal) { + last = list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link); + if (v3dv_job_type_is_gpu(last)) + last->do_sem_signal = true; + } list_for_each_entry_safe(struct v3dv_job, job, &cmd_buffer->jobs, list_link) { + if (job->do_sem_signal && + cmd_buffer_batch_is_multi_queue(queue->device, job->type)) + job->do_sem_signal = false; VkResult result = queue_submit_job(queue, job, sems_info, wait_thread); if (result != VK_SUCCESS) return result; } + /* If we are in the last cmd buffer batch, but the last job cannot handle + * signal semaphores, we emit a serialized noop_job for signalling. + */ + if (do_sem_signal && !(last && last->do_sem_signal)) + return queue_submit_noop_job(queue, sems_info, true, true); + return VK_SUCCESS; } @@ -1196,13 +1258,16 @@ add_signal_semaphores_to_wait_list(struct v3dv_device *device, if (pSubmit->signalSemaphoreCount == 0) return; - /* FIXME: We put all the semaphores in a list and we signal all of them + /* If multisync is supported, we just signal semaphores in the last job of + * the last command buffer and, therefore, we do not need to add any + * semaphores here. + */ + if (device->pdevice->caps.multisync) + return; + + /* Otherwise, we put all the semaphores in a list and we signal all of them * together from the submit master thread when the last wait thread in the - * submit completes. We could do better though: group the semaphores per - * submit and signal them as soon as all wait threads for a particular - * submit completes. Not sure if the extra work would be worth it though, - * since we only spawn waith threads for event waits and only when the - * event if set from the host after the queue submission. + * submit completes. */ /* Check the size of the current semaphore list */ @@ -1259,13 +1324,15 @@ queue_submit_cmd_buffer_batch(struct v3dv_queue *queue, * to do anything special, it should not be a common case anyway. */ if (pSubmit->commandBufferCount == 0) { - result = queue_submit_noop_job(queue, &sems_info); + result = queue_submit_noop_job(queue, &sems_info, true, false); } else { + const uint32_t last_cmd_buffer_idx = pSubmit->commandBufferCount - 1; for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) { pthread_t wait_thread; struct v3dv_cmd_buffer *cmd_buffer = v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]); result = queue_submit_cmd_buffer(queue, cmd_buffer, &sems_info, + (i == last_cmd_buffer_idx), &wait_thread); /* We get VK_NOT_READY if we had to spawn a wait thread for the