From bce77e758acbb168de802d30fd0a6d6b7ea1d1a4 Mon Sep 17 00:00:00 2001
From: Melissa Wen <mwen@igalia.com>
Date: Thu, 16 Dec 2021 22:43:49 +0000
Subject: [PATCH] v3dv: process signal semaphores in the very last job

With multiple semaphores support, we can use a GPU job to handle
multiple signal semaphores in the end of a cmd buffer batch. It
means, the last job in the last cmd buffer will be in change of
signalling semaphores as long as it meets some conditions:
1 - A GPU-job signals semaphores whenever we only have submitted
jobs for the same queue (there is no syncobj created for any
other type). Otherwise, we emit a noop job that waits on the
completion of all jobs submitted and then signals semaphores.
2 - A CPU-job is never in charge of signalling semaphores. We
process it first and emit a noop job that depends on all jobs
previously submitted to signal semaphores.

Signed-off-by: Melissa Wen <mwen@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13178>
---
 src/broadcom/vulkan/v3dv_private.h |   3 +
 src/broadcom/vulkan/v3dv_queue.c   | 129 ++++++++++++++++++++++-------
 2 files changed, 101 insertions(+), 31 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 43ea60d4359..284fa7099e3 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -1102,6 +1102,9 @@ struct v3dv_job {
    /* Whether we need to serialize this job in our command stream */
    bool serialize;
 
+   /* Whether this job is in charge of signalling semaphores */
+   bool do_sem_signal;
+
    /* If this is a CL job, whether we should sync before binning */
    bool needs_bcl_sync;
 
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 0dcf13bd0eb..775777eabd0 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -592,6 +592,13 @@ process_semaphores_to_signal(struct v3dv_device *device,
    if (count == 0)
       return VK_SUCCESS;
 
+   /* If multisync is supported, we are signalling semaphores in the last job
+    * of the last command buffer and, therefore, we do not need to process any
+    * semaphores here.
+    */
+   if (device->pdevice->caps.multisync)
+      return VK_SUCCESS;
+
    int render_fd = device->pdevice->render_fd;
 
    int fd;
@@ -712,12 +719,12 @@ set_in_syncs(struct v3dv_device *device,
 
 static struct drm_v3d_sem *
 set_out_syncs(struct v3dv_device *device,
-              bool do_sem_signal,
+              struct v3dv_job *job,
               enum v3dv_queue_type queue,
               uint32_t *count,
               struct v3dv_submit_info_semaphores *sems_info)
 {
-   uint32_t n_sems = do_sem_signal ? sems_info->signal_sem_count : 0;
+   uint32_t n_sems = job->do_sem_signal ? sems_info->signal_sem_count : 0;
 
    /* We always signal the syncobj from `device->last_job_syncs` related to
     * this v3dv_queue_type to track the last job submitted to this queue. We
@@ -770,7 +777,6 @@ set_multisync(struct drm_v3d_multi_sync *ms,
               struct v3dv_job *job,
               struct drm_v3d_sem *out_syncs,
               struct drm_v3d_sem *in_syncs,
-              bool do_sem_signal,
               enum v3dv_queue_type queue_sync,
               enum v3d_queue wait_stage)
 {
@@ -781,7 +787,7 @@ set_multisync(struct drm_v3d_multi_sync *ms,
    if (!in_syncs && in_sync_count)
       goto fail;
 
-   out_syncs = set_out_syncs(device, do_sem_signal, queue_sync,
+   out_syncs = set_out_syncs(device, job, queue_sync,
                              &out_sync_count, sems_info);
 
    assert(out_sync_count > 0);
@@ -880,13 +886,8 @@ handle_cl_job(struct v3dv_queue *queue,
    if (device->pdevice->caps.multisync) {
       struct drm_v3d_multi_sync ms = { 0 };
       enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
-      /* We are processing all signal VkSemaphores together in the submit
-       * master thread and therefore we don't handle signal VkSemaphores in cl
-       * submission yet. For this reason, we set do_sem_signal to false in the
-       * multisync extension.
-       */
       set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
-                    false, V3DV_QUEUE_CL, wait_stage);
+                    V3DV_QUEUE_CL, wait_stage);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -942,13 +943,8 @@ handle_tfu_job(struct v3dv_queue *queue,
     */
    if (device->pdevice->caps.multisync) {
       struct drm_v3d_multi_sync ms = { 0 };
-      /* We are processing all signal VkSemaphores together in the submit
-       * master thread and therefore we don't handle signal VkSemaphores in
-       * tfu jobs yet. For this reason, we set do_sem_signal to false in the
-       * multisync extension.
-       */
       set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
-                    false, V3DV_QUEUE_TFU, V3D_TFU);
+                    V3DV_QUEUE_TFU, V3D_TFU);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -1006,13 +1002,8 @@ handle_csd_job(struct v3dv_queue *queue,
     */
    if (device->pdevice->caps.multisync) {
       struct drm_v3d_multi_sync ms = { 0 };
-      /* We are processing all signal VkSemaphores together in the submit
-       * master thread and therefore we don't handle signal VkSemaphores in
-       * csd jobs yet. For this reason, we set do_sem_signal to false in the
-       * multisync extension.
-       */
       set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
-                    false, V3DV_QUEUE_CSD, V3D_CSD);
+                    V3DV_QUEUE_CSD, V3D_CSD);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -1125,7 +1116,8 @@ queue_create_noop_job(struct v3dv_queue *queue)
 
 static VkResult
 queue_submit_noop_job(struct v3dv_queue *queue,
-                      struct v3dv_submit_info_semaphores *sems_info)
+                      struct v3dv_submit_info_semaphores *sems_info,
+                      bool do_sem_signal, bool serialize)
 {
    /* VkQueue host access is externally synchronized so we don't need to lock
     * here for the static variable.
@@ -1135,29 +1127,99 @@ queue_submit_noop_job(struct v3dv_queue *queue,
       if (result != VK_SUCCESS)
          return result;
    }
+   queue->noop_job->do_sem_signal = do_sem_signal;
+   queue->noop_job->serialize = serialize;
 
    return queue_submit_job(queue, queue->noop_job, sems_info, NULL);
 }
 
+/* This function takes a job type and returns True if we have
+ * previously submitted any jobs for the same command buffer batch
+ * to a queue different to the one for this job type.
+ */
+static bool
+cmd_buffer_batch_is_multi_queue(struct v3dv_device *device,
+                                enum v3dv_job_type job_type)
+{
+   enum v3dv_queue_type queue_type = V3DV_QUEUE_ANY;
+   struct v3dv_last_job_sync last_job_syncs;
+
+   mtx_lock(&device->mutex);
+   memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
+   mtx_unlock(&device->mutex);
+
+   switch (job_type) {
+   case V3DV_JOB_TYPE_GPU_CL:
+   case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
+      queue_type = V3DV_QUEUE_CL;
+      break;
+   case V3DV_JOB_TYPE_GPU_TFU:
+      queue_type = V3DV_QUEUE_TFU;
+      break;
+   case V3DV_JOB_TYPE_GPU_CSD:
+      queue_type = V3DV_QUEUE_CSD;
+      break;
+   default:
+      unreachable("Queue type is undefined");
+      break;
+   }
+
+   for (int i = 0; i < V3DV_QUEUE_ANY; i++) {
+      if (i != queue_type && !last_job_syncs.first[i]) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
 static VkResult
 queue_submit_cmd_buffer(struct v3dv_queue *queue,
                         struct v3dv_cmd_buffer *cmd_buffer,
                         struct v3dv_submit_info_semaphores *sems_info,
+                        bool is_last_cmd_buffer,
                         pthread_t *wait_thread)
 {
+   struct v3dv_job *last;
+   bool do_sem_signal = is_last_cmd_buffer && sems_info->signal_sem_count > 0;
+
    assert(cmd_buffer);
    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
 
    if (list_is_empty(&cmd_buffer->jobs))
-      return queue_submit_noop_job(queue, sems_info);
+      return queue_submit_noop_job(queue, sems_info, do_sem_signal, false);
+
+   /* When we are in the last cmd buffer and there are semaphores to signal,
+    * we process semaphores in the last job, following these conditions:
+    * - CPU-job: we can't signal until all GPU work has completed, so we
+    *   submit a serialized noop GPU job to handle signaling when all on-going
+    *   GPU work on all queues has completed.
+    * - GPU-job: can signal semaphores only if we have not submitted jobs to
+    *   a queue other than the queue of this job. Otherwise, we submit a
+    *   serialized noop job to handle signaling.
+    */
+   if (do_sem_signal) {
+      last = list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
+      if (v3dv_job_type_is_gpu(last))
+         last->do_sem_signal = true;
+   }
 
    list_for_each_entry_safe(struct v3dv_job, job,
                             &cmd_buffer->jobs, list_link) {
+      if (job->do_sem_signal &&
+          cmd_buffer_batch_is_multi_queue(queue->device, job->type))
+         job->do_sem_signal = false;
       VkResult result = queue_submit_job(queue, job, sems_info, wait_thread);
       if (result != VK_SUCCESS)
          return result;
    }
 
+   /* If we are in the last cmd buffer batch, but the last job cannot handle
+    * signal semaphores, we emit a serialized noop_job for signalling.
+    */
+   if (do_sem_signal && !(last && last->do_sem_signal))
+      return queue_submit_noop_job(queue, sems_info, true, true);
+
    return VK_SUCCESS;
 }
 
@@ -1196,13 +1258,16 @@ add_signal_semaphores_to_wait_list(struct v3dv_device *device,
    if (pSubmit->signalSemaphoreCount == 0)
       return;
 
-   /* FIXME: We put all the semaphores in a list and we signal all of them
+   /* If multisync is supported, we just signal semaphores in the last job of
+    * the last command buffer and, therefore, we do not need to add any
+    * semaphores here.
+    */
+   if (device->pdevice->caps.multisync)
+      return;
+
+   /* Otherwise, we put all the semaphores in a list and we signal all of them
     * together from the submit master thread when the last wait thread in the
-    * submit completes. We could do better though: group the semaphores per
-    * submit and signal them as soon as all wait threads for a particular
-    * submit completes. Not sure if the extra work would be worth it though,
-    * since we only spawn waith threads for event waits and only when the
-    * event if set from the host after the queue submission.
+    * submit completes.
     */
 
    /* Check the size of the current semaphore list */
@@ -1259,13 +1324,15 @@ queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
     * to do anything special, it should not be a common case anyway.
     */
    if (pSubmit->commandBufferCount == 0) {
-      result = queue_submit_noop_job(queue, &sems_info);
+      result = queue_submit_noop_job(queue, &sems_info, true, false);
    } else {
+      const uint32_t last_cmd_buffer_idx = pSubmit->commandBufferCount - 1;
       for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
          pthread_t wait_thread;
          struct v3dv_cmd_buffer *cmd_buffer =
             v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
          result = queue_submit_cmd_buffer(queue, cmd_buffer, &sems_info,
+                                          (i == last_cmd_buffer_idx),
                                           &wait_thread);
 
          /* We get VK_NOT_READY if we had to spawn a wait thread for the