v3dv: track submitted jobs by GPU queue type

The order in which a GPU job is scheduled is guaranteed within the same queue type (CL, TFU, CSD), but the order of completion of jobs from different queues cannot be guaranteed. Since we have multiple semaphores support now, we can track the completion of the last job submitted to each queue and therefore better determine when gpu is idle. We do it using an array of syncobj (last_job_syncs) for each GPU queue (CL, TFU, CSD). With this, job serialization also become more accurate. We also keep tracking the very last job submitted (last_job_sync became an element of the last_job_syncs array as V3DV_QUEUE_ANY) for the case we don't have multisync support. To help in handling wait semaphores, we set a flag per queue to indicate we are starting a new cmd buffer batch and a job submitted to this queue will be the first. Signed-off-by: Melissa Wen <mwen@igalia.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13178>
2026-05-09 02:28:10 +02:00 · 2021-12-14 13:40:34 -01:00 · 2021-12-14 13:40:34 -01:00 · 03a6a82740
commit 03a6a82740
parent fd973218a6
3 changed files with 162 additions and 64 deletions
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@ -1745,6 +1745,16 @@ init_device_meta(struct v3dv_device *device)
   v3dv_meta_texel_buffer_copy_init(device);
 }

+static void
+destroy_device_syncs(struct v3dv_device *device,
+                       int render_fd)
+{
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      if (device->last_job_syncs.syncs[i])
+         drmSyncobjDestroy(render_fd, device->last_job_syncs.syncs[i]);
+   }
+}
+
 static void
 destroy_device_meta(struct v3dv_device *device)
 {
@ -1829,12 +1839,15 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
   if (device->features.robustBufferAccess)
      perf_debug("Device created with Robust Buffer Access enabled.\n");

-   int ret = drmSyncobjCreate(physical_device->render_fd,
-                              DRM_SYNCOBJ_CREATE_SIGNALED,
-                              &device->last_job_sync);
-   if (ret) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
-      goto fail;
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      device->last_job_syncs.first[i] = true;
+      int ret = drmSyncobjCreate(physical_device->render_fd,
+                                 DRM_SYNCOBJ_CREATE_SIGNALED,
+                                 &device->last_job_syncs.syncs[i]);
+      if (ret) {
+         result = VK_ERROR_INITIALIZATION_FAILED;
+         goto fail;
+      }
   }

 #ifdef DEBUG
@ -1852,6 +1865,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
   return VK_SUCCESS;

 fail:
+   destroy_device_syncs(device, physical_device->render_fd);
   vk_device_finish(&device->vk);
   vk_free(&device->vk.alloc, device);

@ -1867,7 +1881,7 @@ v3dv_DestroyDevice(VkDevice _device,
   v3dv_DeviceWaitIdle(_device);
   queue_finish(&device->queue);
   pthread_mutex_destroy(&device->mutex);
-   drmSyncobjDestroy(device->pdevice->render_fd, device->last_job_sync);
+   destroy_device_syncs(device, device->pdevice->render_fd);
   destroy_device_meta(device);
   v3dv_pipeline_cache_finish(&device->default_pipeline_cache);

--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@ -434,6 +434,32 @@ struct v3dv_pipeline_cache {
   bool externally_synchronized;
 };

+/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
+ * tfu), we still need a syncobj to track the last overall job submitted
+ * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
+ * start expecting multisync to be present and drop the legacy implementation
+ * together with this V3DV_QUEUE_ANY tracker.
+ */
+enum v3dv_queue_type {
+   V3DV_QUEUE_CL = 0,
+   V3DV_QUEUE_CSD,
+   V3DV_QUEUE_TFU,
+   V3DV_QUEUE_ANY,
+   V3DV_QUEUE_COUNT,
+};
+
+/* For each GPU queue, we use a syncobj to track the last job submitted. We
+ * set the flag `first` to determine when we are starting a new cmd buffer
+ * batch and therefore a job submitted to a given queue will be the first in a
+ * cmd buf batch.
+ */
+struct v3dv_last_job_sync {
+   /* If the job is the first submitted to a GPU queue in a cmd buffer batch */
+   bool first[V3DV_QUEUE_COUNT];
+   /* Array of syncobj to track the last job submitted to a GPU queue */
+   uint32_t syncs[V3DV_QUEUE_COUNT];
+};
+
 struct v3dv_device {
   struct vk_device vk;

@ -443,8 +469,8 @@ struct v3dv_device {
   struct v3d_device_info devinfo;
   struct v3dv_queue queue;

-   /* A sync object to track the last job submitted to the GPU. */
-   uint32_t last_job_sync;
+   /* Syncobjs to track the last job submitted to any GPU queue */
+   struct v3dv_last_job_sync last_job_syncs;

   /* A mutex to prevent concurrent access to last_job_sync from the queue */
   mtx_t mutex;
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@ -133,15 +133,26 @@ static VkResult
 gpu_queue_wait_idle(struct v3dv_queue *queue)
 {
   struct v3dv_device *device = queue->device;
+   int render_fd = device->pdevice->render_fd;
+   struct v3dv_last_job_sync last_job_syncs;

   mtx_lock(&device->mutex);
-   uint32_t last_job_sync = device->last_job_sync;
+   memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
   mtx_unlock(&device->mutex);

-   int ret = drmSyncobjWait(device->pdevice->render_fd,
-                            &last_job_sync, 1, INT64_MAX, 0, NULL);
-   if (ret)
-      return VK_ERROR_DEVICE_LOST;
+   if (device->pdevice->caps.multisync) {
+      int ret = drmSyncobjWait(render_fd, (uint32_t *) &last_job_syncs.syncs,
+                               3, INT64_MAX,
+                               DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
+      if (ret)
+         return VK_ERROR_DEVICE_LOST;
+   } else {
+      int ret =
+         drmSyncobjWait(render_fd, &last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
+                        INT64_MAX, 0, NULL);
+      if (ret)
+         return VK_ERROR_DEVICE_LOST;
+   }

   return VK_SUCCESS;
 }
@ -585,7 +596,9 @@ process_semaphores_to_signal(struct v3dv_device *device,

   int fd;
   mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
+   drmSyncobjExportSyncFile(render_fd,
+                            device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                            &fd);
   mtx_unlock(&device->mutex);
   if (fd == -1)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
@ -624,7 +637,9 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)

   int fd;
   mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
+   drmSyncobjExportSyncFile(render_fd,
+                            device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                            &fd);
   mtx_unlock(&device->mutex);
   if (fd == -1)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
@ -651,14 +666,17 @@ multisync_free(struct v3dv_device *device,
 }

 static struct drm_v3d_sem *
-set_syncs(struct v3dv_device *device,
-          uint32_t *count, VkSemaphore *sems,
-          uint32_t last_job_sync)
+set_in_syncs(struct v3dv_device *device,
+             struct v3dv_job *job,
+             uint32_t *count,
+             struct v3dv_submit_info_semaphores *sems_info)
 {
-   uint32_t n_sem = *count;
-
-   if (last_job_sync)
-      (*count)++;
+   /* If we are serializing a job in a cmd buffer, we are already making it
+    * wait until the last job submitted to each queue completes before
+    * running, so in that case we can skip waiting for any additional
+    * semaphores.
+    */
+   *count = job->serialize ? 3 : sems_info->wait_sem_count;

   if (!*count)
      return NULL;
@ -670,14 +688,53 @@ set_syncs(struct v3dv_device *device,
   if (!syncs)
      return NULL;

-   if (n_sem)
-      for (unsigned i = 0; i < n_sem; i++) {
-         struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
+   if (!job->serialize) {
+      for (int i = 0; i < *count; i++) {
+         struct v3dv_semaphore *sem =
+            v3dv_semaphore_from_handle(sems_info->wait_sems[i]);
         syncs[i].handle = sem->sync;
      }
+   } else {
+      for (int i = 0; i < *count; i++)
+         syncs[i].handle = device->last_job_syncs.syncs[i];
+   }

-   if (last_job_sync)
-      syncs[n_sem].handle = last_job_sync;
+   return syncs;
+}
+
+static struct drm_v3d_sem *
+set_out_syncs(struct v3dv_device *device,
+              bool do_sem_signal,
+              enum v3dv_queue_type queue,
+              uint32_t *count,
+              struct v3dv_submit_info_semaphores *sems_info)
+{
+   uint32_t n_sems = do_sem_signal ? sems_info->signal_sem_count : 0;
+
+   /* We always signal the syncobj from `device->last_job_syncs` related to
+    * this v3dv_queue_type to track the last job submitted to this queue. We
+    * also signal the last overall job (V3DV_QUEUE_ANY) as we use it to
+    * process signal semaphores and fence.
+    */
+   (*count) = n_sems + 2;
+
+   struct drm_v3d_sem *syncs =
+      vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   if (!syncs)
+      return NULL;
+
+   if (n_sems) {
+      for (unsigned i = 0; i < n_sems; i++) {
+         struct v3dv_semaphore *sem =
+            v3dv_semaphore_from_handle(sems_info->signal_sems[i]);
+         syncs[i].handle = sem->sync;
+      }
+   }
+
+   syncs[n_sems].handle = device->last_job_syncs.syncs[queue];
+   syncs[++n_sems].handle = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];

   return syncs;
 }
@ -702,45 +759,42 @@ set_multisync(struct drm_v3d_multi_sync *ms,
              struct v3dv_submit_info_semaphores *sems_info,
              struct drm_v3d_extension *next,
              struct v3dv_device *device,
+              struct v3dv_job *job,
              struct drm_v3d_sem *out_syncs,
              struct drm_v3d_sem *in_syncs,
              bool do_sem_signal,
-              bool serialize,
-              enum v3d_queue queue)
+              enum v3dv_queue_type queue_sync,
+              enum v3d_queue wait_stage)
 {
   uint32_t out_sync_count = 0, in_sync_count = 0;

-   /* We only want to signal out semaphores for this submission upon
-    * completion of the last job involved with it. We still want to always
-    * signal last_job_sync so we can serialize jobs when needed.
-    */
-   out_sync_count = do_sem_signal ? sems_info->signal_sem_count : 0;
-   out_syncs = set_syncs(device, &out_sync_count, sems_info->signal_sems,
-                         device->last_job_sync);
+   in_syncs = set_in_syncs(device, job, &in_sync_count, sems_info);
+   if (!in_syncs && in_sync_count)
+      goto fail;
+
+   out_syncs = set_out_syncs(device, do_sem_signal, queue_sync,
+                             &out_sync_count, sems_info);

   assert(out_sync_count > 0);

   if (!out_syncs)
-      return;
-
-   /* If we are serializing a job in a command buffer, we are already making
-    * it wait for completion of the last job submitted, so in that case we can
-    * skip waiting for any additional semaphores.
-    */
-   in_sync_count = serialize ? 0 : sems_info->wait_sem_count;
-   in_syncs = set_syncs(device, &in_sync_count, sems_info->wait_sems,
-                        (serialize ? device->last_job_sync : 0));
-   if (!in_syncs && in_sync_count) {
-      vk_free(&device->vk.alloc, out_syncs);
-      return;
-   }
+      goto fail;

   set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
-   ms->wait_stage = queue;
+   ms->wait_stage = wait_stage;
   ms->out_sync_count = out_sync_count;
   ms->out_syncs = (uintptr_t)(void *)out_syncs;
   ms->in_sync_count = in_sync_count;
   ms->in_syncs = (uintptr_t)(void *)in_syncs;
+
+   return;
+
+fail:
+   if (in_syncs)
+      vk_free(&device->vk.alloc, in_syncs);
+   assert(!out_syncs);
+
+   return;
 }

 static VkResult
@ -814,13 +868,14 @@ handle_cl_job(struct v3dv_queue *queue,
    */
   if (device->pdevice->caps.multisync) {
      struct drm_v3d_multi_sync ms = { 0 };
+      enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
      /* We are processing all signal VkSemaphores together in the submit
       * master thread and therefore we don't handle signal VkSemaphores in cl
       * submission yet. For this reason, we set do_sem_signal to false in the
       * multisync extension.
       */
-      set_multisync(&ms, sems_info, NULL, device, out_syncs, in_syncs, false,
-                    job->serialize, needs_rcl_sync ? V3D_RENDER : V3D_BIN);
+      set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
+                    false, V3DV_QUEUE_CL, wait_stage);
      if (!ms.base.id)
         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

@ -831,9 +886,10 @@ handle_cl_job(struct v3dv_queue *queue,
      submit.in_sync_bcl = 0;
      submit.out_sync = 0;
   } else {
-      submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
-      submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
-      submit.out_sync = device->last_job_sync;
+      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
+      submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
+      submit.out_sync = last_job_sync;
   }

   v3dv_clif_dump(device, job, &submit);
@ -880,8 +936,8 @@ handle_tfu_job(struct v3dv_queue *queue,
       * tfu jobs yet. For this reason, we set do_sem_signal to false in the
       * multisync extension.
       */
-      set_multisync(&ms, sems_info, NULL, device, out_syncs, in_syncs, false,
-                    job->serialize, V3D_TFU);
+      set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
+                    false, V3DV_QUEUE_TFU, V3D_TFU);
      if (!ms.base.id)
         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

@ -891,8 +947,9 @@ handle_tfu_job(struct v3dv_queue *queue,
      job->tfu.in_sync = 0;
      job->tfu.out_sync = 0;
   } else {
-      job->tfu.in_sync = needs_sync ? device->last_job_sync : 0;
-      job->tfu.out_sync = device->last_job_sync;
+      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      job->tfu.in_sync = needs_sync ? last_job_sync : 0;
+      job->tfu.out_sync = last_job_sync;
   }
   int ret = v3dv_ioctl(device->pdevice->render_fd,
                        DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
@ -943,8 +1000,8 @@ handle_csd_job(struct v3dv_queue *queue,
       * csd jobs yet. For this reason, we set do_sem_signal to false in the
       * multisync extension.
       */
-      set_multisync(&ms, sems_info, NULL, device, out_syncs, in_syncs, false,
-                    job->serialize, V3D_CSD);
+      set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
+                    false, V3DV_QUEUE_CSD, V3D_CSD);
      if (!ms.base.id)
         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

@ -954,8 +1011,9 @@ handle_csd_job(struct v3dv_queue *queue,
      submit->in_sync = 0;
      submit->out_sync = 0;
   } else {
-      submit->in_sync = needs_sync ? device->last_job_sync : 0;
-      submit->out_sync = device->last_job_sync;
+      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      submit->in_sync = needs_sync ? last_job_sync : 0;
+      submit->out_sync = last_job_sync;
   }
   int ret = v3dv_ioctl(device->pdevice->render_fd,
                        DRM_IOCTL_V3D_SUBMIT_CSD, submit);