v3dv: reimplement occlusion queries

Our implementation was mostly CPU-based, with things such as query resets and result copying handled in the CPU, as well as some aspects of query availability tracking. This new implementation handles all GPU-side query functions by dispatching compute shaders to push the work to the GPU. This involves query availability, reset and result copying. For now, only occlusion queries are managed this way. Performance queries can also be implemented in a similar fashion in the future with some additional work, however, for timestamp queries our only option to improve this would be to execute the actual timestamp in the kernel, since we can't take a timestamp from a shader. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19770>
2026-05-08 19:58:09 +02:00 · 2022-10-28 12:07:07 +02:00 · 2022-10-28 12:07:07 +02:00 · 7a65b3f006
commit 7a65b3f006
parent 4050086439
6 changed files with 1379 additions and 226 deletions
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@ -565,22 +565,41 @@ v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
 }

 static void
-cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
+cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+                              struct v3dv_query_pool *pool,
+                              uint32_t query, uint32_t count)
+{
+   assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_END_QUERY,
+                                     cmd_buffer, -1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   job->cpu.query_end.pool = pool;
+   job->cpu.query_end.query = query;
+   job->cpu.query_end.count = count;
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+}
+
+static void
+cmd_buffer_add_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
 {
   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;

   if (state->query.end.used_count > 0) {
-      const uint32_t query_count = state->query.end.used_count;
-      for (uint32_t i = 0; i < query_count; i++) {
+      const uint32_t count = state->query.end.used_count;
+      for (uint32_t i = 0; i < count; i++) {
         assert(i < state->query.end.used_count);
-         struct v3dv_job *job =
-            v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                           V3DV_JOB_TYPE_CPU_END_QUERY,
-                                           cmd_buffer, -1);
-         v3dv_return_if_oom(cmd_buffer, NULL);
-
-         job->cpu.query_end = state->query.end.states[i];
-         list_addtail(&job->list_link, &cmd_buffer->jobs);
+         struct v3dv_end_query_info *info = &state->query.end.states[i];
+          if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+            v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool,
+                                                        info->query, info->count, 1);
+         } else {
+            cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool,
+                                          info->query, info->count);
+         }
      }
      state->query.end.used_count = 0;
   }
@ -650,14 +669,14 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
   cmd_buffer->state.job = NULL;

   /* If we have recorded any state with this last GPU job that requires to
-    * emit CPU jobs after the job is completed, add them now. The only
-    * exception is secondary command buffers inside a render pass, because in
+    * emit jobs after the job is completed, add them now. The only exception
+    * is secondary command buffers inside a render pass, because in
    * that case we want to defer this until we finish recording the primary
    * job into which we execute the secondary.
    */
   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
       !cmd_buffer->state.pass) {
-      cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
+      cmd_buffer_add_jobs_for_pending_state(cmd_buffer);
   }
 }

@ -765,7 +784,7 @@ v3dv_job_init(struct v3dv_job *job,
      cmd_buffer->state.dirty = ~0;
      cmd_buffer->state.dirty_descriptor_stages = ~0;

-      /* Honor inheritance of occlussion queries in secondaries if requested */
+      /* Honor inheritance of occlusion queries in secondaries if requested */
      if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
          cmd_buffer->state.inheritance.occlusion_query_enable) {
         cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
@ -3495,34 +3514,6 @@ v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
   state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
 }

-void
-v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
-                              struct v3dv_query_pool *pool,
-                              uint32_t first,
-                              uint32_t count)
-{
-   /* Resets can only happen outside a render pass instance so we should not
-    * be in the middle of job recording.
-    */
-   assert(cmd_buffer->state.pass == NULL);
-   assert(cmd_buffer->state.job == NULL);
-
-   assert(first < pool->query_count);
-   assert(first + count <= pool->query_count);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_RESET_QUERIES,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   job->cpu.query_reset.pool = pool;
-   job->cpu.query_reset.first = first;
-   job->cpu.query_reset.count = count;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
 void
 v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
                                   uint32_t slot_size,
@ -3562,8 +3553,9 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
      /* FIXME: we only support one active occlusion query for now */
      assert(cmd_buffer->state.query.active_query.bo == NULL);

-      cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
-      cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
+      cmd_buffer->state.query.active_query.bo = pool->occlusion.bo;
+      cmd_buffer->state.query.active_query.offset =
+         pool->queries[query].occlusion.offset;
      cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
      break;
   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
@ -3591,22 +3583,25 @@ v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
                                   uint32_t query)
 {
   assert(query < pool->query_count);
+   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
+          pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);

+   /* For occlusion queries in the middle of a render pass we don't want to
+    * split the current job at the EndQuery just to emit query availability,
+    * instead we queue this state in the command buffer and we emit it when
+    * we finish the current job.
+    */
   if  (cmd_buffer->state.pass &&
-        pool->query_type != VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
-      /* Queue the EndQuery in the command buffer state, we will create a CPU
-       * job to flag all of these queries as possibly available right after the
-       * render pass job in which they have been recorded.
-       */
+        pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
      struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
      v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
-                                         sizeof(struct v3dv_end_query_cpu_job_info),
+                                         sizeof(struct v3dv_end_query_info),
                                         state->query.end.used_count,
                                         &state->query.end.alloc_count,
                                         (void **) &state->query.end.states);
      v3dv_return_if_oom(cmd_buffer, NULL);

-      struct v3dv_end_query_cpu_job_info *info =
+      struct v3dv_end_query_info *info =
         &state->query.end.states[state->query.end.used_count++];

      info->pool = pool;
@ -3633,20 +3628,15 @@ v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
         info->count = util_bitcount(subpass->view_mask);
      }
   } else {
-      /* Otherwise, schedule the CPU job immediately */
-      struct v3dv_job *job =
-         v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                        V3DV_JOB_TYPE_CPU_END_QUERY,
-                                        cmd_buffer, -1);
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      job->cpu.query_end.pool = pool;
-      job->cpu.query_end.query = query;
-
-      /* Multiview queries cannot cross subpass boundaries */
-      job->cpu.query_end.count = 1;
-
-      list_addtail(&job->list_link, &cmd_buffer->jobs);
+      /* Otherwise, schedule the end query job immediately.
+       *
+       * Multiview queries cannot cross subpass boundaries, so query count is
+       * always 1.
+       */
+       if (pool->query_type == VK_QUERY_TYPE_OCCLUSION)
+         v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1);
+       else
+         cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1);
   }
 }

@ -3699,42 +3689,6 @@ void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
   }
 }

-void
-v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
-                                   struct v3dv_query_pool *pool,
-                                   uint32_t first,
-                                   uint32_t count,
-                                   struct v3dv_buffer *dst,
-                                   uint32_t offset,
-                                   uint32_t stride,
-                                   VkQueryResultFlags flags)
-{
-   /* Copies can only happen outside a render pass instance so we should not
-    * be in the middle of job recording.
-    */
-   assert(cmd_buffer->state.pass == NULL);
-   assert(cmd_buffer->state.job == NULL);
-
-   assert(first < pool->query_count);
-   assert(first + count <= pool->query_count);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   job->cpu.query_copy_results.pool = pool;
-   job->cpu.query_copy_results.first = first;
-   job->cpu.query_copy_results.count = count;
-   job->cpu.query_copy_results.dst = dst;
-   job->cpu.query_copy_results.offset = offset;
-   job->cpu.query_copy_results.stride = stride;
-   job->cpu.query_copy_results.flags = flags;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
 void
 v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
                            struct drm_v3d_submit_tfu *tfu)
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@ -2113,6 +2113,10 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
      goto fail;
   }

+   result = v3dv_query_allocate_resources(device);
+   if (result != VK_SUCCESS)
+      goto fail;
+
   *pDevice = v3dv_device_to_handle(device);

   return VK_SUCCESS;
@ -2123,6 +2127,8 @@ fail:
   queue_finish(&device->queue);
   destroy_device_meta(device);
   v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
+   v3dv_event_free_resources(device);
+   v3dv_query_free_resources(device);
   vk_device_finish(&device->vk);
   vk_free(&device->vk.alloc, device);

@ -2141,6 +2147,8 @@ v3dv_DestroyDevice(VkDevice _device,
   v3dv_event_free_resources(device);
   mtx_destroy(&device->events.lock);

+   v3dv_query_free_resources(device);
+
   destroy_device_meta(device);
   v3dv_pipeline_cache_finish(&device->default_pipeline_cache);

--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@ -556,6 +556,31 @@ struct v3dv_device {
      VkPipeline wait_event_pipeline;
   } events;

+   /* Query handling resources.
+    *
+    * Our implementation of occlusion queries uses a BO per pool to keep track
+    * of the per-query availability state and dispatches compute shaders to
+    * handle GPU query functions that read and write that state. This struct
+    * holds Vulkan resources that can be shared across all query pools to
+    * implement this. This framework may be extended in the future to handle
+    * more query types.
+    */
+   struct {
+      VkDescriptorSetLayout buf_descriptor_set_layout;
+
+      /* Set query availability */
+      VkPipelineLayout avail_pipeline_layout;
+      VkPipeline avail_pipeline;
+
+      /* Reset query availability and clear occlusion counters */
+      VkPipelineLayout reset_occlusion_pipeline_layout;
+      VkPipeline reset_occlusion_pipeline;
+
+      /* Copy query results */
+      VkPipelineLayout copy_pipeline_layout;
+      VkPipeline copy_pipeline;
+   } queries;
+
   struct v3dv_pipeline_cache default_pipeline_cache;

   /* GL_SHADER_STATE_RECORD needs to specify default attribute values. The
@ -1026,7 +1051,7 @@ struct v3dv_reset_query_cpu_job_info {
   uint32_t count;
 };

-struct v3dv_end_query_cpu_job_info {
+struct v3dv_end_query_info {
   struct v3dv_query_pool *pool;
   uint32_t query;

@ -1218,7 +1243,7 @@ struct v3dv_job {
   /* Job specs for CPU jobs */
   union {
      struct v3dv_reset_query_cpu_job_info          query_reset;
-      struct v3dv_end_query_cpu_job_info            query_end;
+      struct v3dv_end_query_info                    query_end;
      struct v3dv_copy_query_results_cpu_job_info   query_copy_results;
      struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
      struct v3dv_csd_indirect_cpu_job_info         csd_indirect;
@ -1449,7 +1474,7 @@ struct v3dv_cmd_buffer_state {
      struct {
         uint32_t used_count;
         uint32_t alloc_count;
-         struct v3dv_end_query_cpu_job_info *states;
+         struct v3dv_end_query_info *states;
      } end;

      struct {
@ -1498,13 +1523,19 @@ struct v3dv_descriptor {
 };

 struct v3dv_query {
+   /* Used by queries where we implement result copying in the CPU so we can
+    * tell if the relevant jobs have been submitted for execution. Currently
+    * these are all but occlusion queries.
+    */
   bool maybe_available;
+
   union {
-      /* Used by GPU queries (occlusion) */
+      /* Used by occlusion queries */
      struct {
-         struct v3dv_bo *bo;
+         /* Offset of this query in the occlusion query counter BO */
         uint32_t offset;
-      };
+      } occlusion;
+
      /* Used by CPU queries (timestamp) */
      uint64_t value;

@ -1516,7 +1547,36 @@ struct v3dv_query {
 struct v3dv_query_pool {
   struct vk_object_base base;

-   struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
+   /* Availability state for each query in the pool. Only used with occlusion
+    * queries for now, but could be used by other query types in the future.
+    */
+   struct v3dv_bo *avail_bo;
+
+   /* Per-pool Vulkan resources required to implement GPU-side query
+    * functions (only occlusion queries for now).
+    */
+   struct {
+      /* Buffer to access query availability state */
+      VkBuffer avail_buf;
+      VkDeviceMemory avail_mem;
+
+      /* Buffer to access occlusion query results */
+      VkBuffer res_buf;
+      VkDeviceMemory res_mem;
+
+      VkDescriptorPool descriptor_pool;
+
+      /* Two descriptor sets: one for accessing the availability buffer and
+       * another for the buffer with the occlusion query results.
+       */
+      VkDescriptorSet descriptor_sets[2];
+   } meta;
+
+   /* Only used with occlusion queries */
+   struct {
+      /* BO with the occlusion counters */
+      struct v3dv_bo *bo;
+   } occlusion;

   /* Only used with performance queries */
   struct {
@ -1537,18 +1597,29 @@ struct v3dv_query_pool {
   struct v3dv_query *queries;
 };

-VkResult v3dv_get_query_pool_results(struct v3dv_device *device,
-                                     struct v3dv_query_pool *pool,
-                                     uint32_t first,
-                                     uint32_t count,
-                                     void *data,
-                                     VkDeviceSize stride,
-                                     VkQueryResultFlags flags);
+VkResult
+v3dv_query_allocate_resources(struct v3dv_device *decice);

-void v3dv_reset_query_pools(struct v3dv_device *device,
-                            struct v3dv_query_pool *query_pool,
-                            uint32_t first,
-                            uint32_t last);
+void
+v3dv_query_free_resources(struct v3dv_device *decice);
+
+VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
+                                         struct v3dv_query_pool *pool,
+                                         uint32_t first,
+                                         uint32_t count,
+                                         void *data,
+                                         VkDeviceSize stride,
+                                         VkQueryResultFlags flags);
+
+void v3dv_reset_query_pool_cpu(struct v3dv_device *device,
+                               struct v3dv_query_pool *query_pool,
+                               uint32_t first,
+                               uint32_t last);
+
+void v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
+                                                 struct v3dv_query_pool *pool,
+                                                 uint32_t query, uint32_t count,
+                                                 uint8_t availability);

 typedef void (*v3dv_cmd_buffer_private_obj_destroy_cb)(VkDevice device,
                                                       uint64_t pobj,
@ -1597,6 +1668,10 @@ struct v3dv_cmd_buffer {
         /* The current descriptor pool for texel buffer copy sources */
         VkDescriptorPool dspool;
      } texel_buffer_copy;
+      struct {
+         /* The current descriptor pool for the copy query results output buffer */
+         VkDescriptorPool dspool;
+      } query;
   } meta;

   /* List of jobs in the command buffer. For primary command buffers it
@ -1625,11 +1700,6 @@ void v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
                                    uint32_t dirty_dynamic_state,
                                    bool needs_subpass_resume);

-void v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
-                                   struct v3dv_query_pool *pool,
-                                   uint32_t first,
-                                   uint32_t count);
-
 void v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                                 struct v3dv_query_pool *pool,
                                 uint32_t query,
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@ -135,7 +135,7 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
    * we handle those in the CPU.
    */
   if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
-      v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
+      v3dv_bo_wait(job->device, info->pool->occlusion.bo, PIPE_TIMEOUT_INFINITE);

   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
      struct vk_sync_wait waits[info->count];
@ -160,7 +160,7 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
         return result;
   }

-   v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
+   v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);

   return VK_SUCCESS;
 }
@ -218,12 +218,15 @@ handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)

   mtx_lock(&job->device->query_mutex);

-   struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
+   struct v3dv_end_query_info *info = &job->cpu.query_end;
   struct v3dv_queue *queue = &job->device->queue;

   int err = 0;
   int fd = -1;

+   assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+          info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
      result = export_perfmon_last_job_sync(queue, job, &fd);

@ -268,6 +271,9 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
   struct v3dv_copy_query_results_cpu_job_info *info =
      &job->cpu.query_copy_results;

+   assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+          info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
   assert(info->dst && info->dst->mem && info->dst->mem->bo);
   struct v3dv_bo *bo = info->dst->mem->bo;

@ -278,13 +284,13 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)

   uint8_t *offset = ((uint8_t *) bo->map) +
                     info->offset + info->dst->mem_offset;
-   v3dv_get_query_pool_results(job->device,
-                               info->pool,
-                               info->first,
-                               info->count,
-                               offset,
-                               info->stride,
-                               info->flags);
+   v3dv_get_query_pool_results_cpu(job->device,
+                                   info->pool,
+                                   info->first,
+                                   info->count,
+                                   offset,
+                                   info->stride,
+                                   info->flags);

   return VK_SUCCESS;
 }
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@ -1598,17 +1598,17 @@ cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
   const uint32_t total_state_count =
      p_state->query.end.used_count + s_state->query.end.used_count;
   v3dv_cmd_buffer_ensure_array_state(primary,
-                                      sizeof(struct v3dv_end_query_cpu_job_info),
+                                      sizeof(struct v3dv_end_query_info),
                                      total_state_count,
                                      &p_state->query.end.alloc_count,
                                      (void **) &p_state->query.end.states);
   v3dv_return_if_oom(primary, NULL);

   for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
-      const struct v3dv_end_query_cpu_job_info *s_qstate =
+      const struct v3dv_end_query_info *s_qstate =
         &secondary->state.query.end.states[i];

-      struct v3dv_end_query_cpu_job_info *p_qstate =
+      struct v3dv_end_query_info *p_qstate =
         &p_state->query.end.states[p_state->query.end.used_count++];

      p_qstate->pool = s_qstate->pool;