nvk/query: Add an interleaved query layout

This gives us two options for how to layout queries. One is optimized for space while the other is optimized for safely touching from the CPU with cached maps. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33959>
2025-12-25 00:00:11 +01:00 · 2025-10-09 16:49:46 -04:00 · 2025-10-09 16:49:46 -04:00 · 36fa755b19
commit 36fa755b19
parent 65642d9e81
3 changed files with 90 additions and 37 deletions
--- a/src/nouveau/vulkan/cl/nvk_query.cl
+++ b/src/nouveau/vulkan/cl/nvk_query.cl
@ -6,25 +6,25 @@
 #include "nvk_query.h"

 void
-nvk_copy_queries(uint64_t pool_addr, uint query_start, uint query_stride,
-                 uint first_query, uint query_count, uint64_t dst_addr,
-                 uint64_t dst_stride, uint flags)
+nvk_copy_queries(uint64_t pool_addr, uint available_stride,
+                 uint reports_start, uint report_count, uint query_stride,
+                 uint first_query, uint query_count,
+                 uint64_t dst_addr, uint64_t dst_stride, uint flags)
 {
   uint i = get_sub_group_local_id() + cl_group_id.x * 32;
   if (i >= query_count)
      return;

   uint query = first_query + i;
-   global uint *available_arr = (global uint *)pool_addr;
-   bool available = available_arr[query] != 0;
+   uint64_t available_offs = (uint64_t)query * (uint64_t)available_stride;
+   bool available = *(global uint *)(pool_addr + available_offs);
   bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);

-   uint64_t report_offs = query_start + (uint64_t)query * (uint64_t)query_stride;
+   uint64_t report_offs = reports_start + (uint64_t)query * (uint64_t)query_stride;
   global struct nvk_query_report *report =
      (global void *)(pool_addr + report_offs);

   uint64_t dst_offset = dst_stride * (uint64_t)i;
-   uint num_reports = 1;

   if (flags & NVK_QUERY_IS_TIMESTAMP) {
      /* Timestamp queries are the only ones use a single report */
@ -32,14 +32,8 @@ nvk_copy_queries(uint64_t pool_addr, uint query_start, uint query_stride,
         vk_write_query(dst_addr + dst_offset, 0, flags, report->timestamp);
      }
   } else {
-      /* Everything that isn't a timestamp has the invariant that the
-       * number of destination entries is equal to the query stride divided
-       * by the size of two reports.
-       */
-      num_reports = query_stride / (2 * sizeof(struct nvk_query_report));
-
      if (write_results) {
-         for (uint r = 0; r < num_reports; ++r) {
+         for (uint r = 0; r < report_count; ++r) {
            uint delta = report[(r * 2) + 1].value - report[r * 2].value;

            vk_write_query(dst_addr + dst_offset, r, flags, delta);
@ -48,6 +42,6 @@ nvk_copy_queries(uint64_t pool_addr, uint query_start, uint query_stride,
   }

   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
-      vk_write_query(dst_addr + dst_offset, num_reports, flags, available);
+      vk_write_query(dst_addr + dst_offset, report_count, flags, available);
   }
 }
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@ -66,9 +66,7 @@ nvk_CreateQueryPool(VkDevice device,
   if (!pool)
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

-   /* We place the availability first and then data */
-   pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
-                             sizeof(struct nvk_query_report));
+   pool->layout = NVK_QUERY_POOL_LAYOUT_SEPARATE;

   uint32_t reports_per_query;
   if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
@ -78,11 +76,30 @@ nvk_CreateQueryPool(VkDevice device,
      /* Everything else is two queries because we have to compute a delta */
      reports_per_query = 2 * vk_query_pool_report_count(&pool->vk);
   }
-   pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);

-   if (pool->vk.query_count > 0) {
-      uint32_t mem_size = pool->query_start +
-                          pool->query_stride * pool->vk.query_count;
+   uint64_t mem_size = 0;
+   switch (pool->layout) {
+   case NVK_QUERY_POOL_LAYOUT_SEPARATE:
+      pool->reports_start = align(pool->vk.query_count * sizeof(uint32_t),
+                                  sizeof(struct nvk_query_report));
+      pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);
+      mem_size = pool->reports_start +
+         pool->vk.query_count * (uint64_t)pool->query_stride;
+      break;
+
+   case NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED:
+      pool->reports_start = sizeof(struct nvk_query_report);
+      pool->query_stride =
+         align((reports_per_query + 1) * sizeof(struct nvk_query_report),
+               pdev->info.nc_atom_size_B);
+      mem_size = pool->vk.query_count * (uint64_t)pool->query_stride;
+      break;
+
+   default:
+      UNREACHABLE("Unsupported query layout");
+   }
+
+   if (mem_size > 0) {
      result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &dev->vk.base,
                                          mem_size, 0 /* align_B */,
                                          NVKMD_MEM_GART,
@ -119,11 +136,18 @@ nvk_DestroyQueryPool(VkDevice device,
   vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
 }

+static uint32_t
+nvk_query_available_stride_B(struct nvk_query_pool *pool)
+{
+   return pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE ?
+          sizeof(uint32_t) : pool->query_stride;
+}
+
 static uint64_t
 nvk_query_available_offset_B(struct nvk_query_pool *pool, uint32_t query)
 {
   assert(query < pool->vk.query_count);
-   return query * sizeof(uint32_t);
+   return query * nvk_query_available_stride_B(pool);
 }

 static uint64_t
@ -142,7 +166,7 @@ static uint64_t
 nvk_query_report_offset_B(struct nvk_query_pool *pool, uint32_t query)
 {
   assert(query < pool->vk.query_count);
-   return pool->query_start + query * pool->query_stride;
+   return pool->reports_start + query * pool->query_stride;
 }

 static uint64_t
@ -200,8 +224,15 @@ nvk_ResetQueryPool(VkDevice device,
 {
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

-   uint32_t *available = nvk_query_available_map(pool, firstQuery);
-   memset(available, 0, queryCount * sizeof(*available));
+   if (pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE) {
+      uint32_t *available = nvk_query_available_map(pool, firstQuery);
+      memset(available, 0, queryCount * sizeof(*available));
+   } else {
+      for (uint32_t i = 0; i < queryCount; i++) {
+         uint32_t *available = nvk_query_available_map(pool, firstQuery + i);
+         *available = 0;
+      }
+   }
 }

 VKAPI_ATTR void VKAPI_CALL
@ -679,7 +710,9 @@ nvk_GetQueryPoolResults(VkDevice device,

 struct nvk_copy_query_push {
   uint64_t pool_addr;
-   uint32_t query_start;
+   uint32_t available_stride;
+   uint32_t reports_start;
+   uint32_t report_count;
   uint32_t query_stride;
   uint32_t first_query;
   uint32_t query_count;
@ -706,13 +739,15 @@ build_copy_queries_shader(void)

   struct glsl_struct_field push_fields[] = {
      { .type = glsl_uint64_t_type(), .name = "pool_addr", .offset = 0 },
-      { .type = glsl_uint_type(), .name = "query_start", .offset = 8 },
-      { .type = glsl_uint_type(), .name = "query_stride", .offset = 12 },
-      { .type = glsl_uint_type(), .name = "first_query", .offset = 16 },
-      { .type = glsl_uint_type(), .name = "query_count", .offset = 20 },
-      { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24 },
-      { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32 },
-      { .type = glsl_uint_type(), .name = "flags", .offset = 40 },
+      { .type = glsl_uint_type(), .name = "available_stride", .offset = 8 },
+      { .type = glsl_uint_type(), .name = "reports_start", .offset = 12 },
+      { .type = glsl_uint_type(), .name = "report_count", .offset = 16 },
+      { .type = glsl_uint_type(), .name = "query_stride", .offset = 20 },
+      { .type = glsl_uint_type(), .name = "first_query", .offset = 24 },
+      { .type = glsl_uint_type(), .name = "query_count", .offset = 28 },
+      { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 32 },
+      { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 40 },
+      { .type = glsl_uint_type(), .name = "flags", .offset = 48 },
   };
   const struct glsl_type *push_iface_type =
      glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
@ -726,7 +761,8 @@ build_copy_queries_shader(void)
   nvk_copy_queries(b, load_struct_var(b, push, 0), load_struct_var(b, push, 1),
                    load_struct_var(b, push, 2), load_struct_var(b, push, 3),
                    load_struct_var(b, push, 4), load_struct_var(b, push, 5),
-                    load_struct_var(b, push, 6), load_struct_var(b, push, 7));
+                    load_struct_var(b, push, 6), load_struct_var(b, push, 7),
+                    load_struct_var(b, push, 8), load_struct_var(b, push, 9));

   return build.shader;
 }
@ -790,7 +826,9 @@ nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,

   const struct nvk_copy_query_push push = {
      .pool_addr = pool->mem->va->addr,
-      .query_start = pool->query_start,
+      .available_stride = nvk_query_available_stride_B(pool),
+      .reports_start = pool->reports_start,
+      .report_count = vk_query_pool_report_count(&pool->vk),
      .query_stride = pool->query_stride,
      .first_query = first_query,
      .query_count = query_count,
--- a/src/nouveau/vulkan/nvk_query_pool.h
+++ b/src/nouveau/vulkan/nvk_query_pool.h
@ -11,10 +11,31 @@

 struct nvkmd_mem;

+enum nvk_query_pool_layout {
+   /** Stores the availables and query reports as separate arrays.
+    *
+    * This uses less memory and is optimized for being able to memset a pile
+    * of availables in one go.  In this layout, the query reports start at
+    * reports_start and are every query_stride.
+    */
+   NVK_QUERY_POOL_LAYOUT_SEPARATE,
+
+   /** Interleaves availables and reports interleaved in aligned chunks
+    *
+    * This uses more memory but ensures that each query is aligned to a CPU
+    * cache line boundary for save non-coherent access.  In this layout, the
+    * available is the first 4 bytes of the query and the reports start at
+    * byte 16.
+    */
+   NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED,
+};
+
 struct nvk_query_pool {
   struct vk_query_pool vk;

-   uint32_t query_start;
+   enum nvk_query_pool_layout layout;
+
+   uint32_t reports_start;
   uint32_t query_stride;

   struct nvkmd_mem *mem;