From 7961641592ad2fc5f64775cc645633e02398ff7d Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Sun, 3 May 2026 18:50:14 +0200
Subject: [PATCH 01/12] nvk: Use SET_REFERENCE in nvk_CmdResetQueryPool

Instead of trying to wait for each available values to be zeroed, we now
use NV906F_SET_REFERENCE.

NV906F_SET_REFERENCE behaves like a FE WFI+MEMBAR on the command
processor itself, ensuring ordering.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_query_pool.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index 7082d95dc96..9b8dcca645c 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -29,6 +29,8 @@
 #include "nv_push_cla0c0.h"
 #include "nv_push_clc597.h"
 #include "nv_push_clc7c0.h"
+#include "nv_push_clc86f.h"
+#include "nv_push_clcb97.h"
 
 static uint32_t
 vk_query_pool_report_count(const struct vk_query_pool *vk_pool)
@@ -277,6 +279,8 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
+   const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
 
    for (uint32_t i = 0; i < queryCount; i++) {
       uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
@@ -299,19 +303,17 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
     * will see the query as unavailable if it happens before the query is
     * completed again.
     */
-   for (uint32_t i = 0; i < queryCount; i++) {
-      uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
-
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
-      __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
-      P_NV906F_SEMAPHOREA(p, addr >> 32);
-      P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2);
-      P_NV906F_SEMAPHOREC(p, 0);
-      P_NV906F_SEMAPHORED(p, {
-         .operation = OPERATION_ACQUIRE,
-         .acquire_switch = ACQUIRE_SWITCH_ENABLED,
-         .release_size = RELEASE_SIZE_4BYTE,
-      });
+   if (pdev->info.cls_eng3d >= HOPPER_A) {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+      P_IMMD(p, NVC86F, WFI, 0);
+      P_MTHD(p, NVC86F, MEM_OP_A);
+      P_NVC86F_MEM_OP_A(p, {});
+      P_NVC86F_MEM_OP_B(p, 0);
+      P_NVC86F_MEM_OP_C(p, { .membar_type = 0 });
+      P_NVC86F_MEM_OP_D(p, { .operation = OPERATION_MEMBAR });
+   } else {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 1);
+      __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
    }
 }
 

From cd4c873c12bb0c6634e36a7fe62d4441e7f6f178 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sat, 2 May 2026 18:02:49 +0200
Subject: [PATCH 02/12] nvk: Use nvk_cmd_fill_memory in CmdResetQueryPool when
 possible

This avoid relying on semaphore releases when we have multiple queries
being reset.

NVIDIA proprietary driver is performing the same however it do fill the
full report instead (as it also contain the available part packed with
it)

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_query_pool.c | 31 +++++++++++++++++------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index 9b8dcca645c..76043708477 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -282,20 +282,25 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);
 
-   for (uint32_t i = 0; i < queryCount; i++) {
-      uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
+   if (queryCount > 1 && pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE) {
+      uint64_t addr = nvk_query_available_addr(pool, firstQuery);
+      nvk_cmd_fill_memory(cmd, addr, queryCount * sizeof(uint32_t), 0);
+   } else {
+      for (uint32_t i = 0; i < queryCount; i++) {
+         uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
 
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
-      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-      P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
-      P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
-      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
-      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-         .operation = OPERATION_RELEASE,
-         .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
-         .pipeline_location = PIPELINE_LOCATION_ALL,
-         .structure_size = STRUCTURE_SIZE_ONE_WORD,
-      });
+         struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
+         P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+         P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
+         P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
+         P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
+         P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+            .operation = OPERATION_RELEASE,
+            .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
+            .pipeline_location = PIPELINE_LOCATION_ALL,
+            .structure_size = STRUCTURE_SIZE_ONE_WORD,
+         });
+      }
    }
 
    /* Wait for the above writes to complete.  This prevents WaW hazards on any

From 9d0532a669c162844f36d6f6ffd650f283609b0e Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Sun, 3 May 2026 15:31:40 +0200
Subject: [PATCH 03/12] nvk: use MME shadow RAM in nvk_meta begin/end

We are going to conditionally change SET_ZPASS_PIXEL_COUNT and
SET_STATISTICS_COUNTER in queries handling.

To prepare for that we now use the MME shadow RAM to restore the
previous state.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_cmd_meta.c | 37 +++++++++----------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_cmd_meta.c b/src/nouveau/vulkan/nvk_cmd_meta.c
index d58f2ca4df2..a1f79df3c7f 100644
--- a/src/nouveau/vulkan/nvk_cmd_meta.c
+++ b/src/nouveau/vulkan/nvk_cmd_meta.c
@@ -78,10 +78,9 @@ nvk_meta_begin(struct nvk_cmd_buffer *cmd,
 {
    const struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
 
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
-
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
+   P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_PASSTHROUGH);
    P_IMMD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE, MODE_ALWAYS_RENDER);
-
    P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {
       .da_vertices_generated_enable = false,
       .da_primitives_generated_enable = false,
@@ -99,8 +98,8 @@ nvk_meta_begin(struct nvk_cmd_buffer *cmd,
       .total_streaming_primitives_needed_succeeded_enable = false,
       .vtg_primitives_out_enable = false,
    });
-
-   P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, false);
+   P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE);
+   P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_TRACK_WITH_FILTER);
 
    save->dynamic = cmd->vk.dynamic_graphics_state;
    save->_dynamic_vi = cmd->state.gfx._dynamic_vi;
@@ -189,29 +188,13 @@ nvk_meta_end(struct nvk_cmd_buffer *cmd,
    nvk_descriptor_state_set_root_array(cmd, desc, push, 0, sizeof(save->push),
                                        save->push);
 
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
-
-   P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, true);
-
-   P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {
-      .da_vertices_generated_enable = true,
-      .da_primitives_generated_enable = true,
-      .vs_invocations_enable = true,
-      .gs_invocations_enable = true,
-      .gs_primitives_generated_enable = true,
-      .streaming_primitives_succeeded_enable = true,
-      .streaming_primitives_needed_enable = true,
-      .clipper_invocations_enable = true,
-      .clipper_primitives_generated_enable = true,
-      .ps_invocations_enable = true,
-      .ti_invocations_enable = true,
-      .ts_invocations_enable = true,
-      .ts_primitives_generated_enable = true,
-      .total_streaming_primitives_needed_succeeded_enable = true,
-      .vtg_primitives_out_enable = true,
-   });
-
+   /* Replay the previous state from shadow RAM */
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
+   P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_REPLAY);
+   P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE);
+   P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {});
    P_IMMD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE, MODE_USE_RENDER_ENABLE);
+   P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_TRACK_WITH_FILTER);
 }
 
 VKAPI_ATTR void VKAPI_CALL

From d0426cd9045b44830dd1e536c534a91596dba175 Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Sun, 3 May 2026 15:58:03 +0200
Subject: [PATCH 04/12] nvk: Move nv_push closer to their uses in
 nvk_cmd_begin_end_query

It was a nightmare to track, let's simplify this.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_query_pool.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index 76043708477..e8fe46c5bb0 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -471,12 +471,9 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
    uint64_t report_addr = nvk_query_report_addr(pool, query) +
                           end * sizeof(struct nvk_query_report);
 
-   uint32_t end_size = 7 * end;
-
-   struct nv_push *p;
    switch (pool->vk.query_type) {
-   case VK_QUERY_TYPE_OCCLUSION:
-      p = nvk_cmd_buffer_push(cmd, 5 + end_size);
+   case VK_QUERY_TYPE_OCCLUSION: {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
 
       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
       P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
@@ -490,10 +487,11 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
          .flush_disable = true,
       });
       break;
+   }
 
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
       uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
-      p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, stat_count * 5);
 
       ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
       for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
@@ -536,7 +534,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
          NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED,
          NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED,
       };
-      p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports));
       for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) {
          P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
          P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
@@ -555,8 +553,8 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
       break;
    }
 
-   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
-      p = nvk_cmd_buffer_push(cmd, 5 + end_size);
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
 
       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
       P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
@@ -571,12 +569,14 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
          .flush_disable = true,
       });
       break;
+   }
 
    default:
       UNREACHABLE("Unsupported query type");
    }
 
    if (end) {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
       P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
 
       uint64_t available_addr = nvk_query_available_addr(pool, query);

From 1a8e75f56af21a0c27a7e76e1baa2ee86dbae6ac Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Sun, 3 May 2026 16:59:08 +0200
Subject: [PATCH 05/12] nvk: Clear counters at the begin of a query

Like the proprietary driver, let's clear counters at the beginning of a
query.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_query_pool.c | 152 ++++++++++++++++++++++------
 1 file changed, 122 insertions(+), 30 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index e8fe46c5bb0..fbeb365f3c1 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -384,49 +384,60 @@ struct nvk_3d_stat_query {
    VkQueryPipelineStatisticFlagBits flag;
    uint8_t loc;
    uint8_t report;
+   uint8_t clear_type;
 };
 
 /* This must remain sorted in flag order */
 static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_DA_VERTICES_GENERATED,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_DA_PRIMITIVES_GENERATED,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_VS_INVOCATIONS,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_GS_INVOCATIONS,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_GS_PRIMITIVES_GENERATED,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_CLIPPER_INVOCATIONS,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_CLIPPER_PRIMITIVES_GENERATED,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_PS_INVOCATIONS,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_TI_INVOCATIONS,
 }, {
-   .flag    = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
-   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
-   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
+   .flag       = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
+   .loc        = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
+   .report     = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
+   .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_TS_INVOCATIONS,
 }, {
    .flag    = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
    .loc     = UINT8_MAX,
@@ -459,6 +470,68 @@ nvk_mme_write_cs_invocations(struct mme_builder *b)
    mme_store_global(b, mme_add64(b, dst_addr, mme_imm64(4)), accum.hi);
 }
 
+static void
+nvk_cmd_clear_report_value(struct nvk_cmd_buffer *cmd,
+                           struct nvk_query_pool *pool)
+{
+   const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION: {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
+      P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_ZPASS_PIXEL_CNT);
+      break;
+   }
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+      uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, stat_count * 2);
+
+      ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
+      for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
+         const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i];
+         if (!(stats_left & sq->flag))
+            continue;
+
+         /* The 3D stat queries array MUST be sorted */
+         assert(!(stats_left & (sq->flag - 1)));
+
+         if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
+            if (pdev->info.cls_compute >= AMPERE_COMPUTE_B) {
+               P_IMMD_WORD(p, NVC7C0, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI), 0);
+               P_IMMD_WORD(p, NVC7C0, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO), 0);
+            }
+            else {
+               P_IMMD_WORD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI), 0);
+               P_IMMD_WORD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO), 0);
+            }
+         } else {
+            P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, sq->clear_type);
+         }
+
+         stats_left &= ~sq->flag;
+      }
+      break;
+   }
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
+      P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_STREAMING_PRIMITIVES_SUCCEEDED);
+      P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_STREAMING_PRIMITIVES_NEEDED);
+      break;
+   }
+
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
+      P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_VTG_PRIMITIVES_OUT);
+      break;
+   }
+
+   default:
+      UNREACHABLE("Unsupported query type");
+   }
+}
+
 static void
 nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
                         struct nvk_query_pool *pool,
@@ -603,6 +676,25 @@ nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
 
+   /* From the Vulkan 1.4.350 spec, vkCmdBeginQuery:
+    *
+    *    VUID-vkCmdBeginQuery-queryPool-01922
+    *
+    *    "queryPool must have been created with a queryType that differs from
+    *    that of any queries that are active within commandBuffer"
+    *
+    * and
+    *
+    *    "After beginning a query, that query is considered active within the
+    *    command buffer it was called in until that same query is ended.
+    *    Queries active in a primary command buffer when secondary command
+    *    buffers are executed are considered active for those secondary command
+    *    buffers."
+    *
+    * This means we will never have two queries with the same type active and
+    * can rely on cleaning counters.
+    */
+   nvk_cmd_clear_report_value(cmd, pool);
    nvk_cmd_begin_end_query(cmd, pool, query, index, false);
 }
 

From 75f898057902aeed7f595efc729ab2eeeb4fd87f Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Sun, 3 May 2026 17:29:46 +0200
Subject: [PATCH 06/12] nvk: Remove delta handling from query pool

Now that we reset counters, we do not need to handle delta for anything.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/cl/nvk_query.cl  |   4 +-
 src/nouveau/vulkan/nvk_query_pool.c | 144 ++++++++++++----------------
 2 files changed, 60 insertions(+), 88 deletions(-)

diff --git a/src/nouveau/vulkan/cl/nvk_query.cl b/src/nouveau/vulkan/cl/nvk_query.cl
index 72b3a08e5e6..580521ca4ca 100644
--- a/src/nouveau/vulkan/cl/nvk_query.cl
+++ b/src/nouveau/vulkan/cl/nvk_query.cl
@@ -34,9 +34,7 @@ nvk_copy_queries(uint64_t pool_addr, uint available_stride,
    } else {
       if (write_results) {
          for (uint r = 0; r < report_count; ++r) {
-            uint delta = report[(r * 2) + 1].value - report[r * 2].value;
-
-            vk_write_query(dst_addr + dst_offset, r, flags, delta);
+            vk_write_query(dst_addr + dst_offset, r, flags, report[r].value);
          }
       }
    }
diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index fbeb365f3c1..0436d2df662 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -76,14 +76,8 @@ nvk_CreateQueryPool(VkDevice device,
    else
       pool->layout = NVK_QUERY_POOL_LAYOUT_SEPARATE;
 
-   uint32_t reports_per_query;
-   if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
-      /* Timestamps are just a single timestamp */
-      reports_per_query = 1;
-   } else {
-      /* Everything else is two queries because we have to compute a delta */
-      reports_per_query = 2 * vk_query_pool_report_count(&pool->vk);
-   }
+   /* Everything is a single query per report */
+   uint32_t reports_per_query = vk_query_pool_report_count(&pool->vk);
 
    uint64_t mem_size = 0;
    switch (pool->layout) {
@@ -532,17 +526,50 @@ nvk_cmd_clear_report_value(struct nvk_cmd_buffer *cmd,
    }
 }
 
-static void
-nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
-                        struct nvk_query_pool *pool,
-                        uint32_t query, uint32_t index,
-                        bool end)
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
+                            VkQueryPool queryPool,
+                            uint32_t query,
+                            VkQueryControlFlags flags,
+                            uint32_t index)
 {
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
+
+   /* From the Vulkan 1.4.350 spec, vkCmdBeginQuery:
+    *
+    *    VUID-vkCmdBeginQuery-queryPool-01922
+    *
+    *    "queryPool must have been created with a queryType that differs from
+    *    that of any queries that are active within commandBuffer"
+    *
+    * and
+    *
+    *    "After beginning a query, that query is considered active within the
+    *    command buffer it was called in until that same query is ended.
+    *    Queries active in a primary command buffer when secondary command
+    *    buffers are executed are considered active for those secondary command
+    *    buffers."
+    *
+    * This means we will never have two queries with the same type active and
+    * can rely on cleaning counters.
+    */
+   nvk_cmd_clear_report_value(cmd, pool);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
+                          VkQueryPool queryPool,
+                          uint32_t query,
+                          uint32_t index)
+{
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
+
    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);
 
-   uint64_t report_addr = nvk_query_report_addr(pool, query) +
-                          end * sizeof(struct nvk_query_report);
+   uint64_t report_addr = nvk_query_report_addr(pool, query);
 
    switch (pool->vk.query_type) {
    case VK_QUERY_TYPE_OCCLUSION: {
@@ -596,7 +623,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
             });
          }
 
-         report_addr += 2 * sizeof(struct nvk_query_report);
+         report_addr += sizeof(struct nvk_query_report);
          stats_left &= ~sq->flag;
       }
       break;
@@ -621,7 +648,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
             .sub_report = index,
             .flush_disable = true,
          });
-         report_addr += 2 * sizeof(struct nvk_query_report);
+         report_addr += sizeof(struct nvk_query_report);
       }
       break;
    }
@@ -648,66 +675,21 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
       UNREACHABLE("Unsupported query type");
    }
 
-   if (end) {
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
-      P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
 
-      uint64_t available_addr = nvk_query_available_addr(pool, query);
-      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-      P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
-      P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
-      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
-      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-         .operation = OPERATION_RELEASE,
-         .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
-         .pipeline_location = PIPELINE_LOCATION_ALL,
-         .structure_size = STRUCTURE_SIZE_ONE_WORD,
-      });
-   }
-}
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+   P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
 
-VKAPI_ATTR void VKAPI_CALL
-nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
-                            VkQueryPool queryPool,
-                            uint32_t query,
-                            VkQueryControlFlags flags,
-                            uint32_t index)
-{
-   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
-   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
-
-   /* From the Vulkan 1.4.350 spec, vkCmdBeginQuery:
-    *
-    *    VUID-vkCmdBeginQuery-queryPool-01922
-    *
-    *    "queryPool must have been created with a queryType that differs from
-    *    that of any queries that are active within commandBuffer"
-    *
-    * and
-    *
-    *    "After beginning a query, that query is considered active within the
-    *    command buffer it was called in until that same query is ended.
-    *    Queries active in a primary command buffer when secondary command
-    *    buffers are executed are considered active for those secondary command
-    *    buffers."
-    *
-    * This means we will never have two queries with the same type active and
-    * can rely on cleaning counters.
-    */
-   nvk_cmd_clear_report_value(cmd, pool);
-   nvk_cmd_begin_end_query(cmd, pool, query, index, false);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
-                          VkQueryPool queryPool,
-                          uint32_t query,
-                          uint32_t index)
-{
-   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
-   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
-
-   nvk_cmd_begin_end_query(cmd, pool, query, index, true);
+   uint64_t available_addr = nvk_query_available_addr(pool, query);
+   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
+   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
+   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
+   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+      .operation = OPERATION_RELEASE,
+      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
+      .pipeline_location = PIPELINE_LOCATION_ALL,
+      .structure_size = STRUCTURE_SIZE_ONE_WORD,
+   });
 
    /* From the Vulkan spec:
     *
@@ -775,14 +757,6 @@ cpu_write_query_result(void *dst, uint32_t idx,
    }
 }
 
-static void
-cpu_get_query_delta(void *dst, const struct nvk_query_report *src,
-                    uint32_t idx, VkQueryResultFlags flags)
-{
-   uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value;
-   cpu_write_query_result(dst, idx, flags, delta);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_GetQueryPoolResults(VkDevice device,
                         VkQueryPool queryPool,
@@ -831,10 +805,10 @@ nvk_GetQueryPoolResults(VkDevice device,
          if (write_results)
             cpu_write_query_result(dst, 0, flags, src->timestamp);
       } else {
-         /* For everything else, we have to compute deltas */
+         /* For everything else, we can just write it */
          if (write_results) {
             for (uint32_t j = 0; j < report_count; j++)
-               cpu_get_query_delta(dst, src, j, flags);
+               cpu_write_query_result(dst, j, flags, src[j].value);
          }
       }
 

From 5ae92173b181a3ca47eacbefb80f3fdac2890964 Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Sun, 3 May 2026 17:46:36 +0200
Subject: [PATCH 07/12] nvk: Conditionally enable counters when needed

We only need some counters when the user request to track certain
queries, let's not keep them enabled all the time.

Following the proprietary driver here, unsure if that will have any
impact.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_cmd_draw.c   |  20 +++-
 src/nouveau/vulkan/nvk_mme.c        |   2 +
 src/nouveau/vulkan/nvk_mme.h        |   4 +
 src/nouveau/vulkan/nvk_query_pool.c | 142 +++++++++++++++++++++++++++-
 src/nouveau/vulkan/nvk_query_pool.h |   1 +
 5 files changed, 165 insertions(+), 4 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_cmd_draw.c b/src/nouveau/vulkan/nvk_cmd_draw.c
index 2d12dcaef1b..19ca89acbe6 100644
--- a/src/nouveau/vulkan/nvk_cmd_draw.c
+++ b/src/nouveau/vulkan/nvk_cmd_draw.c
@@ -399,8 +399,24 @@ nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
       .output7 = OUTPUT7_FALSE,
    });
 
-   /* The blob driver just always leaves this on. */
-   P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_TRUE);
+   P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE);
+   P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {
+      .da_vertices_generated_enable = false,
+      .da_primitives_generated_enable = false,
+      .vs_invocations_enable = false,
+      .gs_invocations_enable = false,
+      .gs_primitives_generated_enable = false,
+      .streaming_primitives_succeeded_enable = false,
+      .streaming_primitives_needed_enable = false,
+      .clipper_invocations_enable = false,
+      .clipper_primitives_generated_enable = false,
+      .ps_invocations_enable = false,
+      .ti_invocations_enable = false,
+      .ts_invocations_enable = false,
+      .ts_primitives_generated_enable = false,
+      .total_streaming_primitives_needed_succeeded_enable = false,
+      .vtg_primitives_out_enable = false,
+   });
 
    P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
    P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c
index d4e36f7c78d..4d8b364375a 100644
--- a/src/nouveau/vulkan/nvk_mme.c
+++ b/src/nouveau/vulkan/nvk_mme.c
@@ -37,6 +37,7 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = {
    [NVK_MME_SET_CONSERVATIVE_RASTER_STATE] = nvk_mme_set_conservative_raster_state,
    [NVK_MME_SET_VIEWPORT_MIN_MAX_Z]        = nvk_mme_set_viewport_min_max_z,
    [NVK_MME_SET_Z_CLAMP]                   = nvk_mme_set_z_clamp,
+   [NVK_MME_SET_STATISTICS_COUNTERS]       = nvk_mme_set_statistics_counters,
 };
 
 static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = {
@@ -45,6 +46,7 @@ static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = {
    [NVK_MME_SET_TESS_PARAMS]               = nvk_mme_set_tess_params_tests,
    [NVK_MME_SET_SHADING_RATE_CONTROL]      = nvk_mme_set_shading_rate_control_tests,
    [NVK_MME_SET_ANTI_ALIAS]                = nvk_mme_set_anti_alias_tests,
+   [NVK_MME_SET_STATISTICS_COUNTERS]       = nvk_mme_set_statistics_counters_tests,
 };
 
 uint32_t *
diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h
index a4d7f7ba309..afffcc6cc19 100644
--- a/src/nouveau/vulkan/nvk_mme.h
+++ b/src/nouveau/vulkan/nvk_mme.h
@@ -41,6 +41,7 @@ enum nvk_mme {
    NVK_MME_SET_CONSERVATIVE_RASTER_STATE,
    NVK_MME_SET_VIEWPORT_MIN_MAX_Z,
    NVK_MME_SET_Z_CLAMP,
+   NVK_MME_SET_STATISTICS_COUNTERS,
 
    NVK_MME_COUNT,
 };
@@ -68,6 +69,7 @@ enum nvk_mme_scratch {
    NVK_MME_SCRATCH_WRITE_MASK_DYN,
    NVK_MME_SCRATCH_WRITE_MASK_PIPELINE,
    NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE,
+   NVK_MME_SCRATCH_STATISTICS_COUNTER_STATE,
 
    /* Copy of SET_WINDOW_CLIP_ENABLE */
    NVK_MME_SCRATCH_WINDOW_CLIP_ENABLED, /* TODO: can we use shadow-ram? */
@@ -249,6 +251,7 @@ void nvk_mme_set_write_mask(struct mme_builder *b);
 void nvk_mme_set_conservative_raster_state(struct mme_builder *b);
 void nvk_mme_set_viewport_min_max_z(struct mme_builder *b);
 void nvk_mme_set_z_clamp(struct mme_builder *b);
+void nvk_mme_set_statistics_counters(struct mme_builder *b);
 
 uint32_t nvk_mme_tess_params(mesa_shader_stage stage,
                              enum nak_ts_domain domain,
@@ -278,6 +281,7 @@ extern const struct nvk_mme_test_case nvk_mme_bind_vb_tests[];
 extern const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[];
 extern const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[];
 extern const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[];
+extern const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[];
 
 void nvk_test_all_mmes(const struct nv_device_info *devinfo);
 
diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index 0436d2df662..a7f08bc49d6 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -52,6 +52,53 @@ vk_query_pool_report_count(const struct vk_query_pool *vk_pool)
    }
 }
 
+static uint32_t
+vk_query_pool_statistics_counter_mask(const struct vk_query_pool *vk_pool)
+{
+   uint32_t result = 0;
+
+   switch (vk_pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+      const VkQueryPipelineStatisticFlags stats = vk_pool->pipeline_statistics;
+      V_NV9097_SET_STATISTICS_COUNTER(result, {
+         .da_vertices_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT) != 0,
+         .da_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT) != 0,
+         .vs_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT) != 0,
+         .gs_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) != 0,
+         .gs_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) != 0,
+         .clipper_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT) != 0,
+         .clipper_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT) != 0,
+         .ps_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) != 0,
+         .ti_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT) != 0,
+         .ts_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT) != 0,
+      });
+      break;
+   }
+
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      V_NV9097_SET_STATISTICS_COUNTER(result, {
+         .vtg_primitives_out_enable = true,
+      });
+      break;
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      V_NV9097_SET_STATISTICS_COUNTER(result, {
+         .streaming_primitives_succeeded_enable = true,
+         .streaming_primitives_needed_enable = true,
+      });
+      break;
+
+   default:
+      UNREACHABLE("Unsupported query type");
+   }
+
+   return result;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_CreateQueryPool(VkDevice device,
                     const VkQueryPoolCreateInfo *pCreateInfo,
@@ -76,6 +123,8 @@ nvk_CreateQueryPool(VkDevice device,
    else
       pool->layout = NVK_QUERY_POOL_LAYOUT_SEPARATE;
 
+   pool->statistics_counter_mask = vk_query_pool_statistics_counter_mask(&pool->vk);
+
    /* Everything is a single query per report */
    uint32_t reports_per_query = vk_query_pool_report_count(&pool->vk);
 
@@ -526,6 +575,33 @@ nvk_cmd_clear_report_value(struct nvk_cmd_buffer *cmd,
    }
 }
 
+static void
+nvk_cmd_set_statistics_counters(struct nvk_cmd_buffer *cmd,
+                                struct nvk_query_pool *pool, bool enable)
+{
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION: {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
+      P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, enable);
+      break;
+   }
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
+      if (pool->statistics_counter_mask != 0) {
+         struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
+         P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_STATISTICS_COUNTERS));
+         P_INLINE_DATA(p, enable);
+         P_INLINE_DATA(p, pool->statistics_counter_mask);
+      }
+      break;
+   }
+
+   default:
+      UNREACHABLE("Unsupported query type");
+   }
+}
+
 VKAPI_ATTR void VKAPI_CALL
 nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
                             VkQueryPool queryPool,
@@ -552,9 +628,10 @@ nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
     *    buffers."
     *
     * This means we will never have two queries with the same type active and
-    * can rely on cleaning counters.
+    * can rely on cleaning and toggling counters.
     */
    nvk_cmd_clear_report_value(cmd, pool);
+   nvk_cmd_set_statistics_counters(cmd, pool, true);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -676,10 +753,13 @@ nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
    }
 
 
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
    P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
 
+   nvk_cmd_set_statistics_counters(cmd, pool, false);
+
    uint64_t available_addr = nvk_query_available_addr(pool, query);
+   p = nvk_cmd_buffer_push(cmd, 5);
    P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
    P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
    P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
@@ -990,3 +1070,61 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
                                     dst_addr, stride, flags);
 }
 
+void
+nvk_mme_set_statistics_counters(struct mme_builder *b)
+{
+   struct mme_value enable = mme_load(b);
+   struct mme_value mask = mme_load(b);
+   struct mme_value state = nvk_mme_load_scratch(b, STATISTICS_COUNTER_STATE);
+
+   mme_if(b, ieq, enable, mme_imm(0)) {
+      mme_and_not_to(b, state, state, mask);
+   }
+
+   mme_if(b, ine, enable, mme_imm(0)) {
+      mme_or_to(b, state, state, mask);
+   }
+
+   nvk_mme_store_scratch(b, STATISTICS_COUNTER_STATE, state);
+   mme_mthd(b, NV9097_SET_STATISTICS_COUNTER);
+   mme_emit(b, state);
+}
+
+const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[] = {{
+   /* This case doesn't change the state so it should do nothing */
+   .init =
+      (struct nvk_mme_mthd_data[]){
+         {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0},
+         {NV9097_SET_STATISTICS_COUNTER, 0},
+         {}},
+   .params = (uint32_t[]){1, 0},
+   .expected =
+      (struct nvk_mme_mthd_data[]){
+         {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0},
+         {NV9097_SET_STATISTICS_COUNTER, 0},
+         {}},
+}, {
+   .init =
+      (struct nvk_mme_mthd_data[]){
+         {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x100},
+         {NV9097_SET_STATISTICS_COUNTER, 0x100},
+         {}},
+   .params = (uint32_t[]){1, 0x200},
+   .expected =
+      (struct nvk_mme_mthd_data[]){
+         {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x300},
+         {NV9097_SET_STATISTICS_COUNTER, 0x300},
+         {}},
+}, {
+   .init =
+      (struct nvk_mme_mthd_data[]){
+         {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x300},
+         {NV9097_SET_STATISTICS_COUNTER, 0x300},
+         {}},
+   .params = (uint32_t[]){0, 0x200},
+   .expected =
+      (struct nvk_mme_mthd_data[]){
+         {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x100},
+         {NV9097_SET_STATISTICS_COUNTER, 0x100},
+         {}},
+}, {}};
diff --git a/src/nouveau/vulkan/nvk_query_pool.h b/src/nouveau/vulkan/nvk_query_pool.h
index 37b6fd2657f..9d1ae4e90f3 100644
--- a/src/nouveau/vulkan/nvk_query_pool.h
+++ b/src/nouveau/vulkan/nvk_query_pool.h
@@ -37,6 +37,7 @@ struct nvk_query_pool {
 
    uint32_t reports_start;
    uint32_t query_stride;
+   uint32_t statistics_counter_mask;
 
    struct nvkmd_mem *mem;
 };

From c6e9dd8af6ad00e62849a8287562d977724f18cb Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Mon, 4 May 2026 19:06:30 +0200
Subject: [PATCH 08/12] nvk: Move report offset to reports_start for
 nvk_CmdCopyQueryPoolResults

That remove the need of a special case for timestamp and will allow some
simplification for MME copies.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/cl/nvk_query.cl  | 16 +++-------------
 src/nouveau/vulkan/cl/nvk_query.h   |  2 --
 src/nouveau/vulkan/nvk_query_pool.c |  7 +++++--
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/src/nouveau/vulkan/cl/nvk_query.cl b/src/nouveau/vulkan/cl/nvk_query.cl
index 580521ca4ca..737cf9c2246 100644
--- a/src/nouveau/vulkan/cl/nvk_query.cl
+++ b/src/nouveau/vulkan/cl/nvk_query.cl
@@ -21,22 +21,12 @@ nvk_copy_queries(uint64_t pool_addr, uint available_stride,
    bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
 
    uint64_t report_offs = reports_start + (uint64_t)query * (uint64_t)query_stride;
-   global struct nvk_query_report *report =
-      (global void *)(pool_addr + report_offs);
+   global uint64_t *report = (global uint64_t *)(pool_addr + report_offs);
 
    uint64_t dst_offset = dst_stride * (uint64_t)i;
 
-   if (flags & NVK_QUERY_IS_TIMESTAMP) {
-      /* Timestamp queries are the only ones use a single report */
-      if (write_results) {
-         vk_write_query(dst_addr + dst_offset, 0, flags, report->timestamp);
-      }
-   } else {
-      if (write_results) {
-         for (uint r = 0; r < report_count; ++r) {
-            vk_write_query(dst_addr + dst_offset, r, flags, report[r].value);
-         }
-      }
+   for (uint r = 0; r < report_count; ++r) {
+      vk_write_query(dst_addr + dst_offset, r, flags, report[r * 2]);
    }
 
    if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
diff --git a/src/nouveau/vulkan/cl/nvk_query.h b/src/nouveau/vulkan/cl/nvk_query.h
index ed697253340..ca917047251 100644
--- a/src/nouveau/vulkan/cl/nvk_query.h
+++ b/src/nouveau/vulkan/cl/nvk_query.h
@@ -6,8 +6,6 @@
 
 #include "compiler/libcl/libcl.h"
 
-#define NVK_QUERY_IS_TIMESTAMP 0x80000000u
-
 struct nvk_query_report {
    uint64_t value;
    uint64_t timestamp;
diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index a7f08bc49d6..e5b583efa13 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -1015,13 +1015,16 @@ nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,
       return;
    }
 
+   uint64_t reports_start = pool->reports_start;
    if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
-      flags |= NVK_QUERY_IS_TIMESTAMP;
+      reports_start += offsetof(struct nvk_query_report, timestamp);
+   else
+      reports_start += offsetof(struct nvk_query_report, value);
 
    const struct nvk_copy_query_push push = {
       .pool_addr = pool->mem->va->addr,
       .available_stride = nvk_query_available_stride_B(pool),
-      .reports_start = pool->reports_start,
+      .reports_start = reports_start,
       .report_count = vk_query_pool_report_count(&pool->vk),
       .query_stride = pool->query_stride,
       .first_query = first_query,

From b48c6af0bf9414616f1cdc8cfa98e29439066814 Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Tue, 5 May 2026 11:37:56 +0200
Subject: [PATCH 09/12] nvk: Handle zero queries in CmdCopyQueryPoolResults and
 CmdResetQueryPool

Nothing disallow it by spec, let's avoid doing anything in that case.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_query_pool.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index e5b583efa13..0d0ab24f35b 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -322,6 +322,10 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
+
+   if (unlikely(!queryCount))
+      return;
+
    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);
 
@@ -1051,6 +1055,9 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
    VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
 
+   if (unlikely(!queryCount))
+      return;
+
    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
       for (uint32_t i = 0; i < queryCount; i++) {
          uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);

From 1a48288455f8d2d7579139e0858702029fdbffc0 Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Tue, 5 May 2026 13:37:36 +0200
Subject: [PATCH 10/12] nvk: Store available and timestamps packed together

This changes timestamps so they are written with their available part
directly.

This allows to save a bit of memory and just write timestamp with only
one operation.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
---
 src/nouveau/vulkan/nvk_query_pool.c | 56 ++++++++++++++++++-----------
 src/nouveau/vulkan/nvk_query_pool.h |  6 ++++
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index 0d0ab24f35b..2da57a461cb 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -115,10 +115,12 @@ nvk_CreateQueryPool(VkDevice device,
    if (!pool)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   /* Use interleaved layouts on Tegra so we can safely  handle non-coherent
-    * maps
+   /* Use a packed layout for timestamps.  For other queries, interleaved
+    * layouts on Tegra so we can safely handle non-coherent maps
     */
-   if (pdev->info.type == NV_DEVICE_TYPE_SOC)
+   if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+      pool->layout = NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED;
+   else if (pdev->info.type == NV_DEVICE_TYPE_SOC)
       pool->layout = NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED;
    else
       pool->layout = NVK_QUERY_POOL_LAYOUT_SEPARATE;
@@ -146,6 +148,16 @@ nvk_CreateQueryPool(VkDevice device,
       mem_size = pool->vk.query_count * (uint64_t)pool->query_stride;
       break;
 
+   case NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED:
+      pool->reports_start = 0;
+      pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);
+
+      if (pdev->info.type == NV_DEVICE_TYPE_SOC)
+         pool->query_stride = align(pool->query_stride, pdev->info.nc_atom_size_B);
+
+      mem_size = pool->vk.query_count * (uint64_t)pool->query_stride;
+      break;
+
    default:
       UNREACHABLE("Unsupported query layout");
    }
@@ -241,7 +253,7 @@ nvk_sync_queries_to_gpu(struct nvk_query_pool *pool,
    if (pool->mem->flags & NVKMD_MEM_COHERENT)
       return;
 
-   assert(pool->layout == NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED);
+   assert(pool->layout != NVK_QUERY_POOL_LAYOUT_SEPARATE);
    nvkmd_mem_sync_map_to_gpu(pool->mem, first_query * pool->query_stride,
                              count * pool->query_stride);
 }
@@ -253,7 +265,7 @@ nvk_sync_queries_from_gpu(struct nvk_query_pool *pool,
    if (pool->mem->flags & NVKMD_MEM_COHERENT)
       return;
 
-   assert(pool->layout == NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED);
+   assert(pool->layout != NVK_QUERY_POOL_LAYOUT_SEPARATE);
    nvkmd_mem_sync_map_from_gpu(pool->mem, first_query * pool->query_stride,
                                count * pool->query_stride);
 }
@@ -305,6 +317,10 @@ nvk_ResetQueryPool(VkDevice device,
       assert(pool->mem->flags & NVKMD_MEM_COHERENT);
       uint32_t *available = nvk_query_available_map(pool, firstQuery);
       memset(available, 0, queryCount * sizeof(*available));
+   } else if (pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED) {
+      struct nvk_query_report *reports = nvk_query_report_map(pool, firstQuery);
+      memset(reports, 0, queryCount * pool->query_stride);
+      nvk_sync_queries_to_gpu(pool, firstQuery, queryCount);
    } else {
       for (uint32_t i = 0; i < queryCount; i++) {
          uint32_t *available = nvk_query_available_map(pool, firstQuery + i);
@@ -329,9 +345,17 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);
 
-   if (queryCount > 1 && pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE) {
+   if (queryCount > 1 && pool->layout != NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED) {
+      uint64_t clear_size;
+      if (pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE)
+         clear_size = queryCount * sizeof(uint32_t);
+      else if (pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED)
+         clear_size = queryCount * pool->query_stride;
+      else
+         UNREACHABLE("Unsupported query type");
+
       uint64_t addr = nvk_query_available_addr(pool, firstQuery);
-      nvk_cmd_fill_memory(cmd, addr, queryCount * sizeof(uint32_t), 0);
+      nvk_cmd_fill_memory(cmd, addr, clear_size, 0);
    } else {
       for (uint32_t i = 0; i < queryCount; i++) {
          uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
@@ -378,29 +402,19 @@ nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
 
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
+   assert(pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED);
 
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
    uint64_t report_addr = nvk_query_report_addr(pool, query);
    P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
    P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
    P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
-   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
-   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-      .operation = OPERATION_REPORT_ONLY,
-      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
-      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
-   });
-
-   uint64_t available_addr = nvk_query_available_addr(pool, query);
-   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
-   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
    P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
    P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
       .operation = OPERATION_RELEASE,
       .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
-      .pipeline_location = PIPELINE_LOCATION_ALL,
-      .structure_size = STRUCTURE_SIZE_ONE_WORD,
+      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
+      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
    });
 
    /* From the Vulkan spec:
diff --git a/src/nouveau/vulkan/nvk_query_pool.h b/src/nouveau/vulkan/nvk_query_pool.h
index 9d1ae4e90f3..d4b5a64e5ea 100644
--- a/src/nouveau/vulkan/nvk_query_pool.h
+++ b/src/nouveau/vulkan/nvk_query_pool.h
@@ -28,6 +28,12 @@ enum nvk_query_pool_layout {
     * byte 16.
     */
    NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED,
+
+   /* Stores the availables and the timestamp in nvk_query_report
+    *
+    * This allows to write a timestamp with only one command.
+    */
+   NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED,
 };
 
 struct nvk_query_pool {

From 1aaeb207dc8da96c2278671eaf500335bac133da Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Mon, 4 May 2026 19:39:17 +0200
Subject: [PATCH 11/12] nvk: Add a MME based CmdCopyQueryPoolResults
 implementation

This adds a MME based approach to the queries copy that allow us to not
switch subchannel when possible.

Signed-off-by: Mary Guillemard <mary@mary.zone>
---
 src/nouveau/mme/mme_builder.h       |   8 ++
 src/nouveau/vulkan/nvk_mme.c        |   1 +
 src/nouveau/vulkan/nvk_mme.h        |   2 +
 src/nouveau/vulkan/nvk_query_pool.c | 150 +++++++++++++++++++++++++++-
 4 files changed, 157 insertions(+), 4 deletions(-)

diff --git a/src/nouveau/mme/mme_builder.h b/src/nouveau/mme/mme_builder.h
index 8e4bd15a711..5433d018d6f 100644
--- a/src/nouveau/mme/mme_builder.h
+++ b/src/nouveau/mme/mme_builder.h
@@ -556,6 +556,14 @@ mme_load(struct mme_builder *b)
       UNREACHABLE("Unsupported GPU class");
 }
 
+static inline struct mme_value64
+mme_load_value64(struct mme_builder *b)
+{
+   struct mme_value lo = mme_load(b);
+   struct mme_value hi = mme_load(b);
+   return mme_value64(lo, hi);
+}
+
 static inline struct mme_value64
 mme_load_addr64(struct mme_builder *b)
 {
diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c
index 4d8b364375a..01f95302715 100644
--- a/src/nouveau/vulkan/nvk_mme.c
+++ b/src/nouveau/vulkan/nvk_mme.c
@@ -38,6 +38,7 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = {
    [NVK_MME_SET_VIEWPORT_MIN_MAX_Z]        = nvk_mme_set_viewport_min_max_z,
    [NVK_MME_SET_Z_CLAMP]                   = nvk_mme_set_z_clamp,
    [NVK_MME_SET_STATISTICS_COUNTERS]       = nvk_mme_set_statistics_counters,
+   [NVK_MME_COPY_QUERIES]                  = nvk_mme_copy_queries,
 };
 
 static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = {
diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h
index afffcc6cc19..7ad4e99b531 100644
--- a/src/nouveau/vulkan/nvk_mme.h
+++ b/src/nouveau/vulkan/nvk_mme.h
@@ -42,6 +42,7 @@ enum nvk_mme {
    NVK_MME_SET_VIEWPORT_MIN_MAX_Z,
    NVK_MME_SET_Z_CLAMP,
    NVK_MME_SET_STATISTICS_COUNTERS,
+   NVK_MME_COPY_QUERIES,
 
    NVK_MME_COUNT,
 };
@@ -252,6 +253,7 @@ void nvk_mme_set_conservative_raster_state(struct mme_builder *b);
 void nvk_mme_set_viewport_min_max_z(struct mme_builder *b);
 void nvk_mme_set_z_clamp(struct mme_builder *b);
 void nvk_mme_set_statistics_counters(struct mme_builder *b);
+void nvk_mme_copy_queries(struct mme_builder *b);
 
 uint32_t nvk_mme_tess_params(mesa_shader_stage stage,
                              enum nak_ts_domain domain,
diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index 2da57a461cb..e7f762f83b4 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -1068,6 +1068,8 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
    VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
+   const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
 
    if (unlikely(!queryCount))
       return;
@@ -1077,7 +1079,7 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
          uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);
 
          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
-         __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
+         __push_mthd(p, nvk_cmd_buffer_last_subchannel(cmd), NV906F_SEMAPHOREA);
          P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
          P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
          P_NV906F_SEMAPHOREC(p, 1);
@@ -1089,9 +1091,49 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
       }
    }
 
-   uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset);
-   nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
-                                    dst_addr, stride, flags);
+   const uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset);
+
+   /* Allow to use MME for copy only if we have a small amount of queries on
+    * Turing+. We also ensure it doesn't cause a switch to 3D subchannel on
+    * Turing as it's missing MME on compute.
+    */
+   const bool should_use_mme_copy =
+      queryCount <= 5 && pdev->info.cls_eng3d >= TURING_A &&
+      (nvk_cmd_buffer_last_subchannel(cmd) != SUBC_NV90C0 ||
+       pdev->info.cls_compute >= AMPERE_COMPUTE_B);
+
+   if (!should_use_mme_copy) {
+      nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
+                                       dst_addr, stride, flags);
+   } else {
+      uint64_t report_addr = nvk_query_report_addr(pool, firstQuery);
+      const uint64_t available_addr = nvk_query_available_addr(pool, firstQuery);
+
+      if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+         report_addr += offsetof(struct nvk_query_report, timestamp);
+      else
+         report_addr += offsetof(struct nvk_query_report, value);
+
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
+      if (nvk_cmd_buffer_last_subchannel(cmd) == SUBC_NV90C0 &&
+         pdev->info.cls_compute >= AMPERE_COMPUTE_B)
+         P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
+      else
+         P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
+      P_INLINE_DATA(p, report_addr >> 32);
+      P_INLINE_DATA(p, report_addr);
+      P_INLINE_DATA(p, available_addr >> 32);
+      P_INLINE_DATA(p, available_addr);
+      P_INLINE_DATA(p, nvk_query_available_stride_B(pool));
+      P_INLINE_DATA(p, vk_query_pool_report_count(&pool->vk));
+      P_INLINE_DATA(p, pool->query_stride);
+      P_INLINE_DATA(p, queryCount);
+      P_INLINE_DATA(p, dst_addr >> 32);
+      P_INLINE_DATA(p, dst_addr);
+      P_INLINE_DATA(p, stride >> 32);
+      P_INLINE_DATA(p, stride);
+      P_INLINE_DATA(p, flags);
+   }
 }
 
 void
@@ -1152,3 +1194,103 @@ const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[] = {{
          {NV9097_SET_STATISTICS_COUNTER, 0x100},
          {}},
 }, {}};
+
+/* This helper is quite convoluted because we only have 4 registers to work
+ * with when writing a report result */
+static void
+nvk_mme_write_query(struct mme_builder *b,
+                    struct mme_value64 dst_addr,
+                    struct mme_value idx,
+                    struct mme_value flags,
+                    struct mme_value64 result)
+{
+   struct mme_value result_64_bit = mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT));
+   mme_if(b, ine, result_64_bit, mme_zero()) {
+      struct mme_value report_offset = mme_sll(b, idx, mme_imm(3));
+      struct mme_value64 report_addr =
+         mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero()));
+      mme_free_reg(b, report_offset);
+
+      mme_store_global(b, report_addr, result.lo);
+
+      mme_add64_to(b, report_addr, report_addr, mme_imm64(4));
+      mme_store_global(b, report_addr, result.hi);
+      mme_free_reg64(b, report_addr);
+   }
+
+   mme_if(b, ieq, result_64_bit, mme_zero()) {
+      struct mme_value report_offset = mme_sll(b, idx, mme_imm(2));
+      struct mme_value64 report_addr =
+         mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero()));
+      mme_free_reg(b, report_offset);
+
+      mme_store_global(b, report_addr, result.lo);
+      mme_free_reg64(b, report_addr);
+   }
+   mme_free_reg(b, result_64_bit);
+}
+
+void
+nvk_mme_copy_queries(struct mme_builder *b)
+{
+   if (b->devinfo->cls_eng3d < TURING_A)
+      return;
+
+   struct mme_value64 report_addr = mme_load_addr64(b);
+   struct mme_value64 available_addr = mme_load_addr64(b);
+   struct mme_value available_stride = mme_load(b);
+   struct mme_value report_count = mme_load(b);
+   struct mme_value query_stride = mme_load(b);
+   struct mme_value query_count = mme_load(b);
+   struct mme_value64 dst_addr = mme_load_addr64(b);
+   struct mme_value64 dst_stride = mme_load_addr64(b);
+   struct mme_value flags = mme_load(b);
+
+   /* Now handle queries */
+   mme_while(b, ine, query_count, mme_zero()) {
+      /* We load available and determine if a result need to be written */
+      mme_tu104_read_fifoed(b, available_addr, mme_imm(1));
+      struct mme_value available = mme_load(b);
+      struct mme_value write_results =
+         mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
+      mme_or_to(b, write_results, write_results, available);
+
+      mme_if(b, ine, write_results, mme_zero()) {
+         struct mme_value r = mme_mov(b, mme_zero());
+         mme_while(b, ine, r, report_count) {
+            /* Setup MME fifo read, we only have 7 registers to work with so
+             * we agressively free registers */
+            STATIC_ASSERT(sizeof(struct nvk_query_report) % 2 == 0);
+            struct mme_value current_report_offs = mme_sll(
+               b, r, mme_imm(util_logbase2(sizeof(struct nvk_query_report))));
+            struct mme_value64 current_report_addr = mme_add64(
+               b, report_addr, mme_value64(current_report_offs, mme_zero()));
+            mme_tu104_read_fifoed(b, current_report_addr, mme_imm(2));
+            mme_free_reg(b, current_report_offs);
+            mme_free_reg64(b, current_report_addr);
+
+            struct mme_value64 report = mme_load_value64(b);
+            nvk_mme_write_query(b, dst_addr, r, flags, report);
+            mme_free_reg64(b, report);
+
+            mme_add_to(b, r, r, mme_imm(1));
+         }
+      }
+      mme_free_reg(b, write_results);
+
+      /* Finally write available if needed */
+      struct mme_value with_availability =
+         mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
+      mme_if(b, ine, with_availability, mme_zero()) {
+         nvk_mme_write_query(b, dst_addr, report_count, flags,
+                             mme_value64(available, mme_zero()));
+      }
+      mme_free_reg(b, with_availability);
+      mme_free_reg(b, available);
+
+      mme_sub_to(b, query_count, query_count, mme_imm(1));
+      mme_add64_to(b, report_addr, report_addr, mme_value64(query_stride, mme_zero()));
+      mme_add64_to(b, available_addr, available_addr, mme_value64(available_stride, mme_zero()));
+      mme_add64_to(b, dst_addr, dst_addr, dst_stride);
+   }
+}

From 7c329f1f9ec4f4359ee31a1915f4567e56850300 Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary@mary.zone>
Date: Tue, 5 May 2026 14:16:00 +0200
Subject: [PATCH 12/12] nvk: Implement support for non graphics timestamp

Now that we have a unified layout for timestamp, we can implement
timestamp writes on DMA and Compute sub channels.

This also expose timestamp on non graphics queues.

Signed-off-by: Mary Guillemard <mary@mary.zone>
---
 src/nouveau/vulkan/nvk_physical_device.c |  5 +-
 src/nouveau/vulkan/nvk_query_pool.c      | 67 ++++++++++++++++++++----
 2 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c
index 1a610c5e4c2..45aecb643ce 100644
--- a/src/nouveau/vulkan/nvk_physical_device.c
+++ b/src/nouveau/vulkan/nvk_physical_device.c
@@ -1759,10 +1759,11 @@ nvk_GetPhysicalDeviceQueueFamilyProperties2(
       vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
          p->queueFamilyProperties.queueFlags = queue_family->queue_flags;
          p->queueFamilyProperties.queueCount = queue_family->queue_count;
-         if (queue_family->queue_flags & VK_QUEUE_GRAPHICS_BIT) {
+         if (queue_family->queue_flags &
+             (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT |
+              VK_QUEUE_TRANSFER_BIT)) {
             p->queueFamilyProperties.timestampValidBits = 64;
          } else {
-            /* TODO: Timestamps on non-graphics queues */
             p->queueFamilyProperties.timestampValidBits = 0;
          }
          p->queueFamilyProperties.minImageTransferGranularity =
diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c
index e7f762f83b4..33172046416 100644
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -16,6 +16,7 @@
 #include "vk_common_entrypoints.h"
 #include "vk_meta.h"
 #include "vk_pipeline.h"
+#include "vk_synchronization.h"
 
 #include "cl/nvk_query.h"
 #include "compiler/nir/nir.h"
@@ -26,6 +27,8 @@
 
 #include "nv_push_cl906f.h"
 #include "nv_push_cl9097.h"
+#include "nv_push_cl90b5.h"
+#include "nv_push_cl90c0.h"
 #include "nv_push_cla0c0.h"
 #include "nv_push_clc597.h"
 #include "nv_push_clc7c0.h"
@@ -404,18 +407,60 @@ nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
 
    assert(pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED);
 
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
    uint64_t report_addr = nvk_query_report_addr(pool, query);
-   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-   P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
-   P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
-   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
-   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-      .operation = OPERATION_RELEASE,
-      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
-      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
-      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
-   });
+   uint8_t subc = nvk_cmd_buffer_last_subchannel(cmd);
+   if (subc == SUBC_NV9097) {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+      P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
+      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+      P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
+      P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
+      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
+      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+         .operation = OPERATION_RELEASE,
+         .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
+         .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
+         .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
+      });
+   } else if (subc == SUBC_NV90C0) {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+
+      /* Compute SET_REPORT_SEMAPHORE_D doesn't provide a pipeline location
+       * meaning that we need to handle first synchronization scope here.
+       *
+       * Considering that if we are on the compute subchannel, we only really
+       * need to wait on anything that runs on compute.
+       */
+      if (vk_expand_src_stage_flags2(stage) &
+          (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
+           VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT |
+           VK_PIPELINE_STAGE_2_BLIT_BIT))
+         P_IMMD(p, NV90C0, WAIT_FOR_IDLE, 0);
+
+      P_MTHD(p, NV90C0, SET_REPORT_SEMAPHORE_A);
+      P_NV90C0_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
+      P_NV90C0_SET_REPORT_SEMAPHORE_B(p, report_addr);
+      P_NV90C0_SET_REPORT_SEMAPHORE_C(p, 1);
+      P_NV90C0_SET_REPORT_SEMAPHORE_D(p, {
+         .operation = OPERATION_RELEASE,
+         .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
+      });
+   } else {
+      assert(subc == SUBC_NV90B5);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
+
+      P_MTHD(p, NV90B5, SET_SEMAPHORE_A);
+      P_NV90B5_SET_SEMAPHORE_A(p, report_addr >> 32);
+      P_NV90B5_SET_SEMAPHORE_B(p, report_addr);
+      P_NV90B5_SET_SEMAPHORE_PAYLOAD(p, 1);
+
+      P_IMMD(p, NV90B5, LAUNCH_DMA, {
+         .data_transfer_type = DATA_TRANSFER_TYPE_NONE,
+         .semaphore_type = SEMAPHORE_TYPE_RELEASE_FOUR_WORD_SEMAPHORE,
+         .flush_enable = FLUSH_ENABLE_TRUE,
+         /* Note: FLUSH_TYPE=SYS implicitly for NVC3B5+ */
+      });
+   }
 
    /* From the Vulkan spec:
     *