nvk: Use the compute MME for compute dispatch

Switching from compute to 3D and vice versa leads to a long stall which destroys compute performance. This switches to the compute MME on Ampere onwards (which was where it was added) for compute dispatches which eliminates stalling from sub-channel switching in these cases. Reviewed-by: Karol Herbst <kherbst@redhat.com> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37671>
2026-06-15 19:58:31 +02:00 · 2025-08-27 01:31:57 +03:00 · 2025-08-27 01:31:57 +03:00 · 7a0e7d24bb
commit 7a0e7d24bb
parent 146a64524d
3 changed files with 22 additions and 4 deletions
--- a/src/nouveau/vulkan/nvk_cmd_dispatch.c
+++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c
@ -25,6 +25,7 @@
 #include "nv_push_clc3c0.h"
 #include "nv_push_clc597.h"
 #include "nv_push_clc6c0.h"
+#include "nv_push_clc7c0.h"
 #include "nv_push_clc86f.h"

 VkResult
@ -315,7 +316,10 @@ nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,

   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);

-   P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
+   if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_B)
+      P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
+   else
+      P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
   P_INLINE_DATA(p, cs_invocations >> 32);
   P_INLINE_DATA(p, cs_invocations);

@ -562,7 +566,10 @@ nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
      p = nvk_cmd_buffer_push(cmd, 14);
      if (nvk_cmd_buffer_compute_cls(cmd) < BLACKWELL_COMPUTE_A)
         P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
-      P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
+      if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_B)
+         P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
+      else
+         P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
      P_INLINE_DATA(p, dispatch_addr >> 32);
      P_INLINE_DATA(p, dispatch_addr);
      P_INLINE_DATA(p, root_desc_addr >> 32);
--- a/src/nouveau/vulkan/nvk_cmd_indirect.c
+++ b/src/nouveau/vulkan/nvk_cmd_indirect.c
@ -20,6 +20,7 @@
 #include "nv_push_cla0c0.h"
 #include "nv_push_clb1c0.h"
 #include "nv_push_clc6c0.h"
+#include "nv_push_clc7c0.h"
 #include "nv_push_clc86f.h"

 struct nvk_indirect_commands_layout {
@ -395,7 +396,10 @@ build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
            /* Now emit commands */
            nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
            invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
-            nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
+            if (pdev->info.cls_compute >= AMPERE_COMPUTE_B)
+               nvk_nir_P_1INC(b, p, NVC7C0, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
+            else
+               nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
            nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
            nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));

--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@ -28,6 +28,7 @@
 #include "nv_push_cl9097.h"
 #include "nv_push_cla0c0.h"
 #include "nv_push_clc597.h"
+#include "nv_push_clc7c0.h"

 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_CreateQueryPool(VkDevice device,
@ -378,6 +379,9 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
                        uint32_t query, uint32_t index,
                        bool end)
 {
+   const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+
   uint64_t report_addr = nvk_query_report_addr(pool, query) +
                          end * sizeof(struct nvk_query_report);

@ -417,7 +421,10 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
         assert(!(stats_left & (sq->flag - 1)));

         if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
-            P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
+            if (pdev->info.cls_compute >= AMPERE_COMPUTE_B)
+               P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
+            else
+               P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
            P_INLINE_DATA(p, report_addr >> 32);
            P_INLINE_DATA(p, report_addr);
         } else {