nvk: Use the compute MME for compute dispatch

Switching from compute to 3D and vice versa leads to a long stall which
destroys compute performance. This switches to the compute MME on Ampere
onwards (which was where it was added) for compute dispatches which eliminates
stalling from sub-channel switching in these cases.

Reviewed-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37671>
This commit is contained in:
Mohamed Ahmed 2025-08-27 01:31:57 +03:00 committed by Marge Bot
parent 146a64524d
commit 7a0e7d24bb
3 changed files with 22 additions and 4 deletions

View file

@ -25,6 +25,7 @@
#include "nv_push_clc3c0.h"
#include "nv_push_clc597.h"
#include "nv_push_clc6c0.h"
#include "nv_push_clc7c0.h"
#include "nv_push_clc86f.h"
VkResult
@ -315,7 +316,10 @@ nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,
struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_B)
P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
else
P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
P_INLINE_DATA(p, cs_invocations >> 32);
P_INLINE_DATA(p, cs_invocations);
@ -562,7 +566,10 @@ nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
p = nvk_cmd_buffer_push(cmd, 14);
if (nvk_cmd_buffer_compute_cls(cmd) < BLACKWELL_COMPUTE_A)
P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_B)
P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
else
P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
P_INLINE_DATA(p, dispatch_addr >> 32);
P_INLINE_DATA(p, dispatch_addr);
P_INLINE_DATA(p, root_desc_addr >> 32);

View file

@ -20,6 +20,7 @@
#include "nv_push_cla0c0.h"
#include "nv_push_clb1c0.h"
#include "nv_push_clc6c0.h"
#include "nv_push_clc7c0.h"
#include "nv_push_clc86f.h"
struct nvk_indirect_commands_layout {
@ -395,7 +396,10 @@ build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
/* Now emit commands */
nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
if (pdev->info.cls_compute >= AMPERE_COMPUTE_B)
nvk_nir_P_1INC(b, p, NVC7C0, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
else
nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));

View file

@ -28,6 +28,7 @@
#include "nv_push_cl9097.h"
#include "nv_push_cla0c0.h"
#include "nv_push_clc597.h"
#include "nv_push_clc7c0.h"
VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateQueryPool(VkDevice device,
@ -378,6 +379,9 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
uint32_t query, uint32_t index,
bool end)
{
const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
uint64_t report_addr = nvk_query_report_addr(pool, query) +
end * sizeof(struct nvk_query_report);
@ -417,7 +421,10 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
assert(!(stats_left & (sq->flag - 1)));
if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
if (pdev->info.cls_compute >= AMPERE_COMPUTE_B)
P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
else
P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
P_INLINE_DATA(p, report_addr >> 32);
P_INLINE_DATA(p, report_addr);
} else {