nvk: Add a MME based CmdCopyQueryPoolResults implementation

This adds a MME based approach to the queries copy that allow us to not
switch subchannel when possible.

Signed-off-by: Mary Guillemard <mary@mary.zone>
This commit is contained in:
Mary Guillemard 2026-05-04 19:39:17 +02:00
parent 1a48288455
commit 1aaeb207dc
4 changed files with 157 additions and 4 deletions

View file

@ -556,6 +556,14 @@ mme_load(struct mme_builder *b)
UNREACHABLE("Unsupported GPU class");
}
static inline struct mme_value64
mme_load_value64(struct mme_builder *b)
{
struct mme_value lo = mme_load(b);
struct mme_value hi = mme_load(b);
return mme_value64(lo, hi);
}
static inline struct mme_value64
mme_load_addr64(struct mme_builder *b)
{

View file

@ -38,6 +38,7 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = {
[NVK_MME_SET_VIEWPORT_MIN_MAX_Z] = nvk_mme_set_viewport_min_max_z,
[NVK_MME_SET_Z_CLAMP] = nvk_mme_set_z_clamp,
[NVK_MME_SET_STATISTICS_COUNTERS] = nvk_mme_set_statistics_counters,
[NVK_MME_COPY_QUERIES] = nvk_mme_copy_queries,
};
static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = {

View file

@ -42,6 +42,7 @@ enum nvk_mme {
NVK_MME_SET_VIEWPORT_MIN_MAX_Z,
NVK_MME_SET_Z_CLAMP,
NVK_MME_SET_STATISTICS_COUNTERS,
NVK_MME_COPY_QUERIES,
NVK_MME_COUNT,
};
@ -252,6 +253,7 @@ void nvk_mme_set_conservative_raster_state(struct mme_builder *b);
void nvk_mme_set_viewport_min_max_z(struct mme_builder *b);
void nvk_mme_set_z_clamp(struct mme_builder *b);
void nvk_mme_set_statistics_counters(struct mme_builder *b);
void nvk_mme_copy_queries(struct mme_builder *b);
uint32_t nvk_mme_tess_params(mesa_shader_stage stage,
enum nak_ts_domain domain,

View file

@ -1068,6 +1068,8 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
if (unlikely(!queryCount))
return;
@ -1077,7 +1079,7 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
__push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
__push_mthd(p, nvk_cmd_buffer_last_subchannel(cmd), NV906F_SEMAPHOREA);
P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
P_NV906F_SEMAPHOREC(p, 1);
@ -1089,9 +1091,49 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
}
}
uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset);
nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
dst_addr, stride, flags);
const uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset);
/* Allow to use MME for copy only if we have a small amount of queries on
* Turing+. We also ensure it doesn't cause a switch to 3D subchannel on
* Turing as it's missing MME on compute.
*/
const bool should_use_mme_copy =
queryCount <= 5 && pdev->info.cls_eng3d >= TURING_A &&
(nvk_cmd_buffer_last_subchannel(cmd) != SUBC_NV90C0 ||
pdev->info.cls_compute >= AMPERE_COMPUTE_B);
if (!should_use_mme_copy) {
nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
dst_addr, stride, flags);
} else {
uint64_t report_addr = nvk_query_report_addr(pool, firstQuery);
const uint64_t available_addr = nvk_query_available_addr(pool, firstQuery);
if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
report_addr += offsetof(struct nvk_query_report, timestamp);
else
report_addr += offsetof(struct nvk_query_report, value);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
if (nvk_cmd_buffer_last_subchannel(cmd) == SUBC_NV90C0 &&
pdev->info.cls_compute >= AMPERE_COMPUTE_B)
P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
else
P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
P_INLINE_DATA(p, report_addr >> 32);
P_INLINE_DATA(p, report_addr);
P_INLINE_DATA(p, available_addr >> 32);
P_INLINE_DATA(p, available_addr);
P_INLINE_DATA(p, nvk_query_available_stride_B(pool));
P_INLINE_DATA(p, vk_query_pool_report_count(&pool->vk));
P_INLINE_DATA(p, pool->query_stride);
P_INLINE_DATA(p, queryCount);
P_INLINE_DATA(p, dst_addr >> 32);
P_INLINE_DATA(p, dst_addr);
P_INLINE_DATA(p, stride >> 32);
P_INLINE_DATA(p, stride);
P_INLINE_DATA(p, flags);
}
}
void
@ -1152,3 +1194,103 @@ const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[] = {{
{NV9097_SET_STATISTICS_COUNTER, 0x100},
{}},
}, {}};
/* This helper is quite convoluted because we only have 4 registers to work
* with when writing a report result */
static void
nvk_mme_write_query(struct mme_builder *b,
struct mme_value64 dst_addr,
struct mme_value idx,
struct mme_value flags,
struct mme_value64 result)
{
struct mme_value result_64_bit = mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT));
mme_if(b, ine, result_64_bit, mme_zero()) {
struct mme_value report_offset = mme_sll(b, idx, mme_imm(3));
struct mme_value64 report_addr =
mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero()));
mme_free_reg(b, report_offset);
mme_store_global(b, report_addr, result.lo);
mme_add64_to(b, report_addr, report_addr, mme_imm64(4));
mme_store_global(b, report_addr, result.hi);
mme_free_reg64(b, report_addr);
}
mme_if(b, ieq, result_64_bit, mme_zero()) {
struct mme_value report_offset = mme_sll(b, idx, mme_imm(2));
struct mme_value64 report_addr =
mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero()));
mme_free_reg(b, report_offset);
mme_store_global(b, report_addr, result.lo);
mme_free_reg64(b, report_addr);
}
mme_free_reg(b, result_64_bit);
}
void
nvk_mme_copy_queries(struct mme_builder *b)
{
if (b->devinfo->cls_eng3d < TURING_A)
return;
struct mme_value64 report_addr = mme_load_addr64(b);
struct mme_value64 available_addr = mme_load_addr64(b);
struct mme_value available_stride = mme_load(b);
struct mme_value report_count = mme_load(b);
struct mme_value query_stride = mme_load(b);
struct mme_value query_count = mme_load(b);
struct mme_value64 dst_addr = mme_load_addr64(b);
struct mme_value64 dst_stride = mme_load_addr64(b);
struct mme_value flags = mme_load(b);
/* Now handle queries */
mme_while(b, ine, query_count, mme_zero()) {
/* We load available and determine if a result need to be written */
mme_tu104_read_fifoed(b, available_addr, mme_imm(1));
struct mme_value available = mme_load(b);
struct mme_value write_results =
mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
mme_or_to(b, write_results, write_results, available);
mme_if(b, ine, write_results, mme_zero()) {
struct mme_value r = mme_mov(b, mme_zero());
mme_while(b, ine, r, report_count) {
/* Setup MME fifo read, we only have 7 registers to work with so
* we agressively free registers */
STATIC_ASSERT(sizeof(struct nvk_query_report) % 2 == 0);
struct mme_value current_report_offs = mme_sll(
b, r, mme_imm(util_logbase2(sizeof(struct nvk_query_report))));
struct mme_value64 current_report_addr = mme_add64(
b, report_addr, mme_value64(current_report_offs, mme_zero()));
mme_tu104_read_fifoed(b, current_report_addr, mme_imm(2));
mme_free_reg(b, current_report_offs);
mme_free_reg64(b, current_report_addr);
struct mme_value64 report = mme_load_value64(b);
nvk_mme_write_query(b, dst_addr, r, flags, report);
mme_free_reg64(b, report);
mme_add_to(b, r, r, mme_imm(1));
}
}
mme_free_reg(b, write_results);
/* Finally write available if needed */
struct mme_value with_availability =
mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
mme_if(b, ine, with_availability, mme_zero()) {
nvk_mme_write_query(b, dst_addr, report_count, flags,
mme_value64(available, mme_zero()));
}
mme_free_reg(b, with_availability);
mme_free_reg(b, available);
mme_sub_to(b, query_count, query_count, mme_imm(1));
mme_add64_to(b, report_addr, report_addr, mme_value64(query_stride, mme_zero()));
mme_add64_to(b, available_addr, available_addr, mme_value64(available_stride, mme_zero()));
mme_add64_to(b, dst_addr, dst_addr, dst_stride);
}
}