diff --git a/src/nouveau/mme/mme_builder.h b/src/nouveau/mme/mme_builder.h index 8e4bd15a711..5433d018d6f 100644 --- a/src/nouveau/mme/mme_builder.h +++ b/src/nouveau/mme/mme_builder.h @@ -556,6 +556,14 @@ mme_load(struct mme_builder *b) UNREACHABLE("Unsupported GPU class"); } +static inline struct mme_value64 +mme_load_value64(struct mme_builder *b) +{ + struct mme_value lo = mme_load(b); + struct mme_value hi = mme_load(b); + return mme_value64(lo, hi); +} + static inline struct mme_value64 mme_load_addr64(struct mme_builder *b) { diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c index 4d8b364375a..01f95302715 100644 --- a/src/nouveau/vulkan/nvk_mme.c +++ b/src/nouveau/vulkan/nvk_mme.c @@ -38,6 +38,7 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = { [NVK_MME_SET_VIEWPORT_MIN_MAX_Z] = nvk_mme_set_viewport_min_max_z, [NVK_MME_SET_Z_CLAMP] = nvk_mme_set_z_clamp, [NVK_MME_SET_STATISTICS_COUNTERS] = nvk_mme_set_statistics_counters, + [NVK_MME_COPY_QUERIES] = nvk_mme_copy_queries, }; static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = { diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h index afffcc6cc19..7ad4e99b531 100644 --- a/src/nouveau/vulkan/nvk_mme.h +++ b/src/nouveau/vulkan/nvk_mme.h @@ -42,6 +42,7 @@ enum nvk_mme { NVK_MME_SET_VIEWPORT_MIN_MAX_Z, NVK_MME_SET_Z_CLAMP, NVK_MME_SET_STATISTICS_COUNTERS, + NVK_MME_COPY_QUERIES, NVK_MME_COUNT, }; @@ -252,6 +253,7 @@ void nvk_mme_set_conservative_raster_state(struct mme_builder *b); void nvk_mme_set_viewport_min_max_z(struct mme_builder *b); void nvk_mme_set_z_clamp(struct mme_builder *b); void nvk_mme_set_statistics_counters(struct mme_builder *b); +void nvk_mme_copy_queries(struct mme_builder *b); uint32_t nvk_mme_tess_params(mesa_shader_stage stage, enum nak_ts_domain domain, diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c index 2da57a461cb..e7f762f83b4 100644 --- a/src/nouveau/vulkan/nvk_query_pool.c +++ b/src/nouveau/vulkan/nvk_query_pool.c @@ -1068,6 +1068,8 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer); + const struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + const struct nvk_physical_device *pdev = nvk_device_physical(dev); if (unlikely(!queryCount)) return; @@ -1077,7 +1079,7 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i); struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); - __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA); + __push_mthd(p, nvk_cmd_buffer_last_subchannel(cmd), NV906F_SEMAPHOREA); P_NV906F_SEMAPHOREA(p, avail_addr >> 32); P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2); P_NV906F_SEMAPHOREC(p, 1); @@ -1089,9 +1091,49 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, } } - uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset); - nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount, - dst_addr, stride, flags); + const uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset); + + /* Allow to use MME for copy only if we have a small amount of queries on + * Turing+. We also ensure it doesn't cause a switch to 3D subchannel on + * Turing as it's missing MME on compute. + */ + const bool should_use_mme_copy = + queryCount <= 5 && pdev->info.cls_eng3d >= TURING_A && + (nvk_cmd_buffer_last_subchannel(cmd) != SUBC_NV90C0 || + pdev->info.cls_compute >= AMPERE_COMPUTE_B); + + if (!should_use_mme_copy) { + nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount, + dst_addr, stride, flags); + } else { + uint64_t report_addr = nvk_query_report_addr(pool, firstQuery); + const uint64_t available_addr = nvk_query_available_addr(pool, firstQuery); + + if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) + report_addr += offsetof(struct nvk_query_report, timestamp); + else + report_addr += offsetof(struct nvk_query_report, value); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 14); + if (nvk_cmd_buffer_last_subchannel(cmd) == SUBC_NV90C0 && + pdev->info.cls_compute >= AMPERE_COMPUTE_B) + P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_COPY_QUERIES)); + else + P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES)); + P_INLINE_DATA(p, report_addr >> 32); + P_INLINE_DATA(p, report_addr); + P_INLINE_DATA(p, available_addr >> 32); + P_INLINE_DATA(p, available_addr); + P_INLINE_DATA(p, nvk_query_available_stride_B(pool)); + P_INLINE_DATA(p, vk_query_pool_report_count(&pool->vk)); + P_INLINE_DATA(p, pool->query_stride); + P_INLINE_DATA(p, queryCount); + P_INLINE_DATA(p, dst_addr >> 32); + P_INLINE_DATA(p, dst_addr); + P_INLINE_DATA(p, stride >> 32); + P_INLINE_DATA(p, stride); + P_INLINE_DATA(p, flags); + } } void @@ -1152,3 +1194,103 @@ const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[] = {{ {NV9097_SET_STATISTICS_COUNTER, 0x100}, {}}, }, {}}; + +/* This helper is quite convoluted because we only have 4 registers to work + * with when writing a report result */ +static void +nvk_mme_write_query(struct mme_builder *b, + struct mme_value64 dst_addr, + struct mme_value idx, + struct mme_value flags, + struct mme_value64 result) +{ + struct mme_value result_64_bit = mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT)); + mme_if(b, ine, result_64_bit, mme_zero()) { + struct mme_value report_offset = mme_sll(b, idx, mme_imm(3)); + struct mme_value64 report_addr = + mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero())); + mme_free_reg(b, report_offset); + + mme_store_global(b, report_addr, result.lo); + + mme_add64_to(b, report_addr, report_addr, mme_imm64(4)); + mme_store_global(b, report_addr, result.hi); + mme_free_reg64(b, report_addr); + } + + mme_if(b, ieq, result_64_bit, mme_zero()) { + struct mme_value report_offset = mme_sll(b, idx, mme_imm(2)); + struct mme_value64 report_addr = + mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero())); + mme_free_reg(b, report_offset); + + mme_store_global(b, report_addr, result.lo); + mme_free_reg64(b, report_addr); + } + mme_free_reg(b, result_64_bit); +} + +void +nvk_mme_copy_queries(struct mme_builder *b) +{ + if (b->devinfo->cls_eng3d < TURING_A) + return; + + struct mme_value64 report_addr = mme_load_addr64(b); + struct mme_value64 available_addr = mme_load_addr64(b); + struct mme_value available_stride = mme_load(b); + struct mme_value report_count = mme_load(b); + struct mme_value query_stride = mme_load(b); + struct mme_value query_count = mme_load(b); + struct mme_value64 dst_addr = mme_load_addr64(b); + struct mme_value64 dst_stride = mme_load_addr64(b); + struct mme_value flags = mme_load(b); + + /* Now handle queries */ + mme_while(b, ine, query_count, mme_zero()) { + /* We load available and determine if a result need to be written */ + mme_tu104_read_fifoed(b, available_addr, mme_imm(1)); + struct mme_value available = mme_load(b); + struct mme_value write_results = + mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); + mme_or_to(b, write_results, write_results, available); + + mme_if(b, ine, write_results, mme_zero()) { + struct mme_value r = mme_mov(b, mme_zero()); + mme_while(b, ine, r, report_count) { + /* Setup MME fifo read, we only have 7 registers to work with so + * we agressively free registers */ + STATIC_ASSERT(sizeof(struct nvk_query_report) % 2 == 0); + struct mme_value current_report_offs = mme_sll( + b, r, mme_imm(util_logbase2(sizeof(struct nvk_query_report)))); + struct mme_value64 current_report_addr = mme_add64( + b, report_addr, mme_value64(current_report_offs, mme_zero())); + mme_tu104_read_fifoed(b, current_report_addr, mme_imm(2)); + mme_free_reg(b, current_report_offs); + mme_free_reg64(b, current_report_addr); + + struct mme_value64 report = mme_load_value64(b); + nvk_mme_write_query(b, dst_addr, r, flags, report); + mme_free_reg64(b, report); + + mme_add_to(b, r, r, mme_imm(1)); + } + } + mme_free_reg(b, write_results); + + /* Finally write available if needed */ + struct mme_value with_availability = + mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); + mme_if(b, ine, with_availability, mme_zero()) { + nvk_mme_write_query(b, dst_addr, report_count, flags, + mme_value64(available, mme_zero())); + } + mme_free_reg(b, with_availability); + mme_free_reg(b, available); + + mme_sub_to(b, query_count, query_count, mme_imm(1)); + mme_add64_to(b, report_addr, report_addr, mme_value64(query_stride, mme_zero())); + mme_add64_to(b, available_addr, available_addr, mme_value64(available_stride, mme_zero())); + mme_add64_to(b, dst_addr, dst_addr, dst_stride); + } +}