diff --git a/src/nouveau/mme/mme_builder.h b/src/nouveau/mme/mme_builder.h index 8e4bd15a711..5433d018d6f 100644 --- a/src/nouveau/mme/mme_builder.h +++ b/src/nouveau/mme/mme_builder.h @@ -556,6 +556,14 @@ mme_load(struct mme_builder *b) UNREACHABLE("Unsupported GPU class"); } +static inline struct mme_value64 +mme_load_value64(struct mme_builder *b) +{ + struct mme_value lo = mme_load(b); + struct mme_value hi = mme_load(b); + return mme_value64(lo, hi); +} + static inline struct mme_value64 mme_load_addr64(struct mme_builder *b) { diff --git a/src/nouveau/vulkan/cl/nvk_query.cl b/src/nouveau/vulkan/cl/nvk_query.cl index 72b3a08e5e6..737cf9c2246 100644 --- a/src/nouveau/vulkan/cl/nvk_query.cl +++ b/src/nouveau/vulkan/cl/nvk_query.cl @@ -21,24 +21,12 @@ nvk_copy_queries(uint64_t pool_addr, uint available_stride, bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); uint64_t report_offs = reports_start + (uint64_t)query * (uint64_t)query_stride; - global struct nvk_query_report *report = - (global void *)(pool_addr + report_offs); + global uint64_t *report = (global uint64_t *)(pool_addr + report_offs); uint64_t dst_offset = dst_stride * (uint64_t)i; - if (flags & NVK_QUERY_IS_TIMESTAMP) { - /* Timestamp queries are the only ones use a single report */ - if (write_results) { - vk_write_query(dst_addr + dst_offset, 0, flags, report->timestamp); - } - } else { - if (write_results) { - for (uint r = 0; r < report_count; ++r) { - uint delta = report[(r * 2) + 1].value - report[r * 2].value; - - vk_write_query(dst_addr + dst_offset, r, flags, delta); - } - } + for (uint r = 0; r < report_count; ++r) { + vk_write_query(dst_addr + dst_offset, r, flags, report[r * 2]); } if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { diff --git a/src/nouveau/vulkan/cl/nvk_query.h b/src/nouveau/vulkan/cl/nvk_query.h index ed697253340..ca917047251 100644 --- a/src/nouveau/vulkan/cl/nvk_query.h +++ b/src/nouveau/vulkan/cl/nvk_query.h @@ -6,8 +6,6 @@ #include "compiler/libcl/libcl.h" -#define NVK_QUERY_IS_TIMESTAMP 0x80000000u - struct nvk_query_report { uint64_t value; uint64_t timestamp; diff --git a/src/nouveau/vulkan/nvk_cmd_draw.c b/src/nouveau/vulkan/nvk_cmd_draw.c index 2d12dcaef1b..19ca89acbe6 100644 --- a/src/nouveau/vulkan/nvk_cmd_draw.c +++ b/src/nouveau/vulkan/nvk_cmd_draw.c @@ -399,8 +399,24 @@ nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p) .output7 = OUTPUT7_FALSE, }); - /* The blob driver just always leaves this on. */ - P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_TRUE); + P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE); + P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, { + .da_vertices_generated_enable = false, + .da_primitives_generated_enable = false, + .vs_invocations_enable = false, + .gs_invocations_enable = false, + .gs_primitives_generated_enable = false, + .streaming_primitives_succeeded_enable = false, + .streaming_primitives_needed_enable = false, + .clipper_invocations_enable = false, + .clipper_primitives_generated_enable = false, + .ps_invocations_enable = false, + .ti_invocations_enable = false, + .ts_invocations_enable = false, + .ts_primitives_generated_enable = false, + .total_streaming_primitives_needed_succeeded_enable = false, + .vtg_primitives_out_enable = false, + }); P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0)); P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE }); diff --git a/src/nouveau/vulkan/nvk_cmd_meta.c b/src/nouveau/vulkan/nvk_cmd_meta.c index d58f2ca4df2..a1f79df3c7f 100644 --- a/src/nouveau/vulkan/nvk_cmd_meta.c +++ b/src/nouveau/vulkan/nvk_cmd_meta.c @@ -78,10 +78,9 @@ nvk_meta_begin(struct nvk_cmd_buffer *cmd, { const struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors; - struct nv_push *p = nvk_cmd_buffer_push(cmd, 6); - + struct nv_push *p = nvk_cmd_buffer_push(cmd, 10); + P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_PASSTHROUGH); P_IMMD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE, MODE_ALWAYS_RENDER); - P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, { .da_vertices_generated_enable = false, .da_primitives_generated_enable = false, @@ -99,8 +98,8 @@ nvk_meta_begin(struct nvk_cmd_buffer *cmd, .total_streaming_primitives_needed_succeeded_enable = false, .vtg_primitives_out_enable = false, }); - - P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, false); + P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE); + P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_TRACK_WITH_FILTER); save->dynamic = cmd->vk.dynamic_graphics_state; save->_dynamic_vi = cmd->state.gfx._dynamic_vi; @@ -189,29 +188,13 @@ nvk_meta_end(struct nvk_cmd_buffer *cmd, nvk_descriptor_state_set_root_array(cmd, desc, push, 0, sizeof(save->push), save->push); - struct nv_push *p = nvk_cmd_buffer_push(cmd, 6); - - P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, true); - - P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, { - .da_vertices_generated_enable = true, - .da_primitives_generated_enable = true, - .vs_invocations_enable = true, - .gs_invocations_enable = true, - .gs_primitives_generated_enable = true, - .streaming_primitives_succeeded_enable = true, - .streaming_primitives_needed_enable = true, - .clipper_invocations_enable = true, - .clipper_primitives_generated_enable = true, - .ps_invocations_enable = true, - .ti_invocations_enable = true, - .ts_invocations_enable = true, - .ts_primitives_generated_enable = true, - .total_streaming_primitives_needed_succeeded_enable = true, - .vtg_primitives_out_enable = true, - }); - + /* Replay the previous state from shadow RAM */ + struct nv_push *p = nvk_cmd_buffer_push(cmd, 10); + P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_REPLAY); + P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE); + P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {}); P_IMMD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE, MODE_USE_RENDER_ENABLE); + P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_TRACK_WITH_FILTER); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c index d4e36f7c78d..01f95302715 100644 --- a/src/nouveau/vulkan/nvk_mme.c +++ b/src/nouveau/vulkan/nvk_mme.c @@ -37,6 +37,8 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = { [NVK_MME_SET_CONSERVATIVE_RASTER_STATE] = nvk_mme_set_conservative_raster_state, [NVK_MME_SET_VIEWPORT_MIN_MAX_Z] = nvk_mme_set_viewport_min_max_z, [NVK_MME_SET_Z_CLAMP] = nvk_mme_set_z_clamp, + [NVK_MME_SET_STATISTICS_COUNTERS] = nvk_mme_set_statistics_counters, + [NVK_MME_COPY_QUERIES] = nvk_mme_copy_queries, }; static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = { @@ -45,6 +47,7 @@ static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = { [NVK_MME_SET_TESS_PARAMS] = nvk_mme_set_tess_params_tests, [NVK_MME_SET_SHADING_RATE_CONTROL] = nvk_mme_set_shading_rate_control_tests, [NVK_MME_SET_ANTI_ALIAS] = nvk_mme_set_anti_alias_tests, + [NVK_MME_SET_STATISTICS_COUNTERS] = nvk_mme_set_statistics_counters_tests, }; uint32_t * diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h index a4d7f7ba309..7ad4e99b531 100644 --- a/src/nouveau/vulkan/nvk_mme.h +++ b/src/nouveau/vulkan/nvk_mme.h @@ -41,6 +41,8 @@ enum nvk_mme { NVK_MME_SET_CONSERVATIVE_RASTER_STATE, NVK_MME_SET_VIEWPORT_MIN_MAX_Z, NVK_MME_SET_Z_CLAMP, + NVK_MME_SET_STATISTICS_COUNTERS, + NVK_MME_COPY_QUERIES, NVK_MME_COUNT, }; @@ -68,6 +70,7 @@ enum nvk_mme_scratch { NVK_MME_SCRATCH_WRITE_MASK_DYN, NVK_MME_SCRATCH_WRITE_MASK_PIPELINE, NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE, + NVK_MME_SCRATCH_STATISTICS_COUNTER_STATE, /* Copy of SET_WINDOW_CLIP_ENABLE */ NVK_MME_SCRATCH_WINDOW_CLIP_ENABLED, /* TODO: can we use shadow-ram? */ @@ -249,6 +252,8 @@ void nvk_mme_set_write_mask(struct mme_builder *b); void nvk_mme_set_conservative_raster_state(struct mme_builder *b); void nvk_mme_set_viewport_min_max_z(struct mme_builder *b); void nvk_mme_set_z_clamp(struct mme_builder *b); +void nvk_mme_set_statistics_counters(struct mme_builder *b); +void nvk_mme_copy_queries(struct mme_builder *b); uint32_t nvk_mme_tess_params(mesa_shader_stage stage, enum nak_ts_domain domain, @@ -278,6 +283,7 @@ extern const struct nvk_mme_test_case nvk_mme_bind_vb_tests[]; extern const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[]; extern const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[]; extern const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[]; +extern const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[]; void nvk_test_all_mmes(const struct nv_device_info *devinfo); diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c index 1a610c5e4c2..45aecb643ce 100644 --- a/src/nouveau/vulkan/nvk_physical_device.c +++ b/src/nouveau/vulkan/nvk_physical_device.c @@ -1759,10 +1759,11 @@ nvk_GetPhysicalDeviceQueueFamilyProperties2( vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) { p->queueFamilyProperties.queueFlags = queue_family->queue_flags; p->queueFamilyProperties.queueCount = queue_family->queue_count; - if (queue_family->queue_flags & VK_QUEUE_GRAPHICS_BIT) { + if (queue_family->queue_flags & + (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT)) { p->queueFamilyProperties.timestampValidBits = 64; } else { - /* TODO: Timestamps on non-graphics queues */ p->queueFamilyProperties.timestampValidBits = 0; } p->queueFamilyProperties.minImageTransferGranularity = diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c index 7082d95dc96..33172046416 100644 --- a/src/nouveau/vulkan/nvk_query_pool.c +++ b/src/nouveau/vulkan/nvk_query_pool.c @@ -16,6 +16,7 @@ #include "vk_common_entrypoints.h" #include "vk_meta.h" #include "vk_pipeline.h" +#include "vk_synchronization.h" #include "cl/nvk_query.h" #include "compiler/nir/nir.h" @@ -26,9 +27,13 @@ #include "nv_push_cl906f.h" #include "nv_push_cl9097.h" +#include "nv_push_cl90b5.h" +#include "nv_push_cl90c0.h" #include "nv_push_cla0c0.h" #include "nv_push_clc597.h" #include "nv_push_clc7c0.h" +#include "nv_push_clc86f.h" +#include "nv_push_clcb97.h" static uint32_t vk_query_pool_report_count(const struct vk_query_pool *vk_pool) @@ -50,6 +55,53 @@ vk_query_pool_report_count(const struct vk_query_pool *vk_pool) } } +static uint32_t +vk_query_pool_statistics_counter_mask(const struct vk_query_pool *vk_pool) +{ + uint32_t result = 0; + + switch (vk_pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_TIMESTAMP: + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + const VkQueryPipelineStatisticFlags stats = vk_pool->pipeline_statistics; + V_NV9097_SET_STATISTICS_COUNTER(result, { + .da_vertices_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT) != 0, + .da_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT) != 0, + .vs_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT) != 0, + .gs_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) != 0, + .gs_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) != 0, + .clipper_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT) != 0, + .clipper_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT) != 0, + .ps_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) != 0, + .ti_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT) != 0, + .ts_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT) != 0, + }); + break; + } + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + V_NV9097_SET_STATISTICS_COUNTER(result, { + .vtg_primitives_out_enable = true, + }); + break; + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + V_NV9097_SET_STATISTICS_COUNTER(result, { + .streaming_primitives_succeeded_enable = true, + .streaming_primitives_needed_enable = true, + }); + break; + + default: + UNREACHABLE("Unsupported query type"); + } + + return result; +} + VKAPI_ATTR VkResult VKAPI_CALL nvk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, @@ -66,22 +118,20 @@ nvk_CreateQueryPool(VkDevice device, if (!pool) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); - /* Use interleaved layouts on Tegra so we can safely handle non-coherent - * maps + /* Use a packed layout for timestamps. For other queries, interleaved + * layouts on Tegra so we can safely handle non-coherent maps */ - if (pdev->info.type == NV_DEVICE_TYPE_SOC) + if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) + pool->layout = NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED; + else if (pdev->info.type == NV_DEVICE_TYPE_SOC) pool->layout = NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED; else pool->layout = NVK_QUERY_POOL_LAYOUT_SEPARATE; - uint32_t reports_per_query; - if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) { - /* Timestamps are just a single timestamp */ - reports_per_query = 1; - } else { - /* Everything else is two queries because we have to compute a delta */ - reports_per_query = 2 * vk_query_pool_report_count(&pool->vk); - } + pool->statistics_counter_mask = vk_query_pool_statistics_counter_mask(&pool->vk); + + /* Everything is a single query per report */ + uint32_t reports_per_query = vk_query_pool_report_count(&pool->vk); uint64_t mem_size = 0; switch (pool->layout) { @@ -101,6 +151,16 @@ nvk_CreateQueryPool(VkDevice device, mem_size = pool->vk.query_count * (uint64_t)pool->query_stride; break; + case NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED: + pool->reports_start = 0; + pool->query_stride = reports_per_query * sizeof(struct nvk_query_report); + + if (pdev->info.type == NV_DEVICE_TYPE_SOC) + pool->query_stride = align(pool->query_stride, pdev->info.nc_atom_size_B); + + mem_size = pool->vk.query_count * (uint64_t)pool->query_stride; + break; + default: UNREACHABLE("Unsupported query layout"); } @@ -196,7 +256,7 @@ nvk_sync_queries_to_gpu(struct nvk_query_pool *pool, if (pool->mem->flags & NVKMD_MEM_COHERENT) return; - assert(pool->layout == NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED); + assert(pool->layout != NVK_QUERY_POOL_LAYOUT_SEPARATE); nvkmd_mem_sync_map_to_gpu(pool->mem, first_query * pool->query_stride, count * pool->query_stride); } @@ -208,7 +268,7 @@ nvk_sync_queries_from_gpu(struct nvk_query_pool *pool, if (pool->mem->flags & NVKMD_MEM_COHERENT) return; - assert(pool->layout == NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED); + assert(pool->layout != NVK_QUERY_POOL_LAYOUT_SEPARATE); nvkmd_mem_sync_map_from_gpu(pool->mem, first_query * pool->query_stride, count * pool->query_stride); } @@ -260,6 +320,10 @@ nvk_ResetQueryPool(VkDevice device, assert(pool->mem->flags & NVKMD_MEM_COHERENT); uint32_t *available = nvk_query_available_map(pool, firstQuery); memset(available, 0, queryCount * sizeof(*available)); + } else if (pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED) { + struct nvk_query_report *reports = nvk_query_report_map(pool, firstQuery); + memset(reports, 0, queryCount * pool->query_stride); + nvk_sync_queries_to_gpu(pool, firstQuery, queryCount); } else { for (uint32_t i = 0; i < queryCount; i++) { uint32_t *available = nvk_query_available_map(pool, firstQuery + i); @@ -278,20 +342,39 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); - for (uint32_t i = 0; i < queryCount; i++) { - uint64_t addr = nvk_query_available_addr(pool, firstQuery + i); + if (unlikely(!queryCount)) + return; - struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); - P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); - P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32); - P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr); - P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); - P_NV9097_SET_REPORT_SEMAPHORE_D(p, { - .operation = OPERATION_RELEASE, - .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, - .pipeline_location = PIPELINE_LOCATION_ALL, - .structure_size = STRUCTURE_SIZE_ONE_WORD, - }); + const struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + const struct nvk_physical_device *pdev = nvk_device_physical(dev); + + if (queryCount > 1 && pool->layout != NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED) { + uint64_t clear_size; + if (pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE) + clear_size = queryCount * sizeof(uint32_t); + else if (pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED) + clear_size = queryCount * pool->query_stride; + else + UNREACHABLE("Unsupported query type"); + + uint64_t addr = nvk_query_available_addr(pool, firstQuery); + nvk_cmd_fill_memory(cmd, addr, clear_size, 0); + } else { + for (uint32_t i = 0; i < queryCount; i++) { + uint64_t addr = nvk_query_available_addr(pool, firstQuery + i); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_RELEASE, + .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, + .pipeline_location = PIPELINE_LOCATION_ALL, + .structure_size = STRUCTURE_SIZE_ONE_WORD, + }); + } } /* Wait for the above writes to complete. This prevents WaW hazards on any @@ -299,19 +382,17 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer, * will see the query as unavailable if it happens before the query is * completed again. */ - for (uint32_t i = 0; i < queryCount; i++) { - uint64_t addr = nvk_query_available_addr(pool, firstQuery + i); - - struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); - __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA); - P_NV906F_SEMAPHOREA(p, addr >> 32); - P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2); - P_NV906F_SEMAPHOREC(p, 0); - P_NV906F_SEMAPHORED(p, { - .operation = OPERATION_ACQUIRE, - .acquire_switch = ACQUIRE_SWITCH_ENABLED, - .release_size = RELEASE_SIZE_4BYTE, - }); + if (pdev->info.cls_eng3d >= HOPPER_A) { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 7); + P_IMMD(p, NVC86F, WFI, 0); + P_MTHD(p, NVC86F, MEM_OP_A); + P_NVC86F_MEM_OP_A(p, {}); + P_NVC86F_MEM_OP_B(p, 0); + P_NVC86F_MEM_OP_C(p, { .membar_type = 0 }); + P_NVC86F_MEM_OP_D(p, { .operation = OPERATION_MEMBAR }); + } else { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 1); + __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0); } } @@ -324,30 +405,62 @@ nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); - struct nv_push *p = nvk_cmd_buffer_push(cmd, 10); + assert(pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED); uint64_t report_addr = nvk_query_report_addr(pool, query); - P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); - P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); - P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr); - P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); - P_NV9097_SET_REPORT_SEMAPHORE_D(p, { - .operation = OPERATION_REPORT_ONLY, - .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage), - .structure_size = STRUCTURE_SIZE_FOUR_WORDS, - }); + uint8_t subc = nvk_cmd_buffer_last_subchannel(cmd); + if (subc == SUBC_NV9097) { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 7); + P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_RELEASE, + .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, + .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage), + .structure_size = STRUCTURE_SIZE_FOUR_WORDS, + }); + } else if (subc == SUBC_NV90C0) { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 7); - uint64_t available_addr = nvk_query_available_addr(pool, query); - P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); - P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32); - P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr); - P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1); - P_NV9097_SET_REPORT_SEMAPHORE_D(p, { - .operation = OPERATION_RELEASE, - .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, - .pipeline_location = PIPELINE_LOCATION_ALL, - .structure_size = STRUCTURE_SIZE_ONE_WORD, - }); + /* Compute SET_REPORT_SEMAPHORE_D doesn't provide a pipeline location + * meaning that we need to handle first synchronization scope here. + * + * Considering that if we are on the compute subchannel, we only really + * need to wait on anything that runs on compute. + */ + if (vk_expand_src_stage_flags2(stage) & + (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | + VK_PIPELINE_STAGE_2_BLIT_BIT)) + P_IMMD(p, NV90C0, WAIT_FOR_IDLE, 0); + + P_MTHD(p, NV90C0, SET_REPORT_SEMAPHORE_A); + P_NV90C0_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); + P_NV90C0_SET_REPORT_SEMAPHORE_B(p, report_addr); + P_NV90C0_SET_REPORT_SEMAPHORE_C(p, 1); + P_NV90C0_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_RELEASE, + .structure_size = STRUCTURE_SIZE_FOUR_WORDS, + }); + } else { + assert(subc == SUBC_NV90B5); + struct nv_push *p = nvk_cmd_buffer_push(cmd, 6); + + P_MTHD(p, NV90B5, SET_SEMAPHORE_A); + P_NV90B5_SET_SEMAPHORE_A(p, report_addr >> 32); + P_NV90B5_SET_SEMAPHORE_B(p, report_addr); + P_NV90B5_SET_SEMAPHORE_PAYLOAD(p, 1); + + P_IMMD(p, NV90B5, LAUNCH_DMA, { + .data_transfer_type = DATA_TRANSFER_TYPE_NONE, + .semaphore_type = SEMAPHORE_TYPE_RELEASE_FOUR_WORD_SEMAPHORE, + .flush_enable = FLUSH_ENABLE_TRUE, + /* Note: FLUSH_TYPE=SYS implicitly for NVC3B5+ */ + }); + } /* From the Vulkan spec: * @@ -377,49 +490,60 @@ struct nvk_3d_stat_query { VkQueryPipelineStatisticFlagBits flag; uint8_t loc; uint8_t report; + uint8_t clear_type; }; /* This must remain sorted in flag order */ static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{ - .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED, + .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_DA_VERTICES_GENERATED, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED, + .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_DA_PRIMITIVES_GENERATED, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS, + .flag = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_VS_INVOCATIONS, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS, + .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_GS_INVOCATIONS, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED, + .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_GS_PRIMITIVES_GENERATED, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */ - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS, + .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_CLIPPER_INVOCATIONS, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */ - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED, + .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_CLIPPER_PRIMITIVES_GENERATED, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS, + .flag = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_PS_INVOCATIONS, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS, + .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_TI_INVOCATIONS, }, { - .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT, - .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER, - .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS, + .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT, + .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER, + .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS, + .clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_TS_INVOCATIONS, }, { .flag = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT, .loc = UINT8_MAX, @@ -453,23 +577,143 @@ nvk_mme_write_cs_invocations(struct mme_builder *b) } static void -nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd, - struct nvk_query_pool *pool, - uint32_t query, uint32_t index, - bool end) +nvk_cmd_clear_report_value(struct nvk_cmd_buffer *cmd, + struct nvk_query_pool *pool) { const struct nvk_device *dev = nvk_cmd_buffer_device(cmd); const struct nvk_physical_device *pdev = nvk_device_physical(dev); - uint64_t report_addr = nvk_query_report_addr(pool, query) + - end * sizeof(struct nvk_query_report); - - uint32_t end_size = 7 * end; - - struct nv_push *p; switch (pool->vk.query_type) { - case VK_QUERY_TYPE_OCCLUSION: - p = nvk_cmd_buffer_push(cmd, 5 + end_size); + case VK_QUERY_TYPE_OCCLUSION: { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_ZPASS_PIXEL_CNT); + break; + } + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics); + struct nv_push *p = nvk_cmd_buffer_push(cmd, stat_count * 2); + + ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics; + for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) { + const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i]; + if (!(stats_left & sq->flag)) + continue; + + /* The 3D stat queries array MUST be sorted */ + assert(!(stats_left & (sq->flag - 1))); + + if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) { + if (pdev->info.cls_compute >= AMPERE_COMPUTE_B) { + P_IMMD_WORD(p, NVC7C0, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI), 0); + P_IMMD_WORD(p, NVC7C0, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO), 0); + } + else { + P_IMMD_WORD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI), 0); + P_IMMD_WORD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO), 0); + } + } else { + P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, sq->clear_type); + } + + stats_left &= ~sq->flag; + } + break; + } + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 4); + P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_STREAMING_PRIMITIVES_SUCCEEDED); + P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_STREAMING_PRIMITIVES_NEEDED); + break; + } + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_VTG_PRIMITIVES_OUT); + break; + } + + default: + UNREACHABLE("Unsupported query type"); + } +} + +static void +nvk_cmd_set_statistics_counters(struct nvk_cmd_buffer *cmd, + struct nvk_query_pool *pool, bool enable) +{ + switch (pool->vk.query_type) { + case VK_QUERY_TYPE_OCCLUSION: { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, enable); + break; + } + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + if (pool->statistics_counter_mask != 0) { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 3); + P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_STATISTICS_COUNTERS)); + P_INLINE_DATA(p, enable); + P_INLINE_DATA(p, pool->statistics_counter_mask); + } + break; + } + + default: + UNREACHABLE("Unsupported query type"); + } +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags, + uint32_t index) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); + + /* From the Vulkan 1.4.350 spec, vkCmdBeginQuery: + * + * VUID-vkCmdBeginQuery-queryPool-01922 + * + * "queryPool must have been created with a queryType that differs from + * that of any queries that are active within commandBuffer" + * + * and + * + * "After beginning a query, that query is considered active within the + * command buffer it was called in until that same query is ended. + * Queries active in a primary command buffer when secondary command + * buffers are executed are considered active for those secondary command + * buffers." + * + * This means we will never have two queries with the same type active and + * can rely on cleaning and toggling counters. + */ + nvk_cmd_clear_report_value(cmd, pool); + nvk_cmd_set_statistics_counters(cmd, pool, true); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + uint32_t index) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); + + const struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + const struct nvk_physical_device *pdev = nvk_device_physical(dev); + + uint64_t report_addr = nvk_query_report_addr(pool, query); + + switch (pool->vk.query_type) { + case VK_QUERY_TYPE_OCCLUSION: { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); @@ -483,10 +727,11 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd, .flush_disable = true, }); break; + } case VK_QUERY_TYPE_PIPELINE_STATISTICS: { uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics); - p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size); + struct nv_push *p = nvk_cmd_buffer_push(cmd, stat_count * 5); ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics; for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) { @@ -518,7 +763,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd, }); } - report_addr += 2 * sizeof(struct nvk_query_report); + report_addr += sizeof(struct nvk_query_report); stats_left &= ~sq->flag; } break; @@ -529,7 +774,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd, NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED, NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED, }; - p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size); + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports)); for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) { P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); @@ -543,13 +788,13 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd, .sub_report = index, .flush_disable = true, }); - report_addr += 2 * sizeof(struct nvk_query_report); + report_addr += sizeof(struct nvk_query_report); } break; } - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - p = nvk_cmd_buffer_push(cmd, 5 + end_size); + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); @@ -564,51 +809,30 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd, .flush_disable = true, }); break; + } default: UNREACHABLE("Unsupported query type"); } - if (end) { - P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0); - uint64_t available_addr = nvk_query_available_addr(pool, query); - P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); - P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32); - P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr); - P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1); - P_NV9097_SET_REPORT_SEMAPHORE_D(p, { - .operation = OPERATION_RELEASE, - .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, - .pipeline_location = PIPELINE_LOCATION_ALL, - .structure_size = STRUCTURE_SIZE_ONE_WORD, - }); - } -} + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0); -VKAPI_ATTR void VKAPI_CALL -nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - VkQueryControlFlags flags, - uint32_t index) -{ - VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); - VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); + nvk_cmd_set_statistics_counters(cmd, pool, false); - nvk_cmd_begin_end_query(cmd, pool, query, index, false); -} - -VKAPI_ATTR void VKAPI_CALL -nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - uint32_t index) -{ - VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); - VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); - - nvk_cmd_begin_end_query(cmd, pool, query, index, true); + uint64_t available_addr = nvk_query_available_addr(pool, query); + p = nvk_cmd_buffer_push(cmd, 5); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_RELEASE, + .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, + .pipeline_location = PIPELINE_LOCATION_ALL, + .structure_size = STRUCTURE_SIZE_ONE_WORD, + }); /* From the Vulkan spec: * @@ -676,14 +900,6 @@ cpu_write_query_result(void *dst, uint32_t idx, } } -static void -cpu_get_query_delta(void *dst, const struct nvk_query_report *src, - uint32_t idx, VkQueryResultFlags flags) -{ - uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value; - cpu_write_query_result(dst, idx, flags, delta); -} - VKAPI_ATTR VkResult VKAPI_CALL nvk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool, @@ -732,10 +948,10 @@ nvk_GetQueryPoolResults(VkDevice device, if (write_results) cpu_write_query_result(dst, 0, flags, src->timestamp); } else { - /* For everything else, we have to compute deltas */ + /* For everything else, we can just write it */ if (write_results) { for (uint32_t j = 0; j < report_count; j++) - cpu_get_query_delta(dst, src, j, flags); + cpu_write_query_result(dst, j, flags, src[j].value); } } @@ -862,13 +1078,16 @@ nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd, return; } + uint64_t reports_start = pool->reports_start; if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) - flags |= NVK_QUERY_IS_TIMESTAMP; + reports_start += offsetof(struct nvk_query_report, timestamp); + else + reports_start += offsetof(struct nvk_query_report, value); const struct nvk_copy_query_push push = { .pool_addr = pool->mem->va->addr, .available_stride = nvk_query_available_stride_B(pool), - .reports_start = pool->reports_start, + .reports_start = reports_start, .report_count = vk_query_pool_report_count(&pool->vk), .query_stride = pool->query_stride, .first_query = first_query, @@ -894,13 +1113,18 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(nvk_query_pool, pool, queryPool); VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer); + const struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + const struct nvk_physical_device *pdev = nvk_device_physical(dev); + + if (unlikely(!queryCount)) + return; if (flags & VK_QUERY_RESULT_WAIT_BIT) { for (uint32_t i = 0; i < queryCount; i++) { uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i); struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); - __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA); + __push_mthd(p, nvk_cmd_buffer_last_subchannel(cmd), NV906F_SEMAPHOREA); P_NV906F_SEMAPHOREA(p, avail_addr >> 32); P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2); P_NV906F_SEMAPHOREC(p, 1); @@ -912,8 +1136,206 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, } } - uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset); - nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount, - dst_addr, stride, flags); + const uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset); + + /* Allow to use MME for copy only if we have a small amount of queries on + * Turing+. We also ensure it doesn't cause a switch to 3D subchannel on + * Turing as it's missing MME on compute. + */ + const bool should_use_mme_copy = + queryCount <= 5 && pdev->info.cls_eng3d >= TURING_A && + (nvk_cmd_buffer_last_subchannel(cmd) != SUBC_NV90C0 || + pdev->info.cls_compute >= AMPERE_COMPUTE_B); + + if (!should_use_mme_copy) { + nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount, + dst_addr, stride, flags); + } else { + uint64_t report_addr = nvk_query_report_addr(pool, firstQuery); + const uint64_t available_addr = nvk_query_available_addr(pool, firstQuery); + + if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) + report_addr += offsetof(struct nvk_query_report, timestamp); + else + report_addr += offsetof(struct nvk_query_report, value); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 14); + if (nvk_cmd_buffer_last_subchannel(cmd) == SUBC_NV90C0 && + pdev->info.cls_compute >= AMPERE_COMPUTE_B) + P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_COPY_QUERIES)); + else + P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES)); + P_INLINE_DATA(p, report_addr >> 32); + P_INLINE_DATA(p, report_addr); + P_INLINE_DATA(p, available_addr >> 32); + P_INLINE_DATA(p, available_addr); + P_INLINE_DATA(p, nvk_query_available_stride_B(pool)); + P_INLINE_DATA(p, vk_query_pool_report_count(&pool->vk)); + P_INLINE_DATA(p, pool->query_stride); + P_INLINE_DATA(p, queryCount); + P_INLINE_DATA(p, dst_addr >> 32); + P_INLINE_DATA(p, dst_addr); + P_INLINE_DATA(p, stride >> 32); + P_INLINE_DATA(p, stride); + P_INLINE_DATA(p, flags); + } } +void +nvk_mme_set_statistics_counters(struct mme_builder *b) +{ + struct mme_value enable = mme_load(b); + struct mme_value mask = mme_load(b); + struct mme_value state = nvk_mme_load_scratch(b, STATISTICS_COUNTER_STATE); + + mme_if(b, ieq, enable, mme_imm(0)) { + mme_and_not_to(b, state, state, mask); + } + + mme_if(b, ine, enable, mme_imm(0)) { + mme_or_to(b, state, state, mask); + } + + nvk_mme_store_scratch(b, STATISTICS_COUNTER_STATE, state); + mme_mthd(b, NV9097_SET_STATISTICS_COUNTER); + mme_emit(b, state); +} + +const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[] = {{ + /* This case doesn't change the state so it should do nothing */ + .init = + (struct nvk_mme_mthd_data[]){ + {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0}, + {NV9097_SET_STATISTICS_COUNTER, 0}, + {}}, + .params = (uint32_t[]){1, 0}, + .expected = + (struct nvk_mme_mthd_data[]){ + {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0}, + {NV9097_SET_STATISTICS_COUNTER, 0}, + {}}, +}, { + .init = + (struct nvk_mme_mthd_data[]){ + {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x100}, + {NV9097_SET_STATISTICS_COUNTER, 0x100}, + {}}, + .params = (uint32_t[]){1, 0x200}, + .expected = + (struct nvk_mme_mthd_data[]){ + {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x300}, + {NV9097_SET_STATISTICS_COUNTER, 0x300}, + {}}, +}, { + .init = + (struct nvk_mme_mthd_data[]){ + {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x300}, + {NV9097_SET_STATISTICS_COUNTER, 0x300}, + {}}, + .params = (uint32_t[]){0, 0x200}, + .expected = + (struct nvk_mme_mthd_data[]){ + {NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x100}, + {NV9097_SET_STATISTICS_COUNTER, 0x100}, + {}}, +}, {}}; + +/* This helper is quite convoluted because we only have 4 registers to work + * with when writing a report result */ +static void +nvk_mme_write_query(struct mme_builder *b, + struct mme_value64 dst_addr, + struct mme_value idx, + struct mme_value flags, + struct mme_value64 result) +{ + struct mme_value result_64_bit = mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT)); + mme_if(b, ine, result_64_bit, mme_zero()) { + struct mme_value report_offset = mme_sll(b, idx, mme_imm(3)); + struct mme_value64 report_addr = + mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero())); + mme_free_reg(b, report_offset); + + mme_store_global(b, report_addr, result.lo); + + mme_add64_to(b, report_addr, report_addr, mme_imm64(4)); + mme_store_global(b, report_addr, result.hi); + mme_free_reg64(b, report_addr); + } + + mme_if(b, ieq, result_64_bit, mme_zero()) { + struct mme_value report_offset = mme_sll(b, idx, mme_imm(2)); + struct mme_value64 report_addr = + mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero())); + mme_free_reg(b, report_offset); + + mme_store_global(b, report_addr, result.lo); + mme_free_reg64(b, report_addr); + } + mme_free_reg(b, result_64_bit); +} + +void +nvk_mme_copy_queries(struct mme_builder *b) +{ + if (b->devinfo->cls_eng3d < TURING_A) + return; + + struct mme_value64 report_addr = mme_load_addr64(b); + struct mme_value64 available_addr = mme_load_addr64(b); + struct mme_value available_stride = mme_load(b); + struct mme_value report_count = mme_load(b); + struct mme_value query_stride = mme_load(b); + struct mme_value query_count = mme_load(b); + struct mme_value64 dst_addr = mme_load_addr64(b); + struct mme_value64 dst_stride = mme_load_addr64(b); + struct mme_value flags = mme_load(b); + + /* Now handle queries */ + mme_while(b, ine, query_count, mme_zero()) { + /* We load available and determine if a result need to be written */ + mme_tu104_read_fifoed(b, available_addr, mme_imm(1)); + struct mme_value available = mme_load(b); + struct mme_value write_results = + mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); + mme_or_to(b, write_results, write_results, available); + + mme_if(b, ine, write_results, mme_zero()) { + struct mme_value r = mme_mov(b, mme_zero()); + mme_while(b, ine, r, report_count) { + /* Setup MME fifo read, we only have 7 registers to work with so + * we agressively free registers */ + STATIC_ASSERT(sizeof(struct nvk_query_report) % 2 == 0); + struct mme_value current_report_offs = mme_sll( + b, r, mme_imm(util_logbase2(sizeof(struct nvk_query_report)))); + struct mme_value64 current_report_addr = mme_add64( + b, report_addr, mme_value64(current_report_offs, mme_zero())); + mme_tu104_read_fifoed(b, current_report_addr, mme_imm(2)); + mme_free_reg(b, current_report_offs); + mme_free_reg64(b, current_report_addr); + + struct mme_value64 report = mme_load_value64(b); + nvk_mme_write_query(b, dst_addr, r, flags, report); + mme_free_reg64(b, report); + + mme_add_to(b, r, r, mme_imm(1)); + } + } + mme_free_reg(b, write_results); + + /* Finally write available if needed */ + struct mme_value with_availability = + mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); + mme_if(b, ine, with_availability, mme_zero()) { + nvk_mme_write_query(b, dst_addr, report_count, flags, + mme_value64(available, mme_zero())); + } + mme_free_reg(b, with_availability); + mme_free_reg(b, available); + + mme_sub_to(b, query_count, query_count, mme_imm(1)); + mme_add64_to(b, report_addr, report_addr, mme_value64(query_stride, mme_zero())); + mme_add64_to(b, available_addr, available_addr, mme_value64(available_stride, mme_zero())); + mme_add64_to(b, dst_addr, dst_addr, dst_stride); + } +} diff --git a/src/nouveau/vulkan/nvk_query_pool.h b/src/nouveau/vulkan/nvk_query_pool.h index 37b6fd2657f..d4b5a64e5ea 100644 --- a/src/nouveau/vulkan/nvk_query_pool.h +++ b/src/nouveau/vulkan/nvk_query_pool.h @@ -28,6 +28,12 @@ enum nvk_query_pool_layout { * byte 16. */ NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED, + + /* Stores the availables and the timestamp in nvk_query_report + * + * This allows to write a timestamp with only one command. + */ + NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED, }; struct nvk_query_pool { @@ -37,6 +43,7 @@ struct nvk_query_pool { uint32_t reports_start; uint32_t query_stride; + uint32_t statistics_counter_mask; struct nvkmd_mem *mem; };