Merge branch 'nvk-query-rework' into 'main'

nvk: Rework queries

See merge request mesa/mesa!41363
This commit is contained in:
Mary Guillemard 2026-05-08 02:11:17 +02:00
commit 2f330b8bbf
10 changed files with 641 additions and 209 deletions

View file

@ -556,6 +556,14 @@ mme_load(struct mme_builder *b)
UNREACHABLE("Unsupported GPU class");
}
static inline struct mme_value64
mme_load_value64(struct mme_builder *b)
{
struct mme_value lo = mme_load(b);
struct mme_value hi = mme_load(b);
return mme_value64(lo, hi);
}
static inline struct mme_value64
mme_load_addr64(struct mme_builder *b)
{

View file

@ -21,24 +21,12 @@ nvk_copy_queries(uint64_t pool_addr, uint available_stride,
bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
uint64_t report_offs = reports_start + (uint64_t)query * (uint64_t)query_stride;
global struct nvk_query_report *report =
(global void *)(pool_addr + report_offs);
global uint64_t *report = (global uint64_t *)(pool_addr + report_offs);
uint64_t dst_offset = dst_stride * (uint64_t)i;
if (flags & NVK_QUERY_IS_TIMESTAMP) {
/* Timestamp queries are the only ones use a single report */
if (write_results) {
vk_write_query(dst_addr + dst_offset, 0, flags, report->timestamp);
}
} else {
if (write_results) {
for (uint r = 0; r < report_count; ++r) {
uint delta = report[(r * 2) + 1].value - report[r * 2].value;
vk_write_query(dst_addr + dst_offset, r, flags, delta);
}
}
for (uint r = 0; r < report_count; ++r) {
vk_write_query(dst_addr + dst_offset, r, flags, report[r * 2]);
}
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {

View file

@ -6,8 +6,6 @@
#include "compiler/libcl/libcl.h"
#define NVK_QUERY_IS_TIMESTAMP 0x80000000u
struct nvk_query_report {
uint64_t value;
uint64_t timestamp;

View file

@ -399,8 +399,24 @@ nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
.output7 = OUTPUT7_FALSE,
});
/* The blob driver just always leaves this on. */
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_TRUE);
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE);
P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {
.da_vertices_generated_enable = false,
.da_primitives_generated_enable = false,
.vs_invocations_enable = false,
.gs_invocations_enable = false,
.gs_primitives_generated_enable = false,
.streaming_primitives_succeeded_enable = false,
.streaming_primitives_needed_enable = false,
.clipper_invocations_enable = false,
.clipper_primitives_generated_enable = false,
.ps_invocations_enable = false,
.ti_invocations_enable = false,
.ts_invocations_enable = false,
.ts_primitives_generated_enable = false,
.total_streaming_primitives_needed_succeeded_enable = false,
.vtg_primitives_out_enable = false,
});
P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });

View file

@ -78,10 +78,9 @@ nvk_meta_begin(struct nvk_cmd_buffer *cmd,
{
const struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_PASSTHROUGH);
P_IMMD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE, MODE_ALWAYS_RENDER);
P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {
.da_vertices_generated_enable = false,
.da_primitives_generated_enable = false,
@ -99,8 +98,8 @@ nvk_meta_begin(struct nvk_cmd_buffer *cmd,
.total_streaming_primitives_needed_succeeded_enable = false,
.vtg_primitives_out_enable = false,
});
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, false);
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE);
P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_TRACK_WITH_FILTER);
save->dynamic = cmd->vk.dynamic_graphics_state;
save->_dynamic_vi = cmd->state.gfx._dynamic_vi;
@ -189,29 +188,13 @@ nvk_meta_end(struct nvk_cmd_buffer *cmd,
nvk_descriptor_state_set_root_array(cmd, desc, push, 0, sizeof(save->push),
save->push);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, true);
P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {
.da_vertices_generated_enable = true,
.da_primitives_generated_enable = true,
.vs_invocations_enable = true,
.gs_invocations_enable = true,
.gs_primitives_generated_enable = true,
.streaming_primitives_succeeded_enable = true,
.streaming_primitives_needed_enable = true,
.clipper_invocations_enable = true,
.clipper_primitives_generated_enable = true,
.ps_invocations_enable = true,
.ti_invocations_enable = true,
.ts_invocations_enable = true,
.ts_primitives_generated_enable = true,
.total_streaming_primitives_needed_succeeded_enable = true,
.vtg_primitives_out_enable = true,
});
/* Replay the previous state from shadow RAM */
struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_REPLAY);
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, ENABLE_FALSE);
P_IMMD(p, NV9097, SET_STATISTICS_COUNTER, {});
P_IMMD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE, MODE_USE_RENDER_ENABLE);
P_IMMD(p, NV9097, SET_MME_SHADOW_RAM_CONTROL, MODE_METHOD_TRACK_WITH_FILTER);
}
VKAPI_ATTR void VKAPI_CALL

View file

@ -37,6 +37,8 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = {
[NVK_MME_SET_CONSERVATIVE_RASTER_STATE] = nvk_mme_set_conservative_raster_state,
[NVK_MME_SET_VIEWPORT_MIN_MAX_Z] = nvk_mme_set_viewport_min_max_z,
[NVK_MME_SET_Z_CLAMP] = nvk_mme_set_z_clamp,
[NVK_MME_SET_STATISTICS_COUNTERS] = nvk_mme_set_statistics_counters,
[NVK_MME_COPY_QUERIES] = nvk_mme_copy_queries,
};
static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = {
@ -45,6 +47,7 @@ static const struct nvk_mme_test_case *mme_tests[NVK_MME_COUNT] = {
[NVK_MME_SET_TESS_PARAMS] = nvk_mme_set_tess_params_tests,
[NVK_MME_SET_SHADING_RATE_CONTROL] = nvk_mme_set_shading_rate_control_tests,
[NVK_MME_SET_ANTI_ALIAS] = nvk_mme_set_anti_alias_tests,
[NVK_MME_SET_STATISTICS_COUNTERS] = nvk_mme_set_statistics_counters_tests,
};
uint32_t *

View file

@ -41,6 +41,8 @@ enum nvk_mme {
NVK_MME_SET_CONSERVATIVE_RASTER_STATE,
NVK_MME_SET_VIEWPORT_MIN_MAX_Z,
NVK_MME_SET_Z_CLAMP,
NVK_MME_SET_STATISTICS_COUNTERS,
NVK_MME_COPY_QUERIES,
NVK_MME_COUNT,
};
@ -68,6 +70,7 @@ enum nvk_mme_scratch {
NVK_MME_SCRATCH_WRITE_MASK_DYN,
NVK_MME_SCRATCH_WRITE_MASK_PIPELINE,
NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE,
NVK_MME_SCRATCH_STATISTICS_COUNTER_STATE,
/* Copy of SET_WINDOW_CLIP_ENABLE */
NVK_MME_SCRATCH_WINDOW_CLIP_ENABLED, /* TODO: can we use shadow-ram? */
@ -249,6 +252,8 @@ void nvk_mme_set_write_mask(struct mme_builder *b);
void nvk_mme_set_conservative_raster_state(struct mme_builder *b);
void nvk_mme_set_viewport_min_max_z(struct mme_builder *b);
void nvk_mme_set_z_clamp(struct mme_builder *b);
void nvk_mme_set_statistics_counters(struct mme_builder *b);
void nvk_mme_copy_queries(struct mme_builder *b);
uint32_t nvk_mme_tess_params(mesa_shader_stage stage,
enum nak_ts_domain domain,
@ -278,6 +283,7 @@ extern const struct nvk_mme_test_case nvk_mme_bind_vb_tests[];
extern const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[];
extern const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[];
extern const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[];
extern const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[];
void nvk_test_all_mmes(const struct nv_device_info *devinfo);

View file

@ -1759,10 +1759,11 @@ nvk_GetPhysicalDeviceQueueFamilyProperties2(
vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
p->queueFamilyProperties.queueFlags = queue_family->queue_flags;
p->queueFamilyProperties.queueCount = queue_family->queue_count;
if (queue_family->queue_flags & VK_QUEUE_GRAPHICS_BIT) {
if (queue_family->queue_flags &
(VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT |
VK_QUEUE_TRANSFER_BIT)) {
p->queueFamilyProperties.timestampValidBits = 64;
} else {
/* TODO: Timestamps on non-graphics queues */
p->queueFamilyProperties.timestampValidBits = 0;
}
p->queueFamilyProperties.minImageTransferGranularity =

View file

@ -16,6 +16,7 @@
#include "vk_common_entrypoints.h"
#include "vk_meta.h"
#include "vk_pipeline.h"
#include "vk_synchronization.h"
#include "cl/nvk_query.h"
#include "compiler/nir/nir.h"
@ -26,9 +27,13 @@
#include "nv_push_cl906f.h"
#include "nv_push_cl9097.h"
#include "nv_push_cl90b5.h"
#include "nv_push_cl90c0.h"
#include "nv_push_cla0c0.h"
#include "nv_push_clc597.h"
#include "nv_push_clc7c0.h"
#include "nv_push_clc86f.h"
#include "nv_push_clcb97.h"
static uint32_t
vk_query_pool_report_count(const struct vk_query_pool *vk_pool)
@ -50,6 +55,53 @@ vk_query_pool_report_count(const struct vk_query_pool *vk_pool)
}
}
static uint32_t
vk_query_pool_statistics_counter_mask(const struct vk_query_pool *vk_pool)
{
uint32_t result = 0;
switch (vk_pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
const VkQueryPipelineStatisticFlags stats = vk_pool->pipeline_statistics;
V_NV9097_SET_STATISTICS_COUNTER(result, {
.da_vertices_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT) != 0,
.da_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT) != 0,
.vs_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT) != 0,
.gs_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) != 0,
.gs_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) != 0,
.clipper_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT) != 0,
.clipper_primitives_generated_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT) != 0,
.ps_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) != 0,
.ti_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT) != 0,
.ts_invocations_enable = (stats & VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT) != 0,
});
break;
}
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
V_NV9097_SET_STATISTICS_COUNTER(result, {
.vtg_primitives_out_enable = true,
});
break;
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
V_NV9097_SET_STATISTICS_COUNTER(result, {
.streaming_primitives_succeeded_enable = true,
.streaming_primitives_needed_enable = true,
});
break;
default:
UNREACHABLE("Unsupported query type");
}
return result;
}
VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateQueryPool(VkDevice device,
const VkQueryPoolCreateInfo *pCreateInfo,
@ -66,22 +118,20 @@ nvk_CreateQueryPool(VkDevice device,
if (!pool)
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
/* Use interleaved layouts on Tegra so we can safely handle non-coherent
* maps
/* Use a packed layout for timestamps. For other queries, interleaved
* layouts on Tegra so we can safely handle non-coherent maps
*/
if (pdev->info.type == NV_DEVICE_TYPE_SOC)
if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
pool->layout = NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED;
else if (pdev->info.type == NV_DEVICE_TYPE_SOC)
pool->layout = NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED;
else
pool->layout = NVK_QUERY_POOL_LAYOUT_SEPARATE;
uint32_t reports_per_query;
if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
/* Timestamps are just a single timestamp */
reports_per_query = 1;
} else {
/* Everything else is two queries because we have to compute a delta */
reports_per_query = 2 * vk_query_pool_report_count(&pool->vk);
}
pool->statistics_counter_mask = vk_query_pool_statistics_counter_mask(&pool->vk);
/* Everything is a single query per report */
uint32_t reports_per_query = vk_query_pool_report_count(&pool->vk);
uint64_t mem_size = 0;
switch (pool->layout) {
@ -101,6 +151,16 @@ nvk_CreateQueryPool(VkDevice device,
mem_size = pool->vk.query_count * (uint64_t)pool->query_stride;
break;
case NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED:
pool->reports_start = 0;
pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);
if (pdev->info.type == NV_DEVICE_TYPE_SOC)
pool->query_stride = align(pool->query_stride, pdev->info.nc_atom_size_B);
mem_size = pool->vk.query_count * (uint64_t)pool->query_stride;
break;
default:
UNREACHABLE("Unsupported query layout");
}
@ -196,7 +256,7 @@ nvk_sync_queries_to_gpu(struct nvk_query_pool *pool,
if (pool->mem->flags & NVKMD_MEM_COHERENT)
return;
assert(pool->layout == NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED);
assert(pool->layout != NVK_QUERY_POOL_LAYOUT_SEPARATE);
nvkmd_mem_sync_map_to_gpu(pool->mem, first_query * pool->query_stride,
count * pool->query_stride);
}
@ -208,7 +268,7 @@ nvk_sync_queries_from_gpu(struct nvk_query_pool *pool,
if (pool->mem->flags & NVKMD_MEM_COHERENT)
return;
assert(pool->layout == NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED);
assert(pool->layout != NVK_QUERY_POOL_LAYOUT_SEPARATE);
nvkmd_mem_sync_map_from_gpu(pool->mem, first_query * pool->query_stride,
count * pool->query_stride);
}
@ -260,6 +320,10 @@ nvk_ResetQueryPool(VkDevice device,
assert(pool->mem->flags & NVKMD_MEM_COHERENT);
uint32_t *available = nvk_query_available_map(pool, firstQuery);
memset(available, 0, queryCount * sizeof(*available));
} else if (pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED) {
struct nvk_query_report *reports = nvk_query_report_map(pool, firstQuery);
memset(reports, 0, queryCount * pool->query_stride);
nvk_sync_queries_to_gpu(pool, firstQuery, queryCount);
} else {
for (uint32_t i = 0; i < queryCount; i++) {
uint32_t *available = nvk_query_available_map(pool, firstQuery + i);
@ -278,20 +342,39 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
for (uint32_t i = 0; i < queryCount; i++) {
uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
if (unlikely(!queryCount))
return;
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
.pipeline_location = PIPELINE_LOCATION_ALL,
.structure_size = STRUCTURE_SIZE_ONE_WORD,
});
const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
if (queryCount > 1 && pool->layout != NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED) {
uint64_t clear_size;
if (pool->layout == NVK_QUERY_POOL_LAYOUT_SEPARATE)
clear_size = queryCount * sizeof(uint32_t);
else if (pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED)
clear_size = queryCount * pool->query_stride;
else
UNREACHABLE("Unsupported query type");
uint64_t addr = nvk_query_available_addr(pool, firstQuery);
nvk_cmd_fill_memory(cmd, addr, clear_size, 0);
} else {
for (uint32_t i = 0; i < queryCount; i++) {
uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
.pipeline_location = PIPELINE_LOCATION_ALL,
.structure_size = STRUCTURE_SIZE_ONE_WORD,
});
}
}
/* Wait for the above writes to complete. This prevents WaW hazards on any
@ -299,19 +382,17 @@ nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
* will see the query as unavailable if it happens before the query is
* completed again.
*/
for (uint32_t i = 0; i < queryCount; i++) {
uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
__push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
P_NV906F_SEMAPHOREA(p, addr >> 32);
P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2);
P_NV906F_SEMAPHOREC(p, 0);
P_NV906F_SEMAPHORED(p, {
.operation = OPERATION_ACQUIRE,
.acquire_switch = ACQUIRE_SWITCH_ENABLED,
.release_size = RELEASE_SIZE_4BYTE,
});
if (pdev->info.cls_eng3d >= HOPPER_A) {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
P_IMMD(p, NVC86F, WFI, 0);
P_MTHD(p, NVC86F, MEM_OP_A);
P_NVC86F_MEM_OP_A(p, {});
P_NVC86F_MEM_OP_B(p, 0);
P_NVC86F_MEM_OP_C(p, { .membar_type = 0 });
P_NVC86F_MEM_OP_D(p, { .operation = OPERATION_MEMBAR });
} else {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 1);
__push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
}
}
@ -324,30 +405,62 @@ nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
assert(pool->layout == NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED);
uint64_t report_addr = nvk_query_report_addr(pool, query);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_REPORT_ONLY,
.pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
.structure_size = STRUCTURE_SIZE_FOUR_WORDS,
});
uint8_t subc = nvk_cmd_buffer_last_subchannel(cmd);
if (subc == SUBC_NV9097) {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
.pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
.structure_size = STRUCTURE_SIZE_FOUR_WORDS,
});
} else if (subc == SUBC_NV90C0) {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
uint64_t available_addr = nvk_query_available_addr(pool, query);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
.pipeline_location = PIPELINE_LOCATION_ALL,
.structure_size = STRUCTURE_SIZE_ONE_WORD,
});
/* Compute SET_REPORT_SEMAPHORE_D doesn't provide a pipeline location
* meaning that we need to handle first synchronization scope here.
*
* Considering that if we are on the compute subchannel, we only really
* need to wait on anything that runs on compute.
*/
if (vk_expand_src_stage_flags2(stage) &
(VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT))
P_IMMD(p, NV90C0, WAIT_FOR_IDLE, 0);
P_MTHD(p, NV90C0, SET_REPORT_SEMAPHORE_A);
P_NV90C0_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
P_NV90C0_SET_REPORT_SEMAPHORE_B(p, report_addr);
P_NV90C0_SET_REPORT_SEMAPHORE_C(p, 1);
P_NV90C0_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.structure_size = STRUCTURE_SIZE_FOUR_WORDS,
});
} else {
assert(subc == SUBC_NV90B5);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
P_MTHD(p, NV90B5, SET_SEMAPHORE_A);
P_NV90B5_SET_SEMAPHORE_A(p, report_addr >> 32);
P_NV90B5_SET_SEMAPHORE_B(p, report_addr);
P_NV90B5_SET_SEMAPHORE_PAYLOAD(p, 1);
P_IMMD(p, NV90B5, LAUNCH_DMA, {
.data_transfer_type = DATA_TRANSFER_TYPE_NONE,
.semaphore_type = SEMAPHORE_TYPE_RELEASE_FOUR_WORD_SEMAPHORE,
.flush_enable = FLUSH_ENABLE_TRUE,
/* Note: FLUSH_TYPE=SYS implicitly for NVC3B5+ */
});
}
/* From the Vulkan spec:
*
@ -377,49 +490,60 @@ struct nvk_3d_stat_query {
VkQueryPipelineStatisticFlagBits flag;
uint8_t loc;
uint8_t report;
uint8_t clear_type;
};
/* This must remain sorted in flag order */
static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{
.flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
.flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_DA_VERTICES_GENERATED,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
.flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_DA_PRIMITIVES_GENERATED,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
.flag = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_VS_INVOCATIONS,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
.flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_GS_INVOCATIONS,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
.flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_GS_PRIMITIVES_GENERATED,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
.flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_CLIPPER_INVOCATIONS,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
.flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_CLIPPER_PRIMITIVES_GENERATED,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
.flag = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_PS_INVOCATIONS,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
.flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_TI_INVOCATIONS,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
.flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
.loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
.report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
.clear_type = NV9097_CLEAR_REPORT_VALUE_TYPE_TS_INVOCATIONS,
}, {
.flag = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
.loc = UINT8_MAX,
@ -453,23 +577,143 @@ nvk_mme_write_cs_invocations(struct mme_builder *b)
}
static void
nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
struct nvk_query_pool *pool,
uint32_t query, uint32_t index,
bool end)
nvk_cmd_clear_report_value(struct nvk_cmd_buffer *cmd,
struct nvk_query_pool *pool)
{
const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
uint64_t report_addr = nvk_query_report_addr(pool, query) +
end * sizeof(struct nvk_query_report);
uint32_t end_size = 7 * end;
struct nv_push *p;
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
p = nvk_cmd_buffer_push(cmd, 5 + end_size);
case VK_QUERY_TYPE_OCCLUSION: {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_ZPASS_PIXEL_CNT);
break;
}
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
struct nv_push *p = nvk_cmd_buffer_push(cmd, stat_count * 2);
ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i];
if (!(stats_left & sq->flag))
continue;
/* The 3D stat queries array MUST be sorted */
assert(!(stats_left & (sq->flag - 1)));
if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
if (pdev->info.cls_compute >= AMPERE_COMPUTE_B) {
P_IMMD_WORD(p, NVC7C0, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI), 0);
P_IMMD_WORD(p, NVC7C0, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO), 0);
}
else {
P_IMMD_WORD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI), 0);
P_IMMD_WORD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO), 0);
}
} else {
P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, sq->clear_type);
}
stats_left &= ~sq->flag;
}
break;
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_STREAMING_PRIMITIVES_SUCCEEDED);
P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_STREAMING_PRIMITIVES_NEEDED);
break;
}
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
P_IMMD(p, NV9097, CLEAR_REPORT_VALUE, TYPE_VTG_PRIMITIVES_OUT);
break;
}
default:
UNREACHABLE("Unsupported query type");
}
}
static void
nvk_cmd_set_statistics_counters(struct nvk_cmd_buffer *cmd,
struct nvk_query_pool *pool, bool enable)
{
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION: {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, enable);
break;
}
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
if (pool->statistics_counter_mask != 0) {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_STATISTICS_COUNTERS));
P_INLINE_DATA(p, enable);
P_INLINE_DATA(p, pool->statistics_counter_mask);
}
break;
}
default:
UNREACHABLE("Unsupported query type");
}
}
VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t query,
VkQueryControlFlags flags,
uint32_t index)
{
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
/* From the Vulkan 1.4.350 spec, vkCmdBeginQuery:
*
* VUID-vkCmdBeginQuery-queryPool-01922
*
* "queryPool must have been created with a queryType that differs from
* that of any queries that are active within commandBuffer"
*
* and
*
* "After beginning a query, that query is considered active within the
* command buffer it was called in until that same query is ended.
* Queries active in a primary command buffer when secondary command
* buffers are executed are considered active for those secondary command
* buffers."
*
* This means we will never have two queries with the same type active and
* can rely on cleaning and toggling counters.
*/
nvk_cmd_clear_report_value(cmd, pool);
nvk_cmd_set_statistics_counters(cmd, pool, true);
}
VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t query,
uint32_t index)
{
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
uint64_t report_addr = nvk_query_report_addr(pool, query);
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION: {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
@ -483,10 +727,11 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
.flush_disable = true,
});
break;
}
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size);
struct nv_push *p = nvk_cmd_buffer_push(cmd, stat_count * 5);
ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
@ -518,7 +763,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
});
}
report_addr += 2 * sizeof(struct nvk_query_report);
report_addr += sizeof(struct nvk_query_report);
stats_left &= ~sq->flag;
}
break;
@ -529,7 +774,7 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED,
NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED,
};
p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports));
for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) {
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
@ -543,13 +788,13 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
.sub_report = index,
.flush_disable = true,
});
report_addr += 2 * sizeof(struct nvk_query_report);
report_addr += sizeof(struct nvk_query_report);
}
break;
}
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
p = nvk_cmd_buffer_push(cmd, 5 + end_size);
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
@ -564,51 +809,30 @@ nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
.flush_disable = true,
});
break;
}
default:
UNREACHABLE("Unsupported query type");
}
if (end) {
P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
uint64_t available_addr = nvk_query_available_addr(pool, query);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
.pipeline_location = PIPELINE_LOCATION_ALL,
.structure_size = STRUCTURE_SIZE_ONE_WORD,
});
}
}
struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t query,
VkQueryControlFlags flags,
uint32_t index)
{
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
nvk_cmd_set_statistics_counters(cmd, pool, false);
nvk_cmd_begin_end_query(cmd, pool, query, index, false);
}
VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t query,
uint32_t index)
{
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
nvk_cmd_begin_end_query(cmd, pool, query, index, true);
uint64_t available_addr = nvk_query_available_addr(pool, query);
p = nvk_cmd_buffer_push(cmd, 5);
P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
.operation = OPERATION_RELEASE,
.release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
.pipeline_location = PIPELINE_LOCATION_ALL,
.structure_size = STRUCTURE_SIZE_ONE_WORD,
});
/* From the Vulkan spec:
*
@ -676,14 +900,6 @@ cpu_write_query_result(void *dst, uint32_t idx,
}
}
static void
cpu_get_query_delta(void *dst, const struct nvk_query_report *src,
uint32_t idx, VkQueryResultFlags flags)
{
uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value;
cpu_write_query_result(dst, idx, flags, delta);
}
VKAPI_ATTR VkResult VKAPI_CALL
nvk_GetQueryPoolResults(VkDevice device,
VkQueryPool queryPool,
@ -732,10 +948,10 @@ nvk_GetQueryPoolResults(VkDevice device,
if (write_results)
cpu_write_query_result(dst, 0, flags, src->timestamp);
} else {
/* For everything else, we have to compute deltas */
/* For everything else, we can just write it */
if (write_results) {
for (uint32_t j = 0; j < report_count; j++)
cpu_get_query_delta(dst, src, j, flags);
cpu_write_query_result(dst, j, flags, src[j].value);
}
}
@ -862,13 +1078,16 @@ nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,
return;
}
uint64_t reports_start = pool->reports_start;
if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
flags |= NVK_QUERY_IS_TIMESTAMP;
reports_start += offsetof(struct nvk_query_report, timestamp);
else
reports_start += offsetof(struct nvk_query_report, value);
const struct nvk_copy_query_push push = {
.pool_addr = pool->mem->va->addr,
.available_stride = nvk_query_available_stride_B(pool),
.reports_start = pool->reports_start,
.reports_start = reports_start,
.report_count = vk_query_pool_report_count(&pool->vk),
.query_stride = pool->query_stride,
.first_query = first_query,
@ -894,13 +1113,18 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
if (unlikely(!queryCount))
return;
if (flags & VK_QUERY_RESULT_WAIT_BIT) {
for (uint32_t i = 0; i < queryCount; i++) {
uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
__push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
__push_mthd(p, nvk_cmd_buffer_last_subchannel(cmd), NV906F_SEMAPHOREA);
P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
P_NV906F_SEMAPHOREC(p, 1);
@ -912,8 +1136,206 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
}
}
uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset);
nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
dst_addr, stride, flags);
const uint64_t dst_addr = vk_buffer_address(&dst_buffer->vk, dstOffset);
/* Allow to use MME for copy only if we have a small amount of queries on
* Turing+. We also ensure it doesn't cause a switch to 3D subchannel on
* Turing as it's missing MME on compute.
*/
const bool should_use_mme_copy =
queryCount <= 5 && pdev->info.cls_eng3d >= TURING_A &&
(nvk_cmd_buffer_last_subchannel(cmd) != SUBC_NV90C0 ||
pdev->info.cls_compute >= AMPERE_COMPUTE_B);
if (!should_use_mme_copy) {
nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
dst_addr, stride, flags);
} else {
uint64_t report_addr = nvk_query_report_addr(pool, firstQuery);
const uint64_t available_addr = nvk_query_available_addr(pool, firstQuery);
if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
report_addr += offsetof(struct nvk_query_report, timestamp);
else
report_addr += offsetof(struct nvk_query_report, value);
struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
if (nvk_cmd_buffer_last_subchannel(cmd) == SUBC_NV90C0 &&
pdev->info.cls_compute >= AMPERE_COMPUTE_B)
P_1INC(p, NVC7C0, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
else
P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
P_INLINE_DATA(p, report_addr >> 32);
P_INLINE_DATA(p, report_addr);
P_INLINE_DATA(p, available_addr >> 32);
P_INLINE_DATA(p, available_addr);
P_INLINE_DATA(p, nvk_query_available_stride_B(pool));
P_INLINE_DATA(p, vk_query_pool_report_count(&pool->vk));
P_INLINE_DATA(p, pool->query_stride);
P_INLINE_DATA(p, queryCount);
P_INLINE_DATA(p, dst_addr >> 32);
P_INLINE_DATA(p, dst_addr);
P_INLINE_DATA(p, stride >> 32);
P_INLINE_DATA(p, stride);
P_INLINE_DATA(p, flags);
}
}
void
nvk_mme_set_statistics_counters(struct mme_builder *b)
{
struct mme_value enable = mme_load(b);
struct mme_value mask = mme_load(b);
struct mme_value state = nvk_mme_load_scratch(b, STATISTICS_COUNTER_STATE);
mme_if(b, ieq, enable, mme_imm(0)) {
mme_and_not_to(b, state, state, mask);
}
mme_if(b, ine, enable, mme_imm(0)) {
mme_or_to(b, state, state, mask);
}
nvk_mme_store_scratch(b, STATISTICS_COUNTER_STATE, state);
mme_mthd(b, NV9097_SET_STATISTICS_COUNTER);
mme_emit(b, state);
}
const struct nvk_mme_test_case nvk_mme_set_statistics_counters_tests[] = {{
/* This case doesn't change the state so it should do nothing */
.init =
(struct nvk_mme_mthd_data[]){
{NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0},
{NV9097_SET_STATISTICS_COUNTER, 0},
{}},
.params = (uint32_t[]){1, 0},
.expected =
(struct nvk_mme_mthd_data[]){
{NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0},
{NV9097_SET_STATISTICS_COUNTER, 0},
{}},
}, {
.init =
(struct nvk_mme_mthd_data[]){
{NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x100},
{NV9097_SET_STATISTICS_COUNTER, 0x100},
{}},
.params = (uint32_t[]){1, 0x200},
.expected =
(struct nvk_mme_mthd_data[]){
{NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x300},
{NV9097_SET_STATISTICS_COUNTER, 0x300},
{}},
}, {
.init =
(struct nvk_mme_mthd_data[]){
{NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x300},
{NV9097_SET_STATISTICS_COUNTER, 0x300},
{}},
.params = (uint32_t[]){0, 0x200},
.expected =
(struct nvk_mme_mthd_data[]){
{NVK_SET_MME_SCRATCH(STATISTICS_COUNTER_STATE), 0x100},
{NV9097_SET_STATISTICS_COUNTER, 0x100},
{}},
}, {}};
/* This helper is quite convoluted because we only have 4 registers to work
* with when writing a report result */
static void
nvk_mme_write_query(struct mme_builder *b,
struct mme_value64 dst_addr,
struct mme_value idx,
struct mme_value flags,
struct mme_value64 result)
{
struct mme_value result_64_bit = mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT));
mme_if(b, ine, result_64_bit, mme_zero()) {
struct mme_value report_offset = mme_sll(b, idx, mme_imm(3));
struct mme_value64 report_addr =
mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero()));
mme_free_reg(b, report_offset);
mme_store_global(b, report_addr, result.lo);
mme_add64_to(b, report_addr, report_addr, mme_imm64(4));
mme_store_global(b, report_addr, result.hi);
mme_free_reg64(b, report_addr);
}
mme_if(b, ieq, result_64_bit, mme_zero()) {
struct mme_value report_offset = mme_sll(b, idx, mme_imm(2));
struct mme_value64 report_addr =
mme_add64(b, dst_addr, mme_value64(report_offset, mme_zero()));
mme_free_reg(b, report_offset);
mme_store_global(b, report_addr, result.lo);
mme_free_reg64(b, report_addr);
}
mme_free_reg(b, result_64_bit);
}
void
nvk_mme_copy_queries(struct mme_builder *b)
{
if (b->devinfo->cls_eng3d < TURING_A)
return;
struct mme_value64 report_addr = mme_load_addr64(b);
struct mme_value64 available_addr = mme_load_addr64(b);
struct mme_value available_stride = mme_load(b);
struct mme_value report_count = mme_load(b);
struct mme_value query_stride = mme_load(b);
struct mme_value query_count = mme_load(b);
struct mme_value64 dst_addr = mme_load_addr64(b);
struct mme_value64 dst_stride = mme_load_addr64(b);
struct mme_value flags = mme_load(b);
/* Now handle queries */
mme_while(b, ine, query_count, mme_zero()) {
/* We load available and determine if a result need to be written */
mme_tu104_read_fifoed(b, available_addr, mme_imm(1));
struct mme_value available = mme_load(b);
struct mme_value write_results =
mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
mme_or_to(b, write_results, write_results, available);
mme_if(b, ine, write_results, mme_zero()) {
struct mme_value r = mme_mov(b, mme_zero());
mme_while(b, ine, r, report_count) {
/* Setup MME fifo read, we only have 7 registers to work with so
* we agressively free registers */
STATIC_ASSERT(sizeof(struct nvk_query_report) % 2 == 0);
struct mme_value current_report_offs = mme_sll(
b, r, mme_imm(util_logbase2(sizeof(struct nvk_query_report))));
struct mme_value64 current_report_addr = mme_add64(
b, report_addr, mme_value64(current_report_offs, mme_zero()));
mme_tu104_read_fifoed(b, current_report_addr, mme_imm(2));
mme_free_reg(b, current_report_offs);
mme_free_reg64(b, current_report_addr);
struct mme_value64 report = mme_load_value64(b);
nvk_mme_write_query(b, dst_addr, r, flags, report);
mme_free_reg64(b, report);
mme_add_to(b, r, r, mme_imm(1));
}
}
mme_free_reg(b, write_results);
/* Finally write available if needed */
struct mme_value with_availability =
mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
mme_if(b, ine, with_availability, mme_zero()) {
nvk_mme_write_query(b, dst_addr, report_count, flags,
mme_value64(available, mme_zero()));
}
mme_free_reg(b, with_availability);
mme_free_reg(b, available);
mme_sub_to(b, query_count, query_count, mme_imm(1));
mme_add64_to(b, report_addr, report_addr, mme_value64(query_stride, mme_zero()));
mme_add64_to(b, available_addr, available_addr, mme_value64(available_stride, mme_zero()));
mme_add64_to(b, dst_addr, dst_addr, dst_stride);
}
}

View file

@ -28,6 +28,12 @@ enum nvk_query_pool_layout {
* byte 16.
*/
NVK_QUERY_POOL_LAYOUT_ALIGNED_INTERLEAVED,
/* Stores the availables and the timestamp in nvk_query_report
*
* This allows to write a timestamp with only one command.
*/
NVK_QUERY_POOL_LAYOUT_TIMESTAMP_PACKED,
};
struct nvk_query_pool {
@ -37,6 +43,7 @@ struct nvk_query_pool {
uint32_t reports_start;
uint32_t query_stride;
uint32_t statistics_counter_mask;
struct nvkmd_mem *mem;
};