From de00fe3f6629d77e380cd730684d50d09785028d Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 4 Dec 2024 16:08:55 +0200 Subject: [PATCH] anv: add BVH building tracking through u_trace Signed-off-by: Lionel Landwerlin Reviewed-by: Kevin Chuang Reviewed-by: Sagar Ghuge Part-of: --- src/intel/ds/intel_driver_ds.cc | 7 ++ src/intel/ds/intel_tracepoints.py | 7 ++ src/intel/vulkan/anv_private.h | 3 + .../vulkan/genX_acceleration_structure.c | 90 ++++++++++++++++++- src/intel/vulkan/genX_cmd_compute.c | 28 ++++-- 5 files changed, 123 insertions(+), 12 deletions(-) diff --git a/src/intel/ds/intel_driver_ds.cc b/src/intel/ds/intel_driver_ds.cc index cb4ff78afba..7c017fc5af6 100644 --- a/src/intel/ds/intel_driver_ds.cc +++ b/src/intel/ds/intel_driver_ds.cc @@ -438,6 +438,13 @@ CREATE_DUAL_EVENT_CALLBACK(query_copy_shader, INTEL_DS_QUEUE_STAGE_INTERNAL_OPS) CREATE_DUAL_EVENT_CALLBACK(write_buffer_marker, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) CREATE_DUAL_EVENT_CALLBACK(rays, INTEL_DS_QUEUE_STAGE_RT) CREATE_DUAL_EVENT_CALLBACK(as_build, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_build_leaves, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_morton_generate, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_morton_sort, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_lbvh_build_internal, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_ploc_build_internal, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_encode, INTEL_DS_QUEUE_STAGE_AS) +CREATE_DUAL_EVENT_CALLBACK(as_copy, INTEL_DS_QUEUE_STAGE_AS) void intel_ds_begin_cmd_buffer_annotation(struct intel_ds_device *device, diff --git a/src/intel/ds/intel_tracepoints.py b/src/intel/ds/intel_tracepoints.py index 902aeeb7743..029d189997b 100644 --- a/src/intel/ds/intel_tracepoints.py +++ b/src/intel/ds/intel_tracepoints.py @@ -199,6 +199,13 @@ def define_tracepoints(args): need_cs_param=True) begin_end_tp('as_build') + begin_end_tp('as_build_leaves', compute=True) + begin_end_tp('as_morton_generate', compute=True) + begin_end_tp('as_morton_sort', compute=True) + begin_end_tp('as_lbvh_build_internal', compute=True) + begin_end_tp('as_ploc_build_internal', compute=True) + begin_end_tp('as_encode', compute=True) + begin_end_tp('as_copy', compute=True) begin_end_tp('rays', tp_args=[Arg(type='uint32_t', var='group_x', c_format='%u'), diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 5142443a186..287d3983b96 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -4028,6 +4028,9 @@ struct anv_cmd_ray_tracing_state { struct brw_rt_scratch_layout layout; } scratch; + uint32_t debug_marker_count; + enum vk_acceleration_structure_build_step debug_markers[5]; + struct anv_address build_priv_mem_addr; size_t build_priv_mem_size; }; diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c index f153fad5e0a..fd504e3ce1a 100644 --- a/src/intel/vulkan/genX_acceleration_structure.c +++ b/src/intel/vulkan/genX_acceleration_structure.c @@ -40,6 +40,77 @@ static uint32_t tlas_id = 0; static struct bvh_dump_struct *bvhDumpArray = NULL; static uint32_t bvh_dump_array_size = 0; +static void +begin_debug_marker(VkCommandBuffer commandBuffer, + enum vk_acceleration_structure_build_step step, + const char *format, ...) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(cmd_buffer->state.rt.debug_marker_count < + ARRAY_SIZE(cmd_buffer->state.rt.debug_markers)); + cmd_buffer->state.rt.debug_markers[cmd_buffer->state.rt.debug_marker_count++] = + step; + switch (step) { + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_TOP: + trace_intel_begin_as_build(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_BUILD_LEAVES: + trace_intel_begin_as_build_leaves(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_GENERATE: + trace_intel_begin_as_morton_generate(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_SORT: + trace_intel_begin_as_morton_sort(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_LBVH_BUILD_INTERNAL: + trace_intel_begin_as_lbvh_build_internal(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_PLOC_BUILD_INTERNAL: + trace_intel_begin_as_ploc_build_internal(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_ENCODE: + trace_intel_begin_as_encode(&cmd_buffer->trace); + break; + default: + unreachable("Invalid build step"); + } +} + +static void +end_debug_marker(VkCommandBuffer commandBuffer) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.rt.debug_marker_count--; + switch (cmd_buffer->state.rt.debug_markers[cmd_buffer->state.rt.debug_marker_count]) { + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_TOP: + trace_intel_end_as_build(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_BUILD_LEAVES: + trace_intel_end_as_build_leaves(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_GENERATE: + trace_intel_end_as_morton_generate(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_SORT: + trace_intel_end_as_morton_sort(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_LBVH_BUILD_INTERNAL: + trace_intel_end_as_lbvh_build_internal(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_PLOC_BUILD_INTERNAL: + trace_intel_end_as_ploc_build_internal(&cmd_buffer->trace); + break; + case VK_ACCELERATION_STRUCTURE_BUILD_STEP_ENCODE: + trace_intel_end_as_encode(&cmd_buffer->trace); + break; + default: + unreachable("Invalid build step"); + } +} + /* clear out everything from (header + bvh_offset) to the end */ static void clear_out_anv_bvh(struct anv_cmd_buffer *cmd_buffer, @@ -533,6 +604,8 @@ anv_init_header(VkCommandBuffer commandBuffer, } static const struct vk_acceleration_structure_build_ops anv_build_ops = { + .begin_debug_marker = begin_debug_marker, + .end_debug_marker = end_debug_marker, .get_as_size = anv_get_as_size, .get_encode_key = { anv_get_encode_key, anv_get_header_key }, .encode_bind_pipeline = { anv_encode_bind_pipeline, @@ -582,6 +655,7 @@ anv_device_init_accel_struct_build_state(struct anv_device *device) device->accel_struct_build.build_args = (struct vk_acceleration_structure_build_args) { + .emit_markers = u_trace_enabled(&device->ds.trace_context), .subgroup_size = device->info->ver >= 20 ? 16 : 8, .radix_sort = device->accel_struct_build.radix_sort, /* See struct anv_accel_struct_header from anv_bvh.h @@ -640,7 +714,6 @@ genX(CmdBuildAccelerationStructuresKHR)( const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - trace_intel_begin_as_build(&cmd_buffer->trace); struct anv_device *device = cmd_buffer->device; @@ -662,7 +735,6 @@ genX(CmdBuildAccelerationStructuresKHR)( &device->accel_struct_build.build_args); anv_cmd_buffer_restore_state(cmd_buffer, &saved); - trace_intel_end_as_build(&cmd_buffer->trace); } void @@ -686,6 +758,8 @@ genX(CmdCopyAccelerationStructureKHR)( VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); + trace_intel_begin_as_copy(&cmd_buffer->trace); + VkPipeline pipeline; VkPipelineLayout layout; VkResult result = get_pipeline_spv(cmd_buffer->device, "copy", copy_spv, @@ -737,6 +811,8 @@ genX(CmdCopyAccelerationStructureKHR)( copy_dispatch_size)); anv_cmd_buffer_restore_state(cmd_buffer, &saved); + + trace_intel_end_as_copy(&cmd_buffer->trace); } void @@ -746,8 +822,10 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); - struct anv_device *device = cmd_buffer->device; + + trace_intel_begin_as_copy(&cmd_buffer->trace); + VkPipeline pipeline; VkPipelineLayout layout; VkResult result = get_pipeline_spv(device, "copy", copy_spv, @@ -804,6 +882,8 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)( copy_dispatch_size)); anv_cmd_buffer_restore_state(cmd_buffer, &saved); + + trace_intel_end_as_copy(&cmd_buffer->trace); } void @@ -814,6 +894,8 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)( ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); + trace_intel_begin_as_copy(&cmd_buffer->trace); + VkPipeline pipeline; VkPipelineLayout layout; VkResult result = get_pipeline_spv(cmd_buffer->device, "copy", copy_spv, @@ -853,6 +935,8 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)( vk_common_CmdDispatch(commandBuffer, 512, 1, 1); anv_cmd_buffer_restore_state(cmd_buffer, &saved); + + trace_intel_end_as_copy(&cmd_buffer->trace); } /* TODO: Host commands */ diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 401b8f4c539..cc1aea5e80e 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -598,7 +598,8 @@ void genX(CmdDispatchBase)( prog_data->local_size[0] * prog_data->local_size[1] * prog_data->local_size[2]); - trace_intel_begin_compute(&cmd_buffer->trace); + if (cmd_buffer->state.rt.debug_marker_count == 0) + trace_intel_begin_compute(&cmd_buffer->trace); genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -610,8 +611,10 @@ void genX(CmdDispatchBase)( groupCountX, groupCountY, groupCountZ, false); - trace_intel_end_compute(&cmd_buffer->trace, - groupCountX, groupCountY, groupCountZ); + if (cmd_buffer->state.rt.debug_marker_count == 0) { + trace_intel_end_compute(&cmd_buffer->trace, + groupCountX, groupCountY, groupCountZ); + } } static void @@ -654,7 +657,8 @@ emit_unaligned_cs_walker( prog_data->local_size[0] * prog_data->local_size[1] * prog_data->local_size[2]); - trace_intel_begin_compute(&cmd_buffer->trace); + if (cmd_buffer->state.rt.debug_marker_count == 0) + trace_intel_begin_compute(&cmd_buffer->trace); assert(!prog_data->uses_num_work_groups); genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -667,8 +671,10 @@ emit_unaligned_cs_walker( dispatch, groupCountX, groupCountY, groupCountZ); #endif - trace_intel_end_compute(&cmd_buffer->trace, - groupCountX, groupCountY, groupCountZ); + if (cmd_buffer->state.rt.debug_marker_count == 0) { + trace_intel_end_compute(&cmd_buffer->trace, + groupCountX, groupCountY, groupCountZ); + } } /* @@ -758,7 +764,9 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer, INTEL_SNAPSHOT_COMPUTE, "compute indirect", 0); - trace_intel_begin_compute_indirect(&cmd_buffer->trace); + + if (cmd_buffer->state.rt.debug_marker_count == 0) + trace_intel_begin_compute_indirect(&cmd_buffer->trace); genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -768,8 +776,10 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer, emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0, 0, 0, is_unaligned_size_x); - trace_intel_end_compute_indirect(&cmd_buffer->trace, - anv_address_utrace(indirect_addr)); + if (cmd_buffer->state.rt.debug_marker_count == 0) { + trace_intel_end_compute_indirect(&cmd_buffer->trace, + anv_address_utrace(indirect_addr)); + } } void genX(CmdDispatchIndirect)(