diff --git a/src/intel/vulkan/bvh/copy.comp b/src/intel/vulkan/bvh/copy.comp new file mode 100644 index 00000000000..070eee3307e --- /dev/null +++ b/src/intel/vulkan/bvh/copy.comp @@ -0,0 +1,147 @@ +/* Copyright © 2022 Bas Nieuwenhuizen + * Copyright © 2024 Intel Coorporation + * SPDX-License-Identifier: MIT + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +#include "anv_build_interface.h" + +layout(push_constant) uniform CONSTS { + copy_args args; +}; + +// Layout of serialized data +/**************************************| +| vk_accel_struct_serialization_header | +|--------------------------------------| +| For a TLAS, all handles to the BLAS | +| within this TLAS. | +| For a BLAS, nothing. | +|--------------------------------------| +| Driver-specific part. | +| For Intel, this starts with | +| anv_accel_struct_header as drawn | +| in anv_bvh.h | +|**************************************/ + +/* + * Explanation of BLAS handles: + * According to the spec of vkCmdCopyAccelerationStructureToMemoryKHR, + * for a TLAS, the handles of all BLAS/instances within this TLAS are + * tightly stored after vk_accel_struct_serialization_header, making this + * serialized-memory a semi-opaque object. The application might be able + * to swap/replace these handles with other handles. In fact this is what + * dEQP-VK.ray_tracing_pipeline.acceleration_structures.header_bottom_address.* + * is doing. + * + * Therefore, if the application updates the handles, we need to replace + * the old handles in anv_instance_leaf with the new one. To access + * anv_instance_leaf without traversing the TLAS, pointers to these + * anv_instance_leaf are stored right after anv_accel_struct_header, + * allowing us to know where they are in the TLAS instantly. + * + * Although, the fact that the application can swap/replace new handles + * of BLAS without rebuilding the TLAS sounds a bit odd. + */ + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + uint32_t lanes = gl_NumWorkGroups.x * 128; + uint32_t increment = lanes * 8; + + uint64_t copy_src_addr = args.src_addr; + uint64_t copy_dst_addr = args.dst_addr; + + if (args.mode == ANV_COPY_MODE_DESERIALIZE) { + copy_src_addr += SIZEOF(vk_accel_struct_serialization_header) + + DEREF(REF(vk_accel_struct_serialization_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); + } + + REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(copy_src_addr); + + uint64_t instance_base = args.src_addr + SIZEOF(vk_accel_struct_serialization_header); + uint64_t instance_offset = SIZEOF(anv_accel_struct_header); + + /* We store the address of instance_leaf after bvh header */ + uint64_t instance_end = DEREF(header).instance_count * SIZEOF(uint64_t); + + if (instance_end > 0) + instance_end += instance_offset; + + if (args.mode == ANV_COPY_MODE_SERIALIZE) { + copy_dst_addr += SIZEOF(vk_accel_struct_serialization_header) + + DEREF(REF(anv_accel_struct_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); + + if (global_id == 0) { + REF(vk_accel_struct_serialization_header) ser_header = + REF(vk_accel_struct_serialization_header)(args.dst_addr); + DEREF(ser_header).serialization_size = DEREF(header).serialization_size; + DEREF(ser_header).deserialization_size = DEREF(header).compacted_size; + DEREF(ser_header).instance_count = DEREF(header).instance_count; + + for (uint32_t offset = 0; offset < VK_UUID_SIZE; offset++) { + DEREF(ser_header).driver_uuid[offset] = args.driver_uuid[offset]; + } + + for (uint32_t offset = 0; offset < VK_UUID_SIZE; offset++) { + DEREF(ser_header).accel_struct_compat[offset] = args.accel_struct_compat[offset]; + } + } + + instance_base = args.dst_addr + SIZEOF(vk_accel_struct_serialization_header); + } else if (args.mode == ANV_COPY_MODE_COPY) { + instance_end = 0; + } + + uint64_t size = DEREF(header).compacted_size; + for (uint64_t offset = global_id * 8; offset < size; offset += increment) { + /* copy 8 bytes per iteration */ + DEREF(REF(uint64_t)(copy_dst_addr + offset)) = + DEREF(REF(uint64_t)(copy_src_addr + offset)); + + /* Do the adjustment inline in the same invocation that copies the data so that we don't have + * to synchronize. + */ + if (offset < instance_end && offset >= instance_offset && + (offset - instance_offset) % SIZEOF(uint64_t) == 0) { + uint64_t idx = (offset - instance_offset) / SIZEOF(uint64_t); + + if (args.mode == ANV_COPY_MODE_SERIALIZE) { + /* Indirectly access the anv_instance_leaf, and store the blas_ptrs after ser_header */ + uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_src_addr + offset)); + REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr); + uint64_t blas_ptr = DEREF(instance_leaf).part1.bvh_ptr & 0xfffffffffffful; + DEREF(INDEX(uint64_t, instance_base, idx)) = blas_ptr; + } else { /* ANV_COPY_MODE_DESERIALIZE */ + /* Indirectly access the anv_instance_leaf, and replace the bvh_ptr with the ones after ser_header */ + uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_dst_addr + offset)); + REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr); + uint64_t blas_ptr = DEREF(INDEX(uint64_t, instance_base, idx)); + DEREF(instance_leaf).part1.bvh_ptr = (blas_ptr & 0xfffffffffffful); + + /* set the startNodePtr to blas_ptr + ANV_HEADER_SIZE */ + uint64_t mask = 0x0000fffffffffffful; + uint64_t new_startNodePtr = blas_ptr + ANV_RT_BVH_HEADER_SIZE; + /* clear bits and set */ + DEREF(instance_leaf).part0.start_node_ptr_and_inst_flags = + (DEREF(instance_leaf).part0.start_node_ptr_and_inst_flags & ~mask) | (new_startNodePtr & mask); + } + } + } +}