diff --git a/src/vulkan/runtime/bvh/hploc_internal.comp b/src/vulkan/runtime/bvh/hploc_internal.comp new file mode 100644 index 00000000000..b733899aca0 --- /dev/null +++ b/src/vulkan/runtime/bvh/hploc_internal.comp @@ -0,0 +1,235 @@ +/* + * Copyright © 2025 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#include "vk_build_interface.h" +#include "vk_debug.h" + +layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform CONSTS +{ + hploc_args args; +}; + +uint32_t +delta(uint32_t index) +{ + uint32_t left_index = index; + uint32_t right_index = index + 1; + + uint32_t left_key = DEREF(INDEX(key_id_pair, args.ids, left_index)).key; + uint32_t right_key = DEREF(INDEX(key_id_pair, args.ids, right_index)).key; + + return left_key != right_key ? (32 + findMSB(left_key ^ right_key)) : findMSB(left_index ^ right_index); +} + +#define SEARCH_RADIUS 16 + +shared uint32_t node_ids[SUBGROUP_SIZE]; +shared vk_aabb node_aabbs[SUBGROUP_SIZE]; +shared uint32_t candidate_infos[SUBGROUP_SIZE]; + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + REF(vk_ir_header) header = REF(vk_ir_header)(args.header); + uint32_t active_leaf_count = DEREF(header).active_leaf_count; + + if (active_leaf_count <= 1) { + if (global_id > 0) + return; + + DEREF(header).ir_internal_node_count = 1; + + uint32_t child_id = VK_BVH_INVALID_NODE; + vk_ir_node child = vk_ir_node(vk_aabb(vec3(0.0), vec3(0.0))); + if (active_leaf_count > 0) { + REF(key_id_pair) key_id = INDEX(key_id_pair, args.ids, global_id); + child_id = DEREF(key_id).id; + child = DEREF(REF(vk_ir_node)(OFFSET(args.bvh, ir_id_to_offset(child_id)))); + } + + REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, args.internal_node_base)); + DEREF(node).base.aabb = child.aabb; + DEREF(node).children[0] = child_id; + DEREF(node).children[1] = VK_BVH_INVALID_NODE; + DEREF(node).bvh_offset = VK_UNKNOWN_BVH_OFFSET; + if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS) && active_leaf_count > 0) + DEREF(node).flags = fetch_child_flags(args.bvh, child_id); + + return; + } + + /* Start at the leaf nodes which cover only one primitive => start=end. */ + uint32_t range_start = global_id; + uint32_t range_end = global_id; + + uint32_t internal_node_count = active_leaf_count - 1; + + bool is_active = global_id < active_leaf_count; + + while (subgroupAny(is_active)) { + uint32_t parent_index = 0xffffffff; + if (is_active) { + /* The parent node has either the index range_start-1 or range_end. Avoid indexing -1 or active_leaf_count. */ + bool use_right_parent = range_start == 0 || (range_end < internal_node_count && delta(range_end) < delta(range_start - 1)); + + parent_index = use_right_parent ? range_end : (range_start - 1); + if (parent_index == internal_node_count) { + is_active = false; + } else { + uint32_t prev_range = atomicExchange(DEREF(INDEX(uint32_t, args.ranges, parent_index)), use_right_parent ? range_start : range_end); + if (prev_range == 0xffffffff) { + is_active = false; + } else { + if (use_right_parent) + range_end = prev_range; + else + range_start = prev_range; + } + } + } + + /* Merging phase for every invocation that has a range with more than SUBGROUP_SIZE / 2 nodes. + * The nodes are merged until the number of nodes is below SUBGROUP_SIZE / 2 which ensures that + * the invocation handling the parent node can load its child nodes. + */ + const uint32_t cluster_threshold = SUBGROUP_SIZE / 2; + uint32_t range_size = range_end - range_start + 1; + bool range_is_root = subgroupAny(range_size == active_leaf_count); + uint64_t cluster_mask = packUint2x32(subgroupBallot(is_active && (range_is_root || range_size > cluster_threshold)).xy); + while (cluster_mask != 0) { + uint32_t cluster_invoc = uint32_t(findLSB(cluster_mask)); + /* Clear the LSB. */ + cluster_mask &= cluster_mask - 1; + + uint32_t start = subgroupShuffle(range_start, cluster_invoc); + uint32_t split = subgroupShuffle(parent_index, cluster_invoc); + uint32_t end = subgroupShuffle(range_end, cluster_invoc); + + uint32_t load_index, load_base, index_range; + if (gl_SubgroupInvocationID >= cluster_threshold) { + load_index = gl_SubgroupInvocationID - cluster_threshold; + load_base = split + 1; + index_range = end - split; + } else { + load_index = gl_SubgroupInvocationID; + load_base = start; + index_range = split + 1 - start; + } + uint32_t node_id_index = load_base + load_index; + uint32_t node_id = VK_BVH_INVALID_NODE; + if (load_index < index_range) + node_id = DEREF(INDEX(key_id_pair, args.ids, node_id_index)).id; + + uvec4 node_valid_mask = subgroupBallot(node_id != VK_BVH_INVALID_NODE); + uint32_t node_prefix_sum = subgroupBallotExclusiveBitCount(node_valid_mask); + uint32_t node_count = subgroupBallotBitCount(node_valid_mask); + if (node_id != VK_BVH_INVALID_NODE) { + node_ids[node_prefix_sum] = node_id; + node_aabbs[node_prefix_sum] = DEREF(REF(vk_ir_node)(OFFSET(args.bvh, ir_id_to_offset(node_id)))).aabb; + } + + while (node_count > (range_is_root ? 1 : cluster_threshold)) { + node_id = VK_BVH_INVALID_NODE; + vk_aabb node_aabb = node_aabbs[gl_SubgroupInvocationID]; + if (gl_SubgroupInvocationID < node_count) { + candidate_infos[gl_SubgroupInvocationID] = 0xffffffff; + uint32_t best_candidate = 0xffffffff; + for (uint32_t i = 1; i <= SEARCH_RADIUS; i++) { + int32_t index = int32_t(gl_SubgroupInvocationID) - int(i); + + vk_aabb shared_bounds; + shared_bounds.min = min(node_aabbs[index].min, node_aabb.min); + shared_bounds.max = max(node_aabbs[index].max, node_aabb.max); + + uint32_t shared_sa = (floatBitsToUint(aabb_surface_area(shared_bounds)) << 1u) & (~(SUBGROUP_SIZE - 1)); + if (index >= 0) { + uint32_t candidate = shared_sa | index; + best_candidate = min(best_candidate, candidate); + + candidate = shared_sa | gl_SubgroupInvocationID; + atomicMin(candidate_infos[index], candidate); + } + } + + best_candidate = min(best_candidate, candidate_infos[gl_SubgroupInvocationID]); + uint32_t best_index = best_candidate & (SUBGROUP_SIZE - 1); + uint32_t other_node_id = node_ids[best_index]; + + vk_aabb shared_bounds; + shared_bounds.min = min(node_aabbs[best_index].min, node_aabb.min); + shared_bounds.max = max(node_aabbs[best_index].max, node_aabb.max); + + /* There is always at least on pair of invocations that can be merged because there is a finite number of pairs and + * one of them therefore has a minimum surface area. If more than two nodes have the exact same surface area, the + * neighbor search prioritizes lower invocation indices. + */ + bool merge = best_index < SUBGROUP_SIZE && subgroupShuffle(best_index, best_index) == gl_SubgroupInvocationID; + + node_id = node_ids[gl_SubgroupInvocationID]; + + if (merge) { + if (gl_SubgroupInvocationID < best_index) { + uint32_t dst_index = atomicAdd(DEREF(header).ir_internal_node_count, 1); + uint32_t dst_offset = args.internal_node_base + dst_index * SIZEOF(vk_ir_box_node); + + node_aabb = shared_bounds; + + REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset)); + DEREF(node).base.aabb = shared_bounds; + DEREF(node).children[0] = node_id; + DEREF(node).children[1] = other_node_id; + DEREF(node).bvh_offset = VK_UNKNOWN_BVH_OFFSET; + if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS)) { + DEREF(node).flags = fetch_child_flags(args.bvh, node_id) & fetch_child_flags(args.bvh, other_node_id); + } + + node_id = pack_ir_node_id(dst_offset, vk_ir_node_internal); + } else { + node_id = VK_BVH_INVALID_NODE; + } + } + } + + node_count = subgroupBallotBitCount(subgroupBallot(node_id != VK_BVH_INVALID_NODE)); + uint32_t node_prefix_sum = subgroupBallotExclusiveBitCount(subgroupBallot(node_id != VK_BVH_INVALID_NODE)); + if (node_id != VK_BVH_INVALID_NODE) { + node_ids[node_prefix_sum] = node_id; + node_aabbs[node_prefix_sum] = node_aabb; + } + } + + if (gl_SubgroupInvocationID < min(end - start + 1, cluster_threshold)) { + uint32_t dst_node = gl_SubgroupInvocationID < node_count ? node_ids[gl_SubgroupInvocationID] : VK_BVH_INVALID_NODE; + DEREF(INDEX(key_id_pair, args.ids, start + gl_SubgroupInvocationID)).id = dst_node; + } + + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + } + } +} diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build index add1590b70f..991ad43fcd6 100644 --- a/src/vulkan/runtime/bvh/meson.build +++ b/src/vulkan/runtime/bvh/meson.build @@ -40,6 +40,10 @@ bvh_shaders = [ 'ploc_internal.comp', 'ploc_internal', ], + [ + 'hploc_internal.comp', + 'hploc_internal', + ], ] spirv_include_dir = dir_source_root + '/src/compiler/spirv' @@ -70,6 +74,7 @@ vk_glsl_shader_extensions = [ 'GL_KHR_shader_subgroup_shuffle', 'GL_KHR_shader_subgroup_ballot', 'GL_KHR_shader_subgroup_clustered', + 'GL_KHR_shader_subgroup_vote', 'GL_EXT_shader_atomic_int64', 'GL_EXT_spirv_intrinsics', ] diff --git a/src/vulkan/runtime/bvh/vk_build_interface.h b/src/vulkan/runtime/bvh/vk_build_interface.h index 714b6485887..99ce8daa9d4 100644 --- a/src/vulkan/runtime/bvh/vk_build_interface.h +++ b/src/vulkan/runtime/bvh/vk_build_interface.h @@ -110,4 +110,12 @@ struct ploc_args { uint32_t internal_node_offset; }; +struct hploc_args { + REF(vk_ir_header) header; + VOID_REF bvh; + REF(key_id_pair) ids; + VOID_REF ranges; + uint32_t internal_node_base; +}; + #endif diff --git a/src/vulkan/runtime/vk_acceleration_structure.c b/src/vulkan/runtime/vk_acceleration_structure.c index d7575fcd846..86752113959 100644 --- a/src/vulkan/runtime/vk_acceleration_structure.c +++ b/src/vulkan/runtime/vk_acceleration_structure.c @@ -61,6 +61,10 @@ static const uint32_t ploc_spv[] = { #include "bvh/ploc_internal.spv.h" }; +static const uint32_t hploc_spv[] = { +#include "bvh/hploc_internal.spv.h" +}; + VKAPI_ATTR VkResult VKAPI_CALL vk_common_CreateAccelerationStructureKHR(VkDevice _device, const VkAccelerationStructureCreateInfoKHR *pCreateInfo, @@ -172,10 +176,13 @@ vk_acceleration_structure_build_state_init(struct vk_acceleration_structure_buil uint32_t offset = 0; uint32_t ploc_scratch_space = 0; + uint32_t hploc_scratch_space = 0; uint32_t lbvh_node_space = 0; if (state->config.internal_type == VK_INTERNAL_BUILD_TYPE_PLOC) ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition); + else if (state->config.internal_type == VK_INTERNAL_BUILD_TYPE_HPLOC) + hploc_scratch_space = sizeof(uint32_t) * internal_count; else lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count; @@ -199,8 +206,11 @@ vk_acceleration_structure_build_state_init(struct vk_acceleration_structure_buil /* Internal sorting data is not needed when PLOC/LBVH are invoked, * save space by aliasing them */ state->scratch.ploc_prefix_sum_partition_offset = offset; + offset += MAX2(requirements.internal_size, ploc_scratch_space); + state->scratch.lbvh_node_offset = offset; - offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space); + state->scratch.hploc_ranges_offset = offset; + offset += MAX2(hploc_scratch_space, lbvh_node_space); /* Make sure encode scratch space does not overlap the BVH. */ offset = MAX2(offset, encode_scratch_end); @@ -242,6 +252,7 @@ struct bvh_batch_state { bool any_updateable; bool any_non_updateable; bool any_ploc; + bool any_hploc; bool any_lbvh; bool any_update; }; @@ -1069,6 +1080,72 @@ ploc_build_internal(VkCommandBuffer commandBuffer, return VK_SUCCESS; } +static VkResult +hploc_build_internal(VkCommandBuffer commandBuffer, + struct vk_device *device, struct vk_meta_device *meta, + const struct vk_acceleration_structure_build_args *args, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states) +{ + VkPipeline pipeline; + VkPipelineLayout layout; + + uint32_t flags = 0; + if (args->propagate_cull_flags) + flags |= VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS; + + VkResult result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_HPLOC, hploc_spv, + sizeof(hploc_spv), sizeof(struct hploc_args), + args, flags, &pipeline, + false /* unaligned_dispatch */); + if (result != VK_SUCCESS) + return result; + + result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct hploc_args), &layout); + if (result != VK_SUCCESS) + return result; + + if (args->emit_markers) { + struct vk_acceleration_structure_build_marker marker = { + .step = VK_ACCELERATION_STRUCTURE_BUILD_STEP_HPLOC_BUILD_INTERNAL, + }; + device->as_build_ops->begin_debug_marker(commandBuffer, &marker); + } + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + disp->CmdBindPipeline( + commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].vk.config.internal_type != VK_INTERNAL_BUILD_TYPE_HPLOC) + continue; + + assert(args->subgroup_size <= 64); + + uint64_t scratch_addr = pInfos[i].scratchData.deviceAddress; + const struct hploc_args consts = { + .header = scratch_addr + bvh_states[i].vk.scratch.header_offset, + .bvh = scratch_addr + bvh_states[i].vk.scratch.ir_offset, + .ranges = scratch_addr + bvh_states[i].vk.scratch.hploc_ranges_offset, + .ids = scratch_addr + bvh_states[i].scratch_offset, + .internal_node_base = bvh_states[i].vk.scratch.internal_node_offset - bvh_states[i].vk.scratch.ir_offset, + }; + + disp->CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + disp->CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].vk.leaf_node_count, args->subgroup_size), 1), 1, 1); + } + + if (args->emit_markers) { + struct vk_acceleration_structure_build_marker marker = { + .step = VK_ACCELERATION_STRUCTURE_BUILD_STEP_HPLOC_BUILD_INTERNAL, + }; + device->as_build_ops->end_debug_marker(commandBuffer, &marker); + } + + return VK_SUCCESS; +} + void vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer, struct vk_device *device, @@ -1124,6 +1201,8 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer, if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_PLOC) { batch_state.any_ploc = true; + } else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_HPLOC) { + batch_state.any_hploc = true; } else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_LBVH) { batch_state.any_lbvh = true; } else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE) { @@ -1172,7 +1251,7 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, }, 0, NULL, 0, NULL); - if (batch_state.any_lbvh || batch_state.any_ploc) { + if (batch_state.any_lbvh || batch_state.any_ploc || batch_state.any_hploc) { VkResult result; if (batch_state.any_non_updateable) { @@ -1199,6 +1278,17 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer, } } + if (batch_state.any_hploc) { + for (uint32_t i = 0; i < infoCount; ++i) { + uint32_t internal_count = MAX2(bvh_states[i].vk.leaf_node_count, 2) - 1; + if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_HPLOC) { + device->cmd_fill_buffer_addr(commandBuffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.hploc_ranges_offset, + sizeof(uint32_t) * internal_count, 0xffffffff); + } + } + vk_barrier_transfer_w_to_compute_r(commandBuffer); + } + vk_barrier_compute_w_to_compute_r(commandBuffer); result = @@ -1237,6 +1327,16 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer, } } + if (batch_state.any_hploc) { + result = + hploc_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states); + + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(cmd_buffer, result); + return; + } + } + vk_barrier_compute_w_to_compute_r(commandBuffer); vk_barrier_compute_w_to_indirect_compute_r(commandBuffer); flushed_compute_after_init_update_scratch = true; diff --git a/src/vulkan/runtime/vk_acceleration_structure.h b/src/vulkan/runtime/vk_acceleration_structure.h index 2f81807f216..3d27538b6b4 100644 --- a/src/vulkan/runtime/vk_acceleration_structure.h +++ b/src/vulkan/runtime/vk_acceleration_structure.h @@ -43,6 +43,7 @@ enum vk_acceleration_structure_build_step { VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_SORT, VK_ACCELERATION_STRUCTURE_BUILD_STEP_LBVH_BUILD_INTERNAL, VK_ACCELERATION_STRUCTURE_BUILD_STEP_PLOC_BUILD_INTERNAL, + VK_ACCELERATION_STRUCTURE_BUILD_STEP_HPLOC_BUILD_INTERNAL, VK_ACCELERATION_STRUCTURE_BUILD_STEP_ENCODE, VK_ACCELERATION_STRUCTURE_BUILD_STEP_UPDATE, }; @@ -88,6 +89,7 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(vk_acceleration_structure, base, VkAccelerationSt enum vk_internal_build_type { VK_INTERNAL_BUILD_TYPE_LBVH, VK_INTERNAL_BUILD_TYPE_PLOC, + VK_INTERNAL_BUILD_TYPE_HPLOC, VK_INTERNAL_BUILD_TYPE_UPDATE, }; @@ -111,6 +113,7 @@ struct vk_scratch_layout { uint32_t ploc_prefix_sum_partition_offset; uint32_t lbvh_node_offset; + uint32_t hploc_ranges_offset; uint32_t ir_offset; uint32_t internal_node_offset; diff --git a/src/vulkan/runtime/vk_meta.h b/src/vulkan/runtime/vk_meta.h index 641d1b929af..b1a7cf5adf6 100644 --- a/src/vulkan/runtime/vk_meta.h +++ b/src/vulkan/runtime/vk_meta.h @@ -180,6 +180,7 @@ enum vk_meta_object_key_type { VK_META_OBJECT_KEY_LBVH_MAIN, VK_META_OBJECT_KEY_LBVH_GENERATE_IR, VK_META_OBJECT_KEY_PLOC, + VK_META_OBJECT_KEY_HPLOC, /* Should be used as an offset for driver-specific object types. */ VK_META_OBJECT_KEY_DRIVER_OFFSET = 0x80000000,