vulkan: Implement HPLOC

Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39116>
This commit is contained in:
Konstantin Seurer 2025-11-23 13:59:43 +01:00
parent ab9d3528dc
commit a6a62363df
6 changed files with 354 additions and 2 deletions

View file

@ -0,0 +1,235 @@
/*
* Copyright © 2025 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#version 460
#include "vk_build_interface.h"
#include "vk_debug.h"
layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform CONSTS
{
hploc_args args;
};
uint32_t
delta(uint32_t index)
{
uint32_t left_index = index;
uint32_t right_index = index + 1;
uint32_t left_key = DEREF(INDEX(key_id_pair, args.ids, left_index)).key;
uint32_t right_key = DEREF(INDEX(key_id_pair, args.ids, right_index)).key;
return left_key != right_key ? (32 + findMSB(left_key ^ right_key)) : findMSB(left_index ^ right_index);
}
#define SEARCH_RADIUS 16
shared uint32_t node_ids[SUBGROUP_SIZE];
shared vk_aabb node_aabbs[SUBGROUP_SIZE];
shared uint32_t candidate_infos[SUBGROUP_SIZE];
void
main(void)
{
uint32_t global_id = gl_GlobalInvocationID.x;
REF(vk_ir_header) header = REF(vk_ir_header)(args.header);
uint32_t active_leaf_count = DEREF(header).active_leaf_count;
if (active_leaf_count <= 1) {
if (global_id > 0)
return;
DEREF(header).ir_internal_node_count = 1;
uint32_t child_id = VK_BVH_INVALID_NODE;
vk_ir_node child = vk_ir_node(vk_aabb(vec3(0.0), vec3(0.0)));
if (active_leaf_count > 0) {
REF(key_id_pair) key_id = INDEX(key_id_pair, args.ids, global_id);
child_id = DEREF(key_id).id;
child = DEREF(REF(vk_ir_node)(OFFSET(args.bvh, ir_id_to_offset(child_id))));
}
REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, args.internal_node_base));
DEREF(node).base.aabb = child.aabb;
DEREF(node).children[0] = child_id;
DEREF(node).children[1] = VK_BVH_INVALID_NODE;
DEREF(node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS) && active_leaf_count > 0)
DEREF(node).flags = fetch_child_flags(args.bvh, child_id);
return;
}
/* Start at the leaf nodes which cover only one primitive => start=end. */
uint32_t range_start = global_id;
uint32_t range_end = global_id;
uint32_t internal_node_count = active_leaf_count - 1;
bool is_active = global_id < active_leaf_count;
while (subgroupAny(is_active)) {
uint32_t parent_index = 0xffffffff;
if (is_active) {
/* The parent node has either the index range_start-1 or range_end. Avoid indexing -1 or active_leaf_count. */
bool use_right_parent = range_start == 0 || (range_end < internal_node_count && delta(range_end) < delta(range_start - 1));
parent_index = use_right_parent ? range_end : (range_start - 1);
if (parent_index == internal_node_count) {
is_active = false;
} else {
uint32_t prev_range = atomicExchange(DEREF(INDEX(uint32_t, args.ranges, parent_index)), use_right_parent ? range_start : range_end);
if (prev_range == 0xffffffff) {
is_active = false;
} else {
if (use_right_parent)
range_end = prev_range;
else
range_start = prev_range;
}
}
}
/* Merging phase for every invocation that has a range with more than SUBGROUP_SIZE / 2 nodes.
* The nodes are merged until the number of nodes is below SUBGROUP_SIZE / 2 which ensures that
* the invocation handling the parent node can load its child nodes.
*/
const uint32_t cluster_threshold = SUBGROUP_SIZE / 2;
uint32_t range_size = range_end - range_start + 1;
bool range_is_root = subgroupAny(range_size == active_leaf_count);
uint64_t cluster_mask = packUint2x32(subgroupBallot(is_active && (range_is_root || range_size > cluster_threshold)).xy);
while (cluster_mask != 0) {
uint32_t cluster_invoc = uint32_t(findLSB(cluster_mask));
/* Clear the LSB. */
cluster_mask &= cluster_mask - 1;
uint32_t start = subgroupShuffle(range_start, cluster_invoc);
uint32_t split = subgroupShuffle(parent_index, cluster_invoc);
uint32_t end = subgroupShuffle(range_end, cluster_invoc);
uint32_t load_index, load_base, index_range;
if (gl_SubgroupInvocationID >= cluster_threshold) {
load_index = gl_SubgroupInvocationID - cluster_threshold;
load_base = split + 1;
index_range = end - split;
} else {
load_index = gl_SubgroupInvocationID;
load_base = start;
index_range = split + 1 - start;
}
uint32_t node_id_index = load_base + load_index;
uint32_t node_id = VK_BVH_INVALID_NODE;
if (load_index < index_range)
node_id = DEREF(INDEX(key_id_pair, args.ids, node_id_index)).id;
uvec4 node_valid_mask = subgroupBallot(node_id != VK_BVH_INVALID_NODE);
uint32_t node_prefix_sum = subgroupBallotExclusiveBitCount(node_valid_mask);
uint32_t node_count = subgroupBallotBitCount(node_valid_mask);
if (node_id != VK_BVH_INVALID_NODE) {
node_ids[node_prefix_sum] = node_id;
node_aabbs[node_prefix_sum] = DEREF(REF(vk_ir_node)(OFFSET(args.bvh, ir_id_to_offset(node_id)))).aabb;
}
while (node_count > (range_is_root ? 1 : cluster_threshold)) {
node_id = VK_BVH_INVALID_NODE;
vk_aabb node_aabb = node_aabbs[gl_SubgroupInvocationID];
if (gl_SubgroupInvocationID < node_count) {
candidate_infos[gl_SubgroupInvocationID] = 0xffffffff;
uint32_t best_candidate = 0xffffffff;
for (uint32_t i = 1; i <= SEARCH_RADIUS; i++) {
int32_t index = int32_t(gl_SubgroupInvocationID) - int(i);
vk_aabb shared_bounds;
shared_bounds.min = min(node_aabbs[index].min, node_aabb.min);
shared_bounds.max = max(node_aabbs[index].max, node_aabb.max);
uint32_t shared_sa = (floatBitsToUint(aabb_surface_area(shared_bounds)) << 1u) & (~(SUBGROUP_SIZE - 1));
if (index >= 0) {
uint32_t candidate = shared_sa | index;
best_candidate = min(best_candidate, candidate);
candidate = shared_sa | gl_SubgroupInvocationID;
atomicMin(candidate_infos[index], candidate);
}
}
best_candidate = min(best_candidate, candidate_infos[gl_SubgroupInvocationID]);
uint32_t best_index = best_candidate & (SUBGROUP_SIZE - 1);
uint32_t other_node_id = node_ids[best_index];
vk_aabb shared_bounds;
shared_bounds.min = min(node_aabbs[best_index].min, node_aabb.min);
shared_bounds.max = max(node_aabbs[best_index].max, node_aabb.max);
/* There is always at least on pair of invocations that can be merged because there is a finite number of pairs and
* one of them therefore has a minimum surface area. If more than two nodes have the exact same surface area, the
* neighbor search prioritizes lower invocation indices.
*/
bool merge = best_index < SUBGROUP_SIZE && subgroupShuffle(best_index, best_index) == gl_SubgroupInvocationID;
node_id = node_ids[gl_SubgroupInvocationID];
if (merge) {
if (gl_SubgroupInvocationID < best_index) {
uint32_t dst_index = atomicAdd(DEREF(header).ir_internal_node_count, 1);
uint32_t dst_offset = args.internal_node_base + dst_index * SIZEOF(vk_ir_box_node);
node_aabb = shared_bounds;
REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));
DEREF(node).base.aabb = shared_bounds;
DEREF(node).children[0] = node_id;
DEREF(node).children[1] = other_node_id;
DEREF(node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS)) {
DEREF(node).flags = fetch_child_flags(args.bvh, node_id) & fetch_child_flags(args.bvh, other_node_id);
}
node_id = pack_ir_node_id(dst_offset, vk_ir_node_internal);
} else {
node_id = VK_BVH_INVALID_NODE;
}
}
}
node_count = subgroupBallotBitCount(subgroupBallot(node_id != VK_BVH_INVALID_NODE));
uint32_t node_prefix_sum = subgroupBallotExclusiveBitCount(subgroupBallot(node_id != VK_BVH_INVALID_NODE));
if (node_id != VK_BVH_INVALID_NODE) {
node_ids[node_prefix_sum] = node_id;
node_aabbs[node_prefix_sum] = node_aabb;
}
}
if (gl_SubgroupInvocationID < min(end - start + 1, cluster_threshold)) {
uint32_t dst_node = gl_SubgroupInvocationID < node_count ? node_ids[gl_SubgroupInvocationID] : VK_BVH_INVALID_NODE;
DEREF(INDEX(key_id_pair, args.ids, start + gl_SubgroupInvocationID)).id = dst_node;
}
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
}
}
}

View file

@ -40,6 +40,10 @@ bvh_shaders = [
'ploc_internal.comp',
'ploc_internal',
],
[
'hploc_internal.comp',
'hploc_internal',
],
]
spirv_include_dir = dir_source_root + '/src/compiler/spirv'
@ -70,6 +74,7 @@ vk_glsl_shader_extensions = [
'GL_KHR_shader_subgroup_shuffle',
'GL_KHR_shader_subgroup_ballot',
'GL_KHR_shader_subgroup_clustered',
'GL_KHR_shader_subgroup_vote',
'GL_EXT_shader_atomic_int64',
'GL_EXT_spirv_intrinsics',
]

View file

@ -110,4 +110,12 @@ struct ploc_args {
uint32_t internal_node_offset;
};
struct hploc_args {
REF(vk_ir_header) header;
VOID_REF bvh;
REF(key_id_pair) ids;
VOID_REF ranges;
uint32_t internal_node_base;
};
#endif

View file

@ -61,6 +61,10 @@ static const uint32_t ploc_spv[] = {
#include "bvh/ploc_internal.spv.h"
};
static const uint32_t hploc_spv[] = {
#include "bvh/hploc_internal.spv.h"
};
VKAPI_ATTR VkResult VKAPI_CALL
vk_common_CreateAccelerationStructureKHR(VkDevice _device,
const VkAccelerationStructureCreateInfoKHR *pCreateInfo,
@ -172,10 +176,13 @@ vk_acceleration_structure_build_state_init(struct vk_acceleration_structure_buil
uint32_t offset = 0;
uint32_t ploc_scratch_space = 0;
uint32_t hploc_scratch_space = 0;
uint32_t lbvh_node_space = 0;
if (state->config.internal_type == VK_INTERNAL_BUILD_TYPE_PLOC)
ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition);
else if (state->config.internal_type == VK_INTERNAL_BUILD_TYPE_HPLOC)
hploc_scratch_space = sizeof(uint32_t) * internal_count;
else
lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count;
@ -199,8 +206,11 @@ vk_acceleration_structure_build_state_init(struct vk_acceleration_structure_buil
/* Internal sorting data is not needed when PLOC/LBVH are invoked,
* save space by aliasing them */
state->scratch.ploc_prefix_sum_partition_offset = offset;
offset += MAX2(requirements.internal_size, ploc_scratch_space);
state->scratch.lbvh_node_offset = offset;
offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space);
state->scratch.hploc_ranges_offset = offset;
offset += MAX2(hploc_scratch_space, lbvh_node_space);
/* Make sure encode scratch space does not overlap the BVH. */
offset = MAX2(offset, encode_scratch_end);
@ -242,6 +252,7 @@ struct bvh_batch_state {
bool any_updateable;
bool any_non_updateable;
bool any_ploc;
bool any_hploc;
bool any_lbvh;
bool any_update;
};
@ -1069,6 +1080,72 @@ ploc_build_internal(VkCommandBuffer commandBuffer,
return VK_SUCCESS;
}
static VkResult
hploc_build_internal(VkCommandBuffer commandBuffer,
struct vk_device *device, struct vk_meta_device *meta,
const struct vk_acceleration_structure_build_args *args,
uint32_t infoCount,
const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
{
VkPipeline pipeline;
VkPipelineLayout layout;
uint32_t flags = 0;
if (args->propagate_cull_flags)
flags |= VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS;
VkResult result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_HPLOC, hploc_spv,
sizeof(hploc_spv), sizeof(struct hploc_args),
args, flags, &pipeline,
false /* unaligned_dispatch */);
if (result != VK_SUCCESS)
return result;
result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct hploc_args), &layout);
if (result != VK_SUCCESS)
return result;
if (args->emit_markers) {
struct vk_acceleration_structure_build_marker marker = {
.step = VK_ACCELERATION_STRUCTURE_BUILD_STEP_HPLOC_BUILD_INTERNAL,
};
device->as_build_ops->begin_debug_marker(commandBuffer, &marker);
}
const struct vk_device_dispatch_table *disp = &device->dispatch_table;
disp->CmdBindPipeline(
commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
for (uint32_t i = 0; i < infoCount; ++i) {
if (bvh_states[i].vk.config.internal_type != VK_INTERNAL_BUILD_TYPE_HPLOC)
continue;
assert(args->subgroup_size <= 64);
uint64_t scratch_addr = pInfos[i].scratchData.deviceAddress;
const struct hploc_args consts = {
.header = scratch_addr + bvh_states[i].vk.scratch.header_offset,
.bvh = scratch_addr + bvh_states[i].vk.scratch.ir_offset,
.ranges = scratch_addr + bvh_states[i].vk.scratch.hploc_ranges_offset,
.ids = scratch_addr + bvh_states[i].scratch_offset,
.internal_node_base = bvh_states[i].vk.scratch.internal_node_offset - bvh_states[i].vk.scratch.ir_offset,
};
disp->CmdPushConstants(commandBuffer, layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
disp->CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].vk.leaf_node_count, args->subgroup_size), 1), 1, 1);
}
if (args->emit_markers) {
struct vk_acceleration_structure_build_marker marker = {
.step = VK_ACCELERATION_STRUCTURE_BUILD_STEP_HPLOC_BUILD_INTERNAL,
};
device->as_build_ops->end_debug_marker(commandBuffer, &marker);
}
return VK_SUCCESS;
}
void
vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
struct vk_device *device,
@ -1124,6 +1201,8 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_PLOC) {
batch_state.any_ploc = true;
} else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_HPLOC) {
batch_state.any_hploc = true;
} else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_LBVH) {
batch_state.any_lbvh = true;
} else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE) {
@ -1172,7 +1251,7 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
}, 0, NULL, 0, NULL);
if (batch_state.any_lbvh || batch_state.any_ploc) {
if (batch_state.any_lbvh || batch_state.any_ploc || batch_state.any_hploc) {
VkResult result;
if (batch_state.any_non_updateable) {
@ -1199,6 +1278,17 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
}
}
if (batch_state.any_hploc) {
for (uint32_t i = 0; i < infoCount; ++i) {
uint32_t internal_count = MAX2(bvh_states[i].vk.leaf_node_count, 2) - 1;
if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_HPLOC) {
device->cmd_fill_buffer_addr(commandBuffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.hploc_ranges_offset,
sizeof(uint32_t) * internal_count, 0xffffffff);
}
}
vk_barrier_transfer_w_to_compute_r(commandBuffer);
}
vk_barrier_compute_w_to_compute_r(commandBuffer);
result =
@ -1237,6 +1327,16 @@ vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
}
}
if (batch_state.any_hploc) {
result =
hploc_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(cmd_buffer, result);
return;
}
}
vk_barrier_compute_w_to_compute_r(commandBuffer);
vk_barrier_compute_w_to_indirect_compute_r(commandBuffer);
flushed_compute_after_init_update_scratch = true;

View file

@ -43,6 +43,7 @@ enum vk_acceleration_structure_build_step {
VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_SORT,
VK_ACCELERATION_STRUCTURE_BUILD_STEP_LBVH_BUILD_INTERNAL,
VK_ACCELERATION_STRUCTURE_BUILD_STEP_PLOC_BUILD_INTERNAL,
VK_ACCELERATION_STRUCTURE_BUILD_STEP_HPLOC_BUILD_INTERNAL,
VK_ACCELERATION_STRUCTURE_BUILD_STEP_ENCODE,
VK_ACCELERATION_STRUCTURE_BUILD_STEP_UPDATE,
};
@ -88,6 +89,7 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(vk_acceleration_structure, base, VkAccelerationSt
enum vk_internal_build_type {
VK_INTERNAL_BUILD_TYPE_LBVH,
VK_INTERNAL_BUILD_TYPE_PLOC,
VK_INTERNAL_BUILD_TYPE_HPLOC,
VK_INTERNAL_BUILD_TYPE_UPDATE,
};
@ -111,6 +113,7 @@ struct vk_scratch_layout {
uint32_t ploc_prefix_sum_partition_offset;
uint32_t lbvh_node_offset;
uint32_t hploc_ranges_offset;
uint32_t ir_offset;
uint32_t internal_node_offset;

View file

@ -180,6 +180,7 @@ enum vk_meta_object_key_type {
VK_META_OBJECT_KEY_LBVH_MAIN,
VK_META_OBJECT_KEY_LBVH_GENERATE_IR,
VK_META_OBJECT_KEY_PLOC,
VK_META_OBJECT_KEY_HPLOC,
/* Should be used as an offset for driver-specific object types. */
VK_META_OBJECT_KEY_DRIVER_OFFSET = 0x80000000,