From 2d48b2cb47365cb36b84800c937164dcfecade07 Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Sun, 11 May 2025 11:02:33 +0200 Subject: [PATCH] radv: Use subgroup OPs for BVH updates on GFX12 This patch changes the update code to launch 8 invocations for every internal node. The internal nodes update their child leaf nodes using the geometry index and primitive index stored inside the primitive node. Processing 8 child nodes in parallel is faster than looping over them. Moving to one dispatch that updates all nodes in one go lets us get rid of atomics and will also enable updatable BVHs to use pair compression. Improves Elden Ring (high settings, max RT settings, 1080p) by around 10%. Reviewed-by: Natalie Vock Part-of: --- src/amd/vulkan/bvh/build_interface.h | 10 + src/amd/vulkan/bvh/bvh.h | 3 + src/amd/vulkan/bvh/header.comp | 5 + src/amd/vulkan/bvh/update_gfx12.comp | 250 +++++++++---------- src/amd/vulkan/radv_acceleration_structure.c | 107 +++++++- 5 files changed, 228 insertions(+), 147 deletions(-) diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index a48f8b26793..d46d6571e67 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -55,6 +55,7 @@ struct header_args { REF(vk_ir_header) src; REF(radv_accel_struct_header) dst; uint32_t bvh_offset; + uint32_t internal_nodes_offset; uint32_t instance_count; }; @@ -68,4 +69,13 @@ struct update_args { vk_bvh_geometry_data geom_data; }; +struct update_gfx12_args { + REF(radv_accel_struct_header) src; + REF(radv_accel_struct_header) dst; + REF(vk_bvh_geometry_data) geom_data; + REF(vk_aabb) bounds; + REF(uint32_t) internal_ready_count; + uint32_t leaf_node_count; +}; + #endif /* BUILD_INTERFACE_H */ diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index 162242f88a3..07ad8190b77 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -51,6 +51,9 @@ struct radv_accel_struct_header { uint32_t reserved; vk_aabb aabb; + /* GFX12 */ + uint32_t update_dispatch_size[3]; + /* Everything after this gets either updated/copied from the CPU or written by header.comp. */ uint64_t compacted_size; uint64_t serialization_size; diff --git a/src/amd/vulkan/bvh/header.comp b/src/amd/vulkan/bvh/header.comp index 509e874cac3..7b5771aa299 100644 --- a/src/amd/vulkan/bvh/header.comp +++ b/src/amd/vulkan/bvh/header.comp @@ -29,6 +29,11 @@ layout(push_constant) uniform CONSTS void main(void) { + DEREF(args.dst).update_dispatch_size[0] = + ((DEREF(args.src).dst_node_offset - args.internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1) * 8; + DEREF(args.dst).update_dispatch_size[1] = 1; + DEREF(args.dst).update_dispatch_size[2] = 1; + uint32_t compacted_size = args.bvh_offset + DEREF(args.src).dst_node_offset; uint32_t serialization_size = diff --git a/src/amd/vulkan/bvh/update_gfx12.comp b/src/amd/vulkan/bvh/update_gfx12.comp index 284bd4d769e..a2898420e53 100644 --- a/src/amd/vulkan/bvh/update_gfx12.comp +++ b/src/amd/vulkan/bvh/update_gfx12.comp @@ -17,22 +17,32 @@ #extension GL_EXT_buffer_reference : require #extension GL_EXT_buffer_reference2 : require #extension GL_KHR_memory_scope_semantics : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_clustered : require layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; #include "build_interface.h" +#include "invocation_cluster.h" #include "update.h" layout(push_constant) uniform CONSTS { - update_args args; + update_gfx12_args args; }; uint32_t -fetch_parent_node(VOID_REF bvh, uint32_t node) +read_bits(VOID_REF data, uint32_t start, uint32_t count) { - uint64_t addr = bvh - node / 16 * 4 - 4; - return DEREF(REF(uint32_t)(addr)); + uint32_t shift = start % 32; + uint32_t lower = DEREF(INDEX(uint32_t, data, start / 32)) >> shift; + uint32_t upper = 0; + if (shift != 0 && shift + count > 32) + upper = DEREF(INDEX(uint32_t, data, start / 32 + 1)) << (32 - shift); + uint32_t total = lower | upper; + return count != 32 ? total & ((1u << count) - 1u) : total; } void @@ -45,110 +55,92 @@ main() VOID_REF leaf_node_offsets = OFFSET(args.src, DEREF(args.src).leaf_node_offsets_offset); - uint32_t leaf_node_size; - if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) - leaf_node_size = SIZEOF(radv_gfx12_primitive_node); - else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) - leaf_node_size = SIZEOF(radv_gfx12_primitive_node); - else - leaf_node_size = SIZEOF(radv_gfx12_instance_node) + SIZEOF(radv_gfx12_instance_node_user_data); - - uint32_t leaf_node_id = args.geom_data.first_id + gl_GlobalInvocationID.x; + uint32_t leaf_node_size = SIZEOF(radv_gfx12_primitive_node); uint32_t first_leaf_offset = id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_gfx12_box_node); + uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size; - uint32_t dst_offset = DEREF(INDEX(uint32_t, leaf_node_offsets, leaf_node_id)); - VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset); - uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride; + uint32_t node_count = DEREF(args.src).update_dispatch_size[0] / 8; + uint32_t node_index = node_count - 1 - gl_GlobalInvocationID.x / 8; - vk_aabb bounds; - bool is_active; - if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { - is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, true); + bool is_root_node = node_index == 0; + + /* Each invocation cluster updates one internal node. */ + radv_invocation_cluster cluster; + radv_invocation_cluster_init(cluster, 8); + + uint32_t compacted_size = uint32_t(DEREF(args.src).compacted_size); + + uint32_t node_offset = internal_nodes_offset + SIZEOF(radv_gfx12_box_node) * (node_index - 1); + if (is_root_node) + node_offset = id_to_offset(RADV_BVH_ROOT_NODE); + + radv_gfx12_box_node src_node = DEREF(REF(radv_gfx12_box_node) OFFSET(src_bvh, node_offset)); + REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, node_offset); + + uint32_t valid_child_count_minus_one = src_node.child_count_exponents >> 28; + + radv_gfx12_box_child child = src_node.children[cluster.invocation_index]; + uint32_t child_type = (child.dword2 >> 24) & 0xf; + + bool is_leaf_or_invalid = child_type == radv_bvh_node_triangle; + bool is_valid = cluster.invocation_index <= valid_child_count_minus_one; + bool is_leaf = is_leaf_or_invalid && is_valid; + + uint32_t child_offset; + if (is_leaf_or_invalid) { + child_offset = id_to_offset(src_node.primitive_base_id); + uint32_t child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)); + child_offset += leaf_node_size * child_index; } else { - VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); - is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, true); + child_offset = id_to_offset(src_node.internal_base_id); + uint32_t child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)); + child_offset += SIZEOF(radv_gfx12_box_node) * child_index; } - if (!is_active) - return; + uint32_t child_index = (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node); - DEREF(INDEX(vk_aabb, args.leaf_bounds, (dst_offset - first_leaf_offset) / leaf_node_size)) = bounds; - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + bool is_ready = is_leaf_or_invalid; + while (true) { + if (!is_ready) + is_ready = DEREF(INDEX(uint32_t, args.internal_ready_count, child_index)) != 0; - uint32_t node_id = pack_node_id(dst_offset, 0); - uint32_t parent_id = fetch_parent_node(src_bvh, node_id); - uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size; - while (parent_id != RADV_BVH_INVALID_NODE) { - uint32_t offset = id_to_offset(parent_id); + if (radv_ballot(cluster, is_ready) != 0xff) + continue; - uint32_t parent_index = (offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1; - if (parent_id == RADV_BVH_ROOT_NODE) - parent_index = 0; + vk_aabb bounds; + bounds.min = vec3(INFINITY); + bounds.max = vec3(-INFINITY); - /* Make accesses to internal nodes in dst_bvh available and visible */ - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + if (is_leaf) { + VOID_REF src_leaf_node = OFFSET(src_bvh, child_offset); + uint32_t indices_midpoint = read_bits(src_leaf_node, 42, 10); + uint32_t geometry_index = read_bits(src_leaf_node, indices_midpoint - 28, 28); + uint32_t primitive_index = read_bits(src_leaf_node, indices_midpoint, 28); - REF(radv_gfx12_box_node) src_node = REF(radv_gfx12_box_node) OFFSET(src_bvh, offset); - REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, offset); + vk_bvh_geometry_data geom_data = DEREF(INDEX(vk_bvh_geometry_data, args.geom_data, geometry_index)); - uint32_t valid_child_count_minus_one = DEREF(src_node).child_count_exponents >> 28; - - /* Check if all children have been processed. As this is an atomic the last path coming from - * a child will pass here, while earlier paths break. - */ - uint32_t ready_child_count = atomicAdd( - DEREF(INDEX(uint32_t, args.internal_ready_count, parent_index)), 1, gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - - if (ready_child_count != valid_child_count_minus_one) - break; - - uint32_t child_internal_id = DEREF(src_node).internal_base_id; - uint32_t child_primitive_id = DEREF(src_node).primitive_base_id; - - if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) { - DEREF(dst_node).internal_base_id = child_internal_id; - DEREF(dst_node).primitive_base_id = child_primitive_id; + VOID_REF dst_leaf_node = OFFSET(dst_bvh, child_offset); + if (geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { + radv_build_triangle(bounds, dst_leaf_node, geom_data, primitive_index, true); + } else { + VOID_REF src_ptr = OFFSET(geom_data.data, primitive_index * geom_data.stride); + radv_build_aabb(bounds, src_ptr, dst_leaf_node, geometry_index, primitive_index, true); + } + } else if (is_valid) { + bounds = DEREF(INDEX(vk_aabb, args.bounds, child_index)); } - uint32_t child_offsets[8]; - vk_aabb total_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY)); - for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { - radv_gfx12_box_child child = DEREF(src_node).children[i]; - uint32_t child_type = (child.dword2 >> 24) & 0xf; - uint32_t child_size_id = (child.dword2 >> 28) * RADV_GFX12_BVH_NODE_SIZE / 8; + vk_aabb total_bounds; + total_bounds.min.x = subgroupClusteredMin(bounds.min.x, 8); + total_bounds.min.y = subgroupClusteredMin(bounds.min.y, 8); + total_bounds.min.z = subgroupClusteredMin(bounds.min.z, 8); + total_bounds.max.x = subgroupClusteredMax(bounds.max.x, 8); + total_bounds.max.y = subgroupClusteredMax(bounds.max.y, 8); + total_bounds.max.z = subgroupClusteredMax(bounds.max.z, 8); - uint32_t child_id; - if (child_type == radv_bvh_node_box32) { - child_id = child_internal_id; - child_internal_id += child_size_id; - } else { - child_id = child_primitive_id; - child_primitive_id += child_size_id; - } - - child_offsets[i] = id_to_offset(child_id); - - uint32_t child_offset = child_offsets[i]; - vk_aabb child_aabb; - if (child_offset == dst_offset) { - child_aabb = bounds; - } else { - uint32_t child_index; - if (child_offset >= internal_nodes_offset) { - child_index = - (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count; - } else { - child_index = (child_offset - first_leaf_offset) / leaf_node_size; - } - - child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index)); - } - - total_bounds.min = min(total_bounds.min, child_aabb.min); - total_bounds.max = max(total_bounds.max, child_aabb.max); + if (!is_root_node) { + DEREF(INDEX(vk_aabb, args.bounds, node_index - 1)) = total_bounds; + DEREF(INDEX(uint32_t, args.internal_ready_count, node_index - 1)) = 1; } vec3 origin = total_bounds.min; @@ -157,62 +149,44 @@ main() extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000); uvec3 extent_exponents = floatBitsToUint(extent) >> 23; - DEREF(dst_node).origin = origin; - DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) | - (extent_exponents.z << 16) | (valid_child_count_minus_one << 28); - if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) - DEREF(dst_node).obb_matrix_index = 0x7f; - - for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { - uint32_t child_offset = child_offsets[i]; - vk_aabb child_aabb; - if (child_offset == dst_offset) { - child_aabb = bounds; - } else { - uint32_t child_index; - if (child_offset >= internal_nodes_offset) { - child_index = - (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count; - } else { - child_index = (child_offset - first_leaf_offset) / leaf_node_size; - } - - child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index)); - } - - radv_gfx12_box_child child = DEREF(src_node).children[i]; + if (cluster.invocation_index == 0) { + DEREF(dst_node).origin = origin; + DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) | + (extent_exponents.z << 16) | (valid_child_count_minus_one << 28); + if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) + DEREF(dst_node).obb_matrix_index = 0x7f; + } + if (is_valid) { radv_gfx12_box_child box_child; - box_child.dword0 = - (child.dword0 & 0xFF000000) | - min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) | - (min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12); + box_child.dword0 = (child.dword0 & 0xFF000000) | + min(uint32_t(floor((bounds.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) | + (min(uint32_t(floor((bounds.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12); box_child.dword1 = (child.dword1 & 0xFF000000) | - min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) | - (min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12); + min(uint32_t(floor((bounds.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) | + (min(uint32_t(ceil((bounds.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12); box_child.dword2 = (child.dword2 & 0xFF000000) | - min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) | - (min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12); - DEREF(dst_node).children[i] = box_child; + min(uint32_t(ceil((bounds.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) | + (min(uint32_t(ceil((bounds.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12); + DEREF(dst_node).children[cluster.invocation_index] = box_child; } - if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) { - for (uint32_t i = valid_child_count_minus_one + 1; i < 8; i++) { - radv_gfx12_box_child null_child; - null_child.dword0 = 0xffffffff; - null_child.dword1 = 0xfff; - null_child.dword2 = 0; - DEREF(dst_node).children[i] = null_child; - } + if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE) && !is_valid) { + radv_gfx12_box_child null_child; + null_child.dword0 = 0xffffffff; + null_child.dword1 = 0xfff; + null_child.dword2 = 0; + DEREF(dst_node).children[cluster.invocation_index] = null_child; } - if (parent_id == RADV_BVH_ROOT_NODE) + if (is_root_node) DEREF(args.dst).aabb = total_bounds; - DEREF(INDEX(vk_aabb, args.leaf_bounds, parent_index + args.leaf_node_count)) = total_bounds; - - parent_id = fetch_parent_node(src_bvh, parent_id); + /* Make changes to the children's BVH offset value available to the other invocations. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + break; } } diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index f81541fca39..ce32880f657 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -59,6 +59,7 @@ struct acceleration_structure_layout { }; struct update_scratch_layout { + uint32_t geometry_data_offset; uint32_t bounds_offsets; uint32_t internal_ready_count_offset; uint32_t size; @@ -159,14 +160,26 @@ radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf } static void -radv_get_update_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct update_scratch_layout *scratch) +radv_get_update_scratch_layout(struct radv_device *device, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, uint32_t leaf_count, + struct update_scratch_layout *scratch) { + const struct radv_physical_device *pdev = radv_device_physical(device); + uint32_t internal_count = MAX2(leaf_count, 2) - 1; uint32_t offset = 0; - scratch->bounds_offsets = offset; - offset += sizeof(vk_aabb) * (leaf_count + internal_count); + if (radv_use_bvh8(pdev)) { + scratch->geometry_data_offset = offset; + offset += sizeof(struct vk_bvh_geometry_data) * build_info->geometryCount; + + scratch->bounds_offsets = offset; + offset += sizeof(vk_aabb) * internal_count; + } else { + scratch->bounds_offsets = offset; + offset += sizeof(vk_aabb) * leaf_count; + } scratch->internal_ready_count_offset = offset; offset += sizeof(uint32_t) * internal_count; @@ -362,12 +375,13 @@ radv_get_as_size(VkDevice _device, const VkAccelerationStructureBuildGeometryInf } static VkDeviceSize -radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count) +radv_get_update_scratch_size(struct vk_device *vk_device, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + uint32_t leaf_count) { struct radv_device *device = container_of(vk_device, struct radv_device, vk); struct update_scratch_layout scratch; - radv_get_update_scratch_layout(device, leaf_count, &scratch); + radv_get_update_scratch_layout(device, build_info, leaf_count, &scratch); return scratch.size; } @@ -594,6 +608,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui .src = intermediate_header_addr, .dst = vk_acceleration_structure_get_va(dst), .bvh_offset = layout.bvh_offset, + .internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset, .instance_count = instance_count, }; radv_bvh_build_set_args(commandBuffer, &args, sizeof(args)); @@ -671,18 +686,47 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui } static void -radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch, uint32_t leaf_count, +radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count, struct vk_acceleration_structure *src_as, struct vk_acceleration_structure *dst_as) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); struct update_scratch_layout layout; - radv_get_update_scratch_layout(device, leaf_count, &layout); + radv_get_update_scratch_layout(device, build_info, leaf_count, &layout); /* Prepare ready counts for internal nodes */ radv_fill_memory(cmd_buffer, scratch + layout.internal_ready_count_offset, layout.size - layout.internal_ready_count_offset, 0x0, RADV_COPY_FLAGS_DEVICE_LOCAL); + + if (radv_use_bvh8(pdev)) { + uint32_t data_size = sizeof(struct vk_bvh_geometry_data) * build_info->geometryCount; + struct vk_bvh_geometry_data *data = malloc(data_size); + if (!data) { + vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY); + return; + } + + uint32_t first_id = 0; + for (uint32_t i = 0; i < build_info->geometryCount; i++) { + const VkAccelerationStructureGeometryKHR *geom = + build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i]; + + const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &build_range_infos[i]; + + data[i] = vk_fill_geometry_data(build_info->type, first_id, i, geom, build_range_info); + + first_id += build_range_info->primitiveCount; + } + + radv_update_memory(cmd_buffer, scratch + layout.geometry_data_offset, data_size, data, + RADV_COPY_FLAGS_DEVICE_LOCAL); + + free(data); + } } static void @@ -742,7 +786,7 @@ radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild } struct update_scratch_layout layout; - radv_get_update_scratch_layout(device, leaf_count, &layout); + radv_get_update_scratch_layout(device, build_info, leaf_count, &layout); struct update_args update_consts = { .src = vk_acceleration_structure_get_va(src), @@ -769,6 +813,50 @@ radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild } } +static void +radv_update_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count, + uint32_t key, struct vk_acceleration_structure *src, struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + if (src != dst) { + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); + + /* Copy header/metadata */ + const uint64_t src_va = vk_acceleration_structure_get_va(src); + const uint64_t dst_va = vk_acceleration_structure_get_va(dst); + + radv_copy_memory(cmd_buffer, src_va, dst_va, layout.bvh_offset, RADV_COPY_FLAGS_DEVICE_LOCAL, + RADV_COPY_FLAGS_DEVICE_LOCAL); + } + + struct update_scratch_layout layout; + radv_get_update_scratch_layout(device, build_info, leaf_count, &layout); + + struct update_gfx12_args update_consts = { + .src = vk_acceleration_structure_get_va(src), + .dst = vk_acceleration_structure_get_va(dst), + .geom_data = build_info->scratchData.deviceAddress + layout.geometry_data_offset, + .bounds = build_info->scratchData.deviceAddress + layout.bounds_offsets, + .internal_ready_count = build_info->scratchData.deviceAddress + layout.internal_ready_count_offset, + .leaf_node_count = leaf_count, + }; + + radv_bvh_build_set_args(commandBuffer, &update_consts, sizeof(update_consts)); + + struct radv_dispatch_info dispatch = { + .ordered = true, + .unaligned = true, + .indirect_va = + vk_acceleration_structure_get_va(src) + offsetof(struct radv_accel_struct_header, update_dispatch_size[0]), + }; + + radv_compute_dispatch(cmd_buffer, &dispatch); +} + static const struct radix_sort_vk_target_config radix_sort_config = { .keyval_dwords = 2, .fill.workgroup_size_log2 = 7, @@ -840,13 +928,14 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) .init_update_scratch = radv_init_update_scratch, .get_update_key[0] = radv_get_update_key, .update_bind_pipeline[0] = radv_update_bind_pipeline, - .update_as[0] = radv_update_as, }; if (radv_use_bvh8(pdev)) { + device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as_gfx12; device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline_gfx12; device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12; } else { + device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as; device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline; device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as; device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv;