radv: Use subgroup OPs for BVH updates on GFX12

This patch changes the update code to launch 8 invocations for every
internal node. The internal nodes update their child leaf nodes using
the geometry index and primitive index stored inside the primitive node.

Processing 8 child nodes in parallel is faster than looping over them.
Moving to one dispatch that updates all nodes in one go lets us get rid
of atomics and will also enable updatable BVHs to use pair compression.

Improves Elden Ring (high settings, max RT settings, 1080p) by around
10%.

Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34601>
This commit is contained in:
Konstantin Seurer 2025-05-11 11:02:33 +02:00
parent c6fdf11303
commit 2d48b2cb47
5 changed files with 228 additions and 147 deletions

View file

@ -55,6 +55,7 @@ struct header_args {
REF(vk_ir_header) src;
REF(radv_accel_struct_header) dst;
uint32_t bvh_offset;
uint32_t internal_nodes_offset;
uint32_t instance_count;
};
@ -68,4 +69,13 @@ struct update_args {
vk_bvh_geometry_data geom_data;
};
struct update_gfx12_args {
REF(radv_accel_struct_header) src;
REF(radv_accel_struct_header) dst;
REF(vk_bvh_geometry_data) geom_data;
REF(vk_aabb) bounds;
REF(uint32_t) internal_ready_count;
uint32_t leaf_node_count;
};
#endif /* BUILD_INTERFACE_H */

View file

@ -51,6 +51,9 @@ struct radv_accel_struct_header {
uint32_t reserved;
vk_aabb aabb;
/* GFX12 */
uint32_t update_dispatch_size[3];
/* Everything after this gets either updated/copied from the CPU or written by header.comp. */
uint64_t compacted_size;
uint64_t serialization_size;

View file

@ -29,6 +29,11 @@ layout(push_constant) uniform CONSTS
void
main(void)
{
DEREF(args.dst).update_dispatch_size[0] =
((DEREF(args.src).dst_node_offset - args.internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1) * 8;
DEREF(args.dst).update_dispatch_size[1] = 1;
DEREF(args.dst).update_dispatch_size[2] = 1;
uint32_t compacted_size = args.bvh_offset + DEREF(args.src).dst_node_offset;
uint32_t serialization_size =

View file

@ -17,22 +17,32 @@
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_buffer_reference2 : require
#extension GL_KHR_memory_scope_semantics : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_shuffle : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_clustered : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#include "build_interface.h"
#include "invocation_cluster.h"
#include "update.h"
layout(push_constant) uniform CONSTS
{
update_args args;
update_gfx12_args args;
};
uint32_t
fetch_parent_node(VOID_REF bvh, uint32_t node)
read_bits(VOID_REF data, uint32_t start, uint32_t count)
{
uint64_t addr = bvh - node / 16 * 4 - 4;
return DEREF(REF(uint32_t)(addr));
uint32_t shift = start % 32;
uint32_t lower = DEREF(INDEX(uint32_t, data, start / 32)) >> shift;
uint32_t upper = 0;
if (shift != 0 && shift + count > 32)
upper = DEREF(INDEX(uint32_t, data, start / 32 + 1)) << (32 - shift);
uint32_t total = lower | upper;
return count != 32 ? total & ((1u << count) - 1u) : total;
}
void
@ -45,110 +55,92 @@ main()
VOID_REF leaf_node_offsets = OFFSET(args.src, DEREF(args.src).leaf_node_offsets_offset);
uint32_t leaf_node_size;
if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
leaf_node_size = SIZEOF(radv_gfx12_primitive_node);
else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR)
leaf_node_size = SIZEOF(radv_gfx12_primitive_node);
else
leaf_node_size = SIZEOF(radv_gfx12_instance_node) + SIZEOF(radv_gfx12_instance_node_user_data);
uint32_t leaf_node_id = args.geom_data.first_id + gl_GlobalInvocationID.x;
uint32_t leaf_node_size = SIZEOF(radv_gfx12_primitive_node);
uint32_t first_leaf_offset = id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_gfx12_box_node);
uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size;
uint32_t dst_offset = DEREF(INDEX(uint32_t, leaf_node_offsets, leaf_node_id));
VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset);
uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride;
uint32_t node_count = DEREF(args.src).update_dispatch_size[0] / 8;
uint32_t node_index = node_count - 1 - gl_GlobalInvocationID.x / 8;
vk_aabb bounds;
bool is_active;
if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, true);
bool is_root_node = node_index == 0;
/* Each invocation cluster updates one internal node. */
radv_invocation_cluster cluster;
radv_invocation_cluster_init(cluster, 8);
uint32_t compacted_size = uint32_t(DEREF(args.src).compacted_size);
uint32_t node_offset = internal_nodes_offset + SIZEOF(radv_gfx12_box_node) * (node_index - 1);
if (is_root_node)
node_offset = id_to_offset(RADV_BVH_ROOT_NODE);
radv_gfx12_box_node src_node = DEREF(REF(radv_gfx12_box_node) OFFSET(src_bvh, node_offset));
REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, node_offset);
uint32_t valid_child_count_minus_one = src_node.child_count_exponents >> 28;
radv_gfx12_box_child child = src_node.children[cluster.invocation_index];
uint32_t child_type = (child.dword2 >> 24) & 0xf;
bool is_leaf_or_invalid = child_type == radv_bvh_node_triangle;
bool is_valid = cluster.invocation_index <= valid_child_count_minus_one;
bool is_leaf = is_leaf_or_invalid && is_valid;
uint32_t child_offset;
if (is_leaf_or_invalid) {
child_offset = id_to_offset(src_node.primitive_base_id);
uint32_t child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
child_offset += leaf_node_size * child_index;
} else {
VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, true);
child_offset = id_to_offset(src_node.internal_base_id);
uint32_t child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
child_offset += SIZEOF(radv_gfx12_box_node) * child_index;
}
if (!is_active)
return;
uint32_t child_index = (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node);
DEREF(INDEX(vk_aabb, args.leaf_bounds, (dst_offset - first_leaf_offset) / leaf_node_size)) = bounds;
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
bool is_ready = is_leaf_or_invalid;
while (true) {
if (!is_ready)
is_ready = DEREF(INDEX(uint32_t, args.internal_ready_count, child_index)) != 0;
uint32_t node_id = pack_node_id(dst_offset, 0);
uint32_t parent_id = fetch_parent_node(src_bvh, node_id);
uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size;
while (parent_id != RADV_BVH_INVALID_NODE) {
uint32_t offset = id_to_offset(parent_id);
if (radv_ballot(cluster, is_ready) != 0xff)
continue;
uint32_t parent_index = (offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1;
if (parent_id == RADV_BVH_ROOT_NODE)
parent_index = 0;
vk_aabb bounds;
bounds.min = vec3(INFINITY);
bounds.max = vec3(-INFINITY);
/* Make accesses to internal nodes in dst_bvh available and visible */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
if (is_leaf) {
VOID_REF src_leaf_node = OFFSET(src_bvh, child_offset);
uint32_t indices_midpoint = read_bits(src_leaf_node, 42, 10);
uint32_t geometry_index = read_bits(src_leaf_node, indices_midpoint - 28, 28);
uint32_t primitive_index = read_bits(src_leaf_node, indices_midpoint, 28);
REF(radv_gfx12_box_node) src_node = REF(radv_gfx12_box_node) OFFSET(src_bvh, offset);
REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, offset);
vk_bvh_geometry_data geom_data = DEREF(INDEX(vk_bvh_geometry_data, args.geom_data, geometry_index));
uint32_t valid_child_count_minus_one = DEREF(src_node).child_count_exponents >> 28;
/* Check if all children have been processed. As this is an atomic the last path coming from
* a child will pass here, while earlier paths break.
*/
uint32_t ready_child_count = atomicAdd(
DEREF(INDEX(uint32_t, args.internal_ready_count, parent_index)), 1, gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
if (ready_child_count != valid_child_count_minus_one)
break;
uint32_t child_internal_id = DEREF(src_node).internal_base_id;
uint32_t child_primitive_id = DEREF(src_node).primitive_base_id;
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) {
DEREF(dst_node).internal_base_id = child_internal_id;
DEREF(dst_node).primitive_base_id = child_primitive_id;
VOID_REF dst_leaf_node = OFFSET(dst_bvh, child_offset);
if (geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
radv_build_triangle(bounds, dst_leaf_node, geom_data, primitive_index, true);
} else {
VOID_REF src_ptr = OFFSET(geom_data.data, primitive_index * geom_data.stride);
radv_build_aabb(bounds, src_ptr, dst_leaf_node, geometry_index, primitive_index, true);
}
} else if (is_valid) {
bounds = DEREF(INDEX(vk_aabb, args.bounds, child_index));
}
uint32_t child_offsets[8];
vk_aabb total_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
radv_gfx12_box_child child = DEREF(src_node).children[i];
uint32_t child_type = (child.dword2 >> 24) & 0xf;
uint32_t child_size_id = (child.dword2 >> 28) * RADV_GFX12_BVH_NODE_SIZE / 8;
vk_aabb total_bounds;
total_bounds.min.x = subgroupClusteredMin(bounds.min.x, 8);
total_bounds.min.y = subgroupClusteredMin(bounds.min.y, 8);
total_bounds.min.z = subgroupClusteredMin(bounds.min.z, 8);
total_bounds.max.x = subgroupClusteredMax(bounds.max.x, 8);
total_bounds.max.y = subgroupClusteredMax(bounds.max.y, 8);
total_bounds.max.z = subgroupClusteredMax(bounds.max.z, 8);
uint32_t child_id;
if (child_type == radv_bvh_node_box32) {
child_id = child_internal_id;
child_internal_id += child_size_id;
} else {
child_id = child_primitive_id;
child_primitive_id += child_size_id;
}
child_offsets[i] = id_to_offset(child_id);
uint32_t child_offset = child_offsets[i];
vk_aabb child_aabb;
if (child_offset == dst_offset) {
child_aabb = bounds;
} else {
uint32_t child_index;
if (child_offset >= internal_nodes_offset) {
child_index =
(child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count;
} else {
child_index = (child_offset - first_leaf_offset) / leaf_node_size;
}
child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
}
total_bounds.min = min(total_bounds.min, child_aabb.min);
total_bounds.max = max(total_bounds.max, child_aabb.max);
if (!is_root_node) {
DEREF(INDEX(vk_aabb, args.bounds, node_index - 1)) = total_bounds;
DEREF(INDEX(uint32_t, args.internal_ready_count, node_index - 1)) = 1;
}
vec3 origin = total_bounds.min;
@ -157,62 +149,44 @@ main()
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
DEREF(dst_node).origin = origin;
DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
(extent_exponents.z << 16) | (valid_child_count_minus_one << 28);
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE))
DEREF(dst_node).obb_matrix_index = 0x7f;
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
uint32_t child_offset = child_offsets[i];
vk_aabb child_aabb;
if (child_offset == dst_offset) {
child_aabb = bounds;
} else {
uint32_t child_index;
if (child_offset >= internal_nodes_offset) {
child_index =
(child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count;
} else {
child_index = (child_offset - first_leaf_offset) / leaf_node_size;
}
child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
}
radv_gfx12_box_child child = DEREF(src_node).children[i];
if (cluster.invocation_index == 0) {
DEREF(dst_node).origin = origin;
DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
(extent_exponents.z << 16) | (valid_child_count_minus_one << 28);
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE))
DEREF(dst_node).obb_matrix_index = 0x7f;
}
if (is_valid) {
radv_gfx12_box_child box_child;
box_child.dword0 =
(child.dword0 & 0xFF000000) |
min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
box_child.dword0 = (child.dword0 & 0xFF000000) |
min(uint32_t(floor((bounds.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((bounds.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
box_child.dword1 =
(child.dword1 & 0xFF000000) |
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12);
min(uint32_t(floor((bounds.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((bounds.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12);
box_child.dword2 =
(child.dword2 & 0xFF000000) |
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12);
DEREF(dst_node).children[i] = box_child;
min(uint32_t(ceil((bounds.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((bounds.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12);
DEREF(dst_node).children[cluster.invocation_index] = box_child;
}
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) {
for (uint32_t i = valid_child_count_minus_one + 1; i < 8; i++) {
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst_node).children[i] = null_child;
}
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE) && !is_valid) {
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst_node).children[cluster.invocation_index] = null_child;
}
if (parent_id == RADV_BVH_ROOT_NODE)
if (is_root_node)
DEREF(args.dst).aabb = total_bounds;
DEREF(INDEX(vk_aabb, args.leaf_bounds, parent_index + args.leaf_node_count)) = total_bounds;
parent_id = fetch_parent_node(src_bvh, parent_id);
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
break;
}
}

View file

@ -59,6 +59,7 @@ struct acceleration_structure_layout {
};
struct update_scratch_layout {
uint32_t geometry_data_offset;
uint32_t bounds_offsets;
uint32_t internal_ready_count_offset;
uint32_t size;
@ -159,14 +160,26 @@ radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf
}
static void
radv_get_update_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct update_scratch_layout *scratch)
radv_get_update_scratch_layout(struct radv_device *device,
const VkAccelerationStructureBuildGeometryInfoKHR *build_info, uint32_t leaf_count,
struct update_scratch_layout *scratch)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
uint32_t internal_count = MAX2(leaf_count, 2) - 1;
uint32_t offset = 0;
scratch->bounds_offsets = offset;
offset += sizeof(vk_aabb) * (leaf_count + internal_count);
if (radv_use_bvh8(pdev)) {
scratch->geometry_data_offset = offset;
offset += sizeof(struct vk_bvh_geometry_data) * build_info->geometryCount;
scratch->bounds_offsets = offset;
offset += sizeof(vk_aabb) * internal_count;
} else {
scratch->bounds_offsets = offset;
offset += sizeof(vk_aabb) * leaf_count;
}
scratch->internal_ready_count_offset = offset;
offset += sizeof(uint32_t) * internal_count;
@ -362,12 +375,13 @@ radv_get_as_size(VkDevice _device, const VkAccelerationStructureBuildGeometryInf
}
static VkDeviceSize
radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count)
radv_get_update_scratch_size(struct vk_device *vk_device, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
uint32_t leaf_count)
{
struct radv_device *device = container_of(vk_device, struct radv_device, vk);
struct update_scratch_layout scratch;
radv_get_update_scratch_layout(device, leaf_count, &scratch);
radv_get_update_scratch_layout(device, build_info, leaf_count, &scratch);
return scratch.size;
}
@ -594,6 +608,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
.src = intermediate_header_addr,
.dst = vk_acceleration_structure_get_va(dst),
.bvh_offset = layout.bvh_offset,
.internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset,
.instance_count = instance_count,
};
radv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
@ -671,18 +686,47 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
}
static void
radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch, uint32_t leaf_count,
radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch,
const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count,
struct vk_acceleration_structure *src_as, struct vk_acceleration_structure *dst_as)
{
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
struct update_scratch_layout layout;
radv_get_update_scratch_layout(device, leaf_count, &layout);
radv_get_update_scratch_layout(device, build_info, leaf_count, &layout);
/* Prepare ready counts for internal nodes */
radv_fill_memory(cmd_buffer, scratch + layout.internal_ready_count_offset,
layout.size - layout.internal_ready_count_offset, 0x0, RADV_COPY_FLAGS_DEVICE_LOCAL);
if (radv_use_bvh8(pdev)) {
uint32_t data_size = sizeof(struct vk_bvh_geometry_data) * build_info->geometryCount;
struct vk_bvh_geometry_data *data = malloc(data_size);
if (!data) {
vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
return;
}
uint32_t first_id = 0;
for (uint32_t i = 0; i < build_info->geometryCount; i++) {
const VkAccelerationStructureGeometryKHR *geom =
build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i];
const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &build_range_infos[i];
data[i] = vk_fill_geometry_data(build_info->type, first_id, i, geom, build_range_info);
first_id += build_range_info->primitiveCount;
}
radv_update_memory(cmd_buffer, scratch + layout.geometry_data_offset, data_size, data,
RADV_COPY_FLAGS_DEVICE_LOCAL);
free(data);
}
}
static void
@ -742,7 +786,7 @@ radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild
}
struct update_scratch_layout layout;
radv_get_update_scratch_layout(device, leaf_count, &layout);
radv_get_update_scratch_layout(device, build_info, leaf_count, &layout);
struct update_args update_consts = {
.src = vk_acceleration_structure_get_va(src),
@ -769,6 +813,50 @@ radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild
}
}
static void
radv_update_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count,
uint32_t key, struct vk_acceleration_structure *src, struct vk_acceleration_structure *dst)
{
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
if (src != dst) {
struct acceleration_structure_layout layout;
radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
/* Copy header/metadata */
const uint64_t src_va = vk_acceleration_structure_get_va(src);
const uint64_t dst_va = vk_acceleration_structure_get_va(dst);
radv_copy_memory(cmd_buffer, src_va, dst_va, layout.bvh_offset, RADV_COPY_FLAGS_DEVICE_LOCAL,
RADV_COPY_FLAGS_DEVICE_LOCAL);
}
struct update_scratch_layout layout;
radv_get_update_scratch_layout(device, build_info, leaf_count, &layout);
struct update_gfx12_args update_consts = {
.src = vk_acceleration_structure_get_va(src),
.dst = vk_acceleration_structure_get_va(dst),
.geom_data = build_info->scratchData.deviceAddress + layout.geometry_data_offset,
.bounds = build_info->scratchData.deviceAddress + layout.bounds_offsets,
.internal_ready_count = build_info->scratchData.deviceAddress + layout.internal_ready_count_offset,
.leaf_node_count = leaf_count,
};
radv_bvh_build_set_args(commandBuffer, &update_consts, sizeof(update_consts));
struct radv_dispatch_info dispatch = {
.ordered = true,
.unaligned = true,
.indirect_va =
vk_acceleration_structure_get_va(src) + offsetof(struct radv_accel_struct_header, update_dispatch_size[0]),
};
radv_compute_dispatch(cmd_buffer, &dispatch);
}
static const struct radix_sort_vk_target_config radix_sort_config = {
.keyval_dwords = 2,
.fill.workgroup_size_log2 = 7,
@ -840,13 +928,14 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
.init_update_scratch = radv_init_update_scratch,
.get_update_key[0] = radv_get_update_key,
.update_bind_pipeline[0] = radv_update_bind_pipeline,
.update_as[0] = radv_update_as,
};
if (radv_use_bvh8(pdev)) {
device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as_gfx12;
device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline_gfx12;
device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12;
} else {
device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as;
device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline;
device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as;
device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv;