From 76031ba53dd96ebec73e8dd4e90ea322ece991e0 Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Tue, 1 Apr 2025 15:48:03 +0200 Subject: [PATCH] radv: Optimize the gfx12 encode shader Reviewed-by: Natalie Vock Part-of: --- src/amd/vulkan/bvh/encode_gfx12.comp | 447 ++++++++++--------- src/amd/vulkan/bvh/invocation_cluster.h | 39 ++ src/amd/vulkan/radv_acceleration_structure.c | 29 +- 3 files changed, 297 insertions(+), 218 deletions(-) create mode 100644 src/amd/vulkan/bvh/invocation_cluster.h diff --git a/src/amd/vulkan/bvh/encode_gfx12.comp b/src/amd/vulkan/bvh/encode_gfx12.comp index 08fbf8bf97b..805b662e6fd 100644 --- a/src/amd/vulkan/bvh/encode_gfx12.comp +++ b/src/amd/vulkan/bvh/encode_gfx12.comp @@ -18,14 +18,20 @@ #extension GL_EXT_buffer_reference : require #extension GL_EXT_buffer_reference2 : require #extension GL_KHR_memory_scope_semantics : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_clustered : require layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; #define GFX12 +#define USE_GLOBAL_SYNC #include "build_helpers.h" #include "build_interface.h" #include "encode.h" +#include "invocation_cluster.h" layout(push_constant) uniform CONSTS { @@ -39,15 +45,234 @@ set_parent(uint32_t child, uint32_t parent) DEREF(REF(uint32_t)(addr)) = parent; } +void +encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_internal_nodes, uint32_t node_index) +{ + /* Each invocation cluster encodes one internal node. */ + radv_invocation_cluster cluster; + radv_invocation_cluster_init(cluster, 8); + + REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, node_index); + vk_ir_box_node src = DEREF(src_node); + bool is_root_node = node_index == DEREF(args.header).ir_internal_node_count - 1; + + for (;;) { + /* Make changes to the current node's BVH offset value visible. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + uint32_t bvh_offset; + if (cluster.invocation_index == 0) { + bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset; + } + bvh_offset = radv_read_invocation(cluster, 0, bvh_offset); + + if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) + continue; + + if (bvh_offset == VK_NULL_BVH_OFFSET) + break; + + REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset)); + + uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32); + + uint32_t child = RADV_BVH_INVALID_NODE; + if (cluster.invocation_index < 2) + child = src.children[cluster.invocation_index]; + + while (true) { + uint32_t valid_children = radv_ballot(cluster, child != RADV_BVH_INVALID_NODE); + if ((valid_children & 0x80) != 0 || valid_children == 0) + break; + + float surface_area = -1.0; + bool is_valid_internal = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_internal; + if (is_valid_internal) { + vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child))).aabb; + surface_area = aabb_surface_area(child_aabb); + } + + float max_surface_area = subgroupClusteredMax(surface_area, 8); + + uint32_t collapse_index = findLSB(radv_ballot(cluster, is_valid_internal && surface_area == max_surface_area)); + if (collapse_index == 0xffffffff) + break; + + uint32_t right; + if (cluster.invocation_index == collapse_index) { + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child)); + DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET; + + uint32_t left = DEREF(child_node).children[0]; + right = DEREF(child_node).children[1]; + + if (left == RADV_BVH_INVALID_NODE) { + left = right; + right = RADV_BVH_INVALID_NODE; + } + + child = left; + } + right = radv_read_invocation(cluster, collapse_index, right); + + if (cluster.invocation_index == findMSB(valid_children) + 1) + child = right; + } + + bool is_valid = child != RADV_BVH_INVALID_NODE; + bool is_valid_primitive = is_valid && ir_id_to_type(child) != vk_ir_node_internal; + bool is_valid_internal = is_valid && ir_id_to_type(child) == vk_ir_node_internal; + + uint32_t child_leaf_node_count = bitCount(radv_ballot(cluster, is_valid_primitive)); + uint32_t child_internal_node_count = bitCount(radv_ballot(cluster, is_valid_internal)); + + uint32_t leaf_node_size; + switch (args.geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + case VK_GEOMETRY_TYPE_AABBS_KHR: + leaf_node_size = RADV_GFX12_BVH_NODE_SIZE; + break; + default: + /* instances */ + leaf_node_size = 2 * RADV_GFX12_BVH_NODE_SIZE; + break; + } + + uint32_t child_leaf_nodes_size = child_leaf_node_count * leaf_node_size; + uint32_t child_internal_nodes_size = child_internal_node_count * RADV_GFX12_BVH_NODE_SIZE; + + uint32_t dst_leaf_offset; + uint32_t dst_internal_offset; + if (cluster.invocation_index == 0) { + dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size); + dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size); + } + dst_leaf_offset = radv_read_invocation(cluster, 0, dst_leaf_offset); + dst_internal_offset = radv_read_invocation(cluster, 0, dst_internal_offset); + + uint32_t child_index = 0; + uint32_t dst_offset = 0; + if (is_valid_internal) { + child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)); + dst_offset = dst_internal_offset + child_index * RADV_GFX12_BVH_NODE_SIZE; + + uint32_t offset = ir_id_to_offset(child); + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset); + DEREF(child_node).bvh_offset = dst_offset; + } + if (is_valid_primitive) { + child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)); + dst_offset = dst_leaf_offset + child_index * leaf_node_size; + child_index += child_internal_node_count; + } + + vec3 origin = src.base.aabb.min; + vec3 extent = src.base.aabb.max - src.base.aabb.min; + + extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000); + uvec3 extent_exponents = floatBitsToUint(extent) >> 23; + + uint32_t valid_child_count = child_leaf_node_count + child_internal_node_count; + if (cluster.invocation_index == 0) { + DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0); + DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0); + DEREF(dst).origin = origin; + DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) | + (extent_exponents.z << 16) | ((valid_child_count - 1) << 28); + DEREF(dst).obb_matrix_index = 0x7f; + } + + if (is_valid) { + uint32_t type = ir_id_to_type(child); + uint32_t offset = ir_id_to_offset(child); + + uint32_t child_node_size_128b = 1; + uint32_t encoded_type = 0; + uint32_t cull_mask = 0xff; + if (type == vk_ir_node_internal) { + encoded_type = 5; + } else { + /* Write leaf node offset. */ + uint32_t leaf_index = offset / ir_leaf_node_size; + REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset); + child_dst_offset = INDEX(uint32_t, child_dst_offset, leaf_index); + DEREF(child_dst_offset) = dst_offset; + + VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset; + + switch (args.geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { + vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset))); + radv_encode_triangle_gfx12(dst_leaf_addr, src_node); + break; + } + case VK_GEOMETRY_TYPE_AABBS_KHR: { + vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset))); + radv_encode_aabb_gfx12(dst_leaf_addr, src_node); + break; + } + default: + /* instances */ + encoded_type = 6; + child_node_size_128b = 2; + + vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset))); + radv_encode_instance_gfx12(dst_leaf_addr, src_node); + + cull_mask = src_node.custom_instance_and_mask >> 24; + + break; + } + } + + vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb; + + radv_gfx12_box_child box_child; + /* TODO: subtree flags culling */ + box_child.dword0 = + min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) | + (min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12); + /* TODO: subtree mask culling */ + box_child.dword1 = + min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) | + (min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) | + (cull_mask << 24); + box_child.dword2 = + min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) | + (min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) | + (encoded_type << 24) | (child_node_size_128b << 28); + DEREF(dst).children[child_index] = box_child; + + set_parent(pack_node_id(dst_offset, encoded_type), node_id); + } else { + child_index = + bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)) + valid_child_count; + radv_gfx12_box_child null_child; + null_child.dword0 = 0xffffffff; + null_child.dword1 = 0xfff; + null_child.dword2 = 0; + DEREF(dst).children[child_index] = null_child; + } + + /* Make changes to the children's BVH offset value available to the other invocations. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + break; + } + + if (is_root_node && cluster.invocation_index == 0) { + REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base); + DEREF(header).aabb = src.base.aabb; + DEREF(header).bvh_offset = args.output_bvh_offset; + + set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE); + } +} + void main() { - if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count) - return; - - /* Revert the order so we start at the root */ - uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x; - uint32_t ir_leaf_node_size; switch (args.geometry_type) { case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { @@ -65,211 +290,17 @@ main() } uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size; - uint32_t dst_internal_offset = id_to_offset(RADV_BVH_ROOT_NODE); - REF(vk_ir_box_node) intermediate_internal_nodes = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size); - REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id); - vk_ir_box_node src = DEREF(src_node); - bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1; + uint32_t ir_internal_node_count = DEREF(args.header).ir_internal_node_count; + uint32_t encode_invocation_count = ir_internal_node_count * 8; - for (;;) { - /* Make changes to the current node's BVH offset value visible. */ - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + uint32_t global_id = gl_GlobalInvocationID.x; + if (global_id >= encode_invocation_count) + return; - uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset; - if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) - continue; - - if (bvh_offset == VK_NULL_BVH_OFFSET) - break; - - REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset)); - - uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32); - - uint32_t children[8]; - - uint32_t found_child_count = 0; - for (uint32_t i = 0; i < 2; i++) { - if (src.children[i] != RADV_BVH_INVALID_NODE) { - children[found_child_count] = src.children[i]; - found_child_count++; - } - } - - /* TODO: Collapse child nodes with high SAH values. */ - while (found_child_count < 8) { - bool progress = false; - for (int32_t i = 0; i < found_child_count; i++) { - uint32_t child_id = children[i]; - if (ir_id_to_type(child_id) != vk_ir_node_internal) - continue; - - progress = true; - - REF(vk_ir_box_node) child_node = - REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child_id)); - uint32_t grandchildren[2] = DEREF(child_node).children; - uint32_t valid_grandchild_count = 0; - - if (grandchildren[1] != RADV_BVH_INVALID_NODE) - valid_grandchild_count++; - - if (grandchildren[0] != RADV_BVH_INVALID_NODE) - valid_grandchild_count++; - else - grandchildren[0] = grandchildren[1]; - - if (valid_grandchild_count > 1) { - children[found_child_count] = grandchildren[1]; - found_child_count++; - } - - if (valid_grandchild_count > 0) { - children[i] = grandchildren[0]; - } else { - found_child_count--; - children[i] = children[found_child_count]; - } - - DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET; - - if (found_child_count == 8) - break; - } - - if (!progress) - break; - } - - uint32_t child_leaf_nodes_size = 0; - uint32_t child_internal_nodes_size = 0; - for (uint32_t i = 0; i < found_child_count; i++) { - uint32_t type = ir_id_to_type(children[i]); - if (type == vk_ir_node_internal) - child_internal_nodes_size += RADV_GFX12_BVH_NODE_SIZE; - else if (type == vk_ir_node_instance) - child_leaf_nodes_size += 2 * RADV_GFX12_BVH_NODE_SIZE; - else - child_leaf_nodes_size += RADV_GFX12_BVH_NODE_SIZE; - } - - uint32_t dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size); - uint32_t dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size); - - vec3 origin = src.base.aabb.min; - vec3 extent = src.base.aabb.max - src.base.aabb.min; - - extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000); - uvec3 extent_exponents = floatBitsToUint(extent) >> 23; - - DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0); - DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0); - DEREF(dst).origin = origin; - DEREF(dst).child_count_exponents = - extent_exponents.x | (extent_exponents.y << 8) | (extent_exponents.z << 16) | ((found_child_count - 1) << 28); - DEREF(dst).obb_matrix_index = 0x7f; - - for (uint32_t i = 0; i < found_child_count; i++) { - uint32_t child_id = children[i]; - uint32_t type = ir_id_to_type(child_id); - uint32_t offset = ir_id_to_offset(child_id); - - uint32_t child_node_size_128b = 1; - uint32_t encoded_type = 0; - uint32_t dst_offset = 0; - uint32_t cull_mask = 0xff; - if (type == vk_ir_node_internal) { - encoded_type = 5; - dst_offset = dst_internal_offset; - - REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset); - DEREF(child_node).bvh_offset = dst_internal_offset; - - dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE; - } else { - dst_offset = dst_leaf_offset; - - /* Write leaf node offset. */ - uint32_t child_index = offset / ir_leaf_node_size; - REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset); - child_dst_offset = INDEX(uint32_t, child_dst_offset, child_index); - DEREF(child_dst_offset) = dst_offset; - - VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_leaf_offset; - - switch (args.geometry_type) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { - vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset))); - radv_encode_triangle_gfx12(dst_leaf_addr, src_node); - dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE; - break; - } - case VK_GEOMETRY_TYPE_AABBS_KHR: { - vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset))); - radv_encode_aabb_gfx12(dst_leaf_addr, src_node); - dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE; - break; - } - default: - /* instances */ - encoded_type = 6; - child_node_size_128b = 2; - - vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset))); - radv_encode_instance_gfx12(dst_leaf_addr, src_node); - - cull_mask = src_node.custom_instance_and_mask >> 24; - - dst_leaf_offset += 2 * RADV_GFX12_BVH_NODE_SIZE; - - break; - } - } - - vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb; - - radv_gfx12_box_child child; - /* TODO: subtree flags culling */ - child.dword0 = min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) | - (min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12); - /* TODO: subtree mask culling */ - child.dword1 = - min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) | - (min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) | - (cull_mask << 24); - child.dword2 = - min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) | - (min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) | - (encoded_type << 24) | (child_node_size_128b << 28); - DEREF(dst).children[i] = child; - - set_parent(pack_node_id(dst_offset, encoded_type), node_id); - } - - /* Set remaining children to invalid */ - for (uint32_t i = found_child_count; i < 8; i++) { - radv_gfx12_box_child null_child; - null_child.dword0 = 0xffffffff; - null_child.dword1 = 0xfff; - null_child.dword2 = 0; - DEREF(dst).children[i] = null_child; - } - - /* Make changes to the children's BVH offset value available to the other invocations. */ - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - break; - } - - if (is_root_node) { - REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base); - DEREF(header).aabb = src.base.aabb; - DEREF(header).bvh_offset = args.output_bvh_offset; - - set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE); - } + /* Revert the order so we start at the root */ + uint32_t node_index = ir_internal_node_count - 1 - global_id / 8; + encode_gfx12(ir_leaf_node_size, intermediate_internal_nodes, node_index); } diff --git a/src/amd/vulkan/bvh/invocation_cluster.h b/src/amd/vulkan/bvh/invocation_cluster.h new file mode 100644 index 00000000000..ba206ae46bf --- /dev/null +++ b/src/amd/vulkan/bvh/invocation_cluster.h @@ -0,0 +1,39 @@ +/* + * Copyright © 2025 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +/* Helpers for encoding BVH nodes on different HW generations. */ + +#ifndef RADV_BVH_INVOCATION_CLUSTER_H +#define RADV_BVH_INVOCATION_CLUSTER_H + +struct radv_invocation_cluster { + uint32_t invocation_index; + uint32_t cluster_index; + uint32_t cluster_size; +}; + +/* cluster_size has to be a power of two and <32. */ +void +radv_invocation_cluster_init(out radv_invocation_cluster cluster, uint32_t cluster_size) +{ + cluster.invocation_index = gl_SubgroupInvocationID & (cluster_size - 1); + cluster.cluster_index = gl_SubgroupInvocationID / cluster_size; + cluster.cluster_size = cluster_size; +} + +#define radv_read_invocation(cluster, index, value) \ + subgroupShuffle(value, (gl_SubgroupInvocationID & (~(cluster.cluster_size - 1))) + index) + +uint32_t +radv_ballot(radv_invocation_cluster cluster, bool value) +{ + uvec4 ballot = subgroupBallot(value); + uint64_t ballot64 = uint64_t(ballot.x) | (uint64_t(ballot.y) << 32ul); + uint32_t cluster_shift = gl_SubgroupInvocationID & (~(cluster.cluster_size - 1)); + return uint32_t((ballot64 >> cluster_shift) & ((1u << cluster.cluster_size) - 1)); +} + +#endif diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index b882f8461d6..fa3ae328261 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -540,11 +540,20 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructur struct acceleration_structure_layout layout; radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); - uint32_t dst_internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset; - uint32_t dst_leaf_nodes_offset = layout.leaf_nodes_offset - layout.bvh_offset; - uint32_t offsets[2] = {dst_internal_nodes_offset, dst_leaf_nodes_offset}; - radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset), offsets, - sizeof(offsets)); + struct vk_ir_header header = { + .sync_data = + { + .current_phase_end_counter = TASK_INDEX_INVALID, + /* Will be updated by the first PLOC shader invocation */ + .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID}, + }, + .dst_node_offset = layout.internal_nodes_offset - layout.bvh_offset, + .dst_leaf_node_offset = layout.leaf_nodes_offset - layout.bvh_offset, + }; + + const uint8_t *update_data = ((const uint8_t *)&header + offsetof(struct vk_ir_header, sync_data)); + radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, sync_data), update_data, + sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data)); if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope) cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2; @@ -560,10 +569,11 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructur vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); + uint32_t internal_count = MAX2(leaf_count, 2) - 1; + struct radv_dispatch_info dispatch = { - .unaligned = true, .ordered = true, - .blocks = {MAX2(leaf_count, 1), 1, 1}, + .blocks = {DIV_ROUND_UP(internal_count * 8, 64), 1, 1}, }; radv_compute_dispatch(cmd_buffer, &dispatch); @@ -664,9 +674,8 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui geometry_infos[i].primitive_count = build_range_infos[i].primitiveCount; } - radv_CmdUpdateBuffer(commandBuffer, vk_buffer_to_handle(dst->buffer), - dst->offset + layout.geometry_info_offset, geometry_infos_size, - geometry_infos); + radv_CmdUpdateBuffer(commandBuffer, vk_buffer_to_handle(dst->buffer), dst->offset + layout.geometry_info_offset, + geometry_infos_size, geometry_infos); free(geometry_infos); }