From 19a4c1c4b59f02ca9d679faf6e762ef42e8e301c Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Wed, 15 Oct 2025 12:30:36 +0200 Subject: [PATCH] radv/bvh: Use box16 nodes when bvh8 is not used Using box16 nodes trades bvh quality for memory bandwidth which seems to be roughly equal in performance. Stats assuming box16 nodes are as expensive as box32 nodes: Totals from 7668 (79.68% of 9624) affected BVHs: compacted_size: 951666944 -> 742347648 (-22.00%) max_depth: 57606 -> 57615 (+0.02%) sah: 129114796242 -> 129998517775 (+0.68%); split: -0.00%, +0.68% scene_sah: 188564162 -> 192063633 (+1.86%); split: -0.02%, +1.88% box16_node_count: 0 -> 3270600 (+inf%) box32_node_count: 3365707 -> 95100 (-97.17%) --- src/amd/vulkan/bvh/build_helpers.h | 4 + src/amd/vulkan/bvh/build_interface.h | 1 + src/amd/vulkan/bvh/encode.comp | 126 +++++++++++++------ src/amd/vulkan/bvh/meson.build | 2 +- src/amd/vulkan/radv_acceleration_structure.c | 9 +- src/vulkan/runtime/bvh/meson.build | 5 +- src/vulkan/runtime/bvh/vk_build_helpers.h | 1 + 7 files changed, 109 insertions(+), 39 deletions(-) diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index a63a534d349..895e1606d6d 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -8,6 +8,7 @@ #define BVH_BUILD_HELPERS_H #include "bvh.h" +#include "spirv_internal_exts.h" #include "vk_build_helpers.h" TYPE(radv_accel_struct_serialization_header, 8); @@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type) return ptr_flags; } +spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f); +spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f); + #endif /* BUILD_HELPERS_H */ diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index 15a7a2aaf5e..d3b726d296b 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -26,6 +26,7 @@ #define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 5)) #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6)) #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7)) +#define RADV_BUILD_FLAG_USE_BOX16 (1u << (VK_BUILD_FLAG_COUNT + 8)) #define RADV_COPY_MODE_COPY 0 #define RADV_COPY_MODE_SERIALIZE 1 diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp index 53c6f853d2c..1fb4dc5d728 100644 --- a/src/amd/vulkan/bvh/encode.comp +++ b/src/amd/vulkan/bvh/encode.comp @@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent) DEREF(REF(uint32_t)(addr)) = parent; } +radv_aabb16 +radv_aabb_f32_to_f16(vk_aabb aabb) +{ + radv_aabb16 aabb16; + aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x); + aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y); + aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z); + aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x); + aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y); + aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z); + return aabb16; +} + +vk_aabb +radv_aabb_f16_to_f32(radv_aabb16 aabb16) +{ + vk_aabb aabb; + aabb.min.x = float(aabb16.min_x); + aabb.min.y = float(aabb16.min_y); + aabb.min.z = float(aabb16.min_z); + aabb.max.x = float(aabb16.max_x); + aabb.max.y = float(aabb16.max_y); + aabb.max.z = float(aabb16.max_z); + return aabb; +} + void main() { @@ -89,18 +115,15 @@ main() memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset; - if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) + uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset; + if (node_id == VK_UNKNOWN_BVH_OFFSET) continue; - if (bvh_offset == VK_NULL_BVH_OFFSET) + if (node_id == VK_NULL_BVH_OFFSET) break; uint32_t flags = 0; - REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset)); - uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32); - uint32_t found_child_count = 0; uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE}; @@ -158,20 +181,33 @@ main() break; } + REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id))); + REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id))); + bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16; + for (uint32_t i = 0; i < found_child_count; ++i) { uint32_t type = ir_id_to_type(children[i]); uint32_t offset = ir_id_to_offset(children[i]); - uint32_t dst_offset; + uint32_t child_node_id; + + vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; if (type == vk_ir_node_internal) { - dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node)); + radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb); + float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16)); + float surface_area_f32 = aabb_surface_area(child_aabb); + bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5; - REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset); - DEREF(child_node).bvh_offset = dst_offset; + uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, + child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node)); + child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32); + + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset); + DEREF(child_node).bvh_offset = child_node_id; flags |= (DEREF(child_node).flags & 0x3) << i * 8; } else { uint32_t child_index = offset / ir_leaf_node_size; - dst_offset = dst_leaf_offset + child_index * output_leaf_node_size; + uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size; if (type == vk_ir_node_instance) { vk_ir_instance_node src_node = @@ -182,47 +218,65 @@ main() uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]); flags |= (child_flags & 0x3) << i * 8; } + + child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type)); } - vk_aabb child_aabb = - DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; - - /* On gfx11, infinities in AABB coords can cause garbage child nodes to be - * returned by box intersection tests with non-default box sorting modes. - * Subtract 1 from the integer representation of inf/-inf to turn it into - * the maximum/minimum representable floating-point value as a workaround. - */ - if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) { - for (uint32_t i = 0; i < 3; ++i) { - if (isinf(child_aabb.min[i])) - child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1); - if (isinf(child_aabb.max[i])) - child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1); + if (is_box16) { + DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb); + } else { + /* On gfx11, infinities in AABB coords can cause garbage child nodes to be + * returned by box intersection tests with non-default box sorting modes. + * Subtract 1 from the integer representation of inf/-inf to turn it into + * the maximum/minimum representable floating-point value as a workaround. + */ + if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) { + for (uint32_t i = 0; i < 3; ++i) { + if (isinf(child_aabb.min[i])) + child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1); + if (isinf(child_aabb.max[i])) + child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1); + } } + + DEREF(dst_node_f32).coords[i] = child_aabb; } - DEREF(dst_node).coords[i] = child_aabb; - - uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type)); - children[i] = child_id; - set_parent(child_id, node_id); + children[i] = child_node_id; + set_parent(child_node_id, node_id); } - for (uint i = found_child_count; i < 4; ++i) { + if (is_box16) { + radv_aabb16 null_aabb; + null_aabb.min_x = NAN_F16; + null_aabb.min_y = NAN_F16; + null_aabb.min_z = NAN_F16; + null_aabb.max_x = NAN_F16; + null_aabb.max_y = NAN_F16; + null_aabb.max_z = NAN_F16; + for (uint i = found_child_count; i < 4; ++i) + DEREF(dst_node_f16).coords[i] = null_aabb; + } else { + for (uint i = found_child_count; i < 4; ++i) { for (uint comp = 0; comp < 3; ++comp) { - DEREF(dst_node).coords[i].min[comp] = NAN; - DEREF(dst_node).coords[i].max[comp] = NAN; + DEREF(dst_node_f32).coords[i].min[comp] = NAN; + DEREF(dst_node_f32).coords[i].max[comp] = NAN; } + } } /* Make changes to the children's BVH offset value available to the other invocations. */ memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - DEREF(dst_node).children = children; + if (is_box16) { + DEREF(dst_node_f16).children = children; + } else { + DEREF(dst_node_f32).children = children; - if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS)) - DEREF(dst_node).flags = flags; + if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS)) + DEREF(dst_node_f32).flags = flags; + } break; } diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index 3320ef67428..c0328db82c7 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -56,7 +56,7 @@ bvh_includes = files( bvh_spv = [] foreach s : bvh_shaders command = [ - prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', + prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet, ] command += vk_glsl_shader_preamble diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 4271a0be143..607d29866dd 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -75,6 +75,7 @@ enum radv_encode_key_bits { RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0), RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1), RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2), + RADV_ENCODE_KEY_USE_BOX16 = (1 << 3), }; static void @@ -287,6 +288,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s VK_FROM_HANDLE(radv_device, device, _device); struct radv_physical_device *pdev = radv_device_physical(device); + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info); + uint32_t encode_key = 0; if (radv_use_bvh8(pdev)) { /* @@ -302,11 +305,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS; - VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info); if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR | VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) && geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; + } else if (!radv_emulate_rt(pdev)) { + if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR)) + encode_key |= RADV_ENCODE_KEY_USE_BOX16; } state->config.encode_key[0] = encode_key; @@ -391,6 +396,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key) flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES; if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES; + if (key & RADV_ENCODE_KEY_USE_BOX16) + flags |= RADV_BUILD_FLAG_USE_BOX16; return flags; } diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build index 02b2afb4163..add1590b70f 100644 --- a/src/vulkan/runtime/bvh/meson.build +++ b/src/vulkan/runtime/bvh/meson.build @@ -42,6 +42,7 @@ bvh_shaders = [ ], ] +spirv_include_dir = dir_source_root + '/src/compiler/spirv' vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh' vk_bvh_includes = files( @@ -50,6 +51,7 @@ vk_bvh_includes = files( 'vk_build_interface.h', 'vk_bvh.h', 'vk_debug.h', + spirv_include_dir + '/spirv_internal_exts.h', ) vk_glsl_shader_extensions = [ @@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [ 'GL_KHR_shader_subgroup_ballot', 'GL_KHR_shader_subgroup_clustered', 'GL_EXT_shader_atomic_int64', + 'GL_EXT_spirv_intrinsics', ] vk_glsl_shader_preamble = [] @@ -79,7 +82,7 @@ endforeach bvh_spv = [] foreach s : bvh_shaders command = [ - prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' + prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' ] + (with_mesa_debug ? ['-g'] : []) command += glslang_quiet command += vk_glsl_shader_preamble diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h index 01acb4db715..dd5795855b2 100644 --- a/src/vulkan/runtime/bvh/vk_build_helpers.h +++ b/src/vulkan/runtime/bvh/vk_build_helpers.h @@ -180,6 +180,7 @@ #define INFINITY (1.0 / 0.0) #define NAN (0.0 / 0.0) +#define NAN_F16 (0.0hf / 0.0hf) #define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))