radv/bvh: Use box16 nodes when bvh8 is not used

Using box16 nodes trades bvh quality for memory bandwidth which seems to
be roughly equal in performance.

Stats assuming box16 nodes are as expensive as box32 nodes:
Totals from 7668 (79.68% of 9624) affected BVHs:
compacted_size: 951666944 -> 742347648 (-22.00%)
max_depth: 57606 -> 57615 (+0.02%)
sah: 129114796242 -> 129998517775 (+0.68%); split: -0.00%, +0.68%
scene_sah: 188564162 -> 192063633 (+1.86%); split: -0.02%, +1.88%
box16_node_count: 0 -> 3270600 (+inf%)
box32_node_count: 3365707 -> 95100 (-97.17%)
This commit is contained in:
Konstantin Seurer 2025-10-15 12:30:36 +02:00
parent 4950f6e23d
commit 19a4c1c4b5
7 changed files with 109 additions and 39 deletions

View file

@ -8,6 +8,7 @@
#define BVH_BUILD_HELPERS_H
#include "bvh.h"
#include "spirv_internal_exts.h"
#include "vk_build_helpers.h"
TYPE(radv_accel_struct_serialization_header, 8);
@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type)
return ptr_flags;
}
spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f);
spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f);
#endif /* BUILD_HELPERS_H */

View file

@ -26,6 +26,7 @@
#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 5))
#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6))
#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7))
#define RADV_BUILD_FLAG_USE_BOX16 (1u << (VK_BUILD_FLAG_COUNT + 8))
#define RADV_COPY_MODE_COPY 0
#define RADV_COPY_MODE_SERIALIZE 1

View file

@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent)
DEREF(REF(uint32_t)(addr)) = parent;
}
radv_aabb16
radv_aabb_f32_to_f16(vk_aabb aabb)
{
radv_aabb16 aabb16;
aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x);
aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y);
aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z);
aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x);
aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y);
aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z);
return aabb16;
}
vk_aabb
radv_aabb_f16_to_f32(radv_aabb16 aabb16)
{
vk_aabb aabb;
aabb.min.x = float(aabb16.min_x);
aabb.min.y = float(aabb16.min_y);
aabb.min.z = float(aabb16.min_z);
aabb.max.x = float(aabb16.max_x);
aabb.max.y = float(aabb16.max_y);
aabb.max.z = float(aabb16.max_z);
return aabb;
}
void
main()
{
@ -89,18 +115,15 @@ main()
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset;
if (node_id == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
if (node_id == VK_NULL_BVH_OFFSET)
break;
uint32_t flags = 0;
REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t found_child_count = 0;
uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE};
@ -158,20 +181,33 @@ main()
break;
}
REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16;
for (uint32_t i = 0; i < found_child_count; ++i) {
uint32_t type = ir_id_to_type(children[i]);
uint32_t offset = ir_id_to_offset(children[i]);
uint32_t dst_offset;
uint32_t child_node_id;
vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
if (type == vk_ir_node_internal) {
dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb);
float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16));
float surface_area_f32 = aabb_surface_area(child_aabb);
bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5;
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_offset;
uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset,
child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node));
child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32);
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = child_node_id;
flags |= (DEREF(child_node).flags & 0x3) << i * 8;
} else {
uint32_t child_index = offset / ir_leaf_node_size;
dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
if (type == vk_ir_node_instance) {
vk_ir_instance_node src_node =
@ -182,47 +218,65 @@ main()
uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]);
flags |= (child_flags & 0x3) << i * 8;
}
child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
}
vk_aabb child_aabb =
DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
/* On gfx11, infinities in AABB coords can cause garbage child nodes to be
* returned by box intersection tests with non-default box sorting modes.
* Subtract 1 from the integer representation of inf/-inf to turn it into
* the maximum/minimum representable floating-point value as a workaround.
*/
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
for (uint32_t i = 0; i < 3; ++i) {
if (isinf(child_aabb.min[i]))
child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
if (isinf(child_aabb.max[i]))
child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
if (is_box16) {
DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb);
} else {
/* On gfx11, infinities in AABB coords can cause garbage child nodes to be
* returned by box intersection tests with non-default box sorting modes.
* Subtract 1 from the integer representation of inf/-inf to turn it into
* the maximum/minimum representable floating-point value as a workaround.
*/
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
for (uint32_t i = 0; i < 3; ++i) {
if (isinf(child_aabb.min[i]))
child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
if (isinf(child_aabb.max[i]))
child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
}
}
DEREF(dst_node_f32).coords[i] = child_aabb;
}
DEREF(dst_node).coords[i] = child_aabb;
uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
children[i] = child_id;
set_parent(child_id, node_id);
children[i] = child_node_id;
set_parent(child_node_id, node_id);
}
for (uint i = found_child_count; i < 4; ++i) {
if (is_box16) {
radv_aabb16 null_aabb;
null_aabb.min_x = NAN_F16;
null_aabb.min_y = NAN_F16;
null_aabb.min_z = NAN_F16;
null_aabb.max_x = NAN_F16;
null_aabb.max_y = NAN_F16;
null_aabb.max_z = NAN_F16;
for (uint i = found_child_count; i < 4; ++i)
DEREF(dst_node_f16).coords[i] = null_aabb;
} else {
for (uint i = found_child_count; i < 4; ++i) {
for (uint comp = 0; comp < 3; ++comp) {
DEREF(dst_node).coords[i].min[comp] = NAN;
DEREF(dst_node).coords[i].max[comp] = NAN;
DEREF(dst_node_f32).coords[i].min[comp] = NAN;
DEREF(dst_node_f32).coords[i].max[comp] = NAN;
}
}
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
DEREF(dst_node).children = children;
if (is_box16) {
DEREF(dst_node_f16).children = children;
} else {
DEREF(dst_node_f32).children = children;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
DEREF(dst_node).flags = flags;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
DEREF(dst_node_f32).flags = flags;
}
break;
}

View file

@ -56,7 +56,7 @@ bvh_includes = files(
bvh_spv = []
foreach s : bvh_shaders
command = [
prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5',
'-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
]
command += vk_glsl_shader_preamble

View file

@ -75,6 +75,7 @@ enum radv_encode_key_bits {
RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0),
RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1),
RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2),
RADV_ENCODE_KEY_USE_BOX16 = (1 << 3),
};
static void
@ -287,6 +288,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
VK_FROM_HANDLE(radv_device, device, _device);
struct radv_physical_device *pdev = radv_device_physical(device);
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
uint32_t encode_key = 0;
if (radv_use_bvh8(pdev)) {
/*
@ -302,11 +305,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS;
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
} else if (!radv_emulate_rt(pdev)) {
if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
encode_key |= RADV_ENCODE_KEY_USE_BOX16;
}
state->config.encode_key[0] = encode_key;
@ -391,6 +396,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;
if (key & RADV_ENCODE_KEY_USE_BOX16)
flags |= RADV_BUILD_FLAG_USE_BOX16;
return flags;
}

View file

@ -42,6 +42,7 @@ bvh_shaders = [
],
]
spirv_include_dir = dir_source_root + '/src/compiler/spirv'
vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
vk_bvh_includes = files(
@ -50,6 +51,7 @@ vk_bvh_includes = files(
'vk_build_interface.h',
'vk_bvh.h',
'vk_debug.h',
spirv_include_dir + '/spirv_internal_exts.h',
)
vk_glsl_shader_extensions = [
@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [
'GL_KHR_shader_subgroup_ballot',
'GL_KHR_shader_subgroup_clustered',
'GL_EXT_shader_atomic_int64',
'GL_EXT_spirv_intrinsics',
]
vk_glsl_shader_preamble = []
@ -79,7 +82,7 @@ endforeach
bvh_spv = []
foreach s : bvh_shaders
command = [
prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
] + (with_mesa_debug ? ['-g'] : [])
command += glslang_quiet
command += vk_glsl_shader_preamble

View file

@ -180,6 +180,7 @@
#define INFINITY (1.0 / 0.0)
#define NAN (0.0 / 0.0)
#define NAN_F16 (0.0hf / 0.0hf)
#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))