radv/bvh: Use box16 nodes when bvh8 is not used
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Using box16 nodes trades bvh quality for memory bandwidth which seems to
be roughly equal in performance.

Stats assuming box16 nodes are as expensive as box32 nodes:
Totals from 7668 (79.68% of 9624) affected BVHs:
compacted_size: 951666944 -> 742347648 (-22.00%)
max_depth: 57606 -> 57615 (+0.02%)
sah: 129114796242 -> 129998517775 (+0.68%); split: -0.00%, +0.68%
scene_sah: 188564162 -> 192063633 (+1.86%); split: -0.02%, +1.88%
box16_node_count: 0 -> 3270600 (+inf%)
box32_node_count: 3365707 -> 95100 (-97.17%)

Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37883>
This commit is contained in:
Konstantin Seurer 2025-10-15 12:30:36 +02:00
parent 543a88af99
commit 077292f65b
7 changed files with 109 additions and 39 deletions

View file

@ -8,6 +8,7 @@
#define BVH_BUILD_HELPERS_H
#include "bvh.h"
#include "spirv_internal_exts.h"
#include "vk_build_helpers.h"
TYPE(radv_accel_struct_serialization_header, 8);
@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type)
return ptr_flags;
}
spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f);
spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f);
#endif /* BUILD_HELPERS_H */

View file

@ -26,6 +26,7 @@
#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 5))
#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6))
#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7))
#define RADV_BUILD_FLAG_USE_BOX16 (1u << (VK_BUILD_FLAG_COUNT + 8))
#define RADV_COPY_MODE_COPY 0
#define RADV_COPY_MODE_SERIALIZE 1

View file

@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent)
DEREF(REF(uint32_t)(addr)) = parent;
}
radv_aabb16
radv_aabb_f32_to_f16(vk_aabb aabb)
{
radv_aabb16 aabb16;
aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x);
aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y);
aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z);
aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x);
aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y);
aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z);
return aabb16;
}
vk_aabb
radv_aabb_f16_to_f32(radv_aabb16 aabb16)
{
vk_aabb aabb;
aabb.min.x = float(aabb16.min_x);
aabb.min.y = float(aabb16.min_y);
aabb.min.z = float(aabb16.min_z);
aabb.max.x = float(aabb16.max_x);
aabb.max.y = float(aabb16.max_y);
aabb.max.z = float(aabb16.max_z);
return aabb;
}
void
main()
{
@ -89,18 +115,15 @@ main()
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset;
if (node_id == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
if (node_id == VK_NULL_BVH_OFFSET)
break;
uint32_t flags = 0;
REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t found_child_count = 0;
uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE};
@ -158,20 +181,33 @@ main()
break;
}
REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16;
for (uint32_t i = 0; i < found_child_count; ++i) {
uint32_t type = ir_id_to_type(children[i]);
uint32_t offset = ir_id_to_offset(children[i]);
uint32_t dst_offset;
uint32_t child_node_id;
vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
if (type == vk_ir_node_internal) {
dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb);
float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16));
float surface_area_f32 = aabb_surface_area(child_aabb);
bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5;
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_offset;
uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset,
child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node));
child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32);
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = child_node_id;
flags |= (DEREF(child_node).flags & 0x3) << i * 8;
} else {
uint32_t child_index = offset / ir_leaf_node_size;
dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
if (type == vk_ir_node_instance) {
vk_ir_instance_node src_node =
@ -182,47 +218,65 @@ main()
uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]);
flags |= (child_flags & 0x3) << i * 8;
}
child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
}
vk_aabb child_aabb =
DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
/* On gfx11, infinities in AABB coords can cause garbage child nodes to be
* returned by box intersection tests with non-default box sorting modes.
* Subtract 1 from the integer representation of inf/-inf to turn it into
* the maximum/minimum representable floating-point value as a workaround.
*/
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
for (uint32_t i = 0; i < 3; ++i) {
if (isinf(child_aabb.min[i]))
child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
if (isinf(child_aabb.max[i]))
child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
if (is_box16) {
DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb);
} else {
/* On gfx11, infinities in AABB coords can cause garbage child nodes to be
* returned by box intersection tests with non-default box sorting modes.
* Subtract 1 from the integer representation of inf/-inf to turn it into
* the maximum/minimum representable floating-point value as a workaround.
*/
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
for (uint32_t i = 0; i < 3; ++i) {
if (isinf(child_aabb.min[i]))
child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
if (isinf(child_aabb.max[i]))
child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
}
}
DEREF(dst_node_f32).coords[i] = child_aabb;
}
DEREF(dst_node).coords[i] = child_aabb;
uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
children[i] = child_id;
set_parent(child_id, node_id);
children[i] = child_node_id;
set_parent(child_node_id, node_id);
}
for (uint i = found_child_count; i < 4; ++i) {
if (is_box16) {
radv_aabb16 null_aabb;
null_aabb.min_x = NAN_F16;
null_aabb.min_y = NAN_F16;
null_aabb.min_z = NAN_F16;
null_aabb.max_x = NAN_F16;
null_aabb.max_y = NAN_F16;
null_aabb.max_z = NAN_F16;
for (uint i = found_child_count; i < 4; ++i)
DEREF(dst_node_f16).coords[i] = null_aabb;
} else {
for (uint i = found_child_count; i < 4; ++i) {
for (uint comp = 0; comp < 3; ++comp) {
DEREF(dst_node).coords[i].min[comp] = NAN;
DEREF(dst_node).coords[i].max[comp] = NAN;
DEREF(dst_node_f32).coords[i].min[comp] = NAN;
DEREF(dst_node_f32).coords[i].max[comp] = NAN;
}
}
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
DEREF(dst_node).children = children;
if (is_box16) {
DEREF(dst_node_f16).children = children;
} else {
DEREF(dst_node_f32).children = children;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
DEREF(dst_node).flags = flags;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
DEREF(dst_node_f32).flags = flags;
}
break;
}

View file

@ -56,7 +56,7 @@ bvh_includes = files(
bvh_spv = []
foreach s : bvh_shaders
command = [
prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5',
'-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
]
command += vk_glsl_shader_preamble

View file

@ -75,6 +75,7 @@ enum radv_encode_key_bits {
RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0),
RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1),
RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2),
RADV_ENCODE_KEY_USE_BOX16 = (1 << 3),
};
static void
@ -284,6 +285,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
VK_FROM_HANDLE(radv_device, device, _device);
struct radv_physical_device *pdev = radv_device_physical(device);
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
uint32_t encode_key = 0;
if (radv_use_bvh8(pdev)) {
/*
@ -299,11 +302,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS;
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
} else if (!radv_emulate_rt(pdev)) {
if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
encode_key |= RADV_ENCODE_KEY_USE_BOX16;
}
state->config.encode_key[0] = encode_key;
@ -388,6 +393,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;
if (key & RADV_ENCODE_KEY_USE_BOX16)
flags |= RADV_BUILD_FLAG_USE_BOX16;
return flags;
}

View file

@ -42,6 +42,7 @@ bvh_shaders = [
],
]
spirv_include_dir = dir_source_root + '/src/compiler/spirv'
vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
vk_bvh_includes = files(
@ -50,6 +51,7 @@ vk_bvh_includes = files(
'vk_build_interface.h',
'vk_bvh.h',
'vk_debug.h',
spirv_include_dir + '/spirv_internal_exts.h',
)
vk_glsl_shader_extensions = [
@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [
'GL_KHR_shader_subgroup_ballot',
'GL_KHR_shader_subgroup_clustered',
'GL_EXT_shader_atomic_int64',
'GL_EXT_spirv_intrinsics',
]
vk_glsl_shader_preamble = []
@ -79,7 +82,7 @@ endforeach
bvh_spv = []
foreach s : bvh_shaders
command = [
prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
] + (with_mesa_debug ? ['-g'] : [])
command += glslang_quiet
command += vk_glsl_shader_preamble

View file

@ -180,6 +180,7 @@
#define INFINITY (1.0 / 0.0)
#define NAN (0.0 / 0.0)
#define NAN_F16 (0.0hf / 0.0hf)
#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))