radv: Use the BVH8 format on GFX12

Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34273>
This commit is contained in:
Konstantin Seurer 2025-03-18 13:41:02 +01:00 committed by Marge Bot
parent 95e7343a7d
commit 97f6287827
26 changed files with 1948 additions and 204 deletions

View file

@ -1461,6 +1461,8 @@ RADV driver environment variables
Dump backend IR (ACO or LLVM) for selected shader stages.
``asm``
Dump shader disassembly for selected shader stages.
``bvh4``
Use bvh4 encoding on GPUs that support bvh8 encoding.
.. envvar:: RADV_FORCE_FAMILY

View file

@ -17,6 +17,10 @@ TYPE(radv_bvh_aabb_node, 4);
TYPE(radv_bvh_instance_node, 8);
TYPE(radv_bvh_box16_node, 4);
TYPE(radv_bvh_box32_node, 4);
TYPE(radv_gfx12_box_node, 4);
TYPE(radv_gfx12_instance_node, 8);
TYPE(radv_gfx12_instance_node_user_data, 4);
TYPE(radv_gfx12_primitive_node, 4);
uint32_t
id_to_offset(uint32_t id)

View file

@ -35,6 +35,16 @@ struct encode_args {
uint32_t geometry_type;
};
struct encode_gfx12_args {
VOID_REF intermediate_bvh;
VOID_REF output_base;
REF(vk_ir_header) header;
uint32_t output_bvh_offset;
uint32_t leaf_node_offsets_offset;
uint32_t leaf_node_count;
uint32_t geometry_type;
};
struct header_args {
REF(vk_ir_header) src;
REF(radv_accel_struct_header) dst;

View file

@ -58,10 +58,12 @@ struct radv_accel_struct_header {
uint64_t size;
/* Everything after this gets updated/copied from the CPU. */
uint32_t geometry_type;
uint32_t geometry_count;
uint32_t primitive_base_indices_offset;
uint64_t instance_offset;
uint64_t instance_count;
uint32_t leaf_node_offsets_offset;
uint32_t build_flags;
};
@ -114,4 +116,60 @@ struct radv_bvh_box32_node {
#define RADV_BVH_ROOT_NODE radv_bvh_node_box32
#define RADV_BVH_INVALID_NODE 0xffffffffu
/* GFX12 */
#define RADV_GFX12_BVH_NODE_SIZE 128
struct radv_gfx12_box_child {
uint32_t dword0;
uint32_t dword1;
uint32_t dword2;
};
#ifndef VULKAN
typedef struct radv_gfx12_box_child radv_gfx12_box_child;
#endif
struct radv_gfx12_box_node {
uint32_t internal_base_id;
uint32_t primitive_base_id;
uint32_t unused;
vec3 origin;
uint32_t child_count_exponents;
uint32_t obb_matrix_index;
radv_gfx12_box_child children[8];
};
struct radv_gfx12_instance_node {
mat3x4 wto_matrix;
uint64_t pointer_flags_bvh_addr;
uint32_t unused;
uint32_t cull_mask_user_data;
vec3 origin;
uint32_t child_count_exponents;
radv_gfx12_box_child children[4];
};
struct radv_gfx12_instance_node_user_data {
mat3x4 otw_matrix;
uint32_t custom_instance;
uint32_t instance_index;
uint32_t bvh_offset;
uint32_t padding;
uint64_t blas_addr;
uint32_t primitive_base_indices_offset;
uint32_t leaf_node_offsets_offset;
uint32_t unused[12];
};
/* Size of the primitive header section in bits. */
#define RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE 52
/* Size of a primitive pair description in bits. */
#define RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE 29
struct radv_gfx12_primitive_node {
uint32_t dwords[32];
};
#endif /* BVH_H */

View file

@ -71,7 +71,10 @@ main(void)
DEREF(REF(uvec4)(copy_src_addr + offset));
/* Do the adjustment inline in the same invocation that copies the data so that we don't have
* to synchronize. */
* to synchronize. This is only possible on pre-GFX12 HW because leaf nodes have a different
* order on GFX12.
*/
#if !GFX12
if (offset < node_end && offset >= node_offset &&
(offset - node_offset) % SIZEOF(radv_bvh_instance_node) == 0) {
uint64_t idx = (offset - node_offset) / SIZEOF(radv_bvh_instance_node);
@ -85,5 +88,6 @@ main(void)
DEREF(REF(radv_bvh_instance_node)(copy_dst_addr + offset)).bvh_ptr = addr_to_node(blas_addr + bvh_offset);
}
}
#endif
}
}

View file

@ -0,0 +1,65 @@
/*
* Copyright © 2022 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#version 460
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_scalar_block_layout : require
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_buffer_reference2 : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#include "build_interface.h"
layout(push_constant) uniform CONSTS
{
copy_args args;
};
void
main(void)
{
uint32_t global_id = gl_GlobalInvocationID.x;
uint32_t total_invocations = gl_NumWorkGroups.x * 64;
uint64_t accel_struct_addr = args.mode == RADV_COPY_MODE_SERIALIZE ? args.src_addr : args.dst_addr;
uint64_t serialized_addr = args.mode == RADV_COPY_MODE_SERIALIZE ? args.dst_addr : args.src_addr;
uint64_t blas_addrs = serialized_addr + SIZEOF(radv_accel_struct_serialization_header);
radv_accel_struct_serialization_header serialization_header =
DEREF(REF(radv_accel_struct_serialization_header)(serialized_addr));
radv_accel_struct_header header = DEREF(REF(radv_accel_struct_header)(accel_struct_addr));
for (uint32_t i = global_id; i < serialization_header.instance_count; i += total_invocations) {
uint64_t instance_offset_addr = accel_struct_addr + (header.leaf_node_offsets_offset + i * 4);
uint64_t instance_addr = accel_struct_addr + (header.bvh_offset + DEREF(REF(uint32_t)(instance_offset_addr)));
REF(radv_gfx12_instance_node) instance_node = REF(radv_gfx12_instance_node)(instance_addr);
REF(radv_gfx12_instance_node_user_data) instance_data =
REF(radv_gfx12_instance_node_user_data)(instance_addr + SIZEOF(radv_gfx12_instance_node));
if (args.mode == RADV_COPY_MODE_SERIALIZE) {
DEREF(INDEX(uint64_t, blas_addrs, i)) = DEREF(instance_data).blas_addr;
} else {
uint32_t bvh_offset = DEREF(instance_data).bvh_offset;
/* Replace the address while keeping the pointer flags. */
uint64_t pointer_flags_bvh_addr = DEREF(instance_node).pointer_flags_bvh_addr;
uint64_t blas_addr = DEREF(INDEX(uint64_t, blas_addrs, i));
DEREF(instance_node).pointer_flags_bvh_addr =
(pointer_flags_bvh_addr & 0xFFC0000000000000ul) | addr_to_node(blas_addr + bvh_offset);
DEREF(instance_data).blas_addr = blas_addr;
}
}
}

View file

@ -52,4 +52,273 @@ radv_encode_instance_gfx10_3(VOID_REF dst_addr, vk_ir_instance_node src)
DEREF(dst).instance_id = src.instance_id;
}
struct bit_writer {
uint64_t addr;
uint32_t offset;
uint32_t temp;
uint32_t count;
uint32_t total_count;
};
void
bit_writer_init(out bit_writer writer, uint64_t addr)
{
writer.addr = addr;
writer.offset = 0;
writer.temp = 0;
writer.count = 0;
writer.total_count = 0;
}
void
bit_writer_write(inout bit_writer writer, uint32_t data, uint32_t bit_size)
{
writer.total_count += bit_size;
if (writer.count + bit_size >= 32) {
writer.temp = writer.temp | (data << writer.count);
REF(uint32_t) dst = REF(uint32_t)(writer.addr + writer.offset);
DEREF(dst) = writer.temp;
writer.offset += 4;
bit_size = bit_size - (32 - writer.count);
if (writer.count == 0)
data = 0;
else
data = data >> (32 - writer.count);
writer.temp = 0;
writer.count = 0;
}
writer.temp = writer.temp | (data << writer.count);
writer.count += bit_size;
}
void
bit_writer_skip_to(inout bit_writer writer, uint32_t target)
{
/* Flush the remaining data. */
if (writer.count > 0) {
REF(uint32_t) dst = REF(uint32_t)(writer.addr + writer.offset);
DEREF(dst) = writer.temp;
}
writer.count = target % 32;
writer.total_count = target;
writer.offset = (target / 32) * 4;
}
void
bit_writer_finish(inout bit_writer writer)
{
/* Flush the remaining data. */
if (writer.count > 0) {
REF(uint32_t) dst = REF(uint32_t)(writer.addr + writer.offset);
DEREF(dst) = writer.temp;
}
writer.temp = 0;
writer.count = 0;
writer.total_count = 0;
}
void
radv_encode_triangle_gfx12(VOID_REF dst, vk_ir_triangle_node src)
{
bit_writer child_writer;
bit_writer_init(child_writer, dst);
bit_writer_write(child_writer, 31, 5); /* x_vertex_bits_minus_one */
bit_writer_write(child_writer, 31, 5); /* y_vertex_bits_minus_one */
bit_writer_write(child_writer, 31, 5); /* z_vertex_bits_minus_one */
bit_writer_write(child_writer, 0, 5); /* trailing_zero_bits */
bit_writer_write(child_writer, 14, 4); /* geometry_index_base_bits_div_2 */
bit_writer_write(child_writer, 14, 4); /* geometry_index_bits_div_2 */
bit_writer_write(child_writer, 0, 3); /* triangle_pair_count_minus_one */
bit_writer_write(child_writer, 0, 1); /* vertex_type */
bit_writer_write(child_writer, 28, 5); /* primitive_index_base_bits */
bit_writer_write(child_writer, 28, 5); /* primitive_index_bits */
/* header + 9 floats + geometry_id */
bit_writer_write(child_writer, RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 9 * 32 + 28, 10);
bit_writer_write(child_writer, floatBitsToUint(src.coords[0][0]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[0][1]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[0][2]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[1][0]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[1][1]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[1][2]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[2][0]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[2][1]), 32);
bit_writer_write(child_writer, floatBitsToUint(src.coords[2][2]), 32);
bit_writer_write(child_writer, src.geometry_id_and_flags & 0xfffffff, 28);
bit_writer_write(child_writer, src.triangle_id, 28);
bit_writer_skip_to(child_writer, 32 * 32 - RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE);
uint32_t opaque = (src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0 ? 1 : 0;
bit_writer_write(child_writer, 1, 1); /* prim_range_stop */
bit_writer_write(child_writer, 0, 1); /* tri1_double_sided */
bit_writer_write(child_writer, 0, 1); /* tri1_opaque */
bit_writer_write(child_writer, 0, 4); /* tri1_v0_index */
bit_writer_write(child_writer, 0, 4); /* tri1_v1_index */
bit_writer_write(child_writer, 0, 4); /* tri1_v2_index */
bit_writer_write(child_writer, 0, 1); /* tri0_double_sided */
bit_writer_write(child_writer, opaque, 1); /* tri0_opaque */
bit_writer_write(child_writer, 0, 4); /* tri0_v0_index */
bit_writer_write(child_writer, 1, 4); /* tri0_v1_index */
bit_writer_write(child_writer, 2, 4); /* tri0_v2_index */
bit_writer_finish(child_writer);
}
void
radv_encode_aabb_gfx12(VOID_REF dst, vk_ir_aabb_node src)
{
bit_writer child_writer;
bit_writer_init(child_writer, dst);
bit_writer_write(child_writer, 0, 5); /* x_vertex_bits_minus_one */
bit_writer_write(child_writer, 0, 5); /* y_vertex_bits_minus_one */
bit_writer_write(child_writer, 0, 5); /* z_vertex_bits_minus_one */
bit_writer_write(child_writer, 0, 5); /* trailing_zero_bits */
bit_writer_write(child_writer, 14, 4); /* geometry_index_base_bits_div_2 */
bit_writer_write(child_writer, 14, 4); /* geometry_index_bits_div_2 */
bit_writer_write(child_writer, 0, 3); /* triangle_pair_count_minus_one */
bit_writer_write(child_writer, 0, 1); /* vertex_type */
bit_writer_write(child_writer, 28, 5); /* primitive_index_base_bits */
bit_writer_write(child_writer, 28, 5); /* primitive_index_bits */
/* header + 6 floats + geometry_id */
bit_writer_write(child_writer, RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 6 * 32 + 28, 10);
bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.min.x), 32);
bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.min.y), 32);
bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.min.z), 32);
bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.max.x), 32);
bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.max.y), 32);
bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.max.z), 32);
bit_writer_write(child_writer, src.geometry_id_and_flags & 0xfffffff, 28);
bit_writer_write(child_writer, src.primitive_id, 28);
bit_writer_skip_to(child_writer, 32 * 32 - RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE);
uint32_t opaque = (src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0 ? 1 : 0;
bit_writer_write(child_writer, 1, 1); /* prim_range_stop */
bit_writer_write(child_writer, 0, 1); /* tri1_double_sided */
bit_writer_write(child_writer, 0, 1); /* tri1_opaque */
bit_writer_write(child_writer, 0, 4); /* tri1_v0_index */
bit_writer_write(child_writer, 0, 4); /* tri1_v1_index */
bit_writer_write(child_writer, 0, 4); /* tri1_v2_index */
bit_writer_write(child_writer, 0, 1); /* tri0_double_sided */
bit_writer_write(child_writer, opaque, 1); /* tri0_opaque */
bit_writer_write(child_writer, 0xf, 4); /* tri0_v0_index */
bit_writer_write(child_writer, 0xf, 4); /* tri0_v1_index */
bit_writer_write(child_writer, 0, 4); /* tri0_v2_index */
bit_writer_finish(child_writer);
}
/* Writes both the HW node and user data. */
void
radv_encode_instance_gfx12(VOID_REF dst, vk_ir_instance_node src)
{
bit_writer child_writer;
bit_writer_init(child_writer, dst);
radv_accel_struct_header blas_header = DEREF(REF(radv_accel_struct_header)(src.base_ptr));
mat4 transform = mat4(src.otw_matrix);
mat4 wto_matrix = transpose(inverse(transpose(transform)));
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][0]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][1]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][2]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][3]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][0]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][1]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][2]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][3]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][0]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][1]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][2]), 32);
bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][3]), 32);
uint32_t flags = src.sbt_offset_and_flags >> 24;
uint32_t instance_pointer_flags = 0;
if ((flags & VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR) != 0)
instance_pointer_flags |= 1;
if ((flags & VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR) != 0)
instance_pointer_flags |= 2;
if ((flags & VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR) != 0 ||
blas_header.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR)
instance_pointer_flags |= 4;
if ((flags & VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR) != 0)
instance_pointer_flags |= 8;
if (blas_header.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
instance_pointer_flags |= 512;
else
instance_pointer_flags |= 256;
uint64_t bvh_addr = addr_to_node(src.base_ptr + blas_header.bvh_offset);
bit_writer_write(child_writer, uint32_t(bvh_addr & 0xffffffff), 32);
bit_writer_write(child_writer, uint32_t(bvh_addr >> 32) | (instance_pointer_flags << (54 - 32)), 32);
bit_writer_write(child_writer, src.custom_instance_and_mask & 0xffffff, 32);
bit_writer_write(child_writer, src.sbt_offset_and_flags & 0xffffff, 24);
bit_writer_write(child_writer, src.custom_instance_and_mask >> 24, 8);
bit_writer_write(child_writer, floatBitsToUint(blas_header.aabb.min.x), 32);
bit_writer_write(child_writer, floatBitsToUint(blas_header.aabb.min.y), 32);
bit_writer_write(child_writer, floatBitsToUint(blas_header.aabb.min.z), 32);
vec3 child_extent = blas_header.aabb.max - blas_header.aabb.min;
uvec3 child_extent_exponents = uvec3(ceil(clamp(log2(child_extent) + 127.0, vec3(0.0), vec3(255))));
bit_writer_write(child_writer, child_extent_exponents.x, 8);
bit_writer_write(child_writer, child_extent_exponents.y, 8);
bit_writer_write(child_writer, child_extent_exponents.z, 8);
bit_writer_write(child_writer, 0, 4);
bit_writer_write(child_writer, 0, 4);
bit_writer_write(child_writer, 0, 12);
bit_writer_write(child_writer, 0, 12);
bit_writer_write(child_writer, 4, 8);
bit_writer_write(child_writer, 0, 12);
bit_writer_write(child_writer, 0xfff, 12);
bit_writer_write(child_writer, 0xff, 8);
bit_writer_write(child_writer, 0xfff, 12);
bit_writer_write(child_writer, 0xfff, 12);
bit_writer_write(child_writer, radv_bvh_node_box32, 4);
bit_writer_write(child_writer, 1, 4);
for (uint32_t remaining_child_index = 0; remaining_child_index < 3; remaining_child_index++) {
bit_writer_write(child_writer, 0xfff, 12);
bit_writer_write(child_writer, 0xfff, 12);
bit_writer_write(child_writer, 0xff, 8);
bit_writer_write(child_writer, 0xfff, 12);
bit_writer_write(child_writer, 0, 12);
bit_writer_write(child_writer, 0, 8);
bit_writer_write(child_writer, 0, 12);
bit_writer_write(child_writer, 0, 12);
bit_writer_write(child_writer, 0, 8);
}
bit_writer_finish(child_writer);
REF(radv_gfx12_instance_node_user_data) user_data =
REF(radv_gfx12_instance_node_user_data)(dst + RADV_GFX12_BVH_NODE_SIZE);
DEREF(user_data).otw_matrix = src.otw_matrix;
DEREF(user_data).custom_instance = src.custom_instance_and_mask & 0xffffff;
DEREF(user_data).instance_index = src.instance_id;
DEREF(user_data).bvh_offset = blas_header.bvh_offset;
DEREF(user_data).blas_addr = src.base_ptr;
DEREF(user_data).primitive_base_indices_offset = blas_header.primitive_base_indices_offset;
DEREF(user_data).leaf_node_offsets_offset = blas_header.leaf_node_offsets_offset;
}
#endif

View file

@ -0,0 +1,275 @@
/*
* Copyright © 2022 Friedrich Vock
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#version 460
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_scalar_block_layout : require
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_buffer_reference2 : require
#extension GL_KHR_memory_scope_semantics : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#define GFX12
#include "build_helpers.h"
#include "build_interface.h"
#include "encode.h"
layout(push_constant) uniform CONSTS
{
encode_gfx12_args args;
};
void
set_parent(uint32_t child, uint32_t parent)
{
uint64_t addr = args.output_base + args.output_bvh_offset - child / 16 * 4 - 4;
DEREF(REF(uint32_t)(addr)) = parent;
}
void
main()
{
if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
return;
/* Revert the order so we start at the root */
uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
uint32_t ir_leaf_node_size;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
break;
}
case VK_GEOMETRY_TYPE_AABBS_KHR: {
ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
break;
}
default:
/* instances */
ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
break;
}
uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
uint32_t dst_internal_offset = id_to_offset(RADV_BVH_ROOT_NODE);
REF(vk_ir_box_node) intermediate_internal_nodes =
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
vk_ir_box_node src = DEREF(src_node);
bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;
for (;;) {
/* Make changes to the current node's BVH offset value visible. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
break;
REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t children[8];
uint32_t found_child_count = 0;
for (uint32_t i = 0; i < 2; i++) {
if (src.children[i] != RADV_BVH_INVALID_NODE) {
children[found_child_count] = src.children[i];
found_child_count++;
}
}
/* TODO: Collapse child nodes with high SAH values. */
while (found_child_count < 8) {
bool progress = false;
for (int32_t i = 0; i < found_child_count; i++) {
uint32_t child_id = children[i];
if (ir_id_to_type(child_id) != vk_ir_node_internal)
continue;
progress = true;
REF(vk_ir_box_node) child_node =
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child_id));
uint32_t grandchildren[2] = DEREF(child_node).children;
uint32_t valid_grandchild_count = 0;
if (grandchildren[1] != RADV_BVH_INVALID_NODE)
valid_grandchild_count++;
if (grandchildren[0] != RADV_BVH_INVALID_NODE)
valid_grandchild_count++;
else
grandchildren[0] = grandchildren[1];
if (valid_grandchild_count > 1) {
children[found_child_count] = grandchildren[1];
found_child_count++;
}
if (valid_grandchild_count > 0) {
children[i] = grandchildren[0];
} else {
found_child_count--;
children[i] = children[found_child_count];
}
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
if (found_child_count == 8)
break;
}
if (!progress)
break;
}
uint32_t child_leaf_nodes_size = 0;
uint32_t child_internal_nodes_size = 0;
for (uint32_t i = 0; i < found_child_count; i++) {
uint32_t type = ir_id_to_type(children[i]);
if (type == vk_ir_node_internal)
child_internal_nodes_size += RADV_GFX12_BVH_NODE_SIZE;
else if (type == vk_ir_node_instance)
child_leaf_nodes_size += 2 * RADV_GFX12_BVH_NODE_SIZE;
else
child_leaf_nodes_size += RADV_GFX12_BVH_NODE_SIZE;
}
uint32_t dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
uint32_t dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
vec3 origin = src.base.aabb.min;
vec3 extent = src.base.aabb.max - src.base.aabb.min;
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
DEREF(dst).origin = origin;
DEREF(dst).child_count_exponents =
extent_exponents.x | (extent_exponents.y << 8) | (extent_exponents.z << 16) | ((found_child_count - 1) << 28);
DEREF(dst).obb_matrix_index = 0x7f;
for (uint32_t i = 0; i < found_child_count; i++) {
uint32_t child_id = children[i];
uint32_t type = ir_id_to_type(child_id);
uint32_t offset = ir_id_to_offset(child_id);
uint32_t child_node_size_128b = 1;
uint32_t encoded_type = 0;
uint32_t dst_offset = 0;
uint32_t cull_mask = 0xff;
if (type == vk_ir_node_internal) {
encoded_type = 5;
dst_offset = dst_internal_offset;
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_internal_offset;
dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE;
} else {
dst_offset = dst_leaf_offset;
/* Write leaf node offset. */
uint32_t child_index = offset / ir_leaf_node_size;
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
child_dst_offset = INDEX(uint32_t, child_dst_offset, child_index);
DEREF(child_dst_offset) = dst_offset;
VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_leaf_offset;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_triangle_gfx12(dst_leaf_addr, src_node);
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
break;
}
case VK_GEOMETRY_TYPE_AABBS_KHR: {
vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_aabb_gfx12(dst_leaf_addr, src_node);
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
break;
}
default:
/* instances */
encoded_type = 6;
child_node_size_128b = 2;
vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_instance_gfx12(dst_leaf_addr, src_node);
cull_mask = src_node.custom_instance_and_mask >> 24;
dst_leaf_offset += 2 * RADV_GFX12_BVH_NODE_SIZE;
break;
}
}
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb;
radv_gfx12_box_child child;
/* TODO: subtree flags culling */
child.dword0 = min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
/* TODO: subtree mask culling */
child.dword1 =
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) |
(cull_mask << 24);
child.dword2 =
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) |
(encoded_type << 24) | (child_node_size_128b << 28);
DEREF(dst).children[i] = child;
set_parent(pack_node_id(dst_offset, encoded_type), node_id);
}
/* Set remaining children to invalid */
for (uint32_t i = found_child_count; i < 8; i++) {
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst).children[i] = null_child;
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
break;
}
if (is_root_node) {
REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base);
DEREF(header).aabb = src.base.aabb;
DEREF(header).bvh_offset = args.output_bvh_offset;
set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE);
}
}

View file

@ -3,9 +3,24 @@
# source file, output name, defines
bvh_shaders = [
[
'copy_blas_addrs_gfx12.comp',
'copy_blas_addrs_gfx12',
[],
],
[
'copy.comp',
'copy',
['GFX12=0'],
],
[
'copy.comp',
'copy_gfx12',
['GFX12=1'],
],
[
'encode_gfx12.comp',
'encode_gfx12',
[],
],
[
@ -28,6 +43,11 @@ bvh_shaders = [
'update',
[],
],
[
'update_gfx12.comp',
'update_gfx12',
[],
],
[
'leaf.comp',
'radv_leaf',

View file

@ -57,10 +57,10 @@ void main() {
vk_aabb bounds;
bool is_active;
if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x);
is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, false);
} else {
VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x);
is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, false);
}
if (!is_active)

View file

@ -11,7 +11,8 @@
#include "encode.h"
bool
radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id,
bool gfx12)
{
bool is_valid = true;
triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
@ -56,13 +57,17 @@ radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data
node.triangle_id = global_id;
node.geometry_id_and_flags = geom_data.geometry_id;
radv_encode_triangle_gfx10_3(dst_ptr, node);
if (gfx12)
radv_encode_triangle_gfx12(dst_ptr, node);
else
radv_encode_triangle_gfx10_3(dst_ptr, node);
return is_valid;
}
bool
radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id,
bool gfx12)
{
bool is_valid = true;
@ -87,10 +92,14 @@ radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32
#endif
vk_ir_aabb_node node;
node.base.aabb = bounds;
node.primitive_id = global_id;
node.geometry_id_and_flags = geometry_id;
radv_encode_aabb_gfx10_3(dst_ptr, node);
if (gfx12)
radv_encode_aabb_gfx12(dst_ptr, node);
else
radv_encode_aabb_gfx10_3(dst_ptr, node);
return is_valid;
}

View file

@ -0,0 +1,213 @@
/*
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#version 460
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_scalar_block_layout : require
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_buffer_reference2 : require
#extension GL_KHR_memory_scope_semantics : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#include "build_interface.h"
#include "update.h"
layout(push_constant) uniform CONSTS
{
update_args args;
};
uint32_t
fetch_parent_node(VOID_REF bvh, uint32_t node)
{
uint64_t addr = bvh - node / 16 * 4 - 4;
return DEREF(REF(uint32_t)(addr));
}
void
main()
{
uint32_t bvh_offset = DEREF(args.src).bvh_offset;
VOID_REF src_bvh = OFFSET(args.src, bvh_offset);
VOID_REF dst_bvh = OFFSET(args.dst, bvh_offset);
VOID_REF leaf_node_offsets = OFFSET(args.src, DEREF(args.src).leaf_node_offsets_offset);
uint32_t leaf_node_size;
if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
leaf_node_size = SIZEOF(radv_gfx12_primitive_node);
else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR)
leaf_node_size = SIZEOF(radv_gfx12_primitive_node);
else
leaf_node_size = SIZEOF(radv_gfx12_instance_node) + SIZEOF(radv_gfx12_instance_node_user_data);
uint32_t leaf_node_id = args.geom_data.first_id + gl_GlobalInvocationID.x;
uint32_t first_leaf_offset = id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_gfx12_box_node);
uint32_t dst_offset = DEREF(INDEX(uint32_t, leaf_node_offsets, leaf_node_id));
VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset);
uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride;
vk_aabb bounds;
bool is_active;
if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, true);
} else {
VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, true);
}
if (!is_active)
return;
DEREF(INDEX(vk_aabb, args.leaf_bounds, (dst_offset - first_leaf_offset) / leaf_node_size)) = bounds;
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t node_id = pack_node_id(dst_offset, 0);
uint32_t parent_id = fetch_parent_node(src_bvh, node_id);
uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size;
while (parent_id != RADV_BVH_INVALID_NODE) {
uint32_t offset = id_to_offset(parent_id);
uint32_t parent_index = (offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1;
if (parent_id == RADV_BVH_ROOT_NODE)
parent_index = 0;
/* Make accesses to internal nodes in dst_bvh available and visible */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
REF(radv_gfx12_box_node) src_node = REF(radv_gfx12_box_node) OFFSET(src_bvh, offset);
REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, offset);
uint32_t valid_child_count_minus_one = DEREF(src_node).child_count_exponents >> 28;
/* Check if all children have been processed. As this is an atomic the last path coming from
* a child will pass here, while earlier paths break.
*/
uint32_t ready_child_count = atomicAdd(
DEREF(INDEX(uint32_t, args.internal_ready_count, parent_index)), 1, gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
if (ready_child_count != valid_child_count_minus_one)
break;
uint32_t child_internal_id = DEREF(src_node).internal_base_id;
uint32_t child_primitive_id = DEREF(src_node).primitive_base_id;
DEREF(dst_node).internal_base_id = child_internal_id;
DEREF(dst_node).primitive_base_id = child_primitive_id;
uint32_t child_offsets[8];
vk_aabb total_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
radv_gfx12_box_child child = DEREF(src_node).children[i];
uint32_t child_type = (child.dword2 >> 24) & 0xf;
uint32_t child_size_id = (child.dword2 >> 28) * RADV_GFX12_BVH_NODE_SIZE / 8;
uint32_t child_id;
if (child_type == radv_bvh_node_box32) {
child_id = child_internal_id;
child_internal_id += child_size_id;
} else {
child_id = child_primitive_id;
child_primitive_id += child_size_id;
}
child_offsets[i] = id_to_offset(child_id);
uint32_t child_offset = child_offsets[i];
vk_aabb child_aabb;
if (child_offset == dst_offset) {
child_aabb = bounds;
} else {
uint32_t child_index;
if (child_offset >= internal_nodes_offset) {
child_index =
(child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count;
} else {
child_index = (child_offset - first_leaf_offset) / leaf_node_size;
}
child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
}
total_bounds.min = min(total_bounds.min, child_aabb.min);
total_bounds.max = max(total_bounds.max, child_aabb.max);
}
vec3 origin = total_bounds.min;
vec3 extent = total_bounds.max - total_bounds.min;
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
DEREF(dst_node).origin = origin;
DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
(extent_exponents.z << 16) | (valid_child_count_minus_one << 28);
DEREF(dst_node).obb_matrix_index = 0x7f;
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
uint32_t child_offset = child_offsets[i];
vk_aabb child_aabb;
if (child_offset == dst_offset) {
child_aabb = bounds;
} else {
uint32_t child_index;
if (child_offset >= internal_nodes_offset) {
child_index =
(child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count;
} else {
child_index = (child_offset - first_leaf_offset) / leaf_node_size;
}
child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
}
radv_gfx12_box_child child = DEREF(src_node).children[i];
radv_gfx12_box_child box_child;
box_child.dword0 =
(child.dword0 & 0xFF000000) |
min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
box_child.dword1 =
(child.dword1 & 0xFF000000) |
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12);
box_child.dword2 =
(child.dword2 & 0xFF000000) |
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12);
DEREF(dst_node).children[i] = box_child;
}
for (uint32_t i = valid_child_count_minus_one + 1; i < 8; i++) {
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst_node).children[i] = null_child;
}
if (parent_id == RADV_BVH_ROOT_NODE)
DEREF(args.dst).aabb = total_bounds;
DEREF(INDEX(vk_aabb, args.leaf_bounds, parent_index + args.leaf_node_count)) = total_bounds;
parent_id = fetch_parent_node(src_bvh, parent_id);
}
}

View file

@ -147,6 +147,7 @@ libradv_files = files(
'radv_rmv.c',
'radv_rmv.h',
'radv_rra_gfx10_3.c',
'radv_rra_gfx12.c',
'radv_rra.c',
'radv_rra.h',
'radv_sampler.c',

View file

@ -241,8 +241,11 @@ enum rq_intersection_type { intersection_type_none, intersection_type_triangle,
static void
lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_vars *vars, nir_deref_instr *rq,
struct radv_instance *instance)
struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_instance *instance = radv_physical_device_instance(pdev);
nir_deref_instr *closest = rq_deref(b, rq, closest);
nir_deref_instr *candidate = rq_deref(b, rq, candidate);
@ -270,7 +273,7 @@ lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query
b, 1, 32, nir_iadd_imm(b, accel_struct, offsetof(struct radv_accel_struct_header, bvh_offset)),
.access = ACCESS_NON_WRITEABLE);
nir_def *bvh_base = nir_iadd(b, accel_struct, nir_u2u64(b, bvh_offset));
bvh_base = build_addr_to_node(b, bvh_base);
bvh_base = build_addr_to_node(device, b, bvh_base, instr->src[2].ssa);
rq_store(b, rq, root_bvh_base, bvh_base);
rq_store(b, rq, trav_bvh_base, bvh_base);
@ -320,44 +323,27 @@ lower_rq_load(struct radv_device *device, nir_builder *b, nir_intrinsic_instr *i
return isec_load(b, intersection, frontface);
case nir_ray_query_value_intersection_geometry_index:
return nir_iand_imm(b, isec_load(b, intersection, geometry_id_and_flags), 0xFFFFFF);
case nir_ray_query_value_intersection_instance_custom_index: {
nir_def *instance_node_addr = isec_load(b, intersection, instance_addr);
return nir_iand_imm(
b,
nir_build_load_global(
b, 1, 32,
nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, custom_instance_and_mask))),
0xFFFFFF);
}
case nir_ray_query_value_intersection_instance_id: {
nir_def *instance_node_addr = isec_load(b, intersection, instance_addr);
return nir_build_load_global(
b, 1, 32, nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, instance_id)));
}
case nir_ray_query_value_intersection_instance_custom_index:
return radv_load_custom_instance(device, b, isec_load(b, intersection, instance_addr));
case nir_ray_query_value_intersection_instance_id:
return radv_load_instance_id(device, b, isec_load(b, intersection, instance_addr));
case nir_ray_query_value_intersection_instance_sbt_index:
return nir_iand_imm(b, isec_load(b, intersection, sbt_offset_and_flags), 0xFFFFFF);
case nir_ray_query_value_intersection_object_ray_direction: {
nir_def *instance_node_addr = isec_load(b, intersection, instance_addr);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(device, b, isec_load(b, intersection, instance_addr), wto_matrix);
return nir_build_vec3_mat_mult(b, rq_load(b, rq, direction), wto_matrix, false);
}
case nir_ray_query_value_intersection_object_ray_origin: {
nir_def *instance_node_addr = isec_load(b, intersection, instance_addr);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(device, b, isec_load(b, intersection, instance_addr), wto_matrix);
return nir_build_vec3_mat_mult(b, rq_load(b, rq, origin), wto_matrix, true);
}
case nir_ray_query_value_intersection_object_to_world: {
nir_def *instance_node_addr = isec_load(b, intersection, instance_addr);
nir_def *rows[3];
for (unsigned r = 0; r < 3; ++r)
rows[r] = nir_build_load_global(
b, 4, 32,
nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, otw_matrix) + r * 16));
return nir_vec3(b, nir_channel(b, rows[0], column), nir_channel(b, rows[1], column),
nir_channel(b, rows[2], column));
nir_def *otw_matrix[3];
radv_load_otw_matrix(device, b, isec_load(b, intersection, instance_addr), otw_matrix);
return nir_vec3(b, nir_channel(b, otw_matrix[0], column), nir_channel(b, otw_matrix[1], column),
nir_channel(b, otw_matrix[2], column));
}
case nir_ray_query_value_intersection_primitive_index:
return isec_load(b, intersection, primitive_id);
@ -371,10 +357,8 @@ lower_rq_load(struct radv_device *device, nir_builder *b, nir_intrinsic_instr *i
return intersection_type;
}
case nir_ray_query_value_intersection_world_to_object: {
nir_def *instance_node_addr = isec_load(b, intersection, instance_addr);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(device, b, isec_load(b, intersection, instance_addr), wto_matrix);
nir_def *vals[3];
for (unsigned i = 0; i < 3; ++i)
@ -477,6 +461,8 @@ static nir_def *
lower_rq_proceed(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_vars *vars, nir_deref_instr *rq,
struct radv_device *device)
{
struct radv_physical_device *pdev = radv_device_physical(device);
nir_deref_instr *closest = rq_deref(b, rq, closest);
nir_deref_instr *candidate = rq_deref(b, rq, candidate);
@ -543,7 +529,11 @@ lower_rq_proceed(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_va
nir_push_if(b, rq_load(b, rq, incomplete));
{
nir_def *incomplete = radv_build_ray_traversal(device, b, &args);
nir_def *incomplete;
if (radv_use_bvh8(pdev))
incomplete = radv_build_ray_traversal_gfx12(device, b, &args);
else
incomplete = radv_build_ray_traversal(device, b, &args);
rq_store(b, rq, incomplete, nir_iand(b, rq_load(b, rq, incomplete), incomplete));
}
nir_pop_if(b, NULL);
@ -571,7 +561,7 @@ bool
radv_nir_lower_ray_queries(struct nir_shader *shader, struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_instance *instance = radv_physical_device_instance(pdev);
bool progress = false;
struct hash_table *query_ht = _mesa_pointer_hash_table_create(NULL);
@ -626,7 +616,7 @@ radv_nir_lower_ray_queries(struct nir_shader *shader, struct radv_device *device
lower_rq_generate_intersection(&builder, intrinsic, rq);
break;
case nir_intrinsic_rq_initialize:
lower_rq_initialize(&builder, intrinsic, vars, rq, instance);
lower_rq_initialize(&builder, intrinsic, vars, rq, device);
break;
case nir_intrinsic_rq_load:
new_dest = lower_rq_load(device, &builder, intrinsic, rq);

View file

@ -267,11 +267,27 @@ intersect_ray_amd_software_tri(struct radv_device *device, nir_builder *b, nir_d
}
nir_def *
build_addr_to_node(nir_builder *b, nir_def *addr)
build_addr_to_node(struct radv_device *device, nir_builder *b, nir_def *addr, nir_def *flags)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint64_t bvh_size = 1ull << 42;
nir_def *node = nir_ushr_imm(b, addr, 3);
return nir_iand_imm(b, node, (bvh_size - 1) << 3);
node = nir_iand_imm(b, node, (bvh_size - 1) << 3);
if (radv_use_bvh8(pdev)) {
/* The HW ray flags are the same bits as the API flags.
* - SpvRayFlagsTerminateOnFirstHitKHRMask, SpvRayFlagsSkipClosestHitShaderKHRMask are handled in shader code.
* - SpvRayFlagsSkipTrianglesKHRMask, SpvRayFlagsSkipAABBsKHRMask do not work.
*/
flags = nir_iand_imm(b, flags,
SpvRayFlagsOpaqueKHRMask | SpvRayFlagsNoOpaqueKHRMask |
SpvRayFlagsCullBackFacingTrianglesKHRMask | SpvRayFlagsCullFrontFacingTrianglesKHRMask |
SpvRayFlagsCullOpaqueKHRMask | SpvRayFlagsCullNoOpaqueKHRMask);
node = nir_ior(b, node, nir_ishl_imm(b, nir_u2u64(b, flags), 54));
}
return node;
}
static nir_def *
@ -302,20 +318,57 @@ nir_build_vec3_mat_mult(nir_builder *b, nir_def *vec, nir_def *matrix[], bool tr
return nir_vec(b, result_components, 3);
}
void
nir_build_wto_matrix_load(nir_builder *b, nir_def *instance_addr, nir_def **out)
{
unsigned offset = offsetof(struct radv_bvh_instance_node, wto_matrix);
for (unsigned i = 0; i < 3; ++i) {
out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16), .align_mul = 64,
.align_offset = offset + i * 16);
}
}
nir_def *
radv_load_vertex_position(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def *geometry_id,
nir_def *primitive_id, uint32_t index)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
if (radv_use_bvh8(pdev)) {
nir_def *addr_offsets =
nir_build_load_global(b, 4, 32,
nir_iadd_imm(b, instance_addr,
sizeof(struct radv_gfx12_instance_node) +
offsetof(struct radv_gfx12_instance_node_user_data, blas_addr)));
nir_def *bvh_offset =
nir_build_load_global(b, 1, 32,
nir_iadd_imm(b, instance_addr,
sizeof(struct radv_gfx12_instance_node) +
offsetof(struct radv_gfx12_instance_node_user_data, bvh_offset)));
nir_def *addr = nir_pack_64_2x32(b, nir_channels(b, addr_offsets, 0x3));
nir_def *base_index_offset =
nir_iadd(b, nir_channel(b, addr_offsets, 2), nir_imul_imm(b, geometry_id, sizeof(uint32_t)));
nir_def *base_index = nir_build_load_global(b, 1, 32, nir_iadd(b, addr, nir_u2u64(b, base_index_offset)));
nir_def *offset_offset = nir_iadd(b, nir_channel(b, addr_offsets, 3),
nir_imul_imm(b, nir_iadd(b, base_index, primitive_id), sizeof(uint32_t)));
nir_def *offset = nir_build_load_global(b, 1, 32, nir_iadd(b, addr, nir_u2u64(b, offset_offset)));
offset = nir_iadd(b, offset, bvh_offset);
/* Assume that vertices are uncompressed. */
offset = nir_iadd_imm(b, offset,
ROUND_DOWN_TO(RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE / 8, 4) + index * 3 * sizeof(float));
nir_def *data[4];
for (uint32_t i = 0; i < ARRAY_SIZE(data); i++) {
data[i] = nir_build_load_global(b, 1, 32, nir_iadd(b, addr, nir_u2u64(b, offset)));
offset = nir_iadd_imm(b, offset, 4);
}
uint32_t subdword_offset = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE % 32;
nir_def *vertices[3];
for (uint32_t i = 0; i < ARRAY_SIZE(vertices); i++) {
nir_def *lo = nir_ubitfield_extract_imm(b, data[i], subdword_offset, 32 - subdword_offset);
nir_def *hi = nir_ubitfield_extract_imm(b, data[i + 1], 0, subdword_offset);
vertices[i] = nir_ior(b, lo, nir_ishl_imm(b, hi, 32 - subdword_offset));
}
return nir_vec3(b, vertices[0], vertices[1], vertices[2]);
}
nir_def *bvh_addr_id =
nir_build_load_global(b, 1, 64, nir_iadd_imm(b, instance_addr, offsetof(struct radv_bvh_instance_node, bvh_ptr)));
nir_def *bvh_addr = build_node_to_addr(device, b, bvh_addr_id, true);
@ -335,6 +388,74 @@ radv_load_vertex_position(struct radv_device *device, nir_builder *b, nir_def *i
return nir_build_load_global(b, 3, 32, nir_iadd(b, bvh_addr, nir_u2u64(b, offset)));
}
void
radv_load_wto_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
unsigned offset = offsetof(struct radv_bvh_instance_node, wto_matrix);
if (radv_use_bvh8(pdev))
offset = offsetof(struct radv_gfx12_instance_node, wto_matrix);
for (unsigned i = 0; i < 3; ++i) {
out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16), .align_mul = 64,
.align_offset = (offset + i * 16) % 64);
}
}
void
radv_load_otw_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
unsigned offset = offsetof(struct radv_bvh_instance_node, otw_matrix);
if (radv_use_bvh8(pdev))
offset =
sizeof(struct radv_gfx12_instance_node) + offsetof(struct radv_gfx12_instance_node_user_data, otw_matrix);
for (unsigned i = 0; i < 3; ++i) {
out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16), .align_mul = 64,
.align_offset = (offset + i * 16) % 64);
}
}
nir_def *
radv_load_custom_instance(struct radv_device *device, nir_builder *b, nir_def *instance_addr)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
if (radv_use_bvh8(pdev)) {
return nir_build_load_global(
b, 1, 32,
nir_iadd_imm(b, instance_addr,
sizeof(struct radv_gfx12_instance_node) +
offsetof(struct radv_gfx12_instance_node_user_data, custom_instance)));
}
return nir_iand_imm(
b,
nir_build_load_global(
b, 1, 32, nir_iadd_imm(b, instance_addr, offsetof(struct radv_bvh_instance_node, custom_instance_and_mask))),
0xFFFFFF);
}
nir_def *
radv_load_instance_id(struct radv_device *device, nir_builder *b, nir_def *instance_addr)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
if (radv_use_bvh8(pdev)) {
return nir_build_load_global(
b, 1, 32,
nir_iadd_imm(b, instance_addr,
sizeof(struct radv_gfx12_instance_node) +
offsetof(struct radv_gfx12_instance_node_user_data, instance_index)));
}
return nir_build_load_global(b, 1, 32,
nir_iadd_imm(b, instance_addr, offsetof(struct radv_bvh_instance_node, instance_id)));
}
/* When a hit is opaque the any_hit shader is skipped for this hit and the hit
* is assumed to be an actual hit. */
static nir_def *
@ -356,11 +477,12 @@ create_bvh_descriptor(nir_builder *b, const struct radv_physical_device *pdev, s
* instances at the cost of having to use 64-bit node ids. */
const uint64_t bvh_size = 1ull << 42;
const uint32_t sort_triangles_first = radv_use_bvh8(pdev) ? BITFIELD_BIT(52 - 32) : 0;
const uint32_t box_sort_enable = BITFIELD_BIT(63 - 32);
const uint32_t triangle_return_mode = BITFIELD_BIT(120 - 96); /* Return IJ for triangles */
uint32_t dword0 = 0;
nir_def *dword1 = nir_imm_intN_t(b, box_sort_enable, 32);
nir_def *dword1 = nir_imm_intN_t(b, sort_triangles_first | box_sort_enable, 32);
uint32_t dword2 = (bvh_size - 1) & 0xFFFFFFFFu;
uint32_t dword3 = ((bvh_size - 1) >> 32) | triangle_return_mode | (1u << 31);
@ -373,9 +495,20 @@ create_bvh_descriptor(nir_builder *b, const struct radv_physical_device *pdev, s
/* Only use largest/midpoint sorting when all invocations have the same ray flags, otherwise
* fall back to the default closest point. */
dword1 = nir_bcsel(b, nir_vote_any(b, 1, ray_flags->terminate_on_first_hit), dword1,
nir_imm_int(b, (box_sort_midpoint << 21) | box_sort_enable));
nir_imm_int(b, (box_sort_midpoint << 21) | sort_triangles_first | box_sort_enable));
dword1 = nir_bcsel(b, nir_vote_all(b, 1, ray_flags->terminate_on_first_hit),
nir_imm_int(b, (box_sort_largest << 21) | box_sort_enable), dword1);
nir_imm_int(b, (box_sort_largest << 21) | sort_triangles_first | box_sort_enable), dword1);
}
if (radv_use_bvh8(pdev)) {
/* compressed_format_en */
dword3 |= BITFIELD_BIT(115 - 96);
/* wide_sort_en */
dword3 |= BITFIELD_BIT(117 - 96);
/* instance_en */
dword3 |= BITFIELD_BIT(118 - 96);
/* pointer_flags */
dword3 |= BITFIELD_BIT(119 - 96);
}
return nir_vec4(b, nir_imm_intN_t(b, dword0, 32), dword1, nir_imm_intN_t(b, dword2, 32), nir_imm_intN_t(b, dword3, 32));
@ -439,6 +572,36 @@ insert_traversal_triangle_case(struct radv_device *device, nir_builder *b, const
nir_pop_if(b, NULL);
}
static void
insert_traversal_triangle_case_gfx12(struct radv_device *device, nir_builder *b,
const struct radv_ray_traversal_args *args, const struct radv_ray_flags *ray_flags,
nir_def *result, nir_def *bvh_node)
{
if (!args->triangle_cb)
return;
struct radv_triangle_intersection intersection;
intersection.t = nir_channel(b, result, 0);
nir_push_if(b, nir_iand(b, nir_flt(b, intersection.t, nir_load_deref(b, args->vars.tmax)),
nir_flt(b, args->tmin, intersection.t)));
{
intersection.frontface = nir_inot(b, nir_test_mask(b, nir_channel(b, result, 3), 1));
intersection.base.node_addr = build_node_to_addr(device, b, bvh_node, false);
intersection.base.primitive_id = nir_ishr_imm(b, nir_channel(b, result, 3), 1);
intersection.base.geometry_id_and_flags = nir_ishr_imm(b, nir_channel(b, result, 8), 2);
intersection.base.opaque = nir_inot(b, nir_test_mask(b, nir_channel(b, result, 2), 1u << 31));
intersection.barycentrics = nir_fabs(b, nir_channels(b, result, 0x3 << 1));
nir_push_if(b, nir_bcsel(b, intersection.base.opaque, ray_flags->no_cull_opaque, ray_flags->no_cull_no_opaque));
{
args->triangle_cb(b, &intersection, args, ray_flags);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
static void
insert_traversal_aabb_case(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args,
const struct radv_ray_flags *ray_flags, nir_def *bvh_node)
@ -466,11 +629,31 @@ insert_traversal_aabb_case(struct radv_device *device, nir_builder *b, const str
nir_pop_if(b, NULL);
}
static nir_def *
fetch_parent_node(nir_builder *b, nir_def *bvh, nir_def *node)
static void
insert_traversal_aabb_case_gfx12(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args,
const struct radv_ray_flags *ray_flags, nir_def *result, nir_def *bvh_node)
{
nir_def *offset = nir_iadd_imm(b, nir_imul_imm(b, nir_udiv_imm(b, node, 8), 4), 4);
if (!args->aabb_cb)
return;
struct radv_leaf_intersection intersection;
intersection.node_addr = build_node_to_addr(device, b, bvh_node, false);
intersection.primitive_id = nir_ishr_imm(b, nir_channel(b, result, 3), 1);
intersection.geometry_id_and_flags = nir_ishr_imm(b, nir_channel(b, result, 8), 2);
intersection.opaque = nir_inot(b, nir_test_mask(b, nir_channel(b, result, 2), 1u << 31));
nir_push_if(b, nir_bcsel(b, intersection.opaque, ray_flags->no_cull_opaque, ray_flags->no_cull_no_opaque));
{
args->aabb_cb(b, &intersection, args);
}
nir_pop_if(b, NULL);
}
static nir_def *
fetch_parent_node(struct radv_device *device, nir_builder *b, nir_def *bvh, nir_def *node)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *offset = nir_iadd_imm(b, nir_imul_imm(b, nir_udiv_imm(b, node, radv_use_bvh8(pdev) ? 16 : 8), 4), 4);
return nir_build_load_global(b, 1, 32, nir_isub(b, bvh, nir_u2u64(b, offset)), .align_mul = 4);
}
@ -547,7 +730,7 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
nir_def *prev = nir_load_deref(b, args->vars.previous_node);
nir_def *bvh_addr = build_node_to_addr(device, b, nir_load_deref(b, args->vars.bvh_base), true);
nir_def *parent = fetch_parent_node(b, bvh_addr, prev);
nir_def *parent = fetch_parent_node(device, b, bvh_addr, prev);
nir_push_if(b, nir_ieq_imm(b, parent, RADV_BVH_INVALID_NODE));
{
nir_store_var(b, incomplete, nir_imm_false(b), 0x1);
@ -615,7 +798,7 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
nir_build_load_global(b, 4, 32, instance_node_addr, .align_mul = 64, .align_offset = 0);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(device, b, instance_node_addr, wto_matrix);
nir_store_deref(b, args->vars.sbt_offset_and_flags, nir_channel(b, instance_data, 3), 1);
@ -718,3 +901,205 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
return nir_load_var(b, incomplete);
}
nir_def *
radv_build_ray_traversal_gfx12(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_variable *incomplete = nir_local_variable_create(b->impl, glsl_bool_type(), "incomplete");
nir_store_var(b, incomplete, nir_imm_true(b), 0x1);
struct radv_ray_flags ray_flags = {
.force_opaque = radv_test_flag(b, args, SpvRayFlagsOpaqueKHRMask, true),
.force_not_opaque = radv_test_flag(b, args, SpvRayFlagsNoOpaqueKHRMask, true),
.terminate_on_first_hit = radv_test_flag(b, args, SpvRayFlagsTerminateOnFirstHitKHRMask, true),
.no_cull_front = radv_test_flag(b, args, SpvRayFlagsCullFrontFacingTrianglesKHRMask, false),
.no_cull_back = radv_test_flag(b, args, SpvRayFlagsCullBackFacingTrianglesKHRMask, false),
.no_cull_opaque = radv_test_flag(b, args, SpvRayFlagsCullOpaqueKHRMask, false),
.no_cull_no_opaque = radv_test_flag(b, args, SpvRayFlagsCullNoOpaqueKHRMask, false),
.no_skip_triangles = radv_test_flag(b, args, SpvRayFlagsSkipTrianglesKHRMask, false),
.no_skip_aabbs = radv_test_flag(b, args, SpvRayFlagsSkipAABBsKHRMask, false),
};
nir_def *desc = create_bvh_descriptor(b, pdev, &ray_flags);
nir_push_loop(b);
{
nir_push_if(b, nir_ieq_imm(b, nir_load_deref(b, args->vars.current_node), RADV_BVH_INVALID_NODE));
{
/* Early exit if we never overflowed the stack, to avoid having to backtrack to
* the root for no reason. */
nir_push_if(b, nir_ilt_imm(b, nir_load_deref(b, args->vars.stack), args->stack_base + args->stack_stride));
{
nir_store_var(b, incomplete, nir_imm_false(b), 0x1);
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_def *stack_instance_exit =
nir_ige(b, nir_load_deref(b, args->vars.top_stack), nir_load_deref(b, args->vars.stack));
nir_def *root_instance_exit =
nir_ieq(b, nir_load_deref(b, args->vars.previous_node), nir_load_deref(b, args->vars.instance_bottom_node));
nir_if *instance_exit = nir_push_if(b, nir_ior(b, stack_instance_exit, root_instance_exit));
instance_exit->control = nir_selection_control_dont_flatten;
{
nir_store_deref(b, args->vars.top_stack, nir_imm_int(b, -1), 1);
nir_store_deref(b, args->vars.previous_node, nir_load_deref(b, args->vars.instance_top_node), 1);
nir_store_deref(b, args->vars.instance_bottom_node, nir_imm_int(b, RADV_BVH_NO_INSTANCE_ROOT), 1);
nir_store_deref(b, args->vars.bvh_base, args->root_bvh_base, 1);
nir_store_deref(b, args->vars.origin, args->origin, 7);
nir_store_deref(b, args->vars.dir, args->dir, 7);
}
nir_pop_if(b, NULL);
nir_push_if(
b, nir_ige(b, nir_load_deref(b, args->vars.stack_low_watermark), nir_load_deref(b, args->vars.stack)));
{
nir_def *prev = nir_load_deref(b, args->vars.previous_node);
nir_def *bvh_addr = build_node_to_addr(device, b, nir_load_deref(b, args->vars.bvh_base), true);
nir_def *parent = fetch_parent_node(device, b, bvh_addr, prev);
nir_push_if(b, nir_ieq_imm(b, parent, RADV_BVH_INVALID_NODE));
{
nir_store_var(b, incomplete, nir_imm_false(b), 0x1);
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_store_deref(b, args->vars.current_node, parent, 0x1);
}
nir_push_else(b, NULL);
{
nir_store_deref(b, args->vars.stack,
nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), -args->stack_stride), 1);
nir_def *stack_ptr =
nir_umod_imm(b, nir_load_deref(b, args->vars.stack), args->stack_stride * args->stack_entries);
nir_def *bvh_node = args->stack_load_cb(b, stack_ptr, args);
nir_store_deref(b, args->vars.current_node, bvh_node, 0x1);
nir_store_deref(b, args->vars.previous_node, nir_imm_int(b, RADV_BVH_INVALID_NODE), 0x1);
}
nir_pop_if(b, NULL);
}
nir_push_else(b, NULL);
{
nir_store_deref(b, args->vars.previous_node, nir_imm_int(b, RADV_BVH_INVALID_NODE), 0x1);
}
nir_pop_if(b, NULL);
nir_def *bvh_node = nir_load_deref(b, args->vars.current_node);
nir_def *prev_node = nir_load_deref(b, args->vars.previous_node);
nir_store_deref(b, args->vars.previous_node, bvh_node, 0x1);
nir_store_deref(b, args->vars.current_node, nir_imm_int(b, RADV_BVH_INVALID_NODE), 0x1);
nir_def *global_bvh_node = nir_iadd(b, nir_load_deref(b, args->vars.bvh_base), nir_u2u64(b, bvh_node));
nir_def *result =
nir_bvh8_intersect_ray_amd(b, 32, desc, nir_unpack_64_2x32(b, nir_load_deref(b, args->vars.bvh_base)),
nir_ishr_imm(b, args->cull_mask, 24), nir_load_deref(b, args->vars.tmax),
nir_load_deref(b, args->vars.origin), nir_load_deref(b, args->vars.dir), bvh_node);
nir_push_if(b, nir_test_mask(b, bvh_node, BITFIELD64_BIT(ffs(radv_bvh_node_box16) - 1)));
{
nir_push_if(b, nir_test_mask(b, bvh_node, BITFIELD64_BIT(ffs(radv_bvh_node_instance) - 1)));
{
if (args->vars.iteration_instance_count) {
nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count);
iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1 << 16);
nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1);
}
nir_def *next_node = nir_iand_imm(b, nir_channel(b, result, 7), 0xff);
nir_push_if(b, nir_ieq_imm(b, next_node, 0xff));
nir_jump(b, nir_jump_continue);
nir_pop_if(b, NULL);
/* instance */
nir_def *instance_node_addr = build_node_to_addr(device, b, global_bvh_node, false);
nir_store_deref(b, args->vars.instance_addr, instance_node_addr, 1);
nir_store_deref(b, args->vars.sbt_offset_and_flags, nir_channel(b, result, 6), 1);
nir_store_deref(b, args->vars.origin, nir_channels(b, result, 0x7 << 10), 0x7);
nir_store_deref(b, args->vars.dir, nir_channels(b, result, 0x7 << 13), 0x7);
nir_store_deref(b, args->vars.top_stack, nir_load_deref(b, args->vars.stack), 1);
nir_store_deref(b, args->vars.bvh_base, nir_pack_64_2x32(b, nir_channels(b, result, 0x3 << 2)), 1);
/* Push the instance root node onto the stack */
nir_store_deref(b, args->vars.current_node, next_node, 0x1);
nir_store_deref(b, args->vars.instance_bottom_node, next_node, 1);
nir_store_deref(b, args->vars.instance_top_node, bvh_node, 1);
}
nir_push_else(b, NULL);
{
/* box */
nir_push_if(b, nir_ieq_imm(b, prev_node, RADV_BVH_INVALID_NODE));
{
nir_def *new_nodes[8];
for (unsigned i = 0; i < 8; ++i)
new_nodes[i] = nir_channel(b, result, i);
for (unsigned i = 1; i < 8; ++i)
nir_push_if(b, nir_ine_imm(b, new_nodes[i], RADV_BVH_INVALID_NODE));
for (unsigned i = 8; i-- > 1;) {
nir_def *stack = nir_load_deref(b, args->vars.stack);
nir_def *stack_ptr = nir_umod_imm(b, stack, args->stack_entries * args->stack_stride);
args->stack_store_cb(b, stack_ptr, new_nodes[i], args);
nir_store_deref(b, args->vars.stack, nir_iadd_imm(b, stack, args->stack_stride), 1);
if (i == 1) {
nir_def *new_watermark =
nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), -args->stack_entries * args->stack_stride);
new_watermark = nir_imax(b, nir_load_deref(b, args->vars.stack_low_watermark), new_watermark);
nir_store_deref(b, args->vars.stack_low_watermark, new_watermark, 0x1);
}
nir_pop_if(b, NULL);
}
nir_store_deref(b, args->vars.current_node, new_nodes[0], 0x1);
}
nir_push_else(b, NULL);
{
nir_def *next = nir_imm_int(b, RADV_BVH_INVALID_NODE);
for (unsigned i = 0; i < 7; ++i) {
next = nir_bcsel(b, nir_ieq(b, prev_node, nir_channel(b, result, i)), nir_channel(b, result, i + 1),
next);
}
nir_store_deref(b, args->vars.current_node, next, 0x1);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_test_mask(b, nir_channel(b, result, 1), 1u << 31));
{
nir_push_if(b, ray_flags.no_skip_aabbs);
insert_traversal_aabb_case_gfx12(device, b, args, &ray_flags, result, global_bvh_node);
nir_pop_if(b, NULL);
}
nir_push_else(b, NULL);
{
nir_push_if(b, ray_flags.no_skip_triangles);
insert_traversal_triangle_case_gfx12(device, b, args, &ray_flags, result, global_bvh_node);
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
if (args->vars.iteration_instance_count) {
nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count);
iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1);
nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1);
}
}
nir_pop_loop(b, NULL);
return nir_load_var(b, incomplete);
}

View file

@ -14,15 +14,21 @@
struct radv_device;
nir_def *build_addr_to_node(nir_builder *b, nir_def *addr);
nir_def *build_addr_to_node(struct radv_device *device, nir_builder *b, nir_def *addr, nir_def *flags);
nir_def *nir_build_vec3_mat_mult(nir_builder *b, nir_def *vec, nir_def *matrix[], bool translation);
void nir_build_wto_matrix_load(nir_builder *b, nir_def *instance_addr, nir_def **out);
nir_def *radv_load_vertex_position(struct radv_device *device, nir_builder *b, nir_def *instance_addr,
nir_def *geometry_id, nir_def *primitive_id, uint32_t index);
void radv_load_wto_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out);
void radv_load_otw_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out);
nir_def *radv_load_custom_instance(struct radv_device *device, nir_builder *b, nir_def *instance_addr);
nir_def *radv_load_instance_id(struct radv_device *device, nir_builder *b, nir_def *instance_addr);
struct radv_ray_traversal_args;
struct radv_ray_flags {
@ -146,4 +152,7 @@ struct radv_ray_traversal_args {
nir_def *radv_build_ray_traversal(struct radv_device *device, nir_builder *b,
const struct radv_ray_traversal_args *args);
nir_def *radv_build_ray_traversal_gfx12(struct radv_device *device, nir_builder *b,
const struct radv_ray_traversal_args *args);
#endif /* RADV_NIR_RT_COMMON_H */

View file

@ -530,11 +530,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
break;
}
case nir_intrinsic_load_ray_instance_custom_index: {
nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr);
nir_def *custom_instance_and_mask = nir_build_load_global(
b, 1, 32,
nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, custom_instance_and_mask)));
ret = nir_iand_imm(b, custom_instance_and_mask, 0xFFFFFF);
ret = radv_load_custom_instance(vars->device, b, nir_load_var(b, vars->instance_addr));
break;
}
case nir_intrinsic_load_primitive_id: {
@ -547,9 +543,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
break;
}
case nir_intrinsic_load_instance_id: {
nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr);
ret = nir_build_load_global(
b, 1, 32, nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, instance_id)));
ret = radv_load_instance_id(vars->device, b, nir_load_var(b, vars->instance_addr));
break;
}
case nir_intrinsic_load_ray_flags: {
@ -564,7 +558,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
unsigned c = nir_intrinsic_column(intr);
nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(vars->device, b, instance_node_addr, wto_matrix);
nir_def *vals[3];
for (unsigned i = 0; i < 3; ++i)
@ -575,26 +569,21 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
}
case nir_intrinsic_load_ray_object_to_world: {
unsigned c = nir_intrinsic_column(intr);
nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr);
nir_def *rows[3];
for (unsigned r = 0; r < 3; ++r)
rows[r] = nir_build_load_global(
b, 4, 32,
nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, otw_matrix) + r * 16));
ret = nir_vec3(b, nir_channel(b, rows[0], c), nir_channel(b, rows[1], c), nir_channel(b, rows[2], c));
nir_def *otw_matrix[3];
radv_load_otw_matrix(vars->device, b, nir_load_var(b, vars->instance_addr), otw_matrix);
ret = nir_vec3(b, nir_channel(b, otw_matrix[0], c), nir_channel(b, otw_matrix[1], c),
nir_channel(b, otw_matrix[2], c));
break;
}
case nir_intrinsic_load_ray_object_origin: {
nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(vars->device, b, nir_load_var(b, vars->instance_addr), wto_matrix);
ret = nir_build_vec3_mat_mult(b, nir_load_var(b, vars->origin), wto_matrix, true);
break;
}
case nir_intrinsic_load_ray_object_direction: {
nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr);
nir_def *wto_matrix[3];
nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix);
radv_load_wto_matrix(vars->device, b, nir_load_var(b, vars->instance_addr), wto_matrix);
ret = nir_build_vec3_mat_mult(b, nir_load_var(b, vars->direction), wto_matrix, false);
break;
}
@ -1526,6 +1515,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
struct rt_traversal_vars trav_vars = init_traversal_vars(b);
nir_def *cull_mask_and_flags = nir_load_var(b, vars->cull_mask_and_flags);
nir_store_var(b, trav_vars.hit, nir_imm_false(b), 1);
nir_def *accel_struct = nir_load_var(b, vars->accel_struct);
@ -1533,7 +1524,7 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
b, 1, 32, nir_iadd_imm(b, accel_struct, offsetof(struct radv_accel_struct_header, bvh_offset)),
.access = ACCESS_NON_WRITEABLE);
nir_def *root_bvh_base = nir_iadd(b, accel_struct, nir_u2u64(b, bvh_offset));
root_bvh_base = build_addr_to_node(b, root_bvh_base);
root_bvh_base = build_addr_to_node(device, b, root_bvh_base, cull_mask_and_flags);
nir_store_var(b, trav_vars.bvh_base, root_bvh_base, 1);
@ -1589,7 +1580,6 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
.pipeline = pipeline,
};
nir_def *cull_mask_and_flags = nir_load_var(b, vars->cull_mask_and_flags);
struct radv_ray_traversal_args args = {
.root_bvh_base = root_bvh_base,
.flags = cull_mask_and_flags,
@ -1617,7 +1607,10 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
nir_def *original_tmax = nir_load_var(b, vars->tmax);
radv_build_ray_traversal(device, b, &args);
if (radv_use_bvh8(pdev))
radv_build_ray_traversal_gfx12(device, b, &args);
else
radv_build_ray_traversal(device, b, &args);
if (vars->device->rra_trace.ray_history_addr)
radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit),

View file

@ -16,10 +16,18 @@
#include "vk_acceleration_structure.h"
#include "vk_common_entrypoints.h"
static const uint32_t copy_blas_addrs_gfx12_spv[] = {
#include "bvh/copy_blas_addrs_gfx12.spv.h"
};
static const uint32_t copy_spv[] = {
#include "bvh/copy.spv.h"
};
static const uint32_t copy_gfx12_spv[] = {
#include "bvh/copy_gfx12.spv.h"
};
static const uint32_t encode_spv[] = {
#include "bvh/encode.spv.h"
};
@ -28,6 +36,10 @@ static const uint32_t encode_compact_spv[] = {
#include "bvh/encode_compact.spv.h"
};
static const uint32_t encode_gfx12_spv[] = {
#include "bvh/encode_gfx12.spv.h"
};
static const uint32_t header_spv[] = {
#include "bvh/header.spv.h"
};
@ -36,6 +48,10 @@ static const uint32_t update_spv[] = {
#include "bvh/update.spv.h"
};
static const uint32_t update_gfx12_spv[] = {
#include "bvh/update_gfx12.spv.h"
};
static const uint32_t leaf_spv[] = {
#include "bvh/radv_leaf.spv.h"
};
@ -47,6 +63,7 @@ static const uint32_t leaf_always_active_spv[] = {
struct acceleration_structure_layout {
uint32_t geometry_info_offset;
uint32_t primitive_base_indices_offset;
uint32_t leaf_node_offsets_offset;
uint32_t bvh_offset;
uint32_t leaf_nodes_offset;
uint32_t internal_nodes_offset;
@ -68,26 +85,50 @@ radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf
const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
struct acceleration_structure_layout *accel_struct)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
uint32_t internal_count = MAX2(leaf_count, 2) - 1;
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
uint32_t bvh_leaf_size;
switch (geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
bvh_leaf_size = sizeof(struct radv_bvh_triangle_node);
break;
case VK_GEOMETRY_TYPE_AABBS_KHR:
bvh_leaf_size = sizeof(struct radv_bvh_aabb_node);
break;
case VK_GEOMETRY_TYPE_INSTANCES_KHR:
bvh_leaf_size = sizeof(struct radv_bvh_instance_node);
break;
default:
unreachable("Unknown VkGeometryTypeKHR");
uint32_t bvh_node_size_gcd;
if (radv_use_bvh8(pdev)) {
switch (geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
bvh_leaf_size = sizeof(struct radv_gfx12_primitive_node);
break;
case VK_GEOMETRY_TYPE_AABBS_KHR:
bvh_leaf_size = sizeof(struct radv_gfx12_primitive_node);
break;
case VK_GEOMETRY_TYPE_INSTANCES_KHR:
bvh_leaf_size = sizeof(struct radv_gfx12_instance_node) + sizeof(struct radv_gfx12_instance_node_user_data);
break;
default:
unreachable("Unknown VkGeometryTypeKHR");
}
bvh_node_size_gcd = RADV_GFX12_BVH_NODE_SIZE;
} else {
switch (geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
bvh_leaf_size = sizeof(struct radv_bvh_triangle_node);
break;
case VK_GEOMETRY_TYPE_AABBS_KHR:
bvh_leaf_size = sizeof(struct radv_bvh_aabb_node);
break;
case VK_GEOMETRY_TYPE_INSTANCES_KHR:
bvh_leaf_size = sizeof(struct radv_bvh_instance_node);
break;
default:
unreachable("Unknown VkGeometryTypeKHR");
}
bvh_node_size_gcd = 64;
}
uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count;
uint32_t internal_node_size =
radv_use_bvh8(pdev) ? sizeof(struct radv_gfx12_box_node) : sizeof(struct radv_bvh_box32_node);
uint64_t bvh_size = bvh_leaf_size * leaf_count + internal_node_size * internal_count;
uint32_t offset = 0;
offset += sizeof(struct radv_accel_struct_header);
@ -101,23 +142,30 @@ radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf
offset += sizeof(uint32_t) * build_info->geometryCount;
}
/* On GFX12, we need additional space for leaf node offsets since they do not have the same
* order as the application provided data.
*/
accel_struct->leaf_node_offsets_offset = offset;
if (radv_use_bvh8(pdev))
offset += leaf_count * 4;
/* Parent links, which have to go directly before bvh_offset as we index them using negative
* offsets from there. */
offset += bvh_size / 64 * 4;
offset += bvh_size / bvh_node_size_gcd * 4;
/* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
offset = ALIGN(offset, 64);
accel_struct->bvh_offset = offset;
/* root node */
offset += sizeof(struct radv_bvh_box32_node);
offset += internal_node_size;
accel_struct->leaf_nodes_offset = offset;
offset += bvh_leaf_size * leaf_count;
accel_struct->internal_nodes_offset = offset;
/* Factor out the root node. */
offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1);
offset += internal_node_size * (internal_count - 1);
accel_struct->size = offset;
}
@ -134,7 +182,7 @@ radv_get_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct
uint32_t update_offset = 0;
update_offset += sizeof(vk_aabb) * leaf_count;
update_offset += sizeof(vk_aabb) * (leaf_count + internal_count);
scratch->internal_ready_count_offset = update_offset;
update_offset += sizeof(uint32_t) * internal_count;
@ -154,6 +202,10 @@ radv_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStruc
STATIC_ASSERT(sizeof(struct radv_bvh_instance_node) == 128);
STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64);
STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128);
STATIC_ASSERT(sizeof(struct radv_gfx12_box_node) == RADV_GFX12_BVH_NODE_SIZE);
STATIC_ASSERT(sizeof(struct radv_gfx12_primitive_node) == RADV_GFX12_BVH_NODE_SIZE);
STATIC_ASSERT(sizeof(struct radv_gfx12_instance_node) == RADV_GFX12_BVH_NODE_SIZE);
STATIC_ASSERT(sizeof(struct radv_gfx12_instance_node_user_data) == RADV_GFX12_BVH_NODE_SIZE);
if (radv_device_init_accel_struct_build_state(device) != VK_SUCCESS)
return;
@ -170,6 +222,7 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
struct vk_device_dispatch_table *dispatch = &device->vk.dispatch_table;
dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_pipeline, &state->alloc);
dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_blas_addrs_gfx12_pipeline, &state->alloc);
dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_pipeline, &state->alloc);
dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_compact_pipeline, &state->alloc);
dispatch->DestroyPipeline(_device, state->accel_struct_build.header_pipeline, &state->alloc);
@ -257,7 +310,11 @@ radv_device_init_null_accel_struct(struct radv_device *device)
VkDevice _device = radv_device_to_handle(device);
uint32_t bvh_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64);
uint32_t size = bvh_offset + sizeof(struct radv_bvh_box32_node);
uint32_t size = bvh_offset;
if (radv_use_bvh8(pdev))
size += sizeof(struct radv_gfx12_box_node);
else
size += sizeof(struct radv_bvh_box32_node);
VkResult result;
@ -321,28 +378,44 @@ radv_device_init_null_accel_struct(struct radv_device *device)
};
memcpy(data, &header, sizeof(struct radv_accel_struct_header));
struct radv_bvh_box32_node root = {
.children =
{
RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE,
},
};
for (uint32_t child = 0; child < 4; child++) {
root.coords[child] = (vk_aabb){
.min.x = NAN,
.min.y = NAN,
.min.z = NAN,
.max.x = NAN,
.max.y = NAN,
.max.z = NAN,
if (radv_use_bvh8(pdev)) {
struct radv_gfx12_box_node root = {
.obb_matrix_index = 0x7f,
};
}
memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_bvh_box32_node));
for (uint32_t child = 0; child < 8; child++) {
root.children[child] = (struct radv_gfx12_box_child){
.dword0 = 0xffffffff,
.dword1 = 0xfff,
.dword2 = 0,
};
}
memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_gfx12_box_node));
} else {
struct radv_bvh_box32_node root = {
.children =
{
RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE,
},
};
for (uint32_t child = 0; child < 4; child++) {
root.coords[child] = (vk_aabb){
.min.x = NAN,
.min.y = NAN,
.min.z = NAN,
.max.x = NAN,
.max.y = NAN,
.max.z = NAN,
};
}
memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_bvh_box32_node));
}
vk_common_UnmapMemory(_device, memory);
@ -385,9 +458,15 @@ radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count)
}
static uint32_t
radv_get_encode_key(struct vk_device *device, VkAccelerationStructureTypeKHR type,
radv_get_encode_key(struct vk_device *vk_device, VkAccelerationStructureTypeKHR type,
VkBuildAccelerationStructureFlagBitsKHR flags)
{
struct radv_device *device = container_of(vk_device, struct radv_device, vk);
struct radv_physical_device *pdev = radv_device_physical(device);
if (radv_use_bvh8(pdev))
return RADV_ENCODE_KEY_COMPACT;
if (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
return RADV_ENCODE_KEY_COMPACT;
@ -401,9 +480,10 @@ radv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
bool compact = key & RADV_ENCODE_KEY_COMPACT;
device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
compact ? device->meta_state.accel_struct_build.encode_compact_pipeline
: device->meta_state.accel_struct_build.encode_pipeline);
VkPipeline pipeline = compact ? device->meta_state.accel_struct_build.encode_compact_pipeline
: device->meta_state.accel_struct_build.encode_pipeline;
device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
return VK_SUCCESS;
}
@ -448,6 +528,47 @@ radv_encode_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild
radv_compute_dispatch(cmd_buffer, &dispatch);
}
static void
radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
VkDeviceAddress intermediate_as_addr, VkDeviceAddress intermediate_header_addr,
uint32_t leaf_count, uint32_t key, struct vk_acceleration_structure *dst)
{
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
struct acceleration_structure_layout layout;
radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
uint32_t dst_internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset;
uint32_t dst_leaf_nodes_offset = layout.leaf_nodes_offset - layout.bvh_offset;
uint32_t offsets[2] = {dst_internal_nodes_offset, dst_leaf_nodes_offset};
radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset), offsets,
sizeof(offsets));
if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope)
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
const struct encode_gfx12_args args = {
.intermediate_bvh = intermediate_as_addr,
.output_base = vk_acceleration_structure_get_va(dst),
.header = intermediate_header_addr,
.output_bvh_offset = layout.bvh_offset,
.leaf_node_offsets_offset = layout.leaf_node_offsets_offset,
.leaf_node_count = leaf_count,
.geometry_type = vk_get_as_geometry_type(build_info),
};
vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
struct radv_dispatch_info dispatch = {
.unaligned = true,
.ordered = true,
.blocks = {MAX2(leaf_count, 1), 1, 1},
};
radv_compute_dispatch(cmd_buffer, &dispatch);
}
static VkResult
radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
{
@ -487,7 +608,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
if (key & RADV_ENCODE_KEY_COMPACT) {
base = offsetof(struct radv_accel_struct_header, geometry_count);
base = offsetof(struct radv_accel_struct_header, geometry_type);
struct header_args args = {
.src = intermediate_header_addr,
@ -506,6 +627,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
header.instance_offset = layout.bvh_offset + sizeof(struct radv_bvh_box32_node);
header.instance_count = instance_count;
header.leaf_node_offsets_offset = layout.leaf_node_offsets_offset;
header.compacted_size = layout.size;
header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64);
@ -520,6 +642,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
sizeof(uint64_t) * header.instance_count;
header.build_flags = build_info->flags;
header.geometry_type = vk_get_as_geometry_type(build_info);
header.geometry_count = build_info->geometryCount;
header.primitive_base_indices_offset = layout.primitive_base_indices_offset;
@ -674,26 +797,6 @@ static const struct radix_sort_vk_target_config radix_sort_config = {
.scatter.block_rows = 14,
};
static const struct vk_acceleration_structure_build_ops build_ops = {
.begin_debug_marker = vk_accel_struct_cmd_begin_debug_marker,
.end_debug_marker = vk_accel_struct_cmd_end_debug_marker,
.get_as_size = radv_get_as_size,
.get_update_scratch_size = radv_get_update_scratch_size,
.get_encode_key[0] = radv_get_encode_key,
.get_encode_key[1] = radv_get_encode_key,
.encode_bind_pipeline[0] = radv_encode_bind_pipeline,
.encode_bind_pipeline[1] = radv_init_header_bind_pipeline,
.encode_as[0] = radv_encode_as,
.encode_as[1] = radv_init_header,
.init_update_scratch = radv_init_update_scratch,
.update_bind_pipeline[0] = radv_update_bind_pipeline,
.update_as[0] = radv_update_as,
.leaf_spirv_override = leaf_spv,
.leaf_spirv_override_size = sizeof(leaf_spv),
.leaf_always_active_spirv_override = leaf_always_active_spv,
.leaf_always_active_spirv_override_size = sizeof(leaf_always_active_spv),
};
static void
radv_write_buffer_cp(VkCommandBuffer commandBuffer, VkDeviceAddress addr, void *data, uint32_t size)
{
@ -729,24 +832,49 @@ radv_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer, VkDeviceAddress addr, V
VkResult
radv_device_init_accel_struct_build_state(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
VkResult result = VK_SUCCESS;
mtx_lock(&device->meta_state.mtx);
if (device->meta_state.accel_struct_build.radix_sort)
goto exit;
result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args),
&device->meta_state.accel_struct_build.encode_pipeline,
&device->meta_state.accel_struct_build.encode_p_layout);
if (result != VK_SUCCESS)
goto exit;
if (radv_use_bvh8(pdev)) {
result =
create_build_pipeline_spv(device, encode_gfx12_spv, sizeof(encode_gfx12_spv), sizeof(struct encode_gfx12_args),
&device->meta_state.accel_struct_build.encode_compact_pipeline,
&device->meta_state.accel_struct_build.encode_p_layout);
if (result != VK_SUCCESS)
goto exit;
result =
create_build_pipeline_spv(device, encode_compact_spv, sizeof(encode_compact_spv), sizeof(struct encode_args),
&device->meta_state.accel_struct_build.encode_compact_pipeline,
&device->meta_state.accel_struct_build.encode_p_layout);
if (result != VK_SUCCESS)
goto exit;
result = create_build_pipeline_spv(device, update_gfx12_spv, sizeof(update_gfx12_spv), sizeof(struct update_args),
&device->meta_state.accel_struct_build.update_pipeline,
&device->meta_state.accel_struct_build.update_p_layout);
if (result != VK_SUCCESS)
goto exit;
} else {
result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args),
&device->meta_state.accel_struct_build.encode_pipeline,
&device->meta_state.accel_struct_build.encode_p_layout);
if (result != VK_SUCCESS)
goto exit;
result =
create_build_pipeline_spv(device, encode_compact_spv, sizeof(encode_compact_spv), sizeof(struct encode_args),
&device->meta_state.accel_struct_build.encode_compact_pipeline,
&device->meta_state.accel_struct_build.encode_p_layout);
if (result != VK_SUCCESS)
goto exit;
result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args),
&device->meta_state.accel_struct_build.update_pipeline,
&device->meta_state.accel_struct_build.update_p_layout);
if (result != VK_SUCCESS)
goto exit;
}
result = create_build_pipeline_spv(device, header_spv, sizeof(header_spv), sizeof(struct header_args),
&device->meta_state.accel_struct_build.header_pipeline,
@ -754,16 +882,36 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
if (result != VK_SUCCESS)
goto exit;
result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args),
&device->meta_state.accel_struct_build.update_pipeline,
&device->meta_state.accel_struct_build.update_p_layout);
if (result != VK_SUCCESS)
goto exit;
device->meta_state.accel_struct_build.radix_sort = vk_create_radix_sort_u64(
radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache, radix_sort_config);
device->vk.as_build_ops = &build_ops;
device->meta_state.accel_struct_build.build_ops = (struct vk_acceleration_structure_build_ops){
.begin_debug_marker = vk_accel_struct_cmd_begin_debug_marker,
.end_debug_marker = vk_accel_struct_cmd_end_debug_marker,
.get_as_size = radv_get_as_size,
.get_update_scratch_size = radv_get_update_scratch_size,
.get_encode_key[0] = radv_get_encode_key,
.get_encode_key[1] = radv_get_encode_key,
.encode_bind_pipeline[0] = radv_encode_bind_pipeline,
.encode_bind_pipeline[1] = radv_init_header_bind_pipeline,
.encode_as[1] = radv_init_header,
.init_update_scratch = radv_init_update_scratch,
.update_bind_pipeline[0] = radv_update_bind_pipeline,
.update_as[0] = radv_update_as,
};
if (radv_use_bvh8(pdev)) {
device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12;
} else {
device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as;
device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv;
device->meta_state.accel_struct_build.build_ops.leaf_spirv_override_size = sizeof(leaf_spv);
device->meta_state.accel_struct_build.build_ops.leaf_always_active_spirv_override = leaf_always_active_spv;
device->meta_state.accel_struct_build.build_ops.leaf_always_active_spirv_override_size =
sizeof(leaf_always_active_spv);
}
device->vk.as_build_ops = &device->meta_state.accel_struct_build.build_ops;
device->vk.write_buffer_cp = radv_write_buffer_cp;
device->vk.flush_buffer_write_cp = radv_flush_buffer_write_cp;
device->vk.cmd_dispatch_unaligned = radv_cmd_dispatch_unaligned;
@ -783,12 +931,30 @@ exit:
static VkResult
radv_device_init_accel_struct_copy_state(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
VkResult result;
mtx_lock(&device->meta_state.mtx);
VkResult result = create_build_pipeline_spv(device, copy_spv, sizeof(copy_spv), sizeof(struct copy_args),
&device->meta_state.accel_struct_build.copy_pipeline,
&device->meta_state.accel_struct_build.copy_p_layout);
if (radv_use_bvh8(pdev)) {
result = create_build_pipeline_spv(device, copy_gfx12_spv, sizeof(copy_gfx12_spv), sizeof(struct copy_args),
&device->meta_state.accel_struct_build.copy_pipeline,
&device->meta_state.accel_struct_build.copy_p_layout);
if (result != VK_SUCCESS)
goto exit;
result = create_build_pipeline_spv(device, copy_blas_addrs_gfx12_spv, sizeof(copy_blas_addrs_gfx12_spv),
sizeof(struct copy_args),
&device->meta_state.accel_struct_build.copy_blas_addrs_gfx12_pipeline,
&device->meta_state.accel_struct_build.copy_p_layout);
} else {
result = create_build_pipeline_spv(device, copy_spv, sizeof(copy_spv), sizeof(struct copy_args),
&device->meta_state.accel_struct_build.copy_pipeline,
&device->meta_state.accel_struct_build.copy_p_layout);
}
exit:
mtx_unlock(&device->meta_state.mtx);
return result;
}
@ -879,6 +1045,7 @@ radv_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_meta_saved_state saved_state;
VkResult result = radv_device_init_accel_struct_copy_state(device);
@ -904,6 +1071,21 @@ radv_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer,
sizeof(consts), &consts);
vk_common_CmdDispatch(commandBuffer, 512, 1, 1);
if (radv_use_bvh8(pdev)) {
/* Wait for the main copy dispatch to finish. */
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_2_SHADER_WRITE_BIT, 0, NULL, NULL) |
radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_2_SHADER_READ_BIT, 0, NULL, NULL);
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
device->meta_state.accel_struct_build.copy_blas_addrs_gfx12_pipeline);
vk_common_CmdDispatch(commandBuffer, 256, 1, 1);
}
radv_meta_restore(&saved_state, cmd_buffer);
}
@ -945,6 +1127,20 @@ radv_CmdCopyAccelerationStructureToMemoryKHR(VkCommandBuffer commandBuffer,
radv_CmdDispatchIndirect(commandBuffer, vk_buffer_to_handle(src->buffer),
src->offset + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
if (radv_use_bvh8(pdev)) {
/* Wait for the main copy dispatch to finish. */
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_2_SHADER_WRITE_BIT, 0, NULL, NULL) |
radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_2_SHADER_READ_BIT, 0, NULL, NULL);
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
device->meta_state.accel_struct_build.copy_blas_addrs_gfx12_pipeline);
vk_common_CmdDispatch(commandBuffer, 256, 1, 1);
}
radv_meta_restore(&saved_state, cmd_buffer);
/* Set the header of the serialized data. */

View file

@ -71,6 +71,7 @@ enum {
RADV_DEBUG_DUMP_ASM = 1ull << 56,
RADV_DEBUG_DUMP_BACKEND_IR = 1ull << 57,
RADV_DEBUG_PSO_HISTORY = 1ull << 58,
RADV_DEBUG_BVH4 = 1ull << 59,
RADV_DEBUG_DUMP_SHADERS = RADV_DEBUG_DUMP_VS | RADV_DEBUG_DUMP_TCS | RADV_DEBUG_DUMP_TES | RADV_DEBUG_DUMP_GS |
RADV_DEBUG_DUMP_PS | RADV_DEBUG_DUMP_TASK | RADV_DEBUG_DUMP_MESH | RADV_DEBUG_DUMP_CS |
RADV_DEBUG_DUMP_NIR | RADV_DEBUG_DUMP_ASM | RADV_DEBUG_DUMP_BACKEND_IR,

View file

@ -100,8 +100,10 @@ struct radv_meta_state {
VkPipeline update_pipeline;
VkPipelineLayout copy_p_layout;
VkPipeline copy_pipeline;
VkPipeline copy_blas_addrs_gfx12_pipeline;
struct radix_sort_vk *radix_sort;
struct vk_acceleration_structure_build_ops build_ops;
struct vk_acceleration_structure_build_args build_args;
struct {

View file

@ -86,6 +86,7 @@ static const struct debug_control radv_debug_options[] = {{"nofastclears", RADV_
{"asm", RADV_DEBUG_DUMP_ASM},
{"ir", RADV_DEBUG_DUMP_BACKEND_IR},
{"pso_history", RADV_DEBUG_PSO_HISTORY},
{"bvh4", RADV_DEBUG_BVH4},
{NULL, 0}};
const char *

View file

@ -157,6 +157,13 @@ radv_emulate_rt(const struct radv_physical_device *pdev)
return !pdev->info.has_image_bvh_intersect_ray && instance->drirc.emulate_rt;
}
bool
radv_use_bvh8(const struct radv_physical_device *pdev)
{
const struct radv_instance *instance = radv_physical_device_instance(pdev);
return pdev->info.gfx_level >= GFX12 && !radv_emulate_rt(pdev) && !(instance->debug_flags & RADV_DEBUG_BVH4);
}
static void
parse_hex(char *out, const char *in, unsigned length)
{
@ -186,6 +193,7 @@ radv_physical_device_init_cache_key(struct radv_physical_device *pdev)
key->disable_sinking_load_input_fs = instance->drirc.disable_sinking_load_input_fs;
key->disable_trunc_coord = instance->drirc.disable_trunc_coord;
key->emulate_rt = radv_emulate_rt(pdev);
key->bvh8 = radv_use_bvh8(pdev);
key->ge_wave32 = pdev->ge_wave_size == 32;
key->invariant_geom = !!(instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM);
key->no_fmask = !!(instance->debug_flags & RADV_DEBUG_NO_FMASK);

View file

@ -48,6 +48,7 @@ struct radv_physical_device_cache_key {
uint32_t disable_sinking_load_input_fs : 1;
uint32_t disable_trunc_coord : 1;
uint32_t emulate_rt : 1;
uint32_t bvh8 : 1;
uint32_t ge_wave32 : 1;
uint32_t invariant_geom : 1;
uint32_t no_fmask : 1;
@ -258,6 +259,8 @@ bool radv_enable_rt(const struct radv_physical_device *pdev);
bool radv_emulate_rt(const struct radv_physical_device *pdev);
bool radv_use_bvh8(const struct radv_physical_device *pdev);
uint32_t radv_find_memory_index(const struct radv_physical_device *pdev, VkMemoryPropertyFlags flags);
VkResult create_null_physical_device(struct vk_instance *vk_instance);

View file

@ -184,8 +184,8 @@ rra_dump_asic_info(const struct radeon_info *gpu_info, FILE *output)
}
static struct rra_accel_struct_header
rra_fill_accel_struct_header_common(struct radv_accel_struct_header *header, size_t parent_id_table_size,
size_t leaf_node_data_size, size_t internal_node_data_size,
rra_fill_accel_struct_header_common(const struct radv_physical_device *pdev, struct radv_accel_struct_header *header,
size_t parent_id_table_size, struct rra_bvh_info *bvh_info,
uint64_t primitive_count)
{
struct rra_accel_struct_header result = {
@ -199,32 +199,39 @@ rra_fill_accel_struct_header_common(struct radv_accel_struct_header *header, siz
/* TODO: calculate active primitives */
.active_primitive_count = primitive_count,
.geometry_description_count = header->geometry_count,
.interior_fp32_node_count = internal_node_data_size / sizeof(struct radv_bvh_box32_node),
.interior_fp32_node_count = bvh_info->internal_nodes_size / sizeof(struct radv_bvh_box32_node),
.leaf_node_count = primitive_count,
.rt_driver_interface_version = 8 << 16,
.rt_ip_version = pdev->info.rt_ip_version,
};
if (!radv_use_bvh8(pdev))
result.rt_ip_version = MIN2(result.rt_ip_version, RT_1_1);
result.metadata_size = sizeof(struct rra_accel_struct_metadata) + parent_id_table_size;
result.file_size =
result.metadata_size + sizeof(struct rra_accel_struct_header) + internal_node_data_size + leaf_node_data_size;
result.file_size = result.metadata_size + sizeof(struct rra_accel_struct_header) + bvh_info->internal_nodes_size +
bvh_info->leaf_nodes_size;
result.internal_nodes_offset = sizeof(struct rra_accel_struct_metadata);
result.leaf_nodes_offset = result.internal_nodes_offset + internal_node_data_size;
result.geometry_infos_offset = result.leaf_nodes_offset + leaf_node_data_size;
result.leaf_nodes_offset = result.internal_nodes_offset + bvh_info->internal_nodes_size;
result.geometry_infos_offset = result.leaf_nodes_offset + bvh_info->leaf_nodes_size;
result.leaf_ids_offset = result.geometry_infos_offset;
if (!header->instance_count)
if (header->instance_count) {
if (radv_use_bvh8(pdev))
result.leaf_ids_offset += bvh_info->instance_sideband_data_size;
} else {
result.leaf_ids_offset += header->geometry_count * sizeof(struct rra_geometry_info);
}
return result;
}
static void
rra_dump_tlas_header(struct radv_accel_struct_header *header, size_t parent_id_table_size, size_t leaf_node_data_size,
size_t internal_node_data_size, uint64_t primitive_count, FILE *output)
rra_dump_tlas_header(const struct radv_physical_device *pdev, struct radv_accel_struct_header *header,
size_t parent_id_table_size, struct rra_bvh_info *bvh_info, uint64_t primitive_count, FILE *output)
{
struct rra_accel_struct_header file_header = rra_fill_accel_struct_header_common(
header, parent_id_table_size, leaf_node_data_size, internal_node_data_size, primitive_count);
struct rra_accel_struct_header file_header =
rra_fill_accel_struct_header_common(pdev, header, parent_id_table_size, bvh_info, primitive_count);
file_header.post_build_info.bvh_type = RRA_BVH_TYPE_TLAS;
file_header.geometry_type = VK_GEOMETRY_TYPE_INSTANCES_KHR;
@ -232,12 +239,12 @@ rra_dump_tlas_header(struct radv_accel_struct_header *header, size_t parent_id_t
}
static void
rra_dump_blas_header(struct radv_accel_struct_header *header, size_t parent_id_table_size,
struct radv_accel_struct_geometry_info *geometry_infos, size_t leaf_node_data_size,
size_t internal_node_data_size, uint64_t primitive_count, FILE *output)
rra_dump_blas_header(const struct radv_physical_device *pdev, struct radv_accel_struct_header *header,
size_t parent_id_table_size, struct radv_accel_struct_geometry_info *geometry_infos,
struct rra_bvh_info *bvh_info, uint64_t primitive_count, FILE *output)
{
struct rra_accel_struct_header file_header = rra_fill_accel_struct_header_common(
header, parent_id_table_size, leaf_node_data_size, internal_node_data_size, primitive_count);
struct rra_accel_struct_header file_header =
rra_fill_accel_struct_header_common(pdev, header, parent_id_table_size, bvh_info, primitive_count);
file_header.post_build_info.bvh_type = RRA_BVH_TYPE_BLAS;
file_header.geometry_type = header->geometry_count ? geometry_infos->type : VK_GEOMETRY_TYPE_TRIANGLES_KHR;
@ -281,7 +288,8 @@ rra_validate_header(struct radv_rra_accel_struct_data *accel_struct, const struc
}
static VkResult
rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, uint8_t *data,
rra_dump_acceleration_structure(const struct radv_physical_device *pdev,
struct radv_rra_accel_struct_data *accel_struct, uint8_t *data,
struct hash_table_u64 *accel_struct_vas, bool should_validate, FILE *output)
{
struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data;
@ -297,10 +305,18 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
if (rra_validate_header(accel_struct, header)) {
return VK_ERROR_VALIDATION_FAILED_EXT;
}
if (rra_validate_node_gfx10_3(accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct->size, !is_tlas, 0)) {
return VK_ERROR_VALIDATION_FAILED_EXT;
if (radv_use_bvh8(pdev)) {
if (rra_validate_node_gfx12(accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct->size, !is_tlas, 0)) {
return VK_ERROR_VALIDATION_FAILED_EXT;
}
} else {
if (rra_validate_node_gfx10_3(accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct->size, !is_tlas, 0)) {
return VK_ERROR_VALIDATION_FAILED_EXT;
}
}
}
@ -321,7 +337,10 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
struct rra_bvh_info bvh_info = {
.geometry_infos = rra_geometry_infos,
};
rra_gather_bvh_info_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info);
if (radv_use_bvh8(pdev))
rra_gather_bvh_info_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info);
else
rra_gather_bvh_info_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info);
leaf_indices = calloc(header->geometry_count, sizeof(struct rra_geometry_info));
if (!leaf_indices) {
@ -343,6 +362,8 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
uint32_t node_parent_table_size =
((bvh_info.leaf_nodes_size + bvh_info.internal_nodes_size) / 64) * sizeof(uint32_t);
if (radv_use_bvh8(pdev))
node_parent_table_size = 0;
node_parent_table = calloc(node_parent_table_size, 1);
if (!node_parent_table) {
@ -355,7 +376,9 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto exit;
}
dst_structure_data = calloc(RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size, 1);
dst_structure_data = calloc(RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size +
bvh_info.instance_sideband_data_size,
1);
if (!dst_structure_data) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto exit;
@ -366,13 +389,20 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
.dst = dst_structure_data,
.dst_leaf_offset = RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size,
.dst_internal_offset = RRA_ROOT_NODE_OFFSET,
.dst_instance_sideband_data_offset =
RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size,
.parent_id_table = node_parent_table,
.parent_id_table_size = node_parent_table_size,
.leaf_node_ids = leaf_node_ids,
.leaf_indices = leaf_indices,
};
rra_transcode_node_gfx10_3(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, header->aabb);
if (radv_use_bvh8(pdev)) {
ctx.dst_internal_offset += sizeof(struct radv_gfx12_box_node);
rra_transcode_node_gfx12(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, RRA_ROOT_NODE_OFFSET);
} else {
rra_transcode_node_gfx10_3(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, header->aabb);
}
struct rra_accel_struct_chunk_header chunk_header = {
.metadata_offset = 0,
@ -395,8 +425,12 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
* the top bits are masked away.
* In order to make sure BLASes can be found in the hashmap, we have
* to replicate that mask here.
* On GFX12, we mask away the top 16 bits because the instance BLAS addresses
* use pointer flags.
*/
uint64_t va = accel_struct->va & 0x1FFFFFFFFFFFFFF;
if (radv_use_bvh8(pdev))
va &= 0xFFFFFFFFFFFF;
memcpy(chunk_header.virtual_address, &va, sizeof(uint64_t));
struct rra_accel_struct_metadata rra_metadata = {
@ -411,15 +445,13 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct,
fwrite(node_parent_table, 1, node_parent_table_size, output);
if (is_tlas)
rra_dump_tlas_header(header, node_parent_table_size, bvh_info.leaf_nodes_size, bvh_info.internal_nodes_size,
primitive_count, output);
rra_dump_tlas_header(pdev, header, node_parent_table_size, &bvh_info, primitive_count, output);
else
rra_dump_blas_header(header, node_parent_table_size, geometry_infos, bvh_info.leaf_nodes_size,
bvh_info.internal_nodes_size, primitive_count, output);
rra_dump_blas_header(pdev, header, node_parent_table_size, geometry_infos, &bvh_info, primitive_count, output);
/* Write acceleration structure data */
fwrite(dst_structure_data + RRA_ROOT_NODE_OFFSET, 1, bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size,
output);
fwrite(dst_structure_data + RRA_ROOT_NODE_OFFSET, 1,
bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size + bvh_info.instance_sideband_data_size, output);
if (!is_tlas)
fwrite(rra_geometry_infos, sizeof(struct rra_geometry_info), header->geometry_count, output);
@ -974,7 +1006,7 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
continue;
accel_struct_offsets[written_accel_struct_count] = (uint64_t)ftell(file);
result = rra_dump_acceleration_structure(data, mapped_data, device->rra_trace.accel_struct_vas,
result = rra_dump_acceleration_structure(pdev, data, mapped_data, device->rra_trace.accel_struct_vas,
device->rra_trace.validate_as, file);
rra_unmap_accel_struct_data(&copy_ctx, i);

View file

@ -285,6 +285,7 @@ radv_node_to_addr(uint64_t node)
struct rra_bvh_info {
uint32_t leaf_nodes_size;
uint32_t internal_nodes_size;
uint32_t instance_sideband_data_size;
struct rra_geometry_info *geometry_infos;
};
@ -293,6 +294,7 @@ struct rra_transcoding_context {
uint8_t *dst;
uint32_t dst_leaf_offset;
uint32_t dst_internal_offset;
uint32_t dst_instance_sideband_data_offset;
uint32_t *parent_id_table;
uint32_t parent_id_table_size;
uint32_t *leaf_node_ids;
@ -307,4 +309,12 @@ void rra_gather_bvh_info_gfx10_3(const uint8_t *bvh, uint32_t node_id, struct rr
uint32_t rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
vk_aabb bounds);
bool rra_validate_node_gfx12(struct hash_table_u64 *accel_struct_vas, uint8_t *data, void *node,
uint32_t geometry_count, uint32_t size, bool is_bottom_level, uint32_t depth);
void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_bvh_info *dst);
void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
uint32_t dst_offset);
#endif /* RADV_RRA_H */

View file

@ -0,0 +1,184 @@
/*
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
/* GFX12 specific code for RRA. */
#include "bvh/bvh.h"
#include "radv_rra.h"
#include "util/bitset.h"
struct rra_instance_sideband_data {
uint32_t instance_index;
uint32_t custom_instance_and_flags;
uint32_t blas_metadata_size;
uint32_t padding;
mat3x4 otw_matrix;
};
bool
rra_validate_node_gfx12(struct hash_table_u64 *accel_struct_vas, uint8_t *data, void *node, uint32_t geometry_count,
uint32_t size, bool is_bottom_level, uint32_t depth)
{
struct rra_validation_context ctx = {0};
if (depth > 1024) {
rra_validation_fail(&ctx, "depth > 1024");
return true;
}
uint32_t cur_offset = (uint8_t *)node - data;
snprintf(ctx.location, sizeof(ctx.location), "internal node (offset=%u)", cur_offset);
return ctx.failed;
}
static uint32_t
get_geometry_id(const void *node, uint32_t node_type)
{
if (node_type == radv_bvh_node_instance)
return 0;
uint32_t indices_midpoint = BITSET_EXTRACT(node, 42, 10);
return BITSET_EXTRACT(node, indices_midpoint - 28, 28);
}
void
rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_bvh_info *dst)
{
uint32_t node_type = node_id & 7;
switch (node_type) {
case radv_bvh_node_box32:
dst->internal_nodes_size += sizeof(struct radv_gfx12_box_node);
break;
case radv_bvh_node_instance:
dst->leaf_nodes_size += sizeof(struct radv_gfx12_instance_node);
dst->instance_sideband_data_size += sizeof(struct rra_instance_sideband_data);
break;
case radv_bvh_node_triangle:
dst->leaf_nodes_size += sizeof(struct radv_gfx12_primitive_node);
break;
default:
unreachable("Invalid node type");
break;
}
const void *node = bvh + ((node_id & (~7u)) << 3);
if (node_type == radv_bvh_node_box32) {
const struct radv_gfx12_box_node *src = node;
uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28;
uint32_t internal_id = src->internal_base_id;
uint32_t primitive_id = src->primitive_base_id;
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
uint32_t child_size = src->children[i].dword2 >> 28;
uint32_t child_id;
if (child_type == radv_bvh_node_box32) {
child_id = internal_id | child_type;
internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
} else {
child_id = primitive_id | child_type;
primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
}
rra_gather_bvh_info_gfx12(bvh, child_id, dst);
}
} else {
dst->geometry_infos[get_geometry_id(node, node_type)].primitive_count++;
}
}
static void
rra_transcode_box8_node(struct rra_transcoding_context *ctx, const struct radv_gfx12_box_node *src, uint32_t parent_id,
uint32_t dst_offset)
{
struct radv_gfx12_box_node *dst = (struct radv_gfx12_box_node *)(ctx->dst + dst_offset);
memcpy(dst, src, sizeof(struct radv_gfx12_box_node));
dst->internal_base_id = ctx->dst_internal_offset >> 3;
dst->primitive_base_id = ctx->dst_leaf_offset >> 3;
dst->unused = parent_id;
uint32_t valid_child_count_minus_one = dst->child_count_exponents >> 28;
uint32_t internal_child_count = 0;
uint32_t leaf_child_count = 0;
for (uint32_t i = 0; i <= valid_child_count_minus_one; ++i) {
uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
if (child_type == radv_bvh_node_box32)
internal_child_count++;
else
leaf_child_count++;
}
uint32_t dst_internal_offset = ctx->dst_internal_offset;
ctx->dst_internal_offset += internal_child_count * RADV_GFX12_BVH_NODE_SIZE;
uint32_t dst_leaf_offset = ctx->dst_leaf_offset;
ctx->dst_leaf_offset += leaf_child_count * RADV_GFX12_BVH_NODE_SIZE;
uint32_t internal_id = src->internal_base_id;
uint32_t primitive_id = src->primitive_base_id;
for (uint32_t i = 0; i <= valid_child_count_minus_one; ++i) {
uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
uint32_t child_size = src->children[i].dword2 >> 28;
uint32_t child_id;
uint32_t child_dst_offset;
if (child_type == radv_bvh_node_box32) {
child_id = internal_id | child_type;
internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
child_dst_offset = dst_internal_offset;
dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE;
} else {
child_id = primitive_id | child_type;
primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
child_dst_offset = dst_leaf_offset;
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
}
rra_transcode_node_gfx12(ctx, radv_bvh_node_box32 | (dst_offset >> 3), child_id, child_dst_offset);
dst->children[i].dword2 = (dst->children[i].dword2 & 0x0fffffff) | (1 << 28);
}
}
void
rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, uint32_t dst_offset)
{
uint32_t node_type = src_id & 7;
uint32_t src_offset = (src_id & (~7u)) << 3;
const void *src_child_node = ctx->src + src_offset;
if (node_type == radv_bvh_node_box32) {
rra_transcode_box8_node(ctx, src_child_node, parent_id, dst_offset);
} else {
memcpy(ctx->dst + dst_offset, src_child_node, RADV_GFX12_BVH_NODE_SIZE);
if (node_type == radv_bvh_node_instance) {
struct radv_gfx12_instance_node *dst = (void *)(ctx->dst + dst_offset);
struct rra_instance_sideband_data *sideband_data = (void *)(ctx->dst + ctx->dst_instance_sideband_data_offset);
ctx->dst_instance_sideband_data_offset += sizeof(struct rra_instance_sideband_data);
const struct radv_gfx12_instance_node_user_data *user_data =
(const void *)((const uint8_t *)src_child_node + sizeof(struct radv_gfx12_instance_node));
dst->pointer_flags_bvh_addr = dst->pointer_flags_bvh_addr - (user_data->bvh_offset >> 3) +
(sizeof(struct rra_accel_struct_metadata) >> 3);
dst->unused = parent_id;
sideband_data->instance_index = user_data->instance_index;
sideband_data->custom_instance_and_flags = user_data->custom_instance;
sideband_data->blas_metadata_size = offsetof(struct rra_accel_struct_metadata, unused);
sideband_data->otw_matrix = user_data->otw_matrix;
}
}
}