From 97f6287827371c9c208cd6e26419d65685111f46 Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Tue, 18 Mar 2025 13:41:02 +0100 Subject: [PATCH] radv: Use the BVH8 format on GFX12 Reviewed-by: Natalie Vock Part-of: --- docs/envvars.rst | 2 + src/amd/vulkan/bvh/build_helpers.h | 4 + src/amd/vulkan/bvh/build_interface.h | 10 + src/amd/vulkan/bvh/bvh.h | 58 +++ src/amd/vulkan/bvh/copy.comp | 6 +- src/amd/vulkan/bvh/copy_blas_addrs_gfx12.comp | 65 +++ src/amd/vulkan/bvh/encode.h | 269 +++++++++++ src/amd/vulkan/bvh/encode_gfx12.comp | 275 ++++++++++++ src/amd/vulkan/bvh/meson.build | 20 + src/amd/vulkan/bvh/update.comp | 4 +- src/amd/vulkan/bvh/update.h | 17 +- src/amd/vulkan/bvh/update_gfx12.comp | 213 +++++++++ src/amd/vulkan/meson.build | 1 + .../vulkan/nir/radv_nir_lower_ray_queries.c | 60 ++- src/amd/vulkan/nir/radv_nir_rt_common.c | 425 +++++++++++++++++- src/amd/vulkan/nir/radv_nir_rt_common.h | 15 +- src/amd/vulkan/nir/radv_nir_rt_shader.c | 39 +- src/amd/vulkan/radv_acceleration_structure.c | 364 +++++++++++---- src/amd/vulkan/radv_debug.h | 1 + src/amd/vulkan/radv_device.h | 2 + src/amd/vulkan/radv_instance.c | 1 + src/amd/vulkan/radv_physical_device.c | 8 + src/amd/vulkan/radv_physical_device.h | 3 + src/amd/vulkan/radv_rra.c | 96 ++-- src/amd/vulkan/radv_rra.h | 10 + src/amd/vulkan/radv_rra_gfx12.c | 184 ++++++++ 26 files changed, 1948 insertions(+), 204 deletions(-) create mode 100644 src/amd/vulkan/bvh/copy_blas_addrs_gfx12.comp create mode 100644 src/amd/vulkan/bvh/encode_gfx12.comp create mode 100644 src/amd/vulkan/bvh/update_gfx12.comp create mode 100644 src/amd/vulkan/radv_rra_gfx12.c diff --git a/docs/envvars.rst b/docs/envvars.rst index dcfc0111d18..02fe920ec80 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -1461,6 +1461,8 @@ RADV driver environment variables Dump backend IR (ACO or LLVM) for selected shader stages. ``asm`` Dump shader disassembly for selected shader stages. + ``bvh4`` + Use bvh4 encoding on GPUs that support bvh8 encoding. .. envvar:: RADV_FORCE_FAMILY diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index 910c3535324..14258005d7f 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -17,6 +17,10 @@ TYPE(radv_bvh_aabb_node, 4); TYPE(radv_bvh_instance_node, 8); TYPE(radv_bvh_box16_node, 4); TYPE(radv_bvh_box32_node, 4); +TYPE(radv_gfx12_box_node, 4); +TYPE(radv_gfx12_instance_node, 8); +TYPE(radv_gfx12_instance_node_user_data, 4); +TYPE(radv_gfx12_primitive_node, 4); uint32_t id_to_offset(uint32_t id) diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index c0c06c98fed..8dd856c7c54 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -35,6 +35,16 @@ struct encode_args { uint32_t geometry_type; }; +struct encode_gfx12_args { + VOID_REF intermediate_bvh; + VOID_REF output_base; + REF(vk_ir_header) header; + uint32_t output_bvh_offset; + uint32_t leaf_node_offsets_offset; + uint32_t leaf_node_count; + uint32_t geometry_type; +}; + struct header_args { REF(vk_ir_header) src; REF(radv_accel_struct_header) dst; diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index d15ad5f276b..162242f88a3 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -58,10 +58,12 @@ struct radv_accel_struct_header { uint64_t size; /* Everything after this gets updated/copied from the CPU. */ + uint32_t geometry_type; uint32_t geometry_count; uint32_t primitive_base_indices_offset; uint64_t instance_offset; uint64_t instance_count; + uint32_t leaf_node_offsets_offset; uint32_t build_flags; }; @@ -114,4 +116,60 @@ struct radv_bvh_box32_node { #define RADV_BVH_ROOT_NODE radv_bvh_node_box32 #define RADV_BVH_INVALID_NODE 0xffffffffu +/* GFX12 */ + +#define RADV_GFX12_BVH_NODE_SIZE 128 + +struct radv_gfx12_box_child { + uint32_t dword0; + uint32_t dword1; + uint32_t dword2; +}; + +#ifndef VULKAN +typedef struct radv_gfx12_box_child radv_gfx12_box_child; +#endif + +struct radv_gfx12_box_node { + uint32_t internal_base_id; + uint32_t primitive_base_id; + uint32_t unused; + vec3 origin; + uint32_t child_count_exponents; + uint32_t obb_matrix_index; + radv_gfx12_box_child children[8]; +}; + +struct radv_gfx12_instance_node { + mat3x4 wto_matrix; + uint64_t pointer_flags_bvh_addr; + uint32_t unused; + uint32_t cull_mask_user_data; + vec3 origin; + uint32_t child_count_exponents; + radv_gfx12_box_child children[4]; +}; + +struct radv_gfx12_instance_node_user_data { + mat3x4 otw_matrix; + uint32_t custom_instance; + uint32_t instance_index; + uint32_t bvh_offset; + uint32_t padding; + uint64_t blas_addr; + uint32_t primitive_base_indices_offset; + uint32_t leaf_node_offsets_offset; + uint32_t unused[12]; +}; + +/* Size of the primitive header section in bits. */ +#define RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE 52 + +/* Size of a primitive pair description in bits. */ +#define RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE 29 + +struct radv_gfx12_primitive_node { + uint32_t dwords[32]; +}; + #endif /* BVH_H */ diff --git a/src/amd/vulkan/bvh/copy.comp b/src/amd/vulkan/bvh/copy.comp index 52fac4e0bd3..2df5285c19f 100644 --- a/src/amd/vulkan/bvh/copy.comp +++ b/src/amd/vulkan/bvh/copy.comp @@ -71,7 +71,10 @@ main(void) DEREF(REF(uvec4)(copy_src_addr + offset)); /* Do the adjustment inline in the same invocation that copies the data so that we don't have - * to synchronize. */ + * to synchronize. This is only possible on pre-GFX12 HW because leaf nodes have a different + * order on GFX12. + */ +#if !GFX12 if (offset < node_end && offset >= node_offset && (offset - node_offset) % SIZEOF(radv_bvh_instance_node) == 0) { uint64_t idx = (offset - node_offset) / SIZEOF(radv_bvh_instance_node); @@ -85,5 +88,6 @@ main(void) DEREF(REF(radv_bvh_instance_node)(copy_dst_addr + offset)).bvh_ptr = addr_to_node(blas_addr + bvh_offset); } } +#endif } } diff --git a/src/amd/vulkan/bvh/copy_blas_addrs_gfx12.comp b/src/amd/vulkan/bvh/copy_blas_addrs_gfx12.comp new file mode 100644 index 00000000000..40e96256267 --- /dev/null +++ b/src/amd/vulkan/bvh/copy_blas_addrs_gfx12.comp @@ -0,0 +1,65 @@ +/* + * Copyright © 2022 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#include "build_interface.h" + +layout(push_constant) uniform CONSTS +{ + copy_args args; +}; + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + uint32_t total_invocations = gl_NumWorkGroups.x * 64; + + uint64_t accel_struct_addr = args.mode == RADV_COPY_MODE_SERIALIZE ? args.src_addr : args.dst_addr; + uint64_t serialized_addr = args.mode == RADV_COPY_MODE_SERIALIZE ? args.dst_addr : args.src_addr; + + uint64_t blas_addrs = serialized_addr + SIZEOF(radv_accel_struct_serialization_header); + + radv_accel_struct_serialization_header serialization_header = + DEREF(REF(radv_accel_struct_serialization_header)(serialized_addr)); + + radv_accel_struct_header header = DEREF(REF(radv_accel_struct_header)(accel_struct_addr)); + + for (uint32_t i = global_id; i < serialization_header.instance_count; i += total_invocations) { + uint64_t instance_offset_addr = accel_struct_addr + (header.leaf_node_offsets_offset + i * 4); + uint64_t instance_addr = accel_struct_addr + (header.bvh_offset + DEREF(REF(uint32_t)(instance_offset_addr))); + REF(radv_gfx12_instance_node) instance_node = REF(radv_gfx12_instance_node)(instance_addr); + REF(radv_gfx12_instance_node_user_data) instance_data = + REF(radv_gfx12_instance_node_user_data)(instance_addr + SIZEOF(radv_gfx12_instance_node)); + + if (args.mode == RADV_COPY_MODE_SERIALIZE) { + DEREF(INDEX(uint64_t, blas_addrs, i)) = DEREF(instance_data).blas_addr; + } else { + uint32_t bvh_offset = DEREF(instance_data).bvh_offset; + + /* Replace the address while keeping the pointer flags. */ + uint64_t pointer_flags_bvh_addr = DEREF(instance_node).pointer_flags_bvh_addr; + uint64_t blas_addr = DEREF(INDEX(uint64_t, blas_addrs, i)); + DEREF(instance_node).pointer_flags_bvh_addr = + (pointer_flags_bvh_addr & 0xFFC0000000000000ul) | addr_to_node(blas_addr + bvh_offset); + DEREF(instance_data).blas_addr = blas_addr; + } + } +} diff --git a/src/amd/vulkan/bvh/encode.h b/src/amd/vulkan/bvh/encode.h index c375c059deb..25abeb2ac8d 100644 --- a/src/amd/vulkan/bvh/encode.h +++ b/src/amd/vulkan/bvh/encode.h @@ -52,4 +52,273 @@ radv_encode_instance_gfx10_3(VOID_REF dst_addr, vk_ir_instance_node src) DEREF(dst).instance_id = src.instance_id; } +struct bit_writer { + uint64_t addr; + uint32_t offset; + uint32_t temp; + uint32_t count; + uint32_t total_count; +}; + +void +bit_writer_init(out bit_writer writer, uint64_t addr) +{ + writer.addr = addr; + writer.offset = 0; + writer.temp = 0; + writer.count = 0; + writer.total_count = 0; +} + +void +bit_writer_write(inout bit_writer writer, uint32_t data, uint32_t bit_size) +{ + writer.total_count += bit_size; + + if (writer.count + bit_size >= 32) { + writer.temp = writer.temp | (data << writer.count); + + REF(uint32_t) dst = REF(uint32_t)(writer.addr + writer.offset); + DEREF(dst) = writer.temp; + writer.offset += 4; + + bit_size = bit_size - (32 - writer.count); + if (writer.count == 0) + data = 0; + else + data = data >> (32 - writer.count); + + writer.temp = 0; + writer.count = 0; + } + + writer.temp = writer.temp | (data << writer.count); + writer.count += bit_size; +} + +void +bit_writer_skip_to(inout bit_writer writer, uint32_t target) +{ + /* Flush the remaining data. */ + if (writer.count > 0) { + REF(uint32_t) dst = REF(uint32_t)(writer.addr + writer.offset); + DEREF(dst) = writer.temp; + } + + writer.count = target % 32; + writer.total_count = target; + writer.offset = (target / 32) * 4; +} + +void +bit_writer_finish(inout bit_writer writer) +{ + /* Flush the remaining data. */ + if (writer.count > 0) { + REF(uint32_t) dst = REF(uint32_t)(writer.addr + writer.offset); + DEREF(dst) = writer.temp; + } + + writer.temp = 0; + writer.count = 0; + writer.total_count = 0; +} + +void +radv_encode_triangle_gfx12(VOID_REF dst, vk_ir_triangle_node src) +{ + bit_writer child_writer; + bit_writer_init(child_writer, dst); + + bit_writer_write(child_writer, 31, 5); /* x_vertex_bits_minus_one */ + bit_writer_write(child_writer, 31, 5); /* y_vertex_bits_minus_one */ + bit_writer_write(child_writer, 31, 5); /* z_vertex_bits_minus_one */ + bit_writer_write(child_writer, 0, 5); /* trailing_zero_bits */ + bit_writer_write(child_writer, 14, 4); /* geometry_index_base_bits_div_2 */ + bit_writer_write(child_writer, 14, 4); /* geometry_index_bits_div_2 */ + bit_writer_write(child_writer, 0, 3); /* triangle_pair_count_minus_one */ + bit_writer_write(child_writer, 0, 1); /* vertex_type */ + bit_writer_write(child_writer, 28, 5); /* primitive_index_base_bits */ + bit_writer_write(child_writer, 28, 5); /* primitive_index_bits */ + /* header + 9 floats + geometry_id */ + bit_writer_write(child_writer, RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 9 * 32 + 28, 10); + + bit_writer_write(child_writer, floatBitsToUint(src.coords[0][0]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[0][1]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[0][2]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[1][0]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[1][1]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[1][2]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[2][0]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[2][1]), 32); + bit_writer_write(child_writer, floatBitsToUint(src.coords[2][2]), 32); + + bit_writer_write(child_writer, src.geometry_id_and_flags & 0xfffffff, 28); + bit_writer_write(child_writer, src.triangle_id, 28); + + bit_writer_skip_to(child_writer, 32 * 32 - RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE); + + uint32_t opaque = (src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0 ? 1 : 0; + + bit_writer_write(child_writer, 1, 1); /* prim_range_stop */ + bit_writer_write(child_writer, 0, 1); /* tri1_double_sided */ + bit_writer_write(child_writer, 0, 1); /* tri1_opaque */ + bit_writer_write(child_writer, 0, 4); /* tri1_v0_index */ + bit_writer_write(child_writer, 0, 4); /* tri1_v1_index */ + bit_writer_write(child_writer, 0, 4); /* tri1_v2_index */ + bit_writer_write(child_writer, 0, 1); /* tri0_double_sided */ + bit_writer_write(child_writer, opaque, 1); /* tri0_opaque */ + bit_writer_write(child_writer, 0, 4); /* tri0_v0_index */ + bit_writer_write(child_writer, 1, 4); /* tri0_v1_index */ + bit_writer_write(child_writer, 2, 4); /* tri0_v2_index */ + + bit_writer_finish(child_writer); +} + +void +radv_encode_aabb_gfx12(VOID_REF dst, vk_ir_aabb_node src) +{ + bit_writer child_writer; + bit_writer_init(child_writer, dst); + + bit_writer_write(child_writer, 0, 5); /* x_vertex_bits_minus_one */ + bit_writer_write(child_writer, 0, 5); /* y_vertex_bits_minus_one */ + bit_writer_write(child_writer, 0, 5); /* z_vertex_bits_minus_one */ + bit_writer_write(child_writer, 0, 5); /* trailing_zero_bits */ + bit_writer_write(child_writer, 14, 4); /* geometry_index_base_bits_div_2 */ + bit_writer_write(child_writer, 14, 4); /* geometry_index_bits_div_2 */ + bit_writer_write(child_writer, 0, 3); /* triangle_pair_count_minus_one */ + bit_writer_write(child_writer, 0, 1); /* vertex_type */ + bit_writer_write(child_writer, 28, 5); /* primitive_index_base_bits */ + bit_writer_write(child_writer, 28, 5); /* primitive_index_bits */ + /* header + 6 floats + geometry_id */ + bit_writer_write(child_writer, RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 6 * 32 + 28, 10); + + bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.min.x), 32); + bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.min.y), 32); + bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.min.z), 32); + bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.max.x), 32); + bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.max.y), 32); + bit_writer_write(child_writer, floatBitsToUint(src.base.aabb.max.z), 32); + + bit_writer_write(child_writer, src.geometry_id_and_flags & 0xfffffff, 28); + bit_writer_write(child_writer, src.primitive_id, 28); + + bit_writer_skip_to(child_writer, 32 * 32 - RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE); + + uint32_t opaque = (src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0 ? 1 : 0; + + bit_writer_write(child_writer, 1, 1); /* prim_range_stop */ + bit_writer_write(child_writer, 0, 1); /* tri1_double_sided */ + bit_writer_write(child_writer, 0, 1); /* tri1_opaque */ + bit_writer_write(child_writer, 0, 4); /* tri1_v0_index */ + bit_writer_write(child_writer, 0, 4); /* tri1_v1_index */ + bit_writer_write(child_writer, 0, 4); /* tri1_v2_index */ + bit_writer_write(child_writer, 0, 1); /* tri0_double_sided */ + bit_writer_write(child_writer, opaque, 1); /* tri0_opaque */ + bit_writer_write(child_writer, 0xf, 4); /* tri0_v0_index */ + bit_writer_write(child_writer, 0xf, 4); /* tri0_v1_index */ + bit_writer_write(child_writer, 0, 4); /* tri0_v2_index */ + + bit_writer_finish(child_writer); +} + +/* Writes both the HW node and user data. */ +void +radv_encode_instance_gfx12(VOID_REF dst, vk_ir_instance_node src) +{ + bit_writer child_writer; + bit_writer_init(child_writer, dst); + + radv_accel_struct_header blas_header = DEREF(REF(radv_accel_struct_header)(src.base_ptr)); + + mat4 transform = mat4(src.otw_matrix); + mat4 wto_matrix = transpose(inverse(transpose(transform))); + + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][0]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][1]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][2]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[0][3]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][0]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][1]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][2]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[1][3]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][0]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][1]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][2]), 32); + bit_writer_write(child_writer, floatBitsToUint(wto_matrix[2][3]), 32); + + uint32_t flags = src.sbt_offset_and_flags >> 24; + uint32_t instance_pointer_flags = 0; + if ((flags & VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR) != 0) + instance_pointer_flags |= 1; + if ((flags & VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR) != 0) + instance_pointer_flags |= 2; + if ((flags & VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR) != 0 || + blas_header.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) + instance_pointer_flags |= 4; + if ((flags & VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR) != 0) + instance_pointer_flags |= 8; + + if (blas_header.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) + instance_pointer_flags |= 512; + else + instance_pointer_flags |= 256; + + uint64_t bvh_addr = addr_to_node(src.base_ptr + blas_header.bvh_offset); + bit_writer_write(child_writer, uint32_t(bvh_addr & 0xffffffff), 32); + bit_writer_write(child_writer, uint32_t(bvh_addr >> 32) | (instance_pointer_flags << (54 - 32)), 32); + bit_writer_write(child_writer, src.custom_instance_and_mask & 0xffffff, 32); + bit_writer_write(child_writer, src.sbt_offset_and_flags & 0xffffff, 24); + bit_writer_write(child_writer, src.custom_instance_and_mask >> 24, 8); + + bit_writer_write(child_writer, floatBitsToUint(blas_header.aabb.min.x), 32); + bit_writer_write(child_writer, floatBitsToUint(blas_header.aabb.min.y), 32); + bit_writer_write(child_writer, floatBitsToUint(blas_header.aabb.min.z), 32); + + vec3 child_extent = blas_header.aabb.max - blas_header.aabb.min; + uvec3 child_extent_exponents = uvec3(ceil(clamp(log2(child_extent) + 127.0, vec3(0.0), vec3(255)))); + + bit_writer_write(child_writer, child_extent_exponents.x, 8); + bit_writer_write(child_writer, child_extent_exponents.y, 8); + bit_writer_write(child_writer, child_extent_exponents.z, 8); + bit_writer_write(child_writer, 0, 4); + bit_writer_write(child_writer, 0, 4); + + bit_writer_write(child_writer, 0, 12); + bit_writer_write(child_writer, 0, 12); + bit_writer_write(child_writer, 4, 8); + bit_writer_write(child_writer, 0, 12); + bit_writer_write(child_writer, 0xfff, 12); + bit_writer_write(child_writer, 0xff, 8); + bit_writer_write(child_writer, 0xfff, 12); + bit_writer_write(child_writer, 0xfff, 12); + bit_writer_write(child_writer, radv_bvh_node_box32, 4); + bit_writer_write(child_writer, 1, 4); + + for (uint32_t remaining_child_index = 0; remaining_child_index < 3; remaining_child_index++) { + bit_writer_write(child_writer, 0xfff, 12); + bit_writer_write(child_writer, 0xfff, 12); + bit_writer_write(child_writer, 0xff, 8); + bit_writer_write(child_writer, 0xfff, 12); + bit_writer_write(child_writer, 0, 12); + bit_writer_write(child_writer, 0, 8); + bit_writer_write(child_writer, 0, 12); + bit_writer_write(child_writer, 0, 12); + bit_writer_write(child_writer, 0, 8); + } + + bit_writer_finish(child_writer); + + REF(radv_gfx12_instance_node_user_data) user_data = + REF(radv_gfx12_instance_node_user_data)(dst + RADV_GFX12_BVH_NODE_SIZE); + DEREF(user_data).otw_matrix = src.otw_matrix; + DEREF(user_data).custom_instance = src.custom_instance_and_mask & 0xffffff; + DEREF(user_data).instance_index = src.instance_id; + DEREF(user_data).bvh_offset = blas_header.bvh_offset; + DEREF(user_data).blas_addr = src.base_ptr; + DEREF(user_data).primitive_base_indices_offset = blas_header.primitive_base_indices_offset; + DEREF(user_data).leaf_node_offsets_offset = blas_header.leaf_node_offsets_offset; +} + #endif diff --git a/src/amd/vulkan/bvh/encode_gfx12.comp b/src/amd/vulkan/bvh/encode_gfx12.comp new file mode 100644 index 00000000000..08fbf8bf97b --- /dev/null +++ b/src/amd/vulkan/bvh/encode_gfx12.comp @@ -0,0 +1,275 @@ +/* + * Copyright © 2022 Friedrich Vock + * Copyright © 2025 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_KHR_memory_scope_semantics : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#define GFX12 + +#include "build_helpers.h" +#include "build_interface.h" +#include "encode.h" + +layout(push_constant) uniform CONSTS +{ + encode_gfx12_args args; +}; + +void +set_parent(uint32_t child, uint32_t parent) +{ + uint64_t addr = args.output_base + args.output_bvh_offset - child / 16 * 4 - 4; + DEREF(REF(uint32_t)(addr)) = parent; +} + +void +main() +{ + if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count) + return; + + /* Revert the order so we start at the root */ + uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x; + + uint32_t ir_leaf_node_size; + switch (args.geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { + ir_leaf_node_size = SIZEOF(vk_ir_triangle_node); + break; + } + case VK_GEOMETRY_TYPE_AABBS_KHR: { + ir_leaf_node_size = SIZEOF(vk_ir_aabb_node); + break; + } + default: + /* instances */ + ir_leaf_node_size = SIZEOF(vk_ir_instance_node); + break; + } + + uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size; + uint32_t dst_internal_offset = id_to_offset(RADV_BVH_ROOT_NODE); + + REF(vk_ir_box_node) intermediate_internal_nodes = + REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size); + REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id); + vk_ir_box_node src = DEREF(src_node); + + bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1; + + for (;;) { + /* Make changes to the current node's BVH offset value visible. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset; + if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) + continue; + + if (bvh_offset == VK_NULL_BVH_OFFSET) + break; + + REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset)); + + uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32); + + uint32_t children[8]; + + uint32_t found_child_count = 0; + for (uint32_t i = 0; i < 2; i++) { + if (src.children[i] != RADV_BVH_INVALID_NODE) { + children[found_child_count] = src.children[i]; + found_child_count++; + } + } + + /* TODO: Collapse child nodes with high SAH values. */ + while (found_child_count < 8) { + bool progress = false; + for (int32_t i = 0; i < found_child_count; i++) { + uint32_t child_id = children[i]; + if (ir_id_to_type(child_id) != vk_ir_node_internal) + continue; + + progress = true; + + REF(vk_ir_box_node) child_node = + REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child_id)); + uint32_t grandchildren[2] = DEREF(child_node).children; + uint32_t valid_grandchild_count = 0; + + if (grandchildren[1] != RADV_BVH_INVALID_NODE) + valid_grandchild_count++; + + if (grandchildren[0] != RADV_BVH_INVALID_NODE) + valid_grandchild_count++; + else + grandchildren[0] = grandchildren[1]; + + if (valid_grandchild_count > 1) { + children[found_child_count] = grandchildren[1]; + found_child_count++; + } + + if (valid_grandchild_count > 0) { + children[i] = grandchildren[0]; + } else { + found_child_count--; + children[i] = children[found_child_count]; + } + + DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET; + + if (found_child_count == 8) + break; + } + + if (!progress) + break; + } + + uint32_t child_leaf_nodes_size = 0; + uint32_t child_internal_nodes_size = 0; + for (uint32_t i = 0; i < found_child_count; i++) { + uint32_t type = ir_id_to_type(children[i]); + if (type == vk_ir_node_internal) + child_internal_nodes_size += RADV_GFX12_BVH_NODE_SIZE; + else if (type == vk_ir_node_instance) + child_leaf_nodes_size += 2 * RADV_GFX12_BVH_NODE_SIZE; + else + child_leaf_nodes_size += RADV_GFX12_BVH_NODE_SIZE; + } + + uint32_t dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size); + uint32_t dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size); + + vec3 origin = src.base.aabb.min; + vec3 extent = src.base.aabb.max - src.base.aabb.min; + + extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000); + uvec3 extent_exponents = floatBitsToUint(extent) >> 23; + + DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0); + DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0); + DEREF(dst).origin = origin; + DEREF(dst).child_count_exponents = + extent_exponents.x | (extent_exponents.y << 8) | (extent_exponents.z << 16) | ((found_child_count - 1) << 28); + DEREF(dst).obb_matrix_index = 0x7f; + + for (uint32_t i = 0; i < found_child_count; i++) { + uint32_t child_id = children[i]; + uint32_t type = ir_id_to_type(child_id); + uint32_t offset = ir_id_to_offset(child_id); + + uint32_t child_node_size_128b = 1; + uint32_t encoded_type = 0; + uint32_t dst_offset = 0; + uint32_t cull_mask = 0xff; + if (type == vk_ir_node_internal) { + encoded_type = 5; + dst_offset = dst_internal_offset; + + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset); + DEREF(child_node).bvh_offset = dst_internal_offset; + + dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE; + } else { + dst_offset = dst_leaf_offset; + + /* Write leaf node offset. */ + uint32_t child_index = offset / ir_leaf_node_size; + REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset); + child_dst_offset = INDEX(uint32_t, child_dst_offset, child_index); + DEREF(child_dst_offset) = dst_offset; + + VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_leaf_offset; + + switch (args.geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { + vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset))); + radv_encode_triangle_gfx12(dst_leaf_addr, src_node); + dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE; + break; + } + case VK_GEOMETRY_TYPE_AABBS_KHR: { + vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset))); + radv_encode_aabb_gfx12(dst_leaf_addr, src_node); + dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE; + break; + } + default: + /* instances */ + encoded_type = 6; + child_node_size_128b = 2; + + vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset))); + radv_encode_instance_gfx12(dst_leaf_addr, src_node); + + cull_mask = src_node.custom_instance_and_mask >> 24; + + dst_leaf_offset += 2 * RADV_GFX12_BVH_NODE_SIZE; + + break; + } + } + + vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb; + + radv_gfx12_box_child child; + /* TODO: subtree flags culling */ + child.dword0 = min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) | + (min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12); + /* TODO: subtree mask culling */ + child.dword1 = + min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) | + (min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) | + (cull_mask << 24); + child.dword2 = + min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) | + (min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) | + (encoded_type << 24) | (child_node_size_128b << 28); + DEREF(dst).children[i] = child; + + set_parent(pack_node_id(dst_offset, encoded_type), node_id); + } + + /* Set remaining children to invalid */ + for (uint32_t i = found_child_count; i < 8; i++) { + radv_gfx12_box_child null_child; + null_child.dword0 = 0xffffffff; + null_child.dword1 = 0xfff; + null_child.dword2 = 0; + DEREF(dst).children[i] = null_child; + } + + /* Make changes to the children's BVH offset value available to the other invocations. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + break; + } + + if (is_root_node) { + REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base); + DEREF(header).aabb = src.base.aabb; + DEREF(header).bvh_offset = args.output_bvh_offset; + + set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE); + } +} diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index ec3d5bc5827..e3ab68cbd25 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -3,9 +3,24 @@ # source file, output name, defines bvh_shaders = [ + [ + 'copy_blas_addrs_gfx12.comp', + 'copy_blas_addrs_gfx12', + [], + ], [ 'copy.comp', 'copy', + ['GFX12=0'], + ], + [ + 'copy.comp', + 'copy_gfx12', + ['GFX12=1'], + ], + [ + 'encode_gfx12.comp', + 'encode_gfx12', [], ], [ @@ -28,6 +43,11 @@ bvh_shaders = [ 'update', [], ], + [ + 'update_gfx12.comp', + 'update_gfx12', + [], + ], [ 'leaf.comp', 'radv_leaf', diff --git a/src/amd/vulkan/bvh/update.comp b/src/amd/vulkan/bvh/update.comp index e37194ce772..71f5314b056 100644 --- a/src/amd/vulkan/bvh/update.comp +++ b/src/amd/vulkan/bvh/update.comp @@ -57,10 +57,10 @@ void main() { vk_aabb bounds; bool is_active; if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { - is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x); + is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, false); } else { VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); - is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x); + is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, false); } if (!is_active) diff --git a/src/amd/vulkan/bvh/update.h b/src/amd/vulkan/bvh/update.h index 7fabd1ed7a3..8012f022961 100644 --- a/src/amd/vulkan/bvh/update.h +++ b/src/amd/vulkan/bvh/update.h @@ -11,7 +11,8 @@ #include "encode.h" bool -radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id) +radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id, + bool gfx12) { bool is_valid = true; triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id); @@ -56,13 +57,17 @@ radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data node.triangle_id = global_id; node.geometry_id_and_flags = geom_data.geometry_id; - radv_encode_triangle_gfx10_3(dst_ptr, node); + if (gfx12) + radv_encode_triangle_gfx12(dst_ptr, node); + else + radv_encode_triangle_gfx10_3(dst_ptr, node); return is_valid; } bool -radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id) +radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id, + bool gfx12) { bool is_valid = true; @@ -87,10 +92,14 @@ radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32 #endif vk_ir_aabb_node node; + node.base.aabb = bounds; node.primitive_id = global_id; node.geometry_id_and_flags = geometry_id; - radv_encode_aabb_gfx10_3(dst_ptr, node); + if (gfx12) + radv_encode_aabb_gfx12(dst_ptr, node); + else + radv_encode_aabb_gfx10_3(dst_ptr, node); return is_valid; } diff --git a/src/amd/vulkan/bvh/update_gfx12.comp b/src/amd/vulkan/bvh/update_gfx12.comp new file mode 100644 index 00000000000..7f1a1c71b2a --- /dev/null +++ b/src/amd/vulkan/bvh/update_gfx12.comp @@ -0,0 +1,213 @@ +/* + * Copyright © 2025 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_KHR_memory_scope_semantics : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#include "build_interface.h" +#include "update.h" + +layout(push_constant) uniform CONSTS +{ + update_args args; +}; + +uint32_t +fetch_parent_node(VOID_REF bvh, uint32_t node) +{ + uint64_t addr = bvh - node / 16 * 4 - 4; + return DEREF(REF(uint32_t)(addr)); +} + +void +main() +{ + uint32_t bvh_offset = DEREF(args.src).bvh_offset; + + VOID_REF src_bvh = OFFSET(args.src, bvh_offset); + VOID_REF dst_bvh = OFFSET(args.dst, bvh_offset); + + VOID_REF leaf_node_offsets = OFFSET(args.src, DEREF(args.src).leaf_node_offsets_offset); + + uint32_t leaf_node_size; + if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) + leaf_node_size = SIZEOF(radv_gfx12_primitive_node); + else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) + leaf_node_size = SIZEOF(radv_gfx12_primitive_node); + else + leaf_node_size = SIZEOF(radv_gfx12_instance_node) + SIZEOF(radv_gfx12_instance_node_user_data); + + uint32_t leaf_node_id = args.geom_data.first_id + gl_GlobalInvocationID.x; + uint32_t first_leaf_offset = id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_gfx12_box_node); + + uint32_t dst_offset = DEREF(INDEX(uint32_t, leaf_node_offsets, leaf_node_id)); + VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset); + uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride; + + vk_aabb bounds; + bool is_active; + if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { + is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, true); + } else { + VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); + is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, true); + } + + if (!is_active) + return; + + DEREF(INDEX(vk_aabb, args.leaf_bounds, (dst_offset - first_leaf_offset) / leaf_node_size)) = bounds; + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + uint32_t node_id = pack_node_id(dst_offset, 0); + uint32_t parent_id = fetch_parent_node(src_bvh, node_id); + uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size; + while (parent_id != RADV_BVH_INVALID_NODE) { + uint32_t offset = id_to_offset(parent_id); + + uint32_t parent_index = (offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1; + if (parent_id == RADV_BVH_ROOT_NODE) + parent_index = 0; + + /* Make accesses to internal nodes in dst_bvh available and visible */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + REF(radv_gfx12_box_node) src_node = REF(radv_gfx12_box_node) OFFSET(src_bvh, offset); + REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, offset); + + uint32_t valid_child_count_minus_one = DEREF(src_node).child_count_exponents >> 28; + + /* Check if all children have been processed. As this is an atomic the last path coming from + * a child will pass here, while earlier paths break. + */ + uint32_t ready_child_count = atomicAdd( + DEREF(INDEX(uint32_t, args.internal_ready_count, parent_index)), 1, gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + if (ready_child_count != valid_child_count_minus_one) + break; + + uint32_t child_internal_id = DEREF(src_node).internal_base_id; + uint32_t child_primitive_id = DEREF(src_node).primitive_base_id; + + DEREF(dst_node).internal_base_id = child_internal_id; + DEREF(dst_node).primitive_base_id = child_primitive_id; + + uint32_t child_offsets[8]; + vk_aabb total_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY)); + for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { + radv_gfx12_box_child child = DEREF(src_node).children[i]; + uint32_t child_type = (child.dword2 >> 24) & 0xf; + uint32_t child_size_id = (child.dword2 >> 28) * RADV_GFX12_BVH_NODE_SIZE / 8; + + uint32_t child_id; + if (child_type == radv_bvh_node_box32) { + child_id = child_internal_id; + child_internal_id += child_size_id; + } else { + child_id = child_primitive_id; + child_primitive_id += child_size_id; + } + + child_offsets[i] = id_to_offset(child_id); + + uint32_t child_offset = child_offsets[i]; + vk_aabb child_aabb; + if (child_offset == dst_offset) { + child_aabb = bounds; + } else { + uint32_t child_index; + if (child_offset >= internal_nodes_offset) { + child_index = + (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count; + } else { + child_index = (child_offset - first_leaf_offset) / leaf_node_size; + } + + child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index)); + } + + total_bounds.min = min(total_bounds.min, child_aabb.min); + total_bounds.max = max(total_bounds.max, child_aabb.max); + } + + vec3 origin = total_bounds.min; + vec3 extent = total_bounds.max - total_bounds.min; + + extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000); + uvec3 extent_exponents = floatBitsToUint(extent) >> 23; + + DEREF(dst_node).origin = origin; + DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) | + (extent_exponents.z << 16) | (valid_child_count_minus_one << 28); + DEREF(dst_node).obb_matrix_index = 0x7f; + + for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { + uint32_t child_offset = child_offsets[i]; + vk_aabb child_aabb; + if (child_offset == dst_offset) { + child_aabb = bounds; + } else { + uint32_t child_index; + if (child_offset >= internal_nodes_offset) { + child_index = + (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node) + 1 + args.leaf_node_count; + } else { + child_index = (child_offset - first_leaf_offset) / leaf_node_size; + } + + child_aabb = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index)); + } + + radv_gfx12_box_child child = DEREF(src_node).children[i]; + + radv_gfx12_box_child box_child; + box_child.dword0 = + (child.dword0 & 0xFF000000) | + min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) | + (min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12); + box_child.dword1 = + (child.dword1 & 0xFF000000) | + min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) | + (min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12); + box_child.dword2 = + (child.dword2 & 0xFF000000) | + min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) | + (min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12); + DEREF(dst_node).children[i] = box_child; + } + + for (uint32_t i = valid_child_count_minus_one + 1; i < 8; i++) { + radv_gfx12_box_child null_child; + null_child.dword0 = 0xffffffff; + null_child.dword1 = 0xfff; + null_child.dword2 = 0; + DEREF(dst_node).children[i] = null_child; + } + + if (parent_id == RADV_BVH_ROOT_NODE) + DEREF(args.dst).aabb = total_bounds; + + DEREF(INDEX(vk_aabb, args.leaf_bounds, parent_index + args.leaf_node_count)) = total_bounds; + + parent_id = fetch_parent_node(src_bvh, parent_id); + } +} diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build index 4166280ed3c..0e6eca9683f 100644 --- a/src/amd/vulkan/meson.build +++ b/src/amd/vulkan/meson.build @@ -147,6 +147,7 @@ libradv_files = files( 'radv_rmv.c', 'radv_rmv.h', 'radv_rra_gfx10_3.c', + 'radv_rra_gfx12.c', 'radv_rra.c', 'radv_rra.h', 'radv_sampler.c', diff --git a/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c b/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c index cc6b4c832d6..2dc30e5a474 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c +++ b/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c @@ -241,8 +241,11 @@ enum rq_intersection_type { intersection_type_none, intersection_type_triangle, static void lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_vars *vars, nir_deref_instr *rq, - struct radv_instance *instance) + struct radv_device *device) { + const struct radv_physical_device *pdev = radv_device_physical(device); + struct radv_instance *instance = radv_physical_device_instance(pdev); + nir_deref_instr *closest = rq_deref(b, rq, closest); nir_deref_instr *candidate = rq_deref(b, rq, candidate); @@ -270,7 +273,7 @@ lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query b, 1, 32, nir_iadd_imm(b, accel_struct, offsetof(struct radv_accel_struct_header, bvh_offset)), .access = ACCESS_NON_WRITEABLE); nir_def *bvh_base = nir_iadd(b, accel_struct, nir_u2u64(b, bvh_offset)); - bvh_base = build_addr_to_node(b, bvh_base); + bvh_base = build_addr_to_node(device, b, bvh_base, instr->src[2].ssa); rq_store(b, rq, root_bvh_base, bvh_base); rq_store(b, rq, trav_bvh_base, bvh_base); @@ -320,44 +323,27 @@ lower_rq_load(struct radv_device *device, nir_builder *b, nir_intrinsic_instr *i return isec_load(b, intersection, frontface); case nir_ray_query_value_intersection_geometry_index: return nir_iand_imm(b, isec_load(b, intersection, geometry_id_and_flags), 0xFFFFFF); - case nir_ray_query_value_intersection_instance_custom_index: { - nir_def *instance_node_addr = isec_load(b, intersection, instance_addr); - return nir_iand_imm( - b, - nir_build_load_global( - b, 1, 32, - nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, custom_instance_and_mask))), - 0xFFFFFF); - } - case nir_ray_query_value_intersection_instance_id: { - nir_def *instance_node_addr = isec_load(b, intersection, instance_addr); - return nir_build_load_global( - b, 1, 32, nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, instance_id))); - } + case nir_ray_query_value_intersection_instance_custom_index: + return radv_load_custom_instance(device, b, isec_load(b, intersection, instance_addr)); + case nir_ray_query_value_intersection_instance_id: + return radv_load_instance_id(device, b, isec_load(b, intersection, instance_addr)); case nir_ray_query_value_intersection_instance_sbt_index: return nir_iand_imm(b, isec_load(b, intersection, sbt_offset_and_flags), 0xFFFFFF); case nir_ray_query_value_intersection_object_ray_direction: { - nir_def *instance_node_addr = isec_load(b, intersection, instance_addr); nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(device, b, isec_load(b, intersection, instance_addr), wto_matrix); return nir_build_vec3_mat_mult(b, rq_load(b, rq, direction), wto_matrix, false); } case nir_ray_query_value_intersection_object_ray_origin: { - nir_def *instance_node_addr = isec_load(b, intersection, instance_addr); nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(device, b, isec_load(b, intersection, instance_addr), wto_matrix); return nir_build_vec3_mat_mult(b, rq_load(b, rq, origin), wto_matrix, true); } case nir_ray_query_value_intersection_object_to_world: { - nir_def *instance_node_addr = isec_load(b, intersection, instance_addr); - nir_def *rows[3]; - for (unsigned r = 0; r < 3; ++r) - rows[r] = nir_build_load_global( - b, 4, 32, - nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, otw_matrix) + r * 16)); - - return nir_vec3(b, nir_channel(b, rows[0], column), nir_channel(b, rows[1], column), - nir_channel(b, rows[2], column)); + nir_def *otw_matrix[3]; + radv_load_otw_matrix(device, b, isec_load(b, intersection, instance_addr), otw_matrix); + return nir_vec3(b, nir_channel(b, otw_matrix[0], column), nir_channel(b, otw_matrix[1], column), + nir_channel(b, otw_matrix[2], column)); } case nir_ray_query_value_intersection_primitive_index: return isec_load(b, intersection, primitive_id); @@ -371,10 +357,8 @@ lower_rq_load(struct radv_device *device, nir_builder *b, nir_intrinsic_instr *i return intersection_type; } case nir_ray_query_value_intersection_world_to_object: { - nir_def *instance_node_addr = isec_load(b, intersection, instance_addr); - nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(device, b, isec_load(b, intersection, instance_addr), wto_matrix); nir_def *vals[3]; for (unsigned i = 0; i < 3; ++i) @@ -477,6 +461,8 @@ static nir_def * lower_rq_proceed(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_vars *vars, nir_deref_instr *rq, struct radv_device *device) { + struct radv_physical_device *pdev = radv_device_physical(device); + nir_deref_instr *closest = rq_deref(b, rq, closest); nir_deref_instr *candidate = rq_deref(b, rq, candidate); @@ -543,7 +529,11 @@ lower_rq_proceed(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_va nir_push_if(b, rq_load(b, rq, incomplete)); { - nir_def *incomplete = radv_build_ray_traversal(device, b, &args); + nir_def *incomplete; + if (radv_use_bvh8(pdev)) + incomplete = radv_build_ray_traversal_gfx12(device, b, &args); + else + incomplete = radv_build_ray_traversal(device, b, &args); rq_store(b, rq, incomplete, nir_iand(b, rq_load(b, rq, incomplete), incomplete)); } nir_pop_if(b, NULL); @@ -571,7 +561,7 @@ bool radv_nir_lower_ray_queries(struct nir_shader *shader, struct radv_device *device) { const struct radv_physical_device *pdev = radv_device_physical(device); - struct radv_instance *instance = radv_physical_device_instance(pdev); + bool progress = false; struct hash_table *query_ht = _mesa_pointer_hash_table_create(NULL); @@ -626,7 +616,7 @@ radv_nir_lower_ray_queries(struct nir_shader *shader, struct radv_device *device lower_rq_generate_intersection(&builder, intrinsic, rq); break; case nir_intrinsic_rq_initialize: - lower_rq_initialize(&builder, intrinsic, vars, rq, instance); + lower_rq_initialize(&builder, intrinsic, vars, rq, device); break; case nir_intrinsic_rq_load: new_dest = lower_rq_load(device, &builder, intrinsic, rq); diff --git a/src/amd/vulkan/nir/radv_nir_rt_common.c b/src/amd/vulkan/nir/radv_nir_rt_common.c index 837da29ac0c..c41cb625864 100644 --- a/src/amd/vulkan/nir/radv_nir_rt_common.c +++ b/src/amd/vulkan/nir/radv_nir_rt_common.c @@ -267,11 +267,27 @@ intersect_ray_amd_software_tri(struct radv_device *device, nir_builder *b, nir_d } nir_def * -build_addr_to_node(nir_builder *b, nir_def *addr) +build_addr_to_node(struct radv_device *device, nir_builder *b, nir_def *addr, nir_def *flags) { + const struct radv_physical_device *pdev = radv_device_physical(device); + const uint64_t bvh_size = 1ull << 42; nir_def *node = nir_ushr_imm(b, addr, 3); - return nir_iand_imm(b, node, (bvh_size - 1) << 3); + node = nir_iand_imm(b, node, (bvh_size - 1) << 3); + + if (radv_use_bvh8(pdev)) { + /* The HW ray flags are the same bits as the API flags. + * - SpvRayFlagsTerminateOnFirstHitKHRMask, SpvRayFlagsSkipClosestHitShaderKHRMask are handled in shader code. + * - SpvRayFlagsSkipTrianglesKHRMask, SpvRayFlagsSkipAABBsKHRMask do not work. + */ + flags = nir_iand_imm(b, flags, + SpvRayFlagsOpaqueKHRMask | SpvRayFlagsNoOpaqueKHRMask | + SpvRayFlagsCullBackFacingTrianglesKHRMask | SpvRayFlagsCullFrontFacingTrianglesKHRMask | + SpvRayFlagsCullOpaqueKHRMask | SpvRayFlagsCullNoOpaqueKHRMask); + node = nir_ior(b, node, nir_ishl_imm(b, nir_u2u64(b, flags), 54)); + } + + return node; } static nir_def * @@ -302,20 +318,57 @@ nir_build_vec3_mat_mult(nir_builder *b, nir_def *vec, nir_def *matrix[], bool tr return nir_vec(b, result_components, 3); } -void -nir_build_wto_matrix_load(nir_builder *b, nir_def *instance_addr, nir_def **out) -{ - unsigned offset = offsetof(struct radv_bvh_instance_node, wto_matrix); - for (unsigned i = 0; i < 3; ++i) { - out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16), .align_mul = 64, - .align_offset = offset + i * 16); - } -} - nir_def * radv_load_vertex_position(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def *geometry_id, nir_def *primitive_id, uint32_t index) { + const struct radv_physical_device *pdev = radv_device_physical(device); + + if (radv_use_bvh8(pdev)) { + nir_def *addr_offsets = + nir_build_load_global(b, 4, 32, + nir_iadd_imm(b, instance_addr, + sizeof(struct radv_gfx12_instance_node) + + offsetof(struct radv_gfx12_instance_node_user_data, blas_addr))); + nir_def *bvh_offset = + nir_build_load_global(b, 1, 32, + nir_iadd_imm(b, instance_addr, + sizeof(struct radv_gfx12_instance_node) + + offsetof(struct radv_gfx12_instance_node_user_data, bvh_offset))); + + nir_def *addr = nir_pack_64_2x32(b, nir_channels(b, addr_offsets, 0x3)); + + nir_def *base_index_offset = + nir_iadd(b, nir_channel(b, addr_offsets, 2), nir_imul_imm(b, geometry_id, sizeof(uint32_t))); + nir_def *base_index = nir_build_load_global(b, 1, 32, nir_iadd(b, addr, nir_u2u64(b, base_index_offset))); + + nir_def *offset_offset = nir_iadd(b, nir_channel(b, addr_offsets, 3), + nir_imul_imm(b, nir_iadd(b, base_index, primitive_id), sizeof(uint32_t))); + nir_def *offset = nir_build_load_global(b, 1, 32, nir_iadd(b, addr, nir_u2u64(b, offset_offset))); + offset = nir_iadd(b, offset, bvh_offset); + + /* Assume that vertices are uncompressed. */ + offset = nir_iadd_imm(b, offset, + ROUND_DOWN_TO(RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE / 8, 4) + index * 3 * sizeof(float)); + + nir_def *data[4]; + for (uint32_t i = 0; i < ARRAY_SIZE(data); i++) { + data[i] = nir_build_load_global(b, 1, 32, nir_iadd(b, addr, nir_u2u64(b, offset))); + offset = nir_iadd_imm(b, offset, 4); + } + + uint32_t subdword_offset = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE % 32; + + nir_def *vertices[3]; + for (uint32_t i = 0; i < ARRAY_SIZE(vertices); i++) { + nir_def *lo = nir_ubitfield_extract_imm(b, data[i], subdword_offset, 32 - subdword_offset); + nir_def *hi = nir_ubitfield_extract_imm(b, data[i + 1], 0, subdword_offset); + vertices[i] = nir_ior(b, lo, nir_ishl_imm(b, hi, 32 - subdword_offset)); + } + + return nir_vec3(b, vertices[0], vertices[1], vertices[2]); + } + nir_def *bvh_addr_id = nir_build_load_global(b, 1, 64, nir_iadd_imm(b, instance_addr, offsetof(struct radv_bvh_instance_node, bvh_ptr))); nir_def *bvh_addr = build_node_to_addr(device, b, bvh_addr_id, true); @@ -335,6 +388,74 @@ radv_load_vertex_position(struct radv_device *device, nir_builder *b, nir_def *i return nir_build_load_global(b, 3, 32, nir_iadd(b, bvh_addr, nir_u2u64(b, offset))); } +void +radv_load_wto_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + + unsigned offset = offsetof(struct radv_bvh_instance_node, wto_matrix); + if (radv_use_bvh8(pdev)) + offset = offsetof(struct radv_gfx12_instance_node, wto_matrix); + + for (unsigned i = 0; i < 3; ++i) { + out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16), .align_mul = 64, + .align_offset = (offset + i * 16) % 64); + } +} + +void +radv_load_otw_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + + unsigned offset = offsetof(struct radv_bvh_instance_node, otw_matrix); + if (radv_use_bvh8(pdev)) + offset = + sizeof(struct radv_gfx12_instance_node) + offsetof(struct radv_gfx12_instance_node_user_data, otw_matrix); + + for (unsigned i = 0; i < 3; ++i) { + out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16), .align_mul = 64, + .align_offset = (offset + i * 16) % 64); + } +} + +nir_def * +radv_load_custom_instance(struct radv_device *device, nir_builder *b, nir_def *instance_addr) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + + if (radv_use_bvh8(pdev)) { + return nir_build_load_global( + b, 1, 32, + nir_iadd_imm(b, instance_addr, + sizeof(struct radv_gfx12_instance_node) + + offsetof(struct radv_gfx12_instance_node_user_data, custom_instance))); + } + + return nir_iand_imm( + b, + nir_build_load_global( + b, 1, 32, nir_iadd_imm(b, instance_addr, offsetof(struct radv_bvh_instance_node, custom_instance_and_mask))), + 0xFFFFFF); +} + +nir_def * +radv_load_instance_id(struct radv_device *device, nir_builder *b, nir_def *instance_addr) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + + if (radv_use_bvh8(pdev)) { + return nir_build_load_global( + b, 1, 32, + nir_iadd_imm(b, instance_addr, + sizeof(struct radv_gfx12_instance_node) + + offsetof(struct radv_gfx12_instance_node_user_data, instance_index))); + } + + return nir_build_load_global(b, 1, 32, + nir_iadd_imm(b, instance_addr, offsetof(struct radv_bvh_instance_node, instance_id))); +} + /* When a hit is opaque the any_hit shader is skipped for this hit and the hit * is assumed to be an actual hit. */ static nir_def * @@ -356,11 +477,12 @@ create_bvh_descriptor(nir_builder *b, const struct radv_physical_device *pdev, s * instances at the cost of having to use 64-bit node ids. */ const uint64_t bvh_size = 1ull << 42; + const uint32_t sort_triangles_first = radv_use_bvh8(pdev) ? BITFIELD_BIT(52 - 32) : 0; const uint32_t box_sort_enable = BITFIELD_BIT(63 - 32); const uint32_t triangle_return_mode = BITFIELD_BIT(120 - 96); /* Return IJ for triangles */ uint32_t dword0 = 0; - nir_def *dword1 = nir_imm_intN_t(b, box_sort_enable, 32); + nir_def *dword1 = nir_imm_intN_t(b, sort_triangles_first | box_sort_enable, 32); uint32_t dword2 = (bvh_size - 1) & 0xFFFFFFFFu; uint32_t dword3 = ((bvh_size - 1) >> 32) | triangle_return_mode | (1u << 31); @@ -373,9 +495,20 @@ create_bvh_descriptor(nir_builder *b, const struct radv_physical_device *pdev, s /* Only use largest/midpoint sorting when all invocations have the same ray flags, otherwise * fall back to the default closest point. */ dword1 = nir_bcsel(b, nir_vote_any(b, 1, ray_flags->terminate_on_first_hit), dword1, - nir_imm_int(b, (box_sort_midpoint << 21) | box_sort_enable)); + nir_imm_int(b, (box_sort_midpoint << 21) | sort_triangles_first | box_sort_enable)); dword1 = nir_bcsel(b, nir_vote_all(b, 1, ray_flags->terminate_on_first_hit), - nir_imm_int(b, (box_sort_largest << 21) | box_sort_enable), dword1); + nir_imm_int(b, (box_sort_largest << 21) | sort_triangles_first | box_sort_enable), dword1); + } + + if (radv_use_bvh8(pdev)) { + /* compressed_format_en */ + dword3 |= BITFIELD_BIT(115 - 96); + /* wide_sort_en */ + dword3 |= BITFIELD_BIT(117 - 96); + /* instance_en */ + dword3 |= BITFIELD_BIT(118 - 96); + /* pointer_flags */ + dword3 |= BITFIELD_BIT(119 - 96); } return nir_vec4(b, nir_imm_intN_t(b, dword0, 32), dword1, nir_imm_intN_t(b, dword2, 32), nir_imm_intN_t(b, dword3, 32)); @@ -439,6 +572,36 @@ insert_traversal_triangle_case(struct radv_device *device, nir_builder *b, const nir_pop_if(b, NULL); } +static void +insert_traversal_triangle_case_gfx12(struct radv_device *device, nir_builder *b, + const struct radv_ray_traversal_args *args, const struct radv_ray_flags *ray_flags, + nir_def *result, nir_def *bvh_node) +{ + if (!args->triangle_cb) + return; + + struct radv_triangle_intersection intersection; + intersection.t = nir_channel(b, result, 0); + + nir_push_if(b, nir_iand(b, nir_flt(b, intersection.t, nir_load_deref(b, args->vars.tmax)), + nir_flt(b, args->tmin, intersection.t))); + { + intersection.frontface = nir_inot(b, nir_test_mask(b, nir_channel(b, result, 3), 1)); + intersection.base.node_addr = build_node_to_addr(device, b, bvh_node, false); + intersection.base.primitive_id = nir_ishr_imm(b, nir_channel(b, result, 3), 1); + intersection.base.geometry_id_and_flags = nir_ishr_imm(b, nir_channel(b, result, 8), 2); + intersection.base.opaque = nir_inot(b, nir_test_mask(b, nir_channel(b, result, 2), 1u << 31)); + intersection.barycentrics = nir_fabs(b, nir_channels(b, result, 0x3 << 1)); + + nir_push_if(b, nir_bcsel(b, intersection.base.opaque, ray_flags->no_cull_opaque, ray_flags->no_cull_no_opaque)); + { + args->triangle_cb(b, &intersection, args, ray_flags); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); +} + static void insert_traversal_aabb_case(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args, const struct radv_ray_flags *ray_flags, nir_def *bvh_node) @@ -466,11 +629,31 @@ insert_traversal_aabb_case(struct radv_device *device, nir_builder *b, const str nir_pop_if(b, NULL); } -static nir_def * -fetch_parent_node(nir_builder *b, nir_def *bvh, nir_def *node) +static void +insert_traversal_aabb_case_gfx12(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args, + const struct radv_ray_flags *ray_flags, nir_def *result, nir_def *bvh_node) { - nir_def *offset = nir_iadd_imm(b, nir_imul_imm(b, nir_udiv_imm(b, node, 8), 4), 4); + if (!args->aabb_cb) + return; + struct radv_leaf_intersection intersection; + intersection.node_addr = build_node_to_addr(device, b, bvh_node, false); + intersection.primitive_id = nir_ishr_imm(b, nir_channel(b, result, 3), 1); + intersection.geometry_id_and_flags = nir_ishr_imm(b, nir_channel(b, result, 8), 2); + intersection.opaque = nir_inot(b, nir_test_mask(b, nir_channel(b, result, 2), 1u << 31)); + + nir_push_if(b, nir_bcsel(b, intersection.opaque, ray_flags->no_cull_opaque, ray_flags->no_cull_no_opaque)); + { + args->aabb_cb(b, &intersection, args); + } + nir_pop_if(b, NULL); +} + +static nir_def * +fetch_parent_node(struct radv_device *device, nir_builder *b, nir_def *bvh, nir_def *node) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + nir_def *offset = nir_iadd_imm(b, nir_imul_imm(b, nir_udiv_imm(b, node, radv_use_bvh8(pdev) ? 16 : 8), 4), 4); return nir_build_load_global(b, 1, 32, nir_isub(b, bvh, nir_u2u64(b, offset)), .align_mul = 4); } @@ -547,7 +730,7 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc nir_def *prev = nir_load_deref(b, args->vars.previous_node); nir_def *bvh_addr = build_node_to_addr(device, b, nir_load_deref(b, args->vars.bvh_base), true); - nir_def *parent = fetch_parent_node(b, bvh_addr, prev); + nir_def *parent = fetch_parent_node(device, b, bvh_addr, prev); nir_push_if(b, nir_ieq_imm(b, parent, RADV_BVH_INVALID_NODE)); { nir_store_var(b, incomplete, nir_imm_false(b), 0x1); @@ -615,7 +798,7 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc nir_build_load_global(b, 4, 32, instance_node_addr, .align_mul = 64, .align_offset = 0); nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(device, b, instance_node_addr, wto_matrix); nir_store_deref(b, args->vars.sbt_offset_and_flags, nir_channel(b, instance_data, 3), 1); @@ -718,3 +901,205 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc return nir_load_var(b, incomplete); } + +nir_def * +radv_build_ray_traversal_gfx12(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + + nir_variable *incomplete = nir_local_variable_create(b->impl, glsl_bool_type(), "incomplete"); + nir_store_var(b, incomplete, nir_imm_true(b), 0x1); + + struct radv_ray_flags ray_flags = { + .force_opaque = radv_test_flag(b, args, SpvRayFlagsOpaqueKHRMask, true), + .force_not_opaque = radv_test_flag(b, args, SpvRayFlagsNoOpaqueKHRMask, true), + .terminate_on_first_hit = radv_test_flag(b, args, SpvRayFlagsTerminateOnFirstHitKHRMask, true), + .no_cull_front = radv_test_flag(b, args, SpvRayFlagsCullFrontFacingTrianglesKHRMask, false), + .no_cull_back = radv_test_flag(b, args, SpvRayFlagsCullBackFacingTrianglesKHRMask, false), + .no_cull_opaque = radv_test_flag(b, args, SpvRayFlagsCullOpaqueKHRMask, false), + .no_cull_no_opaque = radv_test_flag(b, args, SpvRayFlagsCullNoOpaqueKHRMask, false), + .no_skip_triangles = radv_test_flag(b, args, SpvRayFlagsSkipTrianglesKHRMask, false), + .no_skip_aabbs = radv_test_flag(b, args, SpvRayFlagsSkipAABBsKHRMask, false), + }; + + nir_def *desc = create_bvh_descriptor(b, pdev, &ray_flags); + + nir_push_loop(b); + { + nir_push_if(b, nir_ieq_imm(b, nir_load_deref(b, args->vars.current_node), RADV_BVH_INVALID_NODE)); + { + /* Early exit if we never overflowed the stack, to avoid having to backtrack to + * the root for no reason. */ + nir_push_if(b, nir_ilt_imm(b, nir_load_deref(b, args->vars.stack), args->stack_base + args->stack_stride)); + { + nir_store_var(b, incomplete, nir_imm_false(b), 0x1); + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + + nir_def *stack_instance_exit = + nir_ige(b, nir_load_deref(b, args->vars.top_stack), nir_load_deref(b, args->vars.stack)); + nir_def *root_instance_exit = + nir_ieq(b, nir_load_deref(b, args->vars.previous_node), nir_load_deref(b, args->vars.instance_bottom_node)); + nir_if *instance_exit = nir_push_if(b, nir_ior(b, stack_instance_exit, root_instance_exit)); + instance_exit->control = nir_selection_control_dont_flatten; + { + nir_store_deref(b, args->vars.top_stack, nir_imm_int(b, -1), 1); + nir_store_deref(b, args->vars.previous_node, nir_load_deref(b, args->vars.instance_top_node), 1); + nir_store_deref(b, args->vars.instance_bottom_node, nir_imm_int(b, RADV_BVH_NO_INSTANCE_ROOT), 1); + + nir_store_deref(b, args->vars.bvh_base, args->root_bvh_base, 1); + nir_store_deref(b, args->vars.origin, args->origin, 7); + nir_store_deref(b, args->vars.dir, args->dir, 7); + } + nir_pop_if(b, NULL); + + nir_push_if( + b, nir_ige(b, nir_load_deref(b, args->vars.stack_low_watermark), nir_load_deref(b, args->vars.stack))); + { + nir_def *prev = nir_load_deref(b, args->vars.previous_node); + nir_def *bvh_addr = build_node_to_addr(device, b, nir_load_deref(b, args->vars.bvh_base), true); + + nir_def *parent = fetch_parent_node(device, b, bvh_addr, prev); + nir_push_if(b, nir_ieq_imm(b, parent, RADV_BVH_INVALID_NODE)); + { + nir_store_var(b, incomplete, nir_imm_false(b), 0x1); + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + nir_store_deref(b, args->vars.current_node, parent, 0x1); + } + nir_push_else(b, NULL); + { + nir_store_deref(b, args->vars.stack, + nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), -args->stack_stride), 1); + + nir_def *stack_ptr = + nir_umod_imm(b, nir_load_deref(b, args->vars.stack), args->stack_stride * args->stack_entries); + nir_def *bvh_node = args->stack_load_cb(b, stack_ptr, args); + nir_store_deref(b, args->vars.current_node, bvh_node, 0x1); + nir_store_deref(b, args->vars.previous_node, nir_imm_int(b, RADV_BVH_INVALID_NODE), 0x1); + } + nir_pop_if(b, NULL); + } + nir_push_else(b, NULL); + { + nir_store_deref(b, args->vars.previous_node, nir_imm_int(b, RADV_BVH_INVALID_NODE), 0x1); + } + nir_pop_if(b, NULL); + + nir_def *bvh_node = nir_load_deref(b, args->vars.current_node); + + nir_def *prev_node = nir_load_deref(b, args->vars.previous_node); + nir_store_deref(b, args->vars.previous_node, bvh_node, 0x1); + nir_store_deref(b, args->vars.current_node, nir_imm_int(b, RADV_BVH_INVALID_NODE), 0x1); + + nir_def *global_bvh_node = nir_iadd(b, nir_load_deref(b, args->vars.bvh_base), nir_u2u64(b, bvh_node)); + + nir_def *result = + nir_bvh8_intersect_ray_amd(b, 32, desc, nir_unpack_64_2x32(b, nir_load_deref(b, args->vars.bvh_base)), + nir_ishr_imm(b, args->cull_mask, 24), nir_load_deref(b, args->vars.tmax), + nir_load_deref(b, args->vars.origin), nir_load_deref(b, args->vars.dir), bvh_node); + + nir_push_if(b, nir_test_mask(b, bvh_node, BITFIELD64_BIT(ffs(radv_bvh_node_box16) - 1))); + { + nir_push_if(b, nir_test_mask(b, bvh_node, BITFIELD64_BIT(ffs(radv_bvh_node_instance) - 1))); + { + if (args->vars.iteration_instance_count) { + nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count); + iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1 << 16); + nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1); + } + + nir_def *next_node = nir_iand_imm(b, nir_channel(b, result, 7), 0xff); + nir_push_if(b, nir_ieq_imm(b, next_node, 0xff)); + nir_jump(b, nir_jump_continue); + nir_pop_if(b, NULL); + + /* instance */ + nir_def *instance_node_addr = build_node_to_addr(device, b, global_bvh_node, false); + nir_store_deref(b, args->vars.instance_addr, instance_node_addr, 1); + + nir_store_deref(b, args->vars.sbt_offset_and_flags, nir_channel(b, result, 6), 1); + + nir_store_deref(b, args->vars.origin, nir_channels(b, result, 0x7 << 10), 0x7); + nir_store_deref(b, args->vars.dir, nir_channels(b, result, 0x7 << 13), 0x7); + + nir_store_deref(b, args->vars.top_stack, nir_load_deref(b, args->vars.stack), 1); + nir_store_deref(b, args->vars.bvh_base, nir_pack_64_2x32(b, nir_channels(b, result, 0x3 << 2)), 1); + + /* Push the instance root node onto the stack */ + nir_store_deref(b, args->vars.current_node, next_node, 0x1); + nir_store_deref(b, args->vars.instance_bottom_node, next_node, 1); + nir_store_deref(b, args->vars.instance_top_node, bvh_node, 1); + } + nir_push_else(b, NULL); + { + /* box */ + nir_push_if(b, nir_ieq_imm(b, prev_node, RADV_BVH_INVALID_NODE)); + { + nir_def *new_nodes[8]; + for (unsigned i = 0; i < 8; ++i) + new_nodes[i] = nir_channel(b, result, i); + + for (unsigned i = 1; i < 8; ++i) + nir_push_if(b, nir_ine_imm(b, new_nodes[i], RADV_BVH_INVALID_NODE)); + + for (unsigned i = 8; i-- > 1;) { + nir_def *stack = nir_load_deref(b, args->vars.stack); + nir_def *stack_ptr = nir_umod_imm(b, stack, args->stack_entries * args->stack_stride); + args->stack_store_cb(b, stack_ptr, new_nodes[i], args); + nir_store_deref(b, args->vars.stack, nir_iadd_imm(b, stack, args->stack_stride), 1); + + if (i == 1) { + nir_def *new_watermark = + nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), -args->stack_entries * args->stack_stride); + new_watermark = nir_imax(b, nir_load_deref(b, args->vars.stack_low_watermark), new_watermark); + nir_store_deref(b, args->vars.stack_low_watermark, new_watermark, 0x1); + } + + nir_pop_if(b, NULL); + } + nir_store_deref(b, args->vars.current_node, new_nodes[0], 0x1); + } + nir_push_else(b, NULL); + { + nir_def *next = nir_imm_int(b, RADV_BVH_INVALID_NODE); + for (unsigned i = 0; i < 7; ++i) { + next = nir_bcsel(b, nir_ieq(b, prev_node, nir_channel(b, result, i)), nir_channel(b, result, i + 1), + next); + } + nir_store_deref(b, args->vars.current_node, next, 0x1); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_push_else(b, NULL); + { + nir_push_if(b, nir_test_mask(b, nir_channel(b, result, 1), 1u << 31)); + { + nir_push_if(b, ray_flags.no_skip_aabbs); + insert_traversal_aabb_case_gfx12(device, b, args, &ray_flags, result, global_bvh_node); + nir_pop_if(b, NULL); + } + nir_push_else(b, NULL); + { + nir_push_if(b, ray_flags.no_skip_triangles); + insert_traversal_triangle_case_gfx12(device, b, args, &ray_flags, result, global_bvh_node); + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + + if (args->vars.iteration_instance_count) { + nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count); + iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1); + nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1); + } + } + nir_pop_loop(b, NULL); + + return nir_load_var(b, incomplete); +} diff --git a/src/amd/vulkan/nir/radv_nir_rt_common.h b/src/amd/vulkan/nir/radv_nir_rt_common.h index 66021a9d0ba..aaef6ca72cd 100644 --- a/src/amd/vulkan/nir/radv_nir_rt_common.h +++ b/src/amd/vulkan/nir/radv_nir_rt_common.h @@ -14,15 +14,21 @@ struct radv_device; -nir_def *build_addr_to_node(nir_builder *b, nir_def *addr); +nir_def *build_addr_to_node(struct radv_device *device, nir_builder *b, nir_def *addr, nir_def *flags); nir_def *nir_build_vec3_mat_mult(nir_builder *b, nir_def *vec, nir_def *matrix[], bool translation); -void nir_build_wto_matrix_load(nir_builder *b, nir_def *instance_addr, nir_def **out); - nir_def *radv_load_vertex_position(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def *geometry_id, nir_def *primitive_id, uint32_t index); +void radv_load_wto_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out); + +void radv_load_otw_matrix(struct radv_device *device, nir_builder *b, nir_def *instance_addr, nir_def **out); + +nir_def *radv_load_custom_instance(struct radv_device *device, nir_builder *b, nir_def *instance_addr); + +nir_def *radv_load_instance_id(struct radv_device *device, nir_builder *b, nir_def *instance_addr); + struct radv_ray_traversal_args; struct radv_ray_flags { @@ -146,4 +152,7 @@ struct radv_ray_traversal_args { nir_def *radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args); +nir_def *radv_build_ray_traversal_gfx12(struct radv_device *device, nir_builder *b, + const struct radv_ray_traversal_args *args); + #endif /* RADV_NIR_RT_COMMON_H */ diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c index d066665e671..086726052ec 100644 --- a/src/amd/vulkan/nir/radv_nir_rt_shader.c +++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c @@ -530,11 +530,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) break; } case nir_intrinsic_load_ray_instance_custom_index: { - nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr); - nir_def *custom_instance_and_mask = nir_build_load_global( - b, 1, 32, - nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, custom_instance_and_mask))); - ret = nir_iand_imm(b, custom_instance_and_mask, 0xFFFFFF); + ret = radv_load_custom_instance(vars->device, b, nir_load_var(b, vars->instance_addr)); break; } case nir_intrinsic_load_primitive_id: { @@ -547,9 +543,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) break; } case nir_intrinsic_load_instance_id: { - nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr); - ret = nir_build_load_global( - b, 1, 32, nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, instance_id))); + ret = radv_load_instance_id(vars->device, b, nir_load_var(b, vars->instance_addr)); break; } case nir_intrinsic_load_ray_flags: { @@ -564,7 +558,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) unsigned c = nir_intrinsic_column(intr); nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr); nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(vars->device, b, instance_node_addr, wto_matrix); nir_def *vals[3]; for (unsigned i = 0; i < 3; ++i) @@ -575,26 +569,21 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) } case nir_intrinsic_load_ray_object_to_world: { unsigned c = nir_intrinsic_column(intr); - nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr); - nir_def *rows[3]; - for (unsigned r = 0; r < 3; ++r) - rows[r] = nir_build_load_global( - b, 4, 32, - nir_iadd_imm(b, instance_node_addr, offsetof(struct radv_bvh_instance_node, otw_matrix) + r * 16)); - ret = nir_vec3(b, nir_channel(b, rows[0], c), nir_channel(b, rows[1], c), nir_channel(b, rows[2], c)); + nir_def *otw_matrix[3]; + radv_load_otw_matrix(vars->device, b, nir_load_var(b, vars->instance_addr), otw_matrix); + ret = nir_vec3(b, nir_channel(b, otw_matrix[0], c), nir_channel(b, otw_matrix[1], c), + nir_channel(b, otw_matrix[2], c)); break; } case nir_intrinsic_load_ray_object_origin: { - nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr); nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(vars->device, b, nir_load_var(b, vars->instance_addr), wto_matrix); ret = nir_build_vec3_mat_mult(b, nir_load_var(b, vars->origin), wto_matrix, true); break; } case nir_intrinsic_load_ray_object_direction: { - nir_def *instance_node_addr = nir_load_var(b, vars->instance_addr); nir_def *wto_matrix[3]; - nir_build_wto_matrix_load(b, instance_node_addr, wto_matrix); + radv_load_wto_matrix(vars->device, b, nir_load_var(b, vars->instance_addr), wto_matrix); ret = nir_build_vec3_mat_mult(b, nir_load_var(b, vars->direction), wto_matrix, false); break; } @@ -1526,6 +1515,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin struct rt_traversal_vars trav_vars = init_traversal_vars(b); + nir_def *cull_mask_and_flags = nir_load_var(b, vars->cull_mask_and_flags); + nir_store_var(b, trav_vars.hit, nir_imm_false(b), 1); nir_def *accel_struct = nir_load_var(b, vars->accel_struct); @@ -1533,7 +1524,7 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin b, 1, 32, nir_iadd_imm(b, accel_struct, offsetof(struct radv_accel_struct_header, bvh_offset)), .access = ACCESS_NON_WRITEABLE); nir_def *root_bvh_base = nir_iadd(b, accel_struct, nir_u2u64(b, bvh_offset)); - root_bvh_base = build_addr_to_node(b, root_bvh_base); + root_bvh_base = build_addr_to_node(device, b, root_bvh_base, cull_mask_and_flags); nir_store_var(b, trav_vars.bvh_base, root_bvh_base, 1); @@ -1589,7 +1580,6 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin .pipeline = pipeline, }; - nir_def *cull_mask_and_flags = nir_load_var(b, vars->cull_mask_and_flags); struct radv_ray_traversal_args args = { .root_bvh_base = root_bvh_base, .flags = cull_mask_and_flags, @@ -1617,7 +1607,10 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin nir_def *original_tmax = nir_load_var(b, vars->tmax); - radv_build_ray_traversal(device, b, &args); + if (radv_use_bvh8(pdev)) + radv_build_ray_traversal_gfx12(device, b, &args); + else + radv_build_ray_traversal(device, b, &args); if (vars->device->rra_trace.ray_history_addr) radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit), diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 10a372da17c..b882f8461d6 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -16,10 +16,18 @@ #include "vk_acceleration_structure.h" #include "vk_common_entrypoints.h" +static const uint32_t copy_blas_addrs_gfx12_spv[] = { +#include "bvh/copy_blas_addrs_gfx12.spv.h" +}; + static const uint32_t copy_spv[] = { #include "bvh/copy.spv.h" }; +static const uint32_t copy_gfx12_spv[] = { +#include "bvh/copy_gfx12.spv.h" +}; + static const uint32_t encode_spv[] = { #include "bvh/encode.spv.h" }; @@ -28,6 +36,10 @@ static const uint32_t encode_compact_spv[] = { #include "bvh/encode_compact.spv.h" }; +static const uint32_t encode_gfx12_spv[] = { +#include "bvh/encode_gfx12.spv.h" +}; + static const uint32_t header_spv[] = { #include "bvh/header.spv.h" }; @@ -36,6 +48,10 @@ static const uint32_t update_spv[] = { #include "bvh/update.spv.h" }; +static const uint32_t update_gfx12_spv[] = { +#include "bvh/update_gfx12.spv.h" +}; + static const uint32_t leaf_spv[] = { #include "bvh/radv_leaf.spv.h" }; @@ -47,6 +63,7 @@ static const uint32_t leaf_always_active_spv[] = { struct acceleration_structure_layout { uint32_t geometry_info_offset; uint32_t primitive_base_indices_offset; + uint32_t leaf_node_offsets_offset; uint32_t bvh_offset; uint32_t leaf_nodes_offset; uint32_t internal_nodes_offset; @@ -68,26 +85,50 @@ radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf const VkAccelerationStructureBuildGeometryInfoKHR *build_info, struct acceleration_structure_layout *accel_struct) { + const struct radv_physical_device *pdev = radv_device_physical(device); + uint32_t internal_count = MAX2(leaf_count, 2) - 1; VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info); uint32_t bvh_leaf_size; - switch (geometry_type) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: - bvh_leaf_size = sizeof(struct radv_bvh_triangle_node); - break; - case VK_GEOMETRY_TYPE_AABBS_KHR: - bvh_leaf_size = sizeof(struct radv_bvh_aabb_node); - break; - case VK_GEOMETRY_TYPE_INSTANCES_KHR: - bvh_leaf_size = sizeof(struct radv_bvh_instance_node); - break; - default: - unreachable("Unknown VkGeometryTypeKHR"); + uint32_t bvh_node_size_gcd; + if (radv_use_bvh8(pdev)) { + switch (geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + bvh_leaf_size = sizeof(struct radv_gfx12_primitive_node); + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + bvh_leaf_size = sizeof(struct radv_gfx12_primitive_node); + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + bvh_leaf_size = sizeof(struct radv_gfx12_instance_node) + sizeof(struct radv_gfx12_instance_node_user_data); + break; + default: + unreachable("Unknown VkGeometryTypeKHR"); + } + bvh_node_size_gcd = RADV_GFX12_BVH_NODE_SIZE; + } else { + switch (geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + bvh_leaf_size = sizeof(struct radv_bvh_triangle_node); + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + bvh_leaf_size = sizeof(struct radv_bvh_aabb_node); + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + bvh_leaf_size = sizeof(struct radv_bvh_instance_node); + break; + default: + unreachable("Unknown VkGeometryTypeKHR"); + } + bvh_node_size_gcd = 64; } - uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count; + uint32_t internal_node_size = + radv_use_bvh8(pdev) ? sizeof(struct radv_gfx12_box_node) : sizeof(struct radv_bvh_box32_node); + + uint64_t bvh_size = bvh_leaf_size * leaf_count + internal_node_size * internal_count; uint32_t offset = 0; offset += sizeof(struct radv_accel_struct_header); @@ -101,23 +142,30 @@ radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf offset += sizeof(uint32_t) * build_info->geometryCount; } + /* On GFX12, we need additional space for leaf node offsets since they do not have the same + * order as the application provided data. + */ + accel_struct->leaf_node_offsets_offset = offset; + if (radv_use_bvh8(pdev)) + offset += leaf_count * 4; + /* Parent links, which have to go directly before bvh_offset as we index them using negative * offsets from there. */ - offset += bvh_size / 64 * 4; + offset += bvh_size / bvh_node_size_gcd * 4; /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */ offset = ALIGN(offset, 64); accel_struct->bvh_offset = offset; /* root node */ - offset += sizeof(struct radv_bvh_box32_node); + offset += internal_node_size; accel_struct->leaf_nodes_offset = offset; offset += bvh_leaf_size * leaf_count; accel_struct->internal_nodes_offset = offset; /* Factor out the root node. */ - offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1); + offset += internal_node_size * (internal_count - 1); accel_struct->size = offset; } @@ -134,7 +182,7 @@ radv_get_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct uint32_t update_offset = 0; - update_offset += sizeof(vk_aabb) * leaf_count; + update_offset += sizeof(vk_aabb) * (leaf_count + internal_count); scratch->internal_ready_count_offset = update_offset; update_offset += sizeof(uint32_t) * internal_count; @@ -154,6 +202,10 @@ radv_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStruc STATIC_ASSERT(sizeof(struct radv_bvh_instance_node) == 128); STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64); STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128); + STATIC_ASSERT(sizeof(struct radv_gfx12_box_node) == RADV_GFX12_BVH_NODE_SIZE); + STATIC_ASSERT(sizeof(struct radv_gfx12_primitive_node) == RADV_GFX12_BVH_NODE_SIZE); + STATIC_ASSERT(sizeof(struct radv_gfx12_instance_node) == RADV_GFX12_BVH_NODE_SIZE); + STATIC_ASSERT(sizeof(struct radv_gfx12_instance_node_user_data) == RADV_GFX12_BVH_NODE_SIZE); if (radv_device_init_accel_struct_build_state(device) != VK_SUCCESS) return; @@ -170,6 +222,7 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device) struct vk_device_dispatch_table *dispatch = &device->vk.dispatch_table; dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_pipeline, &state->alloc); + dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_blas_addrs_gfx12_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_compact_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.header_pipeline, &state->alloc); @@ -257,7 +310,11 @@ radv_device_init_null_accel_struct(struct radv_device *device) VkDevice _device = radv_device_to_handle(device); uint32_t bvh_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64); - uint32_t size = bvh_offset + sizeof(struct radv_bvh_box32_node); + uint32_t size = bvh_offset; + if (radv_use_bvh8(pdev)) + size += sizeof(struct radv_gfx12_box_node); + else + size += sizeof(struct radv_bvh_box32_node); VkResult result; @@ -321,28 +378,44 @@ radv_device_init_null_accel_struct(struct radv_device *device) }; memcpy(data, &header, sizeof(struct radv_accel_struct_header)); - struct radv_bvh_box32_node root = { - .children = - { - RADV_BVH_INVALID_NODE, - RADV_BVH_INVALID_NODE, - RADV_BVH_INVALID_NODE, - RADV_BVH_INVALID_NODE, - }, - }; - - for (uint32_t child = 0; child < 4; child++) { - root.coords[child] = (vk_aabb){ - .min.x = NAN, - .min.y = NAN, - .min.z = NAN, - .max.x = NAN, - .max.y = NAN, - .max.z = NAN, + if (radv_use_bvh8(pdev)) { + struct radv_gfx12_box_node root = { + .obb_matrix_index = 0x7f, }; - } - memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_bvh_box32_node)); + for (uint32_t child = 0; child < 8; child++) { + root.children[child] = (struct radv_gfx12_box_child){ + .dword0 = 0xffffffff, + .dword1 = 0xfff, + .dword2 = 0, + }; + } + + memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_gfx12_box_node)); + } else { + struct radv_bvh_box32_node root = { + .children = + { + RADV_BVH_INVALID_NODE, + RADV_BVH_INVALID_NODE, + RADV_BVH_INVALID_NODE, + RADV_BVH_INVALID_NODE, + }, + }; + + for (uint32_t child = 0; child < 4; child++) { + root.coords[child] = (vk_aabb){ + .min.x = NAN, + .min.y = NAN, + .min.z = NAN, + .max.x = NAN, + .max.y = NAN, + .max.z = NAN, + }; + } + + memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_bvh_box32_node)); + } vk_common_UnmapMemory(_device, memory); @@ -385,9 +458,15 @@ radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count) } static uint32_t -radv_get_encode_key(struct vk_device *device, VkAccelerationStructureTypeKHR type, +radv_get_encode_key(struct vk_device *vk_device, VkAccelerationStructureTypeKHR type, VkBuildAccelerationStructureFlagBitsKHR flags) { + struct radv_device *device = container_of(vk_device, struct radv_device, vk); + struct radv_physical_device *pdev = radv_device_physical(device); + + if (radv_use_bvh8(pdev)) + return RADV_ENCODE_KEY_COMPACT; + if (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) return RADV_ENCODE_KEY_COMPACT; @@ -401,9 +480,10 @@ radv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); bool compact = key & RADV_ENCODE_KEY_COMPACT; - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - compact ? device->meta_state.accel_struct_build.encode_compact_pipeline - : device->meta_state.accel_struct_build.encode_pipeline); + VkPipeline pipeline = compact ? device->meta_state.accel_struct_build.encode_compact_pipeline + : device->meta_state.accel_struct_build.encode_pipeline; + + device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); return VK_SUCCESS; } @@ -448,6 +528,47 @@ radv_encode_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuild radv_compute_dispatch(cmd_buffer, &dispatch); } +static void +radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, VkDeviceAddress intermediate_header_addr, + uint32_t leaf_count, uint32_t key, struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); + + uint32_t dst_internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset; + uint32_t dst_leaf_nodes_offset = layout.leaf_nodes_offset - layout.bvh_offset; + uint32_t offsets[2] = {dst_internal_nodes_offset, dst_leaf_nodes_offset}; + radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset), offsets, + sizeof(offsets)); + if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2; + + const struct encode_gfx12_args args = { + .intermediate_bvh = intermediate_as_addr, + .output_base = vk_acceleration_structure_get_va(dst), + .header = intermediate_header_addr, + .output_bvh_offset = layout.bvh_offset, + .leaf_node_offsets_offset = layout.leaf_node_offsets_offset, + .leaf_node_count = leaf_count, + .geometry_type = vk_get_as_geometry_type(build_info), + }; + vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); + + struct radv_dispatch_info dispatch = { + .unaligned = true, + .ordered = true, + .blocks = {MAX2(leaf_count, 1), 1, 1}, + }; + + radv_compute_dispatch(cmd_buffer, &dispatch); +} + static VkResult radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) { @@ -487,7 +608,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); if (key & RADV_ENCODE_KEY_COMPACT) { - base = offsetof(struct radv_accel_struct_header, geometry_count); + base = offsetof(struct radv_accel_struct_header, geometry_type); struct header_args args = { .src = intermediate_header_addr, @@ -506,6 +627,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui header.instance_offset = layout.bvh_offset + sizeof(struct radv_bvh_box32_node); header.instance_count = instance_count; + header.leaf_node_offsets_offset = layout.leaf_node_offsets_offset; header.compacted_size = layout.size; header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64); @@ -520,6 +642,7 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui sizeof(uint64_t) * header.instance_count; header.build_flags = build_info->flags; + header.geometry_type = vk_get_as_geometry_type(build_info); header.geometry_count = build_info->geometryCount; header.primitive_base_indices_offset = layout.primitive_base_indices_offset; @@ -674,26 +797,6 @@ static const struct radix_sort_vk_target_config radix_sort_config = { .scatter.block_rows = 14, }; -static const struct vk_acceleration_structure_build_ops build_ops = { - .begin_debug_marker = vk_accel_struct_cmd_begin_debug_marker, - .end_debug_marker = vk_accel_struct_cmd_end_debug_marker, - .get_as_size = radv_get_as_size, - .get_update_scratch_size = radv_get_update_scratch_size, - .get_encode_key[0] = radv_get_encode_key, - .get_encode_key[1] = radv_get_encode_key, - .encode_bind_pipeline[0] = radv_encode_bind_pipeline, - .encode_bind_pipeline[1] = radv_init_header_bind_pipeline, - .encode_as[0] = radv_encode_as, - .encode_as[1] = radv_init_header, - .init_update_scratch = radv_init_update_scratch, - .update_bind_pipeline[0] = radv_update_bind_pipeline, - .update_as[0] = radv_update_as, - .leaf_spirv_override = leaf_spv, - .leaf_spirv_override_size = sizeof(leaf_spv), - .leaf_always_active_spirv_override = leaf_always_active_spv, - .leaf_always_active_spirv_override_size = sizeof(leaf_always_active_spv), -}; - static void radv_write_buffer_cp(VkCommandBuffer commandBuffer, VkDeviceAddress addr, void *data, uint32_t size) { @@ -729,24 +832,49 @@ radv_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer, VkDeviceAddress addr, V VkResult radv_device_init_accel_struct_build_state(struct radv_device *device) { + const struct radv_physical_device *pdev = radv_device_physical(device); + VkResult result = VK_SUCCESS; mtx_lock(&device->meta_state.mtx); if (device->meta_state.accel_struct_build.radix_sort) goto exit; - result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args), - &device->meta_state.accel_struct_build.encode_pipeline, - &device->meta_state.accel_struct_build.encode_p_layout); - if (result != VK_SUCCESS) - goto exit; + if (radv_use_bvh8(pdev)) { + result = + create_build_pipeline_spv(device, encode_gfx12_spv, sizeof(encode_gfx12_spv), sizeof(struct encode_gfx12_args), + &device->meta_state.accel_struct_build.encode_compact_pipeline, + &device->meta_state.accel_struct_build.encode_p_layout); + if (result != VK_SUCCESS) + goto exit; - result = - create_build_pipeline_spv(device, encode_compact_spv, sizeof(encode_compact_spv), sizeof(struct encode_args), - &device->meta_state.accel_struct_build.encode_compact_pipeline, - &device->meta_state.accel_struct_build.encode_p_layout); - if (result != VK_SUCCESS) - goto exit; + result = create_build_pipeline_spv(device, update_gfx12_spv, sizeof(update_gfx12_spv), sizeof(struct update_args), + &device->meta_state.accel_struct_build.update_pipeline, + &device->meta_state.accel_struct_build.update_p_layout); + + if (result != VK_SUCCESS) + goto exit; + } else { + result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args), + &device->meta_state.accel_struct_build.encode_pipeline, + &device->meta_state.accel_struct_build.encode_p_layout); + if (result != VK_SUCCESS) + goto exit; + + result = + create_build_pipeline_spv(device, encode_compact_spv, sizeof(encode_compact_spv), sizeof(struct encode_args), + &device->meta_state.accel_struct_build.encode_compact_pipeline, + &device->meta_state.accel_struct_build.encode_p_layout); + if (result != VK_SUCCESS) + goto exit; + + result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args), + &device->meta_state.accel_struct_build.update_pipeline, + &device->meta_state.accel_struct_build.update_p_layout); + + if (result != VK_SUCCESS) + goto exit; + } result = create_build_pipeline_spv(device, header_spv, sizeof(header_spv), sizeof(struct header_args), &device->meta_state.accel_struct_build.header_pipeline, @@ -754,16 +882,36 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) if (result != VK_SUCCESS) goto exit; - result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args), - &device->meta_state.accel_struct_build.update_pipeline, - &device->meta_state.accel_struct_build.update_p_layout); - if (result != VK_SUCCESS) - goto exit; - device->meta_state.accel_struct_build.radix_sort = vk_create_radix_sort_u64( radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache, radix_sort_config); - device->vk.as_build_ops = &build_ops; + device->meta_state.accel_struct_build.build_ops = (struct vk_acceleration_structure_build_ops){ + .begin_debug_marker = vk_accel_struct_cmd_begin_debug_marker, + .end_debug_marker = vk_accel_struct_cmd_end_debug_marker, + .get_as_size = radv_get_as_size, + .get_update_scratch_size = radv_get_update_scratch_size, + .get_encode_key[0] = radv_get_encode_key, + .get_encode_key[1] = radv_get_encode_key, + .encode_bind_pipeline[0] = radv_encode_bind_pipeline, + .encode_bind_pipeline[1] = radv_init_header_bind_pipeline, + .encode_as[1] = radv_init_header, + .init_update_scratch = radv_init_update_scratch, + .update_bind_pipeline[0] = radv_update_bind_pipeline, + .update_as[0] = radv_update_as, + }; + + if (radv_use_bvh8(pdev)) { + device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12; + } else { + device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as; + device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv; + device->meta_state.accel_struct_build.build_ops.leaf_spirv_override_size = sizeof(leaf_spv); + device->meta_state.accel_struct_build.build_ops.leaf_always_active_spirv_override = leaf_always_active_spv; + device->meta_state.accel_struct_build.build_ops.leaf_always_active_spirv_override_size = + sizeof(leaf_always_active_spv); + } + + device->vk.as_build_ops = &device->meta_state.accel_struct_build.build_ops; device->vk.write_buffer_cp = radv_write_buffer_cp; device->vk.flush_buffer_write_cp = radv_flush_buffer_write_cp; device->vk.cmd_dispatch_unaligned = radv_cmd_dispatch_unaligned; @@ -783,12 +931,30 @@ exit: static VkResult radv_device_init_accel_struct_copy_state(struct radv_device *device) { + const struct radv_physical_device *pdev = radv_device_physical(device); + VkResult result; + mtx_lock(&device->meta_state.mtx); - VkResult result = create_build_pipeline_spv(device, copy_spv, sizeof(copy_spv), sizeof(struct copy_args), - &device->meta_state.accel_struct_build.copy_pipeline, - &device->meta_state.accel_struct_build.copy_p_layout); + if (radv_use_bvh8(pdev)) { + result = create_build_pipeline_spv(device, copy_gfx12_spv, sizeof(copy_gfx12_spv), sizeof(struct copy_args), + &device->meta_state.accel_struct_build.copy_pipeline, + &device->meta_state.accel_struct_build.copy_p_layout); + if (result != VK_SUCCESS) + goto exit; + + result = create_build_pipeline_spv(device, copy_blas_addrs_gfx12_spv, sizeof(copy_blas_addrs_gfx12_spv), + sizeof(struct copy_args), + &device->meta_state.accel_struct_build.copy_blas_addrs_gfx12_pipeline, + &device->meta_state.accel_struct_build.copy_p_layout); + } else { + result = create_build_pipeline_spv(device, copy_spv, sizeof(copy_spv), sizeof(struct copy_args), + &device->meta_state.accel_struct_build.copy_pipeline, + &device->meta_state.accel_struct_build.copy_p_layout); + } + +exit: mtx_unlock(&device->meta_state.mtx); return result; } @@ -879,6 +1045,7 @@ radv_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); struct radv_meta_saved_state saved_state; VkResult result = radv_device_init_accel_struct_copy_state(device); @@ -904,6 +1071,21 @@ radv_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer, sizeof(consts), &consts); vk_common_CmdDispatch(commandBuffer, 512, 1, 1); + + if (radv_use_bvh8(pdev)) { + /* Wait for the main copy dispatch to finish. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_WRITE_BIT, 0, NULL, NULL) | + radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_READ_BIT, 0, NULL, NULL); + + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.accel_struct_build.copy_blas_addrs_gfx12_pipeline); + + vk_common_CmdDispatch(commandBuffer, 256, 1, 1); + } + radv_meta_restore(&saved_state, cmd_buffer); } @@ -945,6 +1127,20 @@ radv_CmdCopyAccelerationStructureToMemoryKHR(VkCommandBuffer commandBuffer, radv_CmdDispatchIndirect(commandBuffer, vk_buffer_to_handle(src->buffer), src->offset + offsetof(struct radv_accel_struct_header, copy_dispatch_size)); + if (radv_use_bvh8(pdev)) { + /* Wait for the main copy dispatch to finish. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_WRITE_BIT, 0, NULL, NULL) | + radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_READ_BIT, 0, NULL, NULL); + + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.accel_struct_build.copy_blas_addrs_gfx12_pipeline); + + vk_common_CmdDispatch(commandBuffer, 256, 1, 1); + } + radv_meta_restore(&saved_state, cmd_buffer); /* Set the header of the serialized data. */ diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 2379a1fc364..31388a5d21d 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -71,6 +71,7 @@ enum { RADV_DEBUG_DUMP_ASM = 1ull << 56, RADV_DEBUG_DUMP_BACKEND_IR = 1ull << 57, RADV_DEBUG_PSO_HISTORY = 1ull << 58, + RADV_DEBUG_BVH4 = 1ull << 59, RADV_DEBUG_DUMP_SHADERS = RADV_DEBUG_DUMP_VS | RADV_DEBUG_DUMP_TCS | RADV_DEBUG_DUMP_TES | RADV_DEBUG_DUMP_GS | RADV_DEBUG_DUMP_PS | RADV_DEBUG_DUMP_TASK | RADV_DEBUG_DUMP_MESH | RADV_DEBUG_DUMP_CS | RADV_DEBUG_DUMP_NIR | RADV_DEBUG_DUMP_ASM | RADV_DEBUG_DUMP_BACKEND_IR, diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h index a718f8d7e70..e9a9a65a496 100644 --- a/src/amd/vulkan/radv_device.h +++ b/src/amd/vulkan/radv_device.h @@ -100,8 +100,10 @@ struct radv_meta_state { VkPipeline update_pipeline; VkPipelineLayout copy_p_layout; VkPipeline copy_pipeline; + VkPipeline copy_blas_addrs_gfx12_pipeline; struct radix_sort_vk *radix_sort; + struct vk_acceleration_structure_build_ops build_ops; struct vk_acceleration_structure_build_args build_args; struct { diff --git a/src/amd/vulkan/radv_instance.c b/src/amd/vulkan/radv_instance.c index 7aebf073dfd..4946a5aebd0 100644 --- a/src/amd/vulkan/radv_instance.c +++ b/src/amd/vulkan/radv_instance.c @@ -86,6 +86,7 @@ static const struct debug_control radv_debug_options[] = {{"nofastclears", RADV_ {"asm", RADV_DEBUG_DUMP_ASM}, {"ir", RADV_DEBUG_DUMP_BACKEND_IR}, {"pso_history", RADV_DEBUG_PSO_HISTORY}, + {"bvh4", RADV_DEBUG_BVH4}, {NULL, 0}}; const char * diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index a81a63f36c2..f325520fdf9 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -157,6 +157,13 @@ radv_emulate_rt(const struct radv_physical_device *pdev) return !pdev->info.has_image_bvh_intersect_ray && instance->drirc.emulate_rt; } +bool +radv_use_bvh8(const struct radv_physical_device *pdev) +{ + const struct radv_instance *instance = radv_physical_device_instance(pdev); + return pdev->info.gfx_level >= GFX12 && !radv_emulate_rt(pdev) && !(instance->debug_flags & RADV_DEBUG_BVH4); +} + static void parse_hex(char *out, const char *in, unsigned length) { @@ -186,6 +193,7 @@ radv_physical_device_init_cache_key(struct radv_physical_device *pdev) key->disable_sinking_load_input_fs = instance->drirc.disable_sinking_load_input_fs; key->disable_trunc_coord = instance->drirc.disable_trunc_coord; key->emulate_rt = radv_emulate_rt(pdev); + key->bvh8 = radv_use_bvh8(pdev); key->ge_wave32 = pdev->ge_wave_size == 32; key->invariant_geom = !!(instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM); key->no_fmask = !!(instance->debug_flags & RADV_DEBUG_NO_FMASK); diff --git a/src/amd/vulkan/radv_physical_device.h b/src/amd/vulkan/radv_physical_device.h index f53307ece06..0c079b294d1 100644 --- a/src/amd/vulkan/radv_physical_device.h +++ b/src/amd/vulkan/radv_physical_device.h @@ -48,6 +48,7 @@ struct radv_physical_device_cache_key { uint32_t disable_sinking_load_input_fs : 1; uint32_t disable_trunc_coord : 1; uint32_t emulate_rt : 1; + uint32_t bvh8 : 1; uint32_t ge_wave32 : 1; uint32_t invariant_geom : 1; uint32_t no_fmask : 1; @@ -258,6 +259,8 @@ bool radv_enable_rt(const struct radv_physical_device *pdev); bool radv_emulate_rt(const struct radv_physical_device *pdev); +bool radv_use_bvh8(const struct radv_physical_device *pdev); + uint32_t radv_find_memory_index(const struct radv_physical_device *pdev, VkMemoryPropertyFlags flags); VkResult create_null_physical_device(struct vk_instance *vk_instance); diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c index 516415bbb0a..6b05390b3b2 100644 --- a/src/amd/vulkan/radv_rra.c +++ b/src/amd/vulkan/radv_rra.c @@ -184,8 +184,8 @@ rra_dump_asic_info(const struct radeon_info *gpu_info, FILE *output) } static struct rra_accel_struct_header -rra_fill_accel_struct_header_common(struct radv_accel_struct_header *header, size_t parent_id_table_size, - size_t leaf_node_data_size, size_t internal_node_data_size, +rra_fill_accel_struct_header_common(const struct radv_physical_device *pdev, struct radv_accel_struct_header *header, + size_t parent_id_table_size, struct rra_bvh_info *bvh_info, uint64_t primitive_count) { struct rra_accel_struct_header result = { @@ -199,32 +199,39 @@ rra_fill_accel_struct_header_common(struct radv_accel_struct_header *header, siz /* TODO: calculate active primitives */ .active_primitive_count = primitive_count, .geometry_description_count = header->geometry_count, - .interior_fp32_node_count = internal_node_data_size / sizeof(struct radv_bvh_box32_node), + .interior_fp32_node_count = bvh_info->internal_nodes_size / sizeof(struct radv_bvh_box32_node), .leaf_node_count = primitive_count, .rt_driver_interface_version = 8 << 16, .rt_ip_version = pdev->info.rt_ip_version, }; + if (!radv_use_bvh8(pdev)) + result.rt_ip_version = MIN2(result.rt_ip_version, RT_1_1); + result.metadata_size = sizeof(struct rra_accel_struct_metadata) + parent_id_table_size; - result.file_size = - result.metadata_size + sizeof(struct rra_accel_struct_header) + internal_node_data_size + leaf_node_data_size; + result.file_size = result.metadata_size + sizeof(struct rra_accel_struct_header) + bvh_info->internal_nodes_size + + bvh_info->leaf_nodes_size; result.internal_nodes_offset = sizeof(struct rra_accel_struct_metadata); - result.leaf_nodes_offset = result.internal_nodes_offset + internal_node_data_size; - result.geometry_infos_offset = result.leaf_nodes_offset + leaf_node_data_size; + result.leaf_nodes_offset = result.internal_nodes_offset + bvh_info->internal_nodes_size; + result.geometry_infos_offset = result.leaf_nodes_offset + bvh_info->leaf_nodes_size; result.leaf_ids_offset = result.geometry_infos_offset; - if (!header->instance_count) + if (header->instance_count) { + if (radv_use_bvh8(pdev)) + result.leaf_ids_offset += bvh_info->instance_sideband_data_size; + } else { result.leaf_ids_offset += header->geometry_count * sizeof(struct rra_geometry_info); + } return result; } static void -rra_dump_tlas_header(struct radv_accel_struct_header *header, size_t parent_id_table_size, size_t leaf_node_data_size, - size_t internal_node_data_size, uint64_t primitive_count, FILE *output) +rra_dump_tlas_header(const struct radv_physical_device *pdev, struct radv_accel_struct_header *header, + size_t parent_id_table_size, struct rra_bvh_info *bvh_info, uint64_t primitive_count, FILE *output) { - struct rra_accel_struct_header file_header = rra_fill_accel_struct_header_common( - header, parent_id_table_size, leaf_node_data_size, internal_node_data_size, primitive_count); + struct rra_accel_struct_header file_header = + rra_fill_accel_struct_header_common(pdev, header, parent_id_table_size, bvh_info, primitive_count); file_header.post_build_info.bvh_type = RRA_BVH_TYPE_TLAS; file_header.geometry_type = VK_GEOMETRY_TYPE_INSTANCES_KHR; @@ -232,12 +239,12 @@ rra_dump_tlas_header(struct radv_accel_struct_header *header, size_t parent_id_t } static void -rra_dump_blas_header(struct radv_accel_struct_header *header, size_t parent_id_table_size, - struct radv_accel_struct_geometry_info *geometry_infos, size_t leaf_node_data_size, - size_t internal_node_data_size, uint64_t primitive_count, FILE *output) +rra_dump_blas_header(const struct radv_physical_device *pdev, struct radv_accel_struct_header *header, + size_t parent_id_table_size, struct radv_accel_struct_geometry_info *geometry_infos, + struct rra_bvh_info *bvh_info, uint64_t primitive_count, FILE *output) { - struct rra_accel_struct_header file_header = rra_fill_accel_struct_header_common( - header, parent_id_table_size, leaf_node_data_size, internal_node_data_size, primitive_count); + struct rra_accel_struct_header file_header = + rra_fill_accel_struct_header_common(pdev, header, parent_id_table_size, bvh_info, primitive_count); file_header.post_build_info.bvh_type = RRA_BVH_TYPE_BLAS; file_header.geometry_type = header->geometry_count ? geometry_infos->type : VK_GEOMETRY_TYPE_TRIANGLES_KHR; @@ -281,7 +288,8 @@ rra_validate_header(struct radv_rra_accel_struct_data *accel_struct, const struc } static VkResult -rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, uint8_t *data, +rra_dump_acceleration_structure(const struct radv_physical_device *pdev, + struct radv_rra_accel_struct_data *accel_struct, uint8_t *data, struct hash_table_u64 *accel_struct_vas, bool should_validate, FILE *output) { struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data; @@ -297,10 +305,18 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, if (rra_validate_header(accel_struct, header)) { return VK_ERROR_VALIDATION_FAILED_EXT; } - if (rra_validate_node_gfx10_3(accel_struct_vas, data + header->bvh_offset, - data + header->bvh_offset + src_root_offset, header->geometry_count, - accel_struct->size, !is_tlas, 0)) { - return VK_ERROR_VALIDATION_FAILED_EXT; + if (radv_use_bvh8(pdev)) { + if (rra_validate_node_gfx12(accel_struct_vas, data + header->bvh_offset, + data + header->bvh_offset + src_root_offset, header->geometry_count, + accel_struct->size, !is_tlas, 0)) { + return VK_ERROR_VALIDATION_FAILED_EXT; + } + } else { + if (rra_validate_node_gfx10_3(accel_struct_vas, data + header->bvh_offset, + data + header->bvh_offset + src_root_offset, header->geometry_count, + accel_struct->size, !is_tlas, 0)) { + return VK_ERROR_VALIDATION_FAILED_EXT; + } } } @@ -321,7 +337,10 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, struct rra_bvh_info bvh_info = { .geometry_infos = rra_geometry_infos, }; - rra_gather_bvh_info_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info); + if (radv_use_bvh8(pdev)) + rra_gather_bvh_info_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info); + else + rra_gather_bvh_info_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info); leaf_indices = calloc(header->geometry_count, sizeof(struct rra_geometry_info)); if (!leaf_indices) { @@ -343,6 +362,8 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, uint32_t node_parent_table_size = ((bvh_info.leaf_nodes_size + bvh_info.internal_nodes_size) / 64) * sizeof(uint32_t); + if (radv_use_bvh8(pdev)) + node_parent_table_size = 0; node_parent_table = calloc(node_parent_table_size, 1); if (!node_parent_table) { @@ -355,7 +376,9 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, result = VK_ERROR_OUT_OF_HOST_MEMORY; goto exit; } - dst_structure_data = calloc(RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size, 1); + dst_structure_data = calloc(RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size + + bvh_info.instance_sideband_data_size, + 1); if (!dst_structure_data) { result = VK_ERROR_OUT_OF_HOST_MEMORY; goto exit; @@ -366,13 +389,20 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, .dst = dst_structure_data, .dst_leaf_offset = RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size, .dst_internal_offset = RRA_ROOT_NODE_OFFSET, + .dst_instance_sideband_data_offset = + RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size, .parent_id_table = node_parent_table, .parent_id_table_size = node_parent_table_size, .leaf_node_ids = leaf_node_ids, .leaf_indices = leaf_indices, }; - rra_transcode_node_gfx10_3(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, header->aabb); + if (radv_use_bvh8(pdev)) { + ctx.dst_internal_offset += sizeof(struct radv_gfx12_box_node); + rra_transcode_node_gfx12(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, RRA_ROOT_NODE_OFFSET); + } else { + rra_transcode_node_gfx10_3(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, header->aabb); + } struct rra_accel_struct_chunk_header chunk_header = { .metadata_offset = 0, @@ -395,8 +425,12 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, * the top bits are masked away. * In order to make sure BLASes can be found in the hashmap, we have * to replicate that mask here. + * On GFX12, we mask away the top 16 bits because the instance BLAS addresses + * use pointer flags. */ uint64_t va = accel_struct->va & 0x1FFFFFFFFFFFFFF; + if (radv_use_bvh8(pdev)) + va &= 0xFFFFFFFFFFFF; memcpy(chunk_header.virtual_address, &va, sizeof(uint64_t)); struct rra_accel_struct_metadata rra_metadata = { @@ -411,15 +445,13 @@ rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, fwrite(node_parent_table, 1, node_parent_table_size, output); if (is_tlas) - rra_dump_tlas_header(header, node_parent_table_size, bvh_info.leaf_nodes_size, bvh_info.internal_nodes_size, - primitive_count, output); + rra_dump_tlas_header(pdev, header, node_parent_table_size, &bvh_info, primitive_count, output); else - rra_dump_blas_header(header, node_parent_table_size, geometry_infos, bvh_info.leaf_nodes_size, - bvh_info.internal_nodes_size, primitive_count, output); + rra_dump_blas_header(pdev, header, node_parent_table_size, geometry_infos, &bvh_info, primitive_count, output); /* Write acceleration structure data */ - fwrite(dst_structure_data + RRA_ROOT_NODE_OFFSET, 1, bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size, - output); + fwrite(dst_structure_data + RRA_ROOT_NODE_OFFSET, 1, + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size + bvh_info.instance_sideband_data_size, output); if (!is_tlas) fwrite(rra_geometry_infos, sizeof(struct rra_geometry_info), header->geometry_count, output); @@ -974,7 +1006,7 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename) continue; accel_struct_offsets[written_accel_struct_count] = (uint64_t)ftell(file); - result = rra_dump_acceleration_structure(data, mapped_data, device->rra_trace.accel_struct_vas, + result = rra_dump_acceleration_structure(pdev, data, mapped_data, device->rra_trace.accel_struct_vas, device->rra_trace.validate_as, file); rra_unmap_accel_struct_data(©_ctx, i); diff --git a/src/amd/vulkan/radv_rra.h b/src/amd/vulkan/radv_rra.h index 089066d27f2..c5af1d8f735 100644 --- a/src/amd/vulkan/radv_rra.h +++ b/src/amd/vulkan/radv_rra.h @@ -285,6 +285,7 @@ radv_node_to_addr(uint64_t node) struct rra_bvh_info { uint32_t leaf_nodes_size; uint32_t internal_nodes_size; + uint32_t instance_sideband_data_size; struct rra_geometry_info *geometry_infos; }; @@ -293,6 +294,7 @@ struct rra_transcoding_context { uint8_t *dst; uint32_t dst_leaf_offset; uint32_t dst_internal_offset; + uint32_t dst_instance_sideband_data_offset; uint32_t *parent_id_table; uint32_t parent_id_table_size; uint32_t *leaf_node_ids; @@ -307,4 +309,12 @@ void rra_gather_bvh_info_gfx10_3(const uint8_t *bvh, uint32_t node_id, struct rr uint32_t rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, vk_aabb bounds); +bool rra_validate_node_gfx12(struct hash_table_u64 *accel_struct_vas, uint8_t *data, void *node, + uint32_t geometry_count, uint32_t size, bool is_bottom_level, uint32_t depth); + +void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_bvh_info *dst); + +void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, + uint32_t dst_offset); + #endif /* RADV_RRA_H */ diff --git a/src/amd/vulkan/radv_rra_gfx12.c b/src/amd/vulkan/radv_rra_gfx12.c new file mode 100644 index 00000000000..4cad241a38b --- /dev/null +++ b/src/amd/vulkan/radv_rra_gfx12.c @@ -0,0 +1,184 @@ +/* + * Copyright © 2025 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +/* GFX12 specific code for RRA. */ + +#include "bvh/bvh.h" +#include "radv_rra.h" + +#include "util/bitset.h" + +struct rra_instance_sideband_data { + uint32_t instance_index; + uint32_t custom_instance_and_flags; + uint32_t blas_metadata_size; + uint32_t padding; + mat3x4 otw_matrix; +}; + +bool +rra_validate_node_gfx12(struct hash_table_u64 *accel_struct_vas, uint8_t *data, void *node, uint32_t geometry_count, + uint32_t size, bool is_bottom_level, uint32_t depth) +{ + struct rra_validation_context ctx = {0}; + + if (depth > 1024) { + rra_validation_fail(&ctx, "depth > 1024"); + return true; + } + + uint32_t cur_offset = (uint8_t *)node - data; + snprintf(ctx.location, sizeof(ctx.location), "internal node (offset=%u)", cur_offset); + + return ctx.failed; +} + +static uint32_t +get_geometry_id(const void *node, uint32_t node_type) +{ + if (node_type == radv_bvh_node_instance) + return 0; + + uint32_t indices_midpoint = BITSET_EXTRACT(node, 42, 10); + return BITSET_EXTRACT(node, indices_midpoint - 28, 28); +} + +void +rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_bvh_info *dst) +{ + uint32_t node_type = node_id & 7; + + switch (node_type) { + case radv_bvh_node_box32: + dst->internal_nodes_size += sizeof(struct radv_gfx12_box_node); + break; + case radv_bvh_node_instance: + dst->leaf_nodes_size += sizeof(struct radv_gfx12_instance_node); + dst->instance_sideband_data_size += sizeof(struct rra_instance_sideband_data); + break; + case radv_bvh_node_triangle: + dst->leaf_nodes_size += sizeof(struct radv_gfx12_primitive_node); + break; + default: + unreachable("Invalid node type"); + break; + } + + const void *node = bvh + ((node_id & (~7u)) << 3); + if (node_type == radv_bvh_node_box32) { + const struct radv_gfx12_box_node *src = node; + + uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28; + + uint32_t internal_id = src->internal_base_id; + uint32_t primitive_id = src->primitive_base_id; + for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { + uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf; + uint32_t child_size = src->children[i].dword2 >> 28; + + uint32_t child_id; + if (child_type == radv_bvh_node_box32) { + child_id = internal_id | child_type; + internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + } else { + child_id = primitive_id | child_type; + primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + } + + rra_gather_bvh_info_gfx12(bvh, child_id, dst); + } + } else { + dst->geometry_infos[get_geometry_id(node, node_type)].primitive_count++; + } +} + +static void +rra_transcode_box8_node(struct rra_transcoding_context *ctx, const struct radv_gfx12_box_node *src, uint32_t parent_id, + uint32_t dst_offset) +{ + struct radv_gfx12_box_node *dst = (struct radv_gfx12_box_node *)(ctx->dst + dst_offset); + + memcpy(dst, src, sizeof(struct radv_gfx12_box_node)); + dst->internal_base_id = ctx->dst_internal_offset >> 3; + dst->primitive_base_id = ctx->dst_leaf_offset >> 3; + dst->unused = parent_id; + + uint32_t valid_child_count_minus_one = dst->child_count_exponents >> 28; + + uint32_t internal_child_count = 0; + uint32_t leaf_child_count = 0; + for (uint32_t i = 0; i <= valid_child_count_minus_one; ++i) { + uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf; + if (child_type == radv_bvh_node_box32) + internal_child_count++; + else + leaf_child_count++; + } + + uint32_t dst_internal_offset = ctx->dst_internal_offset; + ctx->dst_internal_offset += internal_child_count * RADV_GFX12_BVH_NODE_SIZE; + + uint32_t dst_leaf_offset = ctx->dst_leaf_offset; + ctx->dst_leaf_offset += leaf_child_count * RADV_GFX12_BVH_NODE_SIZE; + + uint32_t internal_id = src->internal_base_id; + uint32_t primitive_id = src->primitive_base_id; + for (uint32_t i = 0; i <= valid_child_count_minus_one; ++i) { + uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf; + uint32_t child_size = src->children[i].dword2 >> 28; + + uint32_t child_id; + uint32_t child_dst_offset; + if (child_type == radv_bvh_node_box32) { + child_id = internal_id | child_type; + internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + child_dst_offset = dst_internal_offset; + dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE; + } else { + child_id = primitive_id | child_type; + primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + child_dst_offset = dst_leaf_offset; + dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE; + } + + rra_transcode_node_gfx12(ctx, radv_bvh_node_box32 | (dst_offset >> 3), child_id, child_dst_offset); + + dst->children[i].dword2 = (dst->children[i].dword2 & 0x0fffffff) | (1 << 28); + } +} + +void +rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, uint32_t dst_offset) +{ + uint32_t node_type = src_id & 7; + uint32_t src_offset = (src_id & (~7u)) << 3; + + const void *src_child_node = ctx->src + src_offset; + if (node_type == radv_bvh_node_box32) { + rra_transcode_box8_node(ctx, src_child_node, parent_id, dst_offset); + } else { + memcpy(ctx->dst + dst_offset, src_child_node, RADV_GFX12_BVH_NODE_SIZE); + + if (node_type == radv_bvh_node_instance) { + struct radv_gfx12_instance_node *dst = (void *)(ctx->dst + dst_offset); + + struct rra_instance_sideband_data *sideband_data = (void *)(ctx->dst + ctx->dst_instance_sideband_data_offset); + ctx->dst_instance_sideband_data_offset += sizeof(struct rra_instance_sideband_data); + + const struct radv_gfx12_instance_node_user_data *user_data = + (const void *)((const uint8_t *)src_child_node + sizeof(struct radv_gfx12_instance_node)); + + dst->pointer_flags_bvh_addr = dst->pointer_flags_bvh_addr - (user_data->bvh_offset >> 3) + + (sizeof(struct rra_accel_struct_metadata) >> 3); + dst->unused = parent_id; + + sideband_data->instance_index = user_data->instance_index; + sideband_data->custom_instance_and_flags = user_data->custom_instance; + sideband_data->blas_metadata_size = offsetof(struct rra_accel_struct_metadata, unused); + sideband_data->otw_matrix = user_data->otw_matrix; + } + } +}