diff --git a/docs/features.txt b/docs/features.txt index 56699bc3fb2..4ed1ccdeb45 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -529,7 +529,7 @@ Vulkan 1.4 -- all DONE: anv, lvp, nvk, radv/gfx8+, tu/a7xx+ Khronos extensions that are not part of any Vulkan version: - VK_KHR_acceleration_structure DONE (anv/gfx12.5+, lvp, radv/gfx10.3+) + VK_KHR_acceleration_structure DONE (anv/gfx12.5+, lvp, radv/gfx10.3+, tu/a740+) VK_KHR_android_surface not started VK_KHR_calibrated_timestamps DONE (anv, nvk, radv, tu/a750+) VK_KHR_compute_shader_derivatives DONE (anv, nvk, radv, tu/a7xx+) diff --git a/meson.build b/meson.build index 7cbf7a7770e..1edc7746993 100644 --- a/meson.build +++ b/meson.build @@ -642,7 +642,7 @@ endif prog_glslang = find_program( 'glslangValidator', native : true, - required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk or with_swrast_vk + required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk or with_swrast_vk or with_freedreno_vk ) if prog_glslang.found() diff --git a/src/freedreno/vulkan/bvh/copy.comp b/src/freedreno/vulkan/bvh/copy.comp new file mode 100644 index 00000000000..c36c4cfde12 --- /dev/null +++ b/src/freedreno/vulkan/bvh/copy.comp @@ -0,0 +1,106 @@ +/* + * Copyright © 2022 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +#include "tu_build_interface.h" + +layout(push_constant) uniform CONSTS { + copy_args args; +}; + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + uint32_t lanes = gl_NumWorkGroups.x * 128; + uint32_t increment = lanes * 16; + + uint64_t copy_src_addr = args.src_addr; + uint64_t copy_dst_addr = args.dst_addr; + + if (args.mode == TU_COPY_MODE_DESERIALIZE) { + copy_src_addr += SIZEOF(vk_accel_struct_serialization_header) + + DEREF(REF(vk_accel_struct_serialization_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); + } + + REF(tu_accel_struct_header) header = REF(tu_accel_struct_header)(copy_src_addr); + + uint64_t instance_base = args.src_addr + SIZEOF(vk_accel_struct_serialization_header); + uint64_t instance_offset = SIZEOF(tu_accel_struct_header); + uint64_t instance_end = DEREF(header).instance_count * SIZEOF(tu_instance_descriptor); + if (instance_end > 0) + instance_end += instance_offset; + + if (args.mode == TU_COPY_MODE_SERIALIZE) { + copy_dst_addr += SIZEOF(vk_accel_struct_serialization_header) + + DEREF(REF(tu_accel_struct_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); + + if (global_id == 0) { + REF(vk_accel_struct_serialization_header) ser_header = + REF(vk_accel_struct_serialization_header)(args.dst_addr); + DEREF(ser_header).serialization_size = DEREF(header).serialization_size; + DEREF(ser_header).deserialization_size = DEREF(header).compacted_size; + DEREF(ser_header).instance_count = DEREF(header).instance_count; + } + + instance_base = args.dst_addr + SIZEOF(vk_accel_struct_serialization_header); + } else if (args.mode == TU_COPY_MODE_COPY) { + instance_end = 0; + } + + uint64_t size = DEREF(header).compacted_size; + for (uint64_t offset = global_id * 16; offset < size; offset += increment) { + DEREF(REF(uvec4)(copy_dst_addr + offset)) = + DEREF(REF(uvec4)(copy_src_addr + offset)); + + /* Do the adjustment inline in the same invocation that copies the data so that we don't have + * to synchronize. */ + if (offset < instance_end && offset >= instance_offset && + (offset - instance_offset) % SIZEOF(tu_instance_descriptor) == 0) { + uint64_t idx = (offset - instance_offset) / SIZEOF(tu_instance_descriptor); + + uint32_t bvh_offset = DEREF(REF(tu_instance_descriptor)(copy_src_addr + offset)).bvh_offset; + if (args.mode == TU_COPY_MODE_SERIALIZE) { + DEREF(INDEX(uint64_t, instance_base, idx)) = + DEREF(REF(tu_instance_descriptor)(copy_src_addr + offset)).bvh_ptr - bvh_offset; + } else { /* TU_COPY_MODE_DESERIALIZE */ + uint64_t blas_addr = DEREF(INDEX(uint64_t, instance_base, idx)); + DEREF(REF(tu_instance_descriptor)(copy_dst_addr + offset)).bvh_ptr = blas_addr + bvh_offset; + } + } + } +} diff --git a/src/freedreno/vulkan/bvh/encode.comp b/src/freedreno/vulkan/bvh/encode.comp new file mode 100644 index 00000000000..3cc3176503e --- /dev/null +++ b/src/freedreno/vulkan/bvh/encode.comp @@ -0,0 +1,502 @@ +/* + * Copyright © 2022 Friedrich Vock + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_KHR_memory_scope_semantics : require + +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +#include "tu_build_helpers.h" +#include "tu_build_interface.h" + +layout(push_constant) uniform CONSTS { + encode_args args; +}; + +void set_parent(uint32_t child, uint32_t parent) +{ + uint64_t addr = args.output_bvh - child * 4 - 4; + DEREF(REF(uint32_t)(addr)) = parent; +} + +/* This encoder struct is designed to encode a compressed node without keeping + * all the data live at once, making sure register pressure isn't too high. + */ + +struct tu_encoder { + uint32_t cur_value; + uint word_offset; + uint bit_offset; + REF(tu_compressed_node) node; +}; + +void encode_init(out tu_encoder encoder, REF(tu_compressed_node) node) +{ + encoder.cur_value = 0; + encoder.word_offset = 0; + encoder.bit_offset = 0; + encoder.node = node; +} + +void encode(inout tu_encoder encoder, uint32_t val, uint bits) +{ + encoder.cur_value |= val << encoder.bit_offset; + if (encoder.bit_offset + bits >= 32) { + DEREF(encoder.node).data[encoder.word_offset] = encoder.cur_value; + encoder.cur_value = val >> (32 - encoder.bit_offset); + encoder.word_offset++; + encoder.bit_offset = encoder.bit_offset + bits - 32; + } else { + encoder.bit_offset += bits; + } +} + +void encode_skip(inout tu_encoder encoder, uint bits) +{ + if (encoder.bit_offset + bits >= 32) { + DEREF(encoder.node).data[encoder.word_offset] = encoder.cur_value; + encoder.word_offset++; + encoder.bit_offset = encoder.bit_offset + bits - 32; + } else { + encoder.bit_offset += bits; + } +} + +void encode_finalize(tu_encoder encoder) +{ + DEREF(encoder.node).data[encoder.word_offset] = encoder.cur_value; +} + +void +encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, uint64_t dst_instances, REF(tu_accel_struct_header) dst_header) +{ + float coords[3][3]; + uint32_t id; + uint32_t geometry_id; + uint32_t type_flags = TU_NODE_TYPE_LEAF; + + switch (type) { + case vk_ir_node_triangle: { + vk_ir_triangle_node src = DEREF(REF(vk_ir_triangle_node)(src_node)); + + coords = src.coords; + uint32_t geometry_id_and_flags = src.geometry_id_and_flags; + if ((geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) { + atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_NONOPAQUE); + } else { + type_flags |= TU_NODE_TYPE_NONOPAQUE; + atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_OPAQUE); + } + geometry_id = geometry_id_and_flags & 0xffffff; + id = src.triangle_id; + break; + } + case vk_ir_node_aabb: { + vk_ir_aabb_node src = DEREF(REF(vk_ir_aabb_node)(src_node)); + vk_aabb aabb = src.base.aabb; + coords[0][0] = aabb.min[0]; + coords[0][1] = aabb.min[1]; + coords[0][2] = aabb.min[2]; + coords[1][0] = aabb.max[0]; + coords[1][1] = aabb.max[1]; + coords[1][2] = aabb.max[2]; + + type_flags |= TU_NODE_TYPE_AABB; + + if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) { + atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_NONOPAQUE); + } else { + type_flags |= TU_NODE_TYPE_NONOPAQUE; + atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_OPAQUE); + } + geometry_id = src.geometry_id_and_flags & 0xffffff; + id = src.primitive_id; + break; + } + case vk_ir_node_instance: { + vk_ir_instance_node src = DEREF(REF(vk_ir_instance_node)(src_node)); + + id = src.instance_id; + geometry_id = 0; + REF(tu_instance_descriptor) dst_instance = REF(tu_instance_descriptor)(dst_instances + SIZEOF(tu_instance_descriptor) * id); + + REF(tu_accel_struct_header) blas_header = REF(tu_accel_struct_header)(src.base_ptr); + uint64_t bvh_ptr = DEREF(blas_header).bvh_ptr; + uint32_t bvh_offset = uint32_t(bvh_ptr - src.base_ptr); + + uint32_t sbt_offset_and_flags = src.sbt_offset_and_flags; + uint32_t custom_instance_and_mask = src.custom_instance_and_mask; + DEREF(dst_instance).bvh_ptr = bvh_ptr; + DEREF(dst_instance).custom_instance_index = custom_instance_and_mask & 0xffffffu; + DEREF(dst_instance).sbt_offset_and_flags = sbt_offset_and_flags; + DEREF(dst_instance).bvh_offset = bvh_offset; + + mat4 transform = mat4(src.otw_matrix); + + mat4 inv_transform = transpose(inverse(transpose(transform))); + DEREF(dst_instance).wto_matrix = mat3x4(inv_transform); + DEREF(dst_instance).otw_matrix = mat3x4(transform); + + vk_aabb aabb = src.base.aabb; + coords[0][0] = aabb.min[0]; + coords[0][1] = aabb.min[1]; + coords[0][2] = aabb.min[2]; + coords[1][0] = aabb.max[0]; + coords[1][1] = aabb.max[1]; + coords[1][2] = aabb.max[2]; + + type_flags |= TU_NODE_TYPE_TLAS; + + uint32_t instance_flags = DEREF(blas_header).instance_flags; + + /* Apply VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR and + * VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR to correct the + * ALL_OPAQUE/ALL_NONOPAQUE flags. + */ + if (((sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR | + VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR)) != 0) { + instance_flags &= ~(VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR | + VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR); + instance_flags |= (sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR | + VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR); + } + uint32_t cull_mask_and_flags = ((custom_instance_and_mask >> 16) & 0xff00) | instance_flags; + + coords[2][0] = uintBitsToFloat(cull_mask_and_flags); + break; + } + } + + REF(tu_leaf_node) dst = REF(tu_leaf_node)(dst_node); + DEREF(dst).coords = coords; + DEREF(dst).id = id; + DEREF(dst).geometry_id = geometry_id; + DEREF(dst).type_flags = type_flags; +} + +/* Truncate to bfloat16 while rounding down. bfloat16 is used to store the bases. + */ + +u16vec3 to_bfloat_round_down(vec3 coord) +{ + u32vec3 icoord = floatBitsToUint(coord); + return u16vec3(mix(icoord >> 16, (icoord + 0xffff) >> 16, notEqual(icoord & u32vec3(0x80000000), u32vec3(0)))); +} + +/* Approximate subtraction while rounding up. Return a result greater than or + * equal to the infinitely-precise result. This just uses the native + * subtraction and then shifts one ULP towards infinity. Because the result is + * further rounded, it should usually be good enough while being faster than + * emulated floating-point math. + * + * We assume here that the result is always nonnegative, because it's only used + * to subtract away the base. + */ + +vec3 subtract_round_up_approx(vec3 a, vec3 b) +{ + vec3 f = a - b; + u32vec3 i = floatBitsToUint(f); + + i++; + + /* Handle infinity/zero special cases */ + i = mix(i, floatBitsToUint(f), isinf(f)); + i = mix(i, floatBitsToUint(f), equal(f, vec3(0))); + + return uintBitsToFloat(i); +} + +vec3 subtract_round_down_approx(vec3 a, vec3 b) +{ + vec3 f = a - b; + u32vec3 i = floatBitsToUint(f); + + i--; + + /* Handle infinity/zero special cases */ + i = mix(i, floatBitsToUint(f), isinf(f)); + i = mix(i, floatBitsToUint(f), equal(f, vec3(0))); + + return uintBitsToFloat(i); +} + +u32vec3 extract_mantissa(vec3 f) +{ + return mix((floatBitsToUint(f) & 0x7fffff) | 0x800000, u32vec3(0), equal(f, vec3(0))); +} + +void +encode_internal_node(uint32_t children[8], uint32_t children_offset, uint child_count, + vec3 min_offset, vec3 max_offset, uint32_t bvh_offset) +{ + REF(tu_internal_node) dst_node = REF(tu_internal_node)(OFFSET(args.output_bvh, SIZEOF(tu_internal_node) * bvh_offset)); + + DEREF(dst_node).id = children_offset; + + u16vec3 base_bfloat = to_bfloat_round_down(min_offset); + vec3 base_float = uintBitsToFloat(u32vec3(base_bfloat) << 16); + DEREF(dst_node).bases[0] = base_bfloat.x; + DEREF(dst_node).bases[1] = base_bfloat.y; + DEREF(dst_node).bases[2] = base_bfloat.z; + + vec3 children_max = subtract_round_up_approx(max_offset, base_float); + + /* The largest child offset will be encoded in 8 bits, including the + * explicit leading 1. We need to downcast to this precision while rounding + * up to catch cases where the exponent is increased by rounding up, then + * extract the exponent. Because children_max is always nonnegative, we can + * do the downcast with "(floatBitsToUint(children_max) + 0xffff) >> 16", + * and then we further shift to get the rounded exponent. + */ + u16vec3 exponents = u16vec3((floatBitsToUint(children_max) + 0xffff) >> 23); + u8vec3 exponents_u8 = u8vec3(exponents); + DEREF(dst_node).exponents[0] = exponents_u8.x; + DEREF(dst_node).exponents[1] = exponents_u8.y; + DEREF(dst_node).exponents[2] = exponents_u8.z; + + for (uint32_t i = 0; i < child_count; i++) { + uint32_t offset = ir_id_to_offset(children[i]); + + vk_aabb child_aabb = + DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; + + /* Note: because we subtract from the minimum, we should never have a + * negative value here. + */ + vec3 child_min = subtract_round_down_approx(child_aabb.min, base_float); + vec3 child_max = subtract_round_up_approx(child_aabb.max, base_float); + + u16vec3 child_min_exponents = u16vec3(floatBitsToUint(child_min) >> 23); + u16vec3 child_max_exponents = u16vec3(floatBitsToUint(child_max) >> 23); + + u16vec3 child_min_shift = u16vec3(16) + exponents - child_min_exponents; + /* Divide the mantissa by 2**child_min_shift, rounding down */ + u8vec3 child_min_mantissas = + mix(u8vec3(extract_mantissa(child_min) >> child_min_shift), u8vec3(0), + greaterThanEqual(child_min_shift, u16vec3(32))); + u16vec3 child_max_shift = u16vec3(16) + exponents - child_max_exponents; + /* Divide the mantissa by 2**child_max_shift, rounding up */ + u8vec3 child_max_mantissas = + mix(u8vec3((extract_mantissa(child_max) + ((u32vec3(1u) << u32vec3(child_max_shift)) - 1)) >> child_max_shift), + u8vec3(notEqual(extract_mantissa(child_max), u32vec3(0))), + greaterThanEqual(child_max_shift, u16vec3(32))); + + DEREF(dst_node).mantissas[i][0][0] = child_min_mantissas.x; + DEREF(dst_node).mantissas[i][0][1] = child_min_mantissas.y; + DEREF(dst_node).mantissas[i][0][2] = child_min_mantissas.z; + DEREF(dst_node).mantissas[i][1][0] = child_max_mantissas.x; + DEREF(dst_node).mantissas[i][1][1] = child_max_mantissas.y; + DEREF(dst_node).mantissas[i][1][2] = child_max_mantissas.z; + } + + for (uint32_t i = child_count; i < 8; i++) { + DEREF(dst_node).mantissas[i][0][0] = uint8_t(0xff); + DEREF(dst_node).mantissas[i][0][1] = uint8_t(0xff); + DEREF(dst_node).mantissas[i][0][2] = uint8_t(0xff); + DEREF(dst_node).mantissas[i][1][0] = uint8_t(0); + DEREF(dst_node).mantissas[i][1][1] = uint8_t(0); + DEREF(dst_node).mantissas[i][1][2] = uint8_t(0); + } + + DEREF(dst_node).child_count = uint8_t(child_count); + DEREF(dst_node).type_flags = uint16_t(args.geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ? (TU_NODE_TYPE_TLAS >> 16) : 0); +} + +void +main() +{ + /* Revert the order so we start at the root */ + uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x; + + uint32_t intermediate_leaf_node_size; + switch (args.geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + intermediate_leaf_node_size = SIZEOF(vk_ir_triangle_node); + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + intermediate_leaf_node_size = SIZEOF(vk_ir_aabb_node); + break; + default: /* instances */ + intermediate_leaf_node_size = SIZEOF(vk_ir_instance_node); + break; + } + + uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * intermediate_leaf_node_size; + + REF(vk_ir_box_node) intermediate_internal_nodes = + REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size); + REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id); + vk_ir_box_node src = DEREF(src_node); + + uint64_t dst_instances = args.output_bvh - args.output_bvh_offset + SIZEOF(tu_accel_struct_header); + + bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1; + + REF(tu_accel_struct_header) header = REF(tu_accel_struct_header)(args.output_bvh - args.output_bvh_offset); + + if (is_root_node) { + DEREF(header).instance_flags = + (args.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR ? TU_INSTANCE_ALL_AABB : 0) | + /* These will be removed when processing leaf nodes */ + TU_INSTANCE_ALL_NONOPAQUE | TU_INSTANCE_ALL_OPAQUE; + DEREF(args.header).dst_node_offset = 1; + } + + for (;;) { + /* Make changes to the current node's BVH offset value visible. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + uint32_t bvh_offset = is_root_node ? 0 : DEREF(src_node).bvh_offset; + if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) + continue; + + if (bvh_offset == VK_NULL_BVH_OFFSET) + break; + + uint32_t found_child_count = 0; + uint32_t children[8] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE, + VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE, + VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE, + VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE}; + + for (uint32_t i = 0; i < 2; ++i) + if (src.children[i] != VK_BVH_INVALID_NODE) + children[found_child_count++] = src.children[i]; + + while (found_child_count < 8) { + int32_t collapsed_child_index = -1; + float largest_surface_area = -INFINITY; + + for (int32_t i = 0; i < found_child_count; ++i) { + if (ir_id_to_type(children[i]) != vk_ir_node_internal) + continue; + + vk_aabb bounds = + DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, + ir_id_to_offset(children[i]))).aabb; + + float surface_area = aabb_surface_area(bounds); + if (surface_area > largest_surface_area) { + largest_surface_area = surface_area; + collapsed_child_index = i; + } + } + + if (collapsed_child_index != -1) { + REF(vk_ir_box_node) child_node = + REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, + ir_id_to_offset(children[collapsed_child_index])); + uint32_t grandchildren[2] = DEREF(child_node).children; + uint32_t valid_grandchild_count = 0; + + if (grandchildren[1] != VK_BVH_INVALID_NODE) + ++valid_grandchild_count; + + if (grandchildren[0] != VK_BVH_INVALID_NODE) + ++valid_grandchild_count; + else + grandchildren[0] = grandchildren[1]; + + if (valid_grandchild_count > 1) + children[found_child_count++] = grandchildren[1]; + + if (valid_grandchild_count > 0) + children[collapsed_child_index] = grandchildren[0]; + else { + found_child_count--; + children[collapsed_child_index] = children[found_child_count]; + } + + DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET; + } else + break; + } + + /* If there is only one child, collapse the current node by setting the + * child's offset to this node's offset. Otherwise, use an atomic to + * allocate contiguous space for all of the children. + */ + uint32_t children_offset = bvh_offset; + if (found_child_count > 1) { + children_offset = atomicAdd(DEREF(args.header).dst_node_offset, found_child_count); + } + + vec3 min_offset = vec3(INFINITY); + vec3 max_offset = vec3(-INFINITY); + for (uint32_t i = 0; i < found_child_count; ++i) { + uint32_t type = ir_id_to_type(children[i]); + uint32_t offset = ir_id_to_offset(children[i]); + uint32_t dst_offset; + + dst_offset = children_offset + i; + + if (type == vk_ir_node_internal) { + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset); + DEREF(child_node).bvh_offset = dst_offset; + } else { + encode_leaf_node(type, args.intermediate_bvh + offset, + args.output_bvh + SIZEOF(tu_internal_node) * dst_offset, dst_instances, + header); + } + + vk_aabb child_aabb = + DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; + + min_offset = min(min_offset, child_aabb.min); + max_offset = max(max_offset, child_aabb.max); + + if (found_child_count > 1) { + set_parent(dst_offset, bvh_offset); + } + } + + /* Make changes to the children's BVH offset value available to the other invocations. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + if (found_child_count > 1 || found_child_count == 0) + encode_internal_node(children, children_offset, found_child_count, min_offset, max_offset, bvh_offset); + + break; + } + + if (is_root_node) { + DEREF(header).aabb = src.base.aabb; + DEREF(header).bvh_ptr = args.output_bvh; + + set_parent(0, VK_BVH_INVALID_NODE); + } +} diff --git a/src/freedreno/vulkan/bvh/header.comp b/src/freedreno/vulkan/bvh/header.comp new file mode 100644 index 00000000000..d2c1f96f50f --- /dev/null +++ b/src/freedreno/vulkan/bvh/header.comp @@ -0,0 +1,65 @@ +/* + * Copyright © 2024 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +#include "tu_build_interface.h" + +layout(push_constant) uniform CONSTS +{ + header_args args; +}; + +void +main(void) +{ + uint32_t compacted_size = args.bvh_offset + DEREF(args.src).dst_node_offset * SIZEOF(tu_internal_node); + + uint32_t serialization_size = compacted_size + SIZEOF(uint64_t) * args.instance_count; + + uint32_t size = serialization_size - SIZEOF(vk_accel_struct_serialization_header) - + SIZEOF(uint64_t) * args.instance_count; + + DEREF(args.dst).compacted_size = compacted_size; + + DEREF(args.dst).copy_dispatch_size[0] = DIV_ROUND_UP(compacted_size, 16 * 128); + DEREF(args.dst).copy_dispatch_size[1] = 1; + DEREF(args.dst).copy_dispatch_size[2] = 1; + + DEREF(args.dst).serialization_size = serialization_size; + + DEREF(args.dst).size = size; +} diff --git a/src/freedreno/vulkan/bvh/meson.build b/src/freedreno/vulkan/bvh/meson.build new file mode 100644 index 00000000000..02ec2a78e7b --- /dev/null +++ b/src/freedreno/vulkan/bvh/meson.build @@ -0,0 +1,66 @@ +# Copyright © 2022 Konstantin Seurer + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# source file, output name, defines +bvh_shaders = [ + [ + 'encode.comp', + 'encode', + [], + ], + [ + 'header.comp', + 'header', + [], + ], + [ + 'copy.comp', + 'copy', + [] + ], +] + +tu_bvh_include_dir = dir_source_root + '/src/freedreno/vulkan/bvh' + +tu_bvh_includes = files( + 'tu_build_helpers.h', + 'tu_build_interface.h', + 'tu_bvh.h', +) + +bvh_spv = [] +foreach s : bvh_shaders + command = [ + prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + tu_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' + ] + command += glslang_quiet + + foreach define : s[2] + command += '-D' + define + endforeach + + bvh_spv += custom_target( + s[1] + '.spv.h', + input : s[0], + output : s[1] + '.spv.h', + command : command, + depend_files: [vk_bvh_includes, tu_bvh_includes], + ) +endforeach diff --git a/src/freedreno/vulkan/bvh/tu_build_helpers.h b/src/freedreno/vulkan/bvh/tu_build_helpers.h new file mode 100644 index 00000000000..e33863fa129 --- /dev/null +++ b/src/freedreno/vulkan/bvh/tu_build_helpers.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2024 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef TU_BVH_BUILD_HELPERS_H +#define TU_BVH_BUILD_HELPERS_H + +#include "vk_build_helpers.h" +#include "tu_bvh.h" + +TYPE(tu_accel_struct_header, 8); +TYPE(tu_leaf_node, 4); +TYPE(tu_internal_node, 4); +TYPE(tu_compressed_node, 4); +TYPE(tu_instance_descriptor, 8); + +#endif + diff --git a/src/freedreno/vulkan/bvh/tu_build_interface.h b/src/freedreno/vulkan/bvh/tu_build_interface.h new file mode 100644 index 00000000000..4cf05ef597a --- /dev/null +++ b/src/freedreno/vulkan/bvh/tu_build_interface.h @@ -0,0 +1,64 @@ +/* + * Copyright © 2022 Konstantin Seurer + * Copyright © 2024 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef TU_BVH_BUILD_INTERFACE_H +#define TU_BVH_BUILD_INTERFACE_H + +#ifdef VULKAN +#include "tu_build_helpers.h" +#else +#include +#include "tu_bvh.h" +#define REF(type) uint64_t +#define VOID_REF uint64_t +#endif + +struct encode_args { + VOID_REF intermediate_bvh; + VOID_REF output_bvh; + REF(vk_ir_header) header; + uint32_t output_bvh_offset; + uint32_t leaf_node_count; + uint32_t geometry_type; +}; + +struct header_args { + REF(vk_ir_header) src; + REF(tu_accel_struct_header) dst; + uint32_t bvh_offset; + uint32_t instance_count; +}; + +#define TU_COPY_MODE_COPY 0 +#define TU_COPY_MODE_SERIALIZE 1 +#define TU_COPY_MODE_DESERIALIZE 2 + +struct copy_args { + VOID_REF src_addr; + VOID_REF dst_addr; + uint32_t mode; +}; + +#endif + diff --git a/src/freedreno/vulkan/bvh/tu_bvh.h b/src/freedreno/vulkan/bvh/tu_bvh.h new file mode 100644 index 00000000000..f3f98e83867 --- /dev/null +++ b/src/freedreno/vulkan/bvh/tu_bvh.h @@ -0,0 +1,138 @@ +/* + * Copyright © 2021 Bas Nieuwenhuizen + * Copyright © 2024 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef TU_BVH_H +#define TU_BVH_H + +#ifdef VULKAN +#define VK_UUID_SIZE 16 +#else +#include +#endif + +#include "vk_bvh.h" + +/* The size in bytes of each record in the D3D-style UAV descriptor for + * acceleration structures. The first record is the acceleration struct header + * and the rest are the instances. + */ +#define AS_RECORD_SIZE 128 + +/* The size of a BVH node as defined by the HW. */ +#define AS_NODE_SIZE 64 + +struct tu_accel_struct_header { + vk_aabb aabb; + + uint64_t bvh_ptr; + + /* This word contains flags that should be set in the leaf nodes for + * instances pointing to this BLAS. ALL_NODES_{OPAQUE_NONOPAQUE} may be + * modified by the FORCE_OPAQUE and FORCE_NON_OPAQUE instance flags. + */ + uint32_t instance_flags; + + /* Everything after this gets either updated/copied from the CPU or written by header.comp. */ + uint32_t copy_dispatch_size[3]; + + uint64_t compacted_size; + uint64_t serialization_size; + uint64_t size; + + /* Everything after this gets updated/copied from the CPU. */ + uint64_t instance_count; + + uint64_t self_ptr; + + uint32_t padding[10]; +}; + +/* See + * https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/a7xx-ray-tracing + * for details of the encoding. + */ + +#define TU_NODE_TYPE_TLAS (1u << 24) +#define TU_NODE_TYPE_LEAF (1u << 25) +#define TU_NODE_TYPE_NONOPAQUE (1u << 26) +#define TU_NODE_TYPE_AABB (1u << 27) + +#define TU_INTERSECTION_TYPE_TLAS (1u << 8) +#define TU_INTERSECTION_TYPE_LEAF (1u << 9) +#define TU_INTERSECTION_TYPE_NONOPAQUE (1u << 10) +#define TU_INTERSECTION_TYPE_AABB (1u << 11) +#define TU_INTERSECTION_BACK_FACE (1u << 12) + +#define TU_INSTANCE_ALL_OPAQUE (1u << 2) +#define TU_INSTANCE_ALL_NONOPAQUE (1u << 3) +#define TU_INSTANCE_ALL_AABB (1u << 6) + +struct tu_leaf_node { + uint32_t id; + float coords[3][3]; + uint32_t geometry_id; /* Ignored by HW, we use it to stash the geometry ID */ + uint32_t padding[4]; + uint32_t type_flags; +}; + +struct tu_internal_node { + uint32_t id; + uint16_t bases[3]; + uint8_t mantissas[8][2][3]; + uint8_t exponents[3]; + uint8_t child_count; + uint16_t type_flags; +}; + +struct tu_compressed_node { + uint32_t id; + uint32_t bases[3]; + uint32_t data[12]; +}; + +struct tu_instance_descriptor { + uint64_t bvh_ptr; + + uint32_t custom_instance_index; + + /* lower 24 bits are the sbt offset, upper 8 bits are the + * VkGeometryInstanceFlagsKHR + */ + uint32_t sbt_offset_and_flags; + + mat3x4 wto_matrix; + + uint32_t bvh_offset; + + /* Pad to make the size a power of 2 so that addressing math is + * simplified. + */ + uint32_t reserved[3]; + + /* Object to world matrix inverted from the initial transform. */ + mat3x4 otw_matrix; +}; + +#endif + diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index 16ab9357e36..532a82f7ba6 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -19,7 +19,10 @@ tu_entrypoints = custom_target( libtu_files = files( + 'bvh/tu_bvh.h', + 'bvh/tu_build_interface.h', 'layers/tu_rmv_layer.cc', + 'tu_acceleration_structure.cc', 'tu_autotune.cc', 'tu_buffer.cc', 'tu_buffer_view.cc', @@ -47,6 +50,8 @@ libtu_files = files( 'tu_util.cc', ) +subdir('bvh') + libtu_includes = [ inc_include, inc_src, @@ -160,7 +165,7 @@ endif libvulkan_freedreno = shared_library( 'vulkan_freedreno', - [libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files, sha1_h, u_format_pack_h], + [libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files, sha1_h, u_format_pack_h, bvh_spv], include_directories : libtu_includes, link_with : [ libfreedreno_ir3, diff --git a/src/freedreno/vulkan/tu_acceleration_structure.cc b/src/freedreno/vulkan/tu_acceleration_structure.cc new file mode 100644 index 00000000000..f87337e899a --- /dev/null +++ b/src/freedreno/vulkan/tu_acceleration_structure.cc @@ -0,0 +1,763 @@ +/* + * Copyright © 2021 Bas Nieuwenhuizen + * Copyright © 2024 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "tu_buffer.h" +#include "tu_device.h" +#include "tu_cmd_buffer.h" + +#include "vk_acceleration_structure.h" +#include "tu_acceleration_structure.h" +#include "radix_sort/radix_sort_u64.h" + +#include "util/u_hexdump.h" + +#include "bvh/tu_build_interface.h" + +static const uint32_t encode_spv[] = { +#include "bvh/encode.spv.h" +}; + +static const uint32_t header_spv[] = { +#include "bvh/header.spv.h" +}; + +static const uint32_t copy_spv[] = { +#include "bvh/copy.spv.h" +}; + +static_assert(sizeof(struct tu_instance_descriptor) == AS_RECORD_SIZE); +static_assert(sizeof(struct tu_accel_struct_header) == AS_RECORD_SIZE); +static_assert(sizeof(struct tu_internal_node) == AS_NODE_SIZE); +static_assert(sizeof(struct tu_leaf_node) == AS_NODE_SIZE); + +static VkResult +get_pipeline_spv(struct tu_device *device, + const char *name, const uint32_t *spv, uint32_t spv_size, + unsigned push_constant_size, + VkPipeline *pipeline, VkPipelineLayout *layout) +{ + size_t key_size = strlen(name); + + const VkPushConstantRange pc_range = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = push_constant_size, + }; + + VkResult result = vk_meta_get_pipeline_layout(&device->vk, + &device->meta, NULL, + &pc_range, name, key_size, + layout); + + if (result != VK_SUCCESS) + return result; + + VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta, name, key_size); + if (pipeline_from_cache != VK_NULL_HANDLE) { + *pipeline = pipeline_from_cache; + return VK_SUCCESS; + } + + VkShaderModuleCreateInfo module_info = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = NULL, + .flags = 0, + .codeSize = spv_size, + .pCode = spv, + }; + + VkPipelineShaderStageCreateInfo shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = &module_info, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .flags = 0, + .stage = shader_stage, + .layout = *layout, + }; + + return vk_meta_create_compute_pipeline(&device->vk, &device->meta, &pipeline_info, + name, key_size, pipeline); +} + +struct bvh_layout { + uint64_t bvh_offset; + uint64_t size; +}; + +static void +get_bvh_layout(VkGeometryTypeKHR geometry_type, + uint32_t leaf_count, + struct bvh_layout *layout) +{ + uint32_t internal_count = MAX2(leaf_count, 2) - 1; + + uint64_t offset = sizeof(struct tu_accel_struct_header); + + /* Instance descriptors, one per instance. */ + if (geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) { + offset += leaf_count * sizeof(struct tu_instance_descriptor); + } + + /* Parent links, which have to go directly before bvh_offset as we index + * them using negative offsets from there. + */ + offset += (internal_count + leaf_count) * sizeof(uint32_t); + + /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */ + offset = ALIGN(offset, 64); + layout->bvh_offset = offset; + + offset += internal_count * sizeof(struct tu_internal_node) + + leaf_count * sizeof(struct tu_leaf_node); + + layout->size = offset; +} + +VkDeviceSize get_bvh_size(VkDevice device, + const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, + uint32_t leaf_count) +{ + struct bvh_layout layout; + get_bvh_layout(vk_get_as_geometry_type(pBuildInfo), leaf_count, &layout); + return layout.size; +} + +static uint32_t +encode_key(VkAccelerationStructureTypeKHR type, + VkBuildAccelerationStructureFlagBitsKHR flags) +{ + return 0; +} + + +static VkResult +encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + struct tu_device *device = cmdbuf->device; + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = + get_pipeline_spv(device, "encode", encode_spv, sizeof(encode_spv), + sizeof(encode_args), &pipeline, &layout); + + if (result != VK_SUCCESS) + return result; + + tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + return VK_SUCCESS; +} + +static void +encode(VkCommandBuffer commandBuffer, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, + VkDeviceAddress intermediate_header_addr, + uint32_t leaf_count, + uint32_t key, + struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + struct tu_device *device = cmdbuf->device; + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info); + + VkPipeline pipeline; + VkPipelineLayout layout; + get_pipeline_spv(device, "encode", encode_spv, sizeof(encode_spv), + sizeof(encode_args), &pipeline, &layout); + + struct bvh_layout bvh_layout; + get_bvh_layout(geometry_type, leaf_count, &bvh_layout); + + const struct encode_args args = { + .intermediate_bvh = intermediate_as_addr, + .output_bvh = vk_acceleration_structure_get_va(dst) + bvh_layout.bvh_offset, + .header = intermediate_header_addr, + .output_bvh_offset = bvh_layout.bvh_offset, + .leaf_node_count = leaf_count, + .geometry_type = geometry_type, + }; + vk_common_CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), + &args); + + tu_dispatch_unaligned_indirect(commandBuffer, + intermediate_header_addr + + offsetof(struct vk_ir_header, ir_internal_node_count)); + + *(VkDeviceSize *) + util_sparse_array_get(&device->accel_struct_ranges, + vk_acceleration_structure_get_va(dst)) = dst->size; + +} + +/* Don't bother copying over the compacted size using a compute shader if + * compaction is never going to happen. + */ +enum tu_header_key { + HEADER_NO_DISPATCH, + HEADER_USE_DISPATCH +}; + +static uint32_t +header_key(VkAccelerationStructureTypeKHR type, + VkBuildAccelerationStructureFlagBitsKHR flags) +{ + return (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) ? + HEADER_USE_DISPATCH : HEADER_NO_DISPATCH; +} + +static VkResult +header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + struct tu_device *device = cmdbuf->device; + + if (key == HEADER_USE_DISPATCH) { + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = + get_pipeline_spv(device, "header", header_spv, sizeof(header_spv), + sizeof(header_args), &pipeline, &layout); + + if (result != VK_SUCCESS) + return result; + + static const VkMemoryBarrier mb = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, + }; + + vk_common_CmdPipelineBarrier(commandBuffer, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, 1, &mb, 0, NULL, 0, NULL); + + tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + } + + return VK_SUCCESS; +} + +static void +header(VkCommandBuffer commandBuffer, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, + VkDeviceAddress intermediate_header_addr, + uint32_t leaf_count, + uint32_t key, + struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + struct tu_device *device = cmdbuf->device; + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info); + + struct bvh_layout bvh_layout; + get_bvh_layout(geometry_type, leaf_count, &bvh_layout); + + VkDeviceAddress header_addr = vk_acceleration_structure_get_va(dst); + + size_t base = offsetof(struct tu_accel_struct_header, copy_dispatch_size); + + uint32_t instance_count = + geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ? leaf_count : 0; + + if (key == HEADER_USE_DISPATCH) { + base = offsetof(struct tu_accel_struct_header, instance_count); + VkPipeline pipeline; + VkPipelineLayout layout; + get_pipeline_spv(device, "header", header_spv, sizeof(header_spv), + sizeof(header_args), &pipeline, &layout); + + struct header_args args = { + .src = intermediate_header_addr, + .dst = vk_acceleration_structure_get_va(dst), + .bvh_offset = bvh_layout.bvh_offset, + .instance_count = instance_count, + }; + + vk_common_CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), + &args); + + vk_common_CmdDispatch(commandBuffer, 1, 1, 1); + } + + struct tu_accel_struct_header header = {}; + + header.instance_count = instance_count; + header.self_ptr = header_addr; + header.compacted_size = bvh_layout.size; + + header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 128); + header.copy_dispatch_size[1] = 1; + header.copy_dispatch_size[2] = 1; + + header.serialization_size = + header.compacted_size + + sizeof(struct vk_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count; + + header.size = header.serialization_size - sizeof(struct vk_accel_struct_serialization_header) - + sizeof(uint64_t) * header.instance_count; + + struct tu_cs *cs = &cmdbuf->cs; + + size_t header_size = sizeof(struct tu_accel_struct_header) - base; + assert(base % sizeof(uint32_t) == 0); + assert(header_size % sizeof(uint32_t) == 0); + uint32_t *header_ptr = (uint32_t *)((char *)&header + base); + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + header_size / sizeof(uint32_t)); + tu_cs_emit_qw(cs, header_addr + base); + tu_cs_emit_array(cs, header_ptr, header_size / sizeof(uint32_t)); +} + +const struct vk_acceleration_structure_build_ops tu_as_build_ops = { + .get_as_size = get_bvh_size, + .get_encode_key = { encode_key, header_key }, + .encode_bind_pipeline = { encode_bind_pipeline, header_bind_pipeline }, + .encode_as = { encode, header }, +}; + +struct radix_sort_vk_target_config tu_radix_sort_config = { + .keyval_dwords = 2, + .init = { .workgroup_size_log2 = 8, }, + .fill = { .workgroup_size_log2 = 8, .block_rows = 8 }, + .histogram = { + .workgroup_size_log2 = 8, + .subgroup_size_log2 = 7, + .block_rows = 14, /* TODO tune this */ + }, + .prefix = { + .workgroup_size_log2 = 8, + .subgroup_size_log2 = 7, + }, + .scatter = { + .workgroup_size_log2 = 8, + .subgroup_size_log2 = 7, + .block_rows = 14, /* TODO tune this */ + }, + .nonsequential_dispatch = false, +}; + +static VkResult +init_radix_sort(struct tu_device *device) +{ + if (!device->radix_sort) { + mtx_lock(&device->radix_sort_mutex); + if (!device->radix_sort) { + device->radix_sort = + vk_create_radix_sort_u64(tu_device_to_handle(device), + &device->vk.alloc, + VK_NULL_HANDLE, tu_radix_sort_config); + if (!device->radix_sort) { + /* TODO plumb through the error here */ + mtx_unlock(&device->radix_sort_mutex); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + } + mtx_unlock(&device->radix_sort_mutex); + } + + return VK_SUCCESS; +} + +struct tu_saved_compute_state { + uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; + struct tu_shader *compute_shader; +}; + +static void +tu_save_compute_state(struct tu_cmd_buffer *cmd, + struct tu_saved_compute_state *state) +{ + memcpy(state->push_constants, cmd->push_constants, sizeof(cmd->push_constants)); + state->compute_shader = cmd->state.shaders[MESA_SHADER_COMPUTE]; +} + +static void +tu_restore_compute_state(struct tu_cmd_buffer *cmd, + struct tu_saved_compute_state *state) +{ + cmd->state.shaders[MESA_SHADER_COMPUTE] = state->compute_shader; + if (state->compute_shader) { + tu_cs_emit_state_ib(&cmd->cs, state->compute_shader->state); + } + memcpy(cmd->push_constants, state->push_constants, sizeof(cmd->push_constants)); + cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; +} + +VKAPI_ATTR void VKAPI_CALL +tu_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + struct tu_device *device = cmd->device; + struct tu_saved_compute_state state; + + VkResult result = init_radix_sort(device); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + tu_save_compute_state(cmd, &state); + + struct vk_acceleration_structure_build_args args = { + .subgroup_size = 128, + .bvh_bounds_offset = offsetof(tu_accel_struct_header, aabb), + .emit_markers = false, + .radix_sort = device->radix_sort, + }; + + vk_cmd_build_acceleration_structures(commandBuffer, + &device->vk, + &device->meta, + infoCount, + pInfos, + ppBuildRangeInfos, + &args); + + tu_restore_compute_state(cmd, &state); +} + +VKAPI_ATTR void VKAPI_CALL +tu_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureInfoKHR *pInfo) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); + VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); + struct tu_saved_compute_state state; + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = + get_pipeline_spv(cmd->device, "copy", copy_spv, sizeof(copy_spv), + sizeof(copy_args), &pipeline, &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + tu_save_compute_state(cmd, &state); + + tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + struct copy_args consts = { + .src_addr = vk_acceleration_structure_get_va(src), + .dst_addr = vk_acceleration_structure_get_va(dst), + .mode = TU_COPY_MODE_COPY, + }; + + vk_common_CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), + &consts); + + TU_CALLX(cmd->device, tu_CmdDispatchIndirect)( + commandBuffer, src->buffer, + src->offset + offsetof(struct tu_accel_struct_header, copy_dispatch_size)); + + tu_restore_compute_state(cmd, &state); +} + +VKAPI_ATTR void VKAPI_CALL +tu_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer, + const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); + struct tu_saved_compute_state state; + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = + get_pipeline_spv(cmd->device, "copy", copy_spv, sizeof(copy_spv), + sizeof(copy_args), &pipeline, &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + tu_save_compute_state(cmd, &state); + + tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + const struct copy_args consts = { + .src_addr = pInfo->src.deviceAddress, + .dst_addr = vk_acceleration_structure_get_va(dst), + .mode = TU_COPY_MODE_DESERIALIZE, + }; + + vk_common_CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), + &consts); + + vk_common_CmdDispatch(commandBuffer, 256, 1, 1); + + tu_restore_compute_state(cmd, &state); +} + +VKAPI_ATTR void VKAPI_CALL +tu_CmdCopyAccelerationStructureToMemoryKHR(VkCommandBuffer commandBuffer, + const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); + struct tu_saved_compute_state state; + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = + get_pipeline_spv(cmd->device, "copy", copy_spv, sizeof(copy_spv), + sizeof(copy_args), &pipeline, &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + tu_save_compute_state(cmd, &state); + + tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + const struct copy_args consts = { + .src_addr = vk_acceleration_structure_get_va(src), + .dst_addr = pInfo->dst.deviceAddress, + .mode = TU_COPY_MODE_SERIALIZE, + }; + + vk_common_CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), + &consts); + + TU_CALLX(cmd->device, tu_CmdDispatchIndirect)( + commandBuffer, src->buffer, + src->offset + offsetof(struct tu_accel_struct_header, copy_dispatch_size)); + + tu_restore_compute_state(cmd, &state); + + /* Set the header of the serialized data. */ + uint32_t header_data[2 * VK_UUID_SIZE / 4]; + memcpy(header_data, cmd->device->physical_device->driver_uuid, VK_UUID_SIZE); + memcpy(header_data + VK_UUID_SIZE / 4, cmd->device->physical_device->cache_uuid, VK_UUID_SIZE); + + struct tu_cs *cs = &cmd->cs; + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + ARRAY_SIZE(header_data)); + tu_cs_emit_qw(cs, pInfo->dst.deviceAddress); + tu_cs_emit_array(cs, header_data, ARRAY_SIZE(header_data)); +} + +VKAPI_ATTR void VKAPI_CALL +tu_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType, + const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, + const uint32_t *pMaxPrimitiveCounts, + VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo) +{ + VK_FROM_HANDLE(tu_device, device, _device); + + init_radix_sort(device); + + struct vk_acceleration_structure_build_args args = { + .subgroup_size = 128, + .radix_sort = device->radix_sort, + }; + + vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts, + pSizeInfo, &args); +} + +VKAPI_ATTR void VKAPI_CALL +tu_GetDeviceAccelerationStructureCompatibilityKHR(VkDevice _device, + const VkAccelerationStructureVersionInfoKHR *pVersionInfo, + VkAccelerationStructureCompatibilityKHR *pCompatibility) +{ + VK_FROM_HANDLE(tu_device, device, _device); + bool compat = + memcmp(pVersionInfo->pVersionData, device->physical_device->driver_uuid, VK_UUID_SIZE) == 0 && + memcmp(pVersionInfo->pVersionData + VK_UUID_SIZE, device->physical_device->cache_uuid, VK_UUID_SIZE) == 0; + *pCompatibility = compat ? VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR + : VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR; +} + +VkResult +tu_init_null_accel_struct(struct tu_device *device) +{ + VkResult result = tu_bo_init_new(device, NULL, + &device->null_accel_struct_bo, + sizeof(tu_accel_struct_header) + + sizeof(tu_internal_node), + TU_BO_ALLOC_NO_FLAGS, "null AS"); + if (result != VK_SUCCESS) { + return result; + } + + result = tu_bo_map(device, device->null_accel_struct_bo, NULL); + if (result != VK_SUCCESS) { + tu_bo_finish(device, device->null_accel_struct_bo); + return result; + } + + struct tu_accel_struct_header header = { + .bvh_ptr = device->null_accel_struct_bo->iova + + sizeof(tu_accel_struct_header), + .self_ptr = device->null_accel_struct_bo->iova, + }; + + struct tu_internal_node node = { + .child_count = 0, + .type_flags = 0, + }; + + for (unsigned i = 0; i < 8; i++) { + node.mantissas[i][0][0] = 0xff; + node.mantissas[i][0][1] = 0xff; + node.mantissas[i][0][2] = 0xff; + } + + memcpy(device->null_accel_struct_bo->map, (void *)&header, sizeof(header)); + memcpy((char *)device->null_accel_struct_bo->map + sizeof(header), + (void *)&node, sizeof(node)); + return VK_SUCCESS; +} + +struct tu_node { + uint32_t data[16]; +}; + +static void +dump_leaf(struct tu_leaf_node *node) +{ + fprintf(stderr, "\tID: %d\n", node->id); + fprintf(stderr, "\tgeometry ID: %d\n", node->geometry_id); + bool aabb = node->type_flags & TU_NODE_TYPE_AABB; + for (unsigned i = 0; i < (aabb ? 2 : 3); i++) { + fprintf(stderr, "\t("); + for (unsigned j = 0; j < 3; j++) { + if (j != 0) + fprintf(stderr, ", "); + fprintf(stderr, "%f", node->coords[i][j]); + } + fprintf(stderr, ")\n"); + } +} + +static void +dump_internal(struct tu_internal_node *node, uint32_t *max_child) +{ + *max_child = MAX2(*max_child, node->id + node->child_count); + float base[3]; + unsigned exponents[3]; + for (unsigned i = 0; i < 3; i++) { + base[i] = uif(node->bases[i] << 16); + exponents[i] = node->exponents[i] - 134; + } + + for (unsigned i = 0; i < node->child_count; i++) { + fprintf(stderr, "\tchild %d\n", node->id + i); + for (unsigned vert = 0; vert < 2; vert++) { + fprintf(stderr, "\t\t("); + for (unsigned coord = 0; coord < 3; coord++) { + unsigned mantissa = node->mantissas[i][vert][coord]; + if (coord != 0) + fprintf(stderr, ", "); + fprintf(stderr, "%f", base[coord] + ldexp((float)mantissa, + exponents[coord])); + } + fprintf(stderr, ")\n"); + } + } +} + +static void +dump_as(struct vk_acceleration_structure *as) +{ + VK_FROM_HANDLE(tu_buffer, buf, as->buffer); + + struct tu_accel_struct_header *hdr = + (struct tu_accel_struct_header *)((char *)buf->bo->map + as->offset); + + fprintf(stderr, "dumping AS at %" PRIx64 "\n", buf->iova + as->offset); + u_hexdump(stderr, (uint8_t *)hdr, sizeof(*hdr), false); + + char *base = ((char *)buf->bo->map + (hdr->bvh_ptr - buf->iova)); + struct tu_node *node = (struct tu_node *)base; + + fprintf(stderr, "dumping nodes at %" PRIx64 "\n", hdr->bvh_ptr); + + uint32_t max_child = 1; + for (unsigned i = 0; i < max_child; i++) { + uint32_t *parent_ptr = (uint32_t*)(base - (4 + 4 * i)); + uint32_t parent = *parent_ptr; + fprintf(stderr, "node %d parent %d\n", i, parent); + u_hexdump(stderr, (uint8_t *)node, sizeof(*node), false); + if (node->data[15] & TU_NODE_TYPE_LEAF) { + /* TODO compressed leaves */ + dump_leaf((struct tu_leaf_node *)node); + } else { + dump_internal((struct tu_internal_node *)node, &max_child); + } + + node++; + } +} + +static bool +as_finished(struct tu_device *dev, struct vk_acceleration_structure *as) +{ + VK_FROM_HANDLE(tu_buffer, buf, as->buffer); + tu_bo_map(dev, buf->bo, NULL); + + struct tu_accel_struct_header *hdr = + (struct tu_accel_struct_header *)((char *)buf->bo->map + as->offset); + return hdr->self_ptr == buf->iova + as->offset; +} + +VKAPI_ATTR void VKAPI_CALL +tu_DestroyAccelerationStructureKHR(VkDevice _device, + VkAccelerationStructureKHR accelerationStructure, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(tu_device, device, _device); + if (TU_DEBUG(DUMPAS)) { + VK_FROM_HANDLE(vk_acceleration_structure, as, accelerationStructure); + if (as_finished(device, as)) + dump_as(as); + } + + vk_common_DestroyAccelerationStructureKHR(_device, accelerationStructure, + pAllocator); +} diff --git a/src/freedreno/vulkan/tu_acceleration_structure.h b/src/freedreno/vulkan/tu_acceleration_structure.h new file mode 100644 index 00000000000..08a88c61ddf --- /dev/null +++ b/src/freedreno/vulkan/tu_acceleration_structure.h @@ -0,0 +1,15 @@ +/* + * Copyright © 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#ifndef TU_ACCELERATION_STRUCT_H +#define TU_ACCELERATION_STRUCT_H + +#include "tu_common.h" + +VkResult tu_init_null_accel_struct(struct tu_device *device); + +extern const vk_acceleration_structure_build_ops tu_as_build_ops; + +#endif diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 001f6d2e068..d606486adcc 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -3958,13 +3958,26 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR, + VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR | + VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | SHADER_STAGES)) mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ; + /* Reading the AS for copying involves doing CmdDispatchIndirect with the + * copy size as a parameter, so it's read by the CP as well as a shader. + */ + if (gfx_read_access(flags, stages, + VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR)) + mask |= TU_ACCESS_SYSMEM_READ; + + if (gfx_read_access(flags, stages, VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT, SHADER_STAGES)) @@ -3985,6 +3998,11 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only SHADER_STAGES)) mask |= TU_ACCESS_UCHE_WRITE; + if (gfx_write_access(flags, stages, + VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR)) + mask |= TU_ACCESS_UCHE_WRITE | TU_ACCESS_CP_WRITE; + /* When using GMEM, the CCU is always flushed automatically to GMEM, and * then GMEM is flushed to sysmem. Furthermore, we already had to flush any * previous writes in sysmem mode when transitioning to GMEM. Therefore we diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 80ffdf66627..e90a44e2662 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -26,6 +26,7 @@ #include "util/mesa-sha1.h" #include "vk_descriptors.h" #include "vk_util.h" +#include "vk_acceleration_structure.h" #include "tu_buffer.h" #include "tu_buffer_view.h" @@ -33,6 +34,7 @@ #include "tu_image.h" #include "tu_formats.h" #include "tu_rmv.h" +#include "bvh/tu_build_interface.h" static inline uint8_t * pool_base(struct tu_descriptor_pool *pool) @@ -67,6 +69,7 @@ descriptor_size(struct tu_device *dev, COND(dev->physical_device->info->a7xx.storage_8bit, 1)); case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: return binding->descriptorCount; + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: default: return A6XX_TEX_CONST_DWORDS * 4; } @@ -1156,6 +1159,27 @@ write_sampler_descriptor(uint32_t *dst, VkSampler _sampler) memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor)); } +static void +write_accel_struct(uint32_t *dst, uint64_t va, uint64_t size) +{ + dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_32_UINT); + + /* The overall range of the entire AS may be more than the max range, but + * the SSBO is only used to access the instance descriptors and header. + * Make sure that we don't specify a too-large range. + */ + dst[1] = MAX2(DIV_ROUND_UP(size, AS_RECORD_SIZE), MAX_TEXEL_ELEMENTS); + dst[2] = + A6XX_TEX_CONST_2_STRUCTSIZETEXELS(AS_RECORD_SIZE / 4) | + A6XX_TEX_CONST_2_STARTOFFSETTEXELS(0) | + A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER); + dst[3] = 0; + dst[4] = A6XX_TEX_CONST_4_BASE_LO(va); + dst[5] = A6XX_TEX_CONST_5_BASE_HI(va >> 32); + for (int j = 6; j < A6XX_TEX_CONST_DWORDS; j++) + dst[j] = 0; +} + /* note: this is used with immutable samplers in push descriptors */ static void write_sampler_push(uint32_t *dst, const struct tu_sampler *sampler) @@ -1203,6 +1227,18 @@ tu_GetDescriptorEXT( case VK_DESCRIPTOR_TYPE_SAMPLER: write_sampler_descriptor(dest, *pDescriptorInfo->data.pSampler); break; + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { + if (pDescriptorInfo->data.accelerationStructure == 0) { + write_accel_struct(dest, device->null_accel_struct_bo->iova, + device->null_accel_struct_bo->size); + } else { + VkDeviceSize size = *(VkDeviceSize *) + util_sparse_array_get(&device->accel_struct_ranges, + pDescriptorInfo->data.accelerationStructure); + write_accel_struct(dest, pDescriptorInfo->data.accelerationStructure, size); + } + break; + } case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, pDescriptorInfo->data.pInputAttachmentImage); @@ -1236,6 +1272,8 @@ tu_update_descriptor_sets(const struct tu_device *device, ptr += binding_layout->offset / 4; } + const VkWriteDescriptorSetAccelerationStructureKHR *accel_structs = NULL; + /* for immutable samplers with push descriptors: */ const bool copy_immutable_samplers = dstSetOverride && binding_layout->immutable_samplers_offset; @@ -1279,6 +1317,9 @@ tu_update_descriptor_sets(const struct tu_device *device, } while (remaining > 0); continue; + } else if (writeset->descriptorType == + VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) { + accel_structs = vk_find_struct_const(writeset->pNext, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR); } ptr += binding_layout->size / 4 * writeset->dstArrayElement; @@ -1316,6 +1357,18 @@ tu_update_descriptor_sets(const struct tu_device *device, else if (copy_immutable_samplers) write_sampler_push(ptr, &samplers[writeset->dstArrayElement + j]); break; + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { + VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, accel_structs->pAccelerationStructures[j]); + if (accel_struct) { + write_accel_struct(ptr, + vk_acceleration_structure_get_va(accel_struct), + accel_struct->size); + } else { + write_accel_struct(ptr, device->null_accel_struct_bo->iova, + device->null_accel_struct_bo->size); + } + break; + } default: unreachable("unimplemented descriptor type"); break; @@ -1646,6 +1699,18 @@ tu_update_descriptor_set_with_template( else if (samplers) write_sampler_push(ptr, &samplers[j]); break; + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { + VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, *(const VkAccelerationStructureKHR *)src); + if (accel_struct) { + write_accel_struct(ptr, + vk_acceleration_structure_get_va(accel_struct), + accel_struct->size); + } else { + write_accel_struct(ptr, device->null_accel_struct_bo->iova, + device->null_accel_struct_bo->size); + } + break; + } default: unreachable("unimplemented descriptor type"); break; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index df94a96a870..10bfa99b5d0 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -30,6 +30,7 @@ #include "freedreno/common/freedreno_uuid.h" #include "freedreno/common/freedreno_stompable_regs.h" +#include "tu_acceleration_structure.h" #include "tu_clear_blit.h" #include "tu_cmd_buffer.h" #include "tu_cs.h" @@ -143,9 +144,19 @@ static void get_device_extensions(const struct tu_physical_device *device, struct vk_device_extension_table *ext) { + /* device->has_raytracing contains the value of the SW fuse. If the + * device doesn't have a fuse (i.e. a740), we have to ignore it because + * kgsl returns false. If it does have a fuse, enable raytracing if the + * fuse is set and we have ray_intersection. + */ + bool has_raytracing = + device->info->a7xx.has_ray_intersection && + (!device->info->a7xx.has_sw_fuse || device->has_raytracing); + *ext = (struct vk_device_extension_table) { .table = { .KHR_8bit_storage = device->info->a7xx.storage_8bit, .KHR_16bit_storage = device->info->a6xx.storage_16bit, + .KHR_acceleration_structure = has_raytracing, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, .KHR_calibrated_timestamps = device->info->a7xx.has_persistent_counter, @@ -153,6 +164,7 @@ get_device_extensions(const struct tu_physical_device *device, .KHR_copy_commands2 = true, .KHR_create_renderpass2 = true, .KHR_dedicated_allocation = true, + .KHR_deferred_host_operations = true, .KHR_depth_stencil_resolve = true, .KHR_descriptor_update_template = true, .KHR_device_group = true, @@ -466,6 +478,11 @@ tu_get_features(struct tu_physical_device *pdevice, /* Vulkan 1.4 */ features->pushDescriptor = true; + /* VK_KHR_acceleration_structure */ + features->accelerationStructure = true; + features->accelerationStructureCaptureReplay = pdevice->has_set_iova; + features->descriptorBindingAccelerationStructureUpdateAfterBind = true; + /* VK_KHR_compute_shader_derivatives */ features->computeDerivativeGroupQuads = pdevice->info->chip >= 7; features->computeDerivativeGroupLinear = pdevice->info->chip >= 7; @@ -1219,6 +1236,7 @@ tu_get_properties(struct tu_physical_device *pdevice, COND(pdevice->info->a7xx.storage_8bit, 1)); props->robustStorageBufferDescriptorSize = props->storageBufferDescriptorSize; + props->accelerationStructureDescriptorSize = 4 * A6XX_TEX_CONST_DWORDS; props->inputAttachmentDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; props->maxSamplerDescriptorBufferRange = ~0ull; props->maxResourceDescriptorBufferRange = ~0ull; @@ -1304,6 +1322,16 @@ tu_get_properties(struct tu_physical_device *pdevice, memcpy(props->optimalTilingLayoutUUID, sha1, VK_UUID_SIZE); } + + /* VK_KHR_acceleration_structure */ + props->maxGeometryCount = (1 << 24) - 1; + props->maxInstanceCount = (1 << 24) - 1; + props->maxPrimitiveCount = (1 << 29) - 1; + props->maxPerStageDescriptorAccelerationStructures = max_descriptor_set_size; + props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = max_descriptor_set_size; + props->maxDescriptorSetAccelerationStructures = max_descriptor_set_size; + props->maxDescriptorSetUpdateAfterBindAccelerationStructures = max_descriptor_set_size; + props->minAccelerationStructureScratchOffsetAlignment = 128; } static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = { @@ -2416,6 +2444,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } device->vk.command_buffer_ops = &tu_cmd_buffer_ops; + device->vk.as_build_ops = &tu_as_build_ops; device->vk.check_status = tu_device_check_status; device->vk.get_timestamp = tu_device_get_timestamp; @@ -2479,6 +2508,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, if (result != VK_SUCCESS) goto fail_queues; + util_sparse_array_init(&device->accel_struct_ranges, sizeof(VkDeviceSize), 256); + + mtx_init(&device->radix_sort_mutex, mtx_plain); + { struct ir3_compiler_options ir3_options = { .push_ubo_with_preamble = true, @@ -2555,6 +2588,15 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->global_bo_map = global; tu_init_clear_blit_shaders(device); + if (device->vk.enabled_features.accelerationStructure && + device->vk.enabled_features.nullDescriptor) { + result = tu_init_null_accel_struct(device); + if (result != VK_SUCCESS) { + vk_startup_errorf(device->instance, result, "null acceleration struct"); + goto fail_null_accel_struct; + } + } + result = tu_init_empty_shaders(device); if (result != VK_SUCCESS) { vk_startup_errorf(device->instance, result, "empty shaders"); @@ -2738,6 +2780,9 @@ fail_pipeline_cache: fail_dynamic_rendering: tu_destroy_empty_shaders(device); fail_empty_shaders: + if (device->null_accel_struct_bo) + tu_bo_finish(device, device->null_accel_struct_bo); +fail_null_accel_struct: tu_destroy_clear_blit_shaders(device); fail_global_bo_map: TU_RMV(resource_destroy, device, device->global_bo); @@ -2752,6 +2797,7 @@ fail_free_zombie_vma: u_vector_finish(&device->zombie_vmas); ir3_compiler_destroy(device->compiler); fail_compiler: + util_sparse_array_finish(&device->accel_struct_ranges); vk_meta_device_finish(&device->vk, &device->meta); fail_queues: for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { @@ -2804,6 +2850,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) vk_meta_device_finish(&device->vk, &device->meta); + util_sparse_array_finish(&device->accel_struct_ranges); + ir3_compiler_destroy(device->compiler); vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc); @@ -2832,6 +2880,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_bo_finish(device, device->global_bo); + if (device->null_accel_struct_bo) + tu_bo_finish(device, device->null_accel_struct_bo); + for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) tu_queue_finish(&device->queues[i][q]); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index d736080727a..00a9ce9d339 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -22,6 +22,8 @@ #include "tu_suballoc.h" #include "tu_util.h" +#include "radix_sort/radix_sort_vk.h" + #include "common/freedreno_rd_output.h" #include "util/vma.h" #include "util/u_vector.h" @@ -290,6 +292,11 @@ struct tu_device struct vk_meta_device meta; + radix_sort_vk_t *radix_sort; + mtx_t radix_sort_mutex; + + struct util_sparse_array accel_struct_ranges; + #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ /* Currently the kernel driver uses a 32-bit GPU address space, but it @@ -306,6 +313,8 @@ struct tu_device struct tu_bo *global_bo; struct tu6_global *global_bo_map; + struct tu_bo *null_accel_struct_bo; + uint32_t implicit_sync_bo_count; /* Device-global BO suballocator for reducing BO management overhead for diff --git a/src/freedreno/vulkan/tu_formats.cc b/src/freedreno/vulkan/tu_formats.cc index fb5d8c15bd4..cb134a979c6 100644 --- a/src/freedreno/vulkan/tu_formats.cc +++ b/src/freedreno/vulkan/tu_formats.cc @@ -12,6 +12,7 @@ #include "vk_android.h" #include "vk_enum_defines.h" #include "vk_util.h" +#include "vk_acceleration_structure.h" #include "drm-uapi/drm_fourcc.h" #include "tu_android.h" @@ -270,6 +271,9 @@ tu_physical_device_get_format_properties( if (vk_format == VK_FORMAT_R8_UINT) optimal |= VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR; + if (vk_acceleration_struct_vtx_format_supported(vk_format)) + buffer |= VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR; + end: out_properties->linearTilingFeatures = linear; out_properties->optimalTilingFeatures = optimal; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 14553e198ac..213024dc411 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -79,6 +79,7 @@ tu6_load_state_size(struct tu_pipeline *pipeline, case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: /* IBO-backed resources only need one packet for all graphics stages */ if (stage_count) count += 1; @@ -174,7 +175,8 @@ tu6_emit_load_state(struct tu_device *device, FALLTHROUGH; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4); /* IBO-backed resources only need one packet for all graphics stages */ if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) { diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 824e4cd3635..ff9463bc136 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -13,9 +13,11 @@ #include "nir/nir_builder.h" #include "util/os_time.h" +#include "vk_acceleration_structure.h" #include "vk_util.h" #include "tu_buffer.h" +#include "bvh/tu_build_interface.h" #include "tu_cmd_buffer.h" #include "tu_cs.h" #include "tu_device.h" @@ -92,6 +94,11 @@ struct PACKED primitives_generated_query_slot { uint64_t end; }; +struct PACKED accel_struct_slot { + struct query_slot common; + uint64_t result; +}; + /* Returns the IOVA or mapped address of a given uint64_t field * in a given slot of a query pool. */ #define query_iova(type, pool, query, field) \ @@ -251,6 +258,12 @@ tu_CreateQueryPool(VkDevice _device, pool_size += sizeof(struct tu_perf_query_data) * perf_query_info->counterIndexCount; break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + slot_size = sizeof(struct accel_struct_slot); + break; } case VK_QUERY_TYPE_PIPELINE_STATISTICS: slot_size = sizeof(struct pipeline_stat_query_slot); @@ -366,6 +379,10 @@ get_result_count(struct tu_query_pool *pool) case VK_QUERY_TYPE_OCCLUSION: case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: return 1; /* Transform feedback queries write two integer values */ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: @@ -586,6 +603,10 @@ tu_GetQueryPoolResults(VkDevice _device, case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: case VK_QUERY_TYPE_PIPELINE_STATISTICS: case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: return get_query_pool_results(device, pool, firstQuery, queryCount, dataSize, pData, stride, flags); default: @@ -735,6 +756,10 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, queryCount, buffer, dstOffset, stride, flags); @@ -802,6 +827,10 @@ tu_CmdResetQueryPool(VkCommandBuffer commandBuffer, case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: case VK_QUERY_TYPE_PIPELINE_STATISTICS: case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount); break; default: @@ -1710,6 +1739,56 @@ tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, handle_multiview_queries(cmd, pool, query); } +VKAPI_ATTR void VKAPI_CALL +tu_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, + uint32_t accelerationStructureCount, + const VkAccelerationStructureKHR *pAccelerationStructures, + VkQueryType queryType, + VkQueryPool queryPool, + uint32_t firstQuery) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(tu_query_pool, pool, queryPool); + + struct tu_cs *cs = &cmd->cs; + + /* Flush any AS builds */ + tu_emit_cache_flush(cmd); + + for (uint32_t i = 0; i < accelerationStructureCount; ++i) { + uint32_t query = i + firstQuery; + + VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]); + uint64_t va = vk_acceleration_structure_get_va(accel_struct); + + switch (queryType) { + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + va += offsetof(struct tu_accel_struct_header, compacted_size); + break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + va += offsetof(struct tu_accel_struct_header, serialization_size); + break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + va += offsetof(struct tu_accel_struct_header, instance_count); + break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + va += offsetof(struct tu_accel_struct_header, size); + break; + default: + unreachable("Unhandle accel struct query type."); + } + + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); + tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE); + tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0)); + tu_cs_emit_qw(cs, va); + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, query_available_iova(pool, query)); + tu_cs_emit_qw(cs, 0x1); + } +} + VKAPI_ATTR VkResult VKAPI_CALL tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( VkPhysicalDevice physicalDevice, diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index b96de287990..aa405f61edd 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -45,6 +45,7 @@ static const struct debug_control tu_debug_options[] = { { "hiprio", TU_DEBUG_HIPRIO }, { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES }, { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES }, + { "dumpas", TU_DEBUG_DUMPAS }, { NULL, 0 } }; diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index 0910acc183e..3bd037ac1f9 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -51,6 +51,7 @@ enum tu_debug_flags TU_DEBUG_HIPRIO = 1 << 26, TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27, TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28, + TU_DEBUG_DUMPAS = 1 << 29, }; struct tu_env {