diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index 4f9d418319b..a63a534d349 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -21,6 +21,7 @@ TYPE(radv_gfx12_box_node, 4); TYPE(radv_gfx12_instance_node, 8); TYPE(radv_gfx12_instance_node_user_data, 4); TYPE(radv_gfx12_primitive_node, 4); +TYPE(radv_triangle_encode_task, 4); uint32_t id_to_offset(uint32_t id) diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index 22e76c3f181..861d59eb0bb 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -18,13 +18,15 @@ #define VOID_REF uint64_t #endif -#define RADV_BUILD_FLAG_COMPACT (1u << (VK_BUILD_FLAG_COUNT + 0)) -#define RADV_BUILD_FLAG_BVH8 (1u << (VK_BUILD_FLAG_COUNT + 1)) -#define RADV_BUILD_FLAG_UPDATE_IN_PLACE (1u << (VK_BUILD_FLAG_COUNT + 2)) -#define RADV_BUILD_FLAG_NO_INFS (1u << (VK_BUILD_FLAG_COUNT + 3)) -#define RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS (1u << (VK_BUILD_FLAG_COUNT + 4)) -#define RADV_BUILD_FLAG_UPDATE_SINGLE_GEOMETRY (1u << (VK_BUILD_FLAG_COUNT + 5)) -#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6)) +#define RADV_BUILD_FLAG_COMPACT (1u << (VK_BUILD_FLAG_COUNT + 0)) +#define RADV_BUILD_FLAG_BVH8 (1u << (VK_BUILD_FLAG_COUNT + 1)) +#define RADV_BUILD_FLAG_UPDATE_IN_PLACE (1u << (VK_BUILD_FLAG_COUNT + 2)) +#define RADV_BUILD_FLAG_NO_INFS (1u << (VK_BUILD_FLAG_COUNT + 3)) +#define RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS (1u << (VK_BUILD_FLAG_COUNT + 4)) +#define RADV_BUILD_FLAG_UPDATE_SINGLE_GEOMETRY (1u << (VK_BUILD_FLAG_COUNT + 5)) +#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6)) +#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 7)) +#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 8)) #define RADV_COPY_MODE_COPY 0 #define RADV_COPY_MODE_SERIALIZE 1 @@ -55,6 +57,15 @@ struct encode_gfx12_args { uint32_t geometry_type; }; +struct encode_triangles_gfx12_args { + VOID_REF intermediate_bvh; + VOID_REF output_base; + REF(vk_ir_header) header; + uint32_t output_bvh_offset; + uint32_t leaf_node_offsets_offset; + uint32_t batches_size; +}; + struct header_args { REF(vk_ir_header) src; REF(radv_accel_struct_header) dst; @@ -84,4 +95,11 @@ struct update_gfx12_args { vk_bvh_geometry_data geom_data0; }; +#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X 0 +#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Y 1 +#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Z 2 +#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X 3 +#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Y 4 +#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Z 5 + #endif /* BUILD_INTERFACE_H */ diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index 54e4d424a91..f6e867df6bb 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -195,4 +195,13 @@ struct radv_gfx12_primitive_node { uint32_t dwords[32]; }; +#define RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT 16 +#define RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT 8 + +struct radv_triangle_encode_task { + uint32_t parent_offset; + /* The pair index is stored in the 4 high bits and the node index is stored in the low bits. */ + uint32_t pair_index_node_index[RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT]; +}; + #endif /* BVH_H */ diff --git a/src/amd/vulkan/bvh/encode_gfx12.comp b/src/amd/vulkan/bvh/encode_gfx12.comp index c59548d0647..0ee77dd9545 100644 --- a/src/amd/vulkan/bvh/encode_gfx12.comp +++ b/src/amd/vulkan/bvh/encode_gfx12.comp @@ -91,7 +91,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern right = RADV_BVH_INVALID_NODE; } else if (right != RADV_BVH_INVALID_NODE && ir_id_to_type(left) == vk_ir_node_triangle && ir_id_to_type(right) == vk_ir_node_triangle && - VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES)) { + (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) || + VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))) { second_child = right; right = RADV_BVH_INVALID_NODE; } @@ -100,7 +101,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern } right = radv_read_invocation(cluster, collapse_index, right); - if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES)) { + if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) || + VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) { bool is_valid_triangle = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_triangle; uint32_t right_pair_mask = radv_ballot(cluster, is_valid_triangle && second_child == RADV_BVH_INVALID_NODE && @@ -142,7 +144,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern uint32_t dst_leaf_offset; uint32_t dst_internal_offset; if (cluster.invocation_index == 0) { - dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size); + if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) + dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size); dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size); } dst_leaf_offset = radv_read_invocation(cluster, 0, dst_leaf_offset); @@ -170,7 +173,17 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000); uvec3 extent_exponents = floatBitsToUint(extent) >> 23; - uint32_t valid_child_count = child_leaf_node_count + child_internal_node_count; + uint32_t valid_child_count = child_internal_node_count; + + uint32_t output_valid_child_count = valid_child_count; + /* Do not include triangle nodes if RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES because + * the count can only be computed by the encode pass. + */ + if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) + output_valid_child_count += child_leaf_node_count; + + valid_child_count += child_leaf_node_count; + if (cluster.invocation_index == 0) { DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0); DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0); @@ -178,7 +191,7 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern DEREF(dst).parent_id = RADV_BVH_INVALID_NODE; DEREF(dst).origin = origin; DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) | - (extent_exponents.z << 16) | ((valid_child_count - 1) << 28); + (extent_exponents.z << 16) | ((output_valid_child_count - 1) << 28); DEREF(dst).obb_matrix_index = 0x7f; } @@ -199,6 +212,39 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern REF(radv_gfx12_box_node) child_box = REF(radv_gfx12_box_node)(dst_child_addr); DEREF(child_box).parent_id = node_id; + } else if (VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) { + /* We try to encode 16 (RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT) triangles into a single node. */ + uint32_t batch_aligned_triangle_index; + if (cluster.invocation_index == radv_first_active_invocation(cluster)) { + /* Each invocation will encode a triangle pair. */ + batch_aligned_triangle_index = + atomicAdd(DEREF(args.header).driver_internal[0], RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + } + batch_aligned_triangle_index = + radv_read_invocation(cluster, radv_first_active_invocation(cluster), batch_aligned_triangle_index); + + VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header)); + REF(radv_triangle_encode_task) task = + INDEX(radv_triangle_encode_task, triangle_tasks, + batch_aligned_triangle_index / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + if (cluster.invocation_index == radv_first_active_invocation(cluster)) + DEREF(task).parent_offset = bvh_offset; + + uint32_t triangle_pair_index = child_index - child_internal_node_count; + + DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 0] = + (child_index << 28) | (ir_id_to_offset(child) / ir_leaf_node_size); + if (second_child != RADV_BVH_INVALID_NODE) { + DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] = + (child_index << 28) | (ir_id_to_offset(second_child) / ir_leaf_node_size); + } else { + DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] = RADV_BVH_INVALID_NODE; + } + + if (child_leaf_node_count < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT && + cluster.invocation_index == radv_first_active_invocation(cluster)) + DEREF(task).pair_index_node_index[child_leaf_node_count * 2] = RADV_BVH_INVALID_NODE; } else { if (VK_BUILD_FLAG(RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS)) { /* Write leaf node offset. */ diff --git a/src/amd/vulkan/bvh/encode_triangles_gfx12.comp b/src/amd/vulkan/bvh/encode_triangles_gfx12.comp new file mode 100644 index 00000000000..b28169ff7d0 --- /dev/null +++ b/src/amd/vulkan/bvh/encode_triangles_gfx12.comp @@ -0,0 +1,692 @@ +/* + * Copyright © 2025 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_KHR_memory_scope_semantics : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_clustered : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#define GFX12 +#define USE_GLOBAL_SYNC + +#include "vk_debug.h" + +#include "build_helpers.h" +#include "build_interface.h" +#include "encode.h" +#include "invocation_cluster.h" + +layout(push_constant) uniform CONSTS +{ + encode_triangles_gfx12_args args; +}; + +#define UNASSIGNED_VERTEX_INDICES 0xfffffffffffful + +void +main() +{ + bool is_retry = VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY); + + uint32_t global_id = gl_GlobalInvocationID.x; + + /* Each invocation cluster handles one task. */ + radv_invocation_cluster cluster; + radv_invocation_cluster_init(cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + uint32_t task_index = global_id / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT; + if (is_retry) { + VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size); + task_index = DEREF(INDEX(uint32_t, retry_indices, task_index)); + } + + VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header)); + REF(radv_triangle_encode_task) task = INDEX(radv_triangle_encode_task, triangle_tasks, task_index); + uint32_t pair_index_node_index0 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2]; + uint32_t pair_index_node_index1 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2 + 1]; + + uint32_t total_pair_count = min(findLSB(radv_ballot(cluster, pair_index_node_index0 == RADV_BVH_INVALID_NODE)), 8u); + + if (cluster.invocation_index >= total_pair_count) + return; + + uint32_t leaf_index0 = pair_index_node_index0 & 0x0fffffff; + vk_ir_triangle_node node0 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index0)); + + uint32_t triangle_id0 = node0.triangle_id; + uint32_t geometry_id0 = node0.geometry_id_and_flags & 0xffffff; + bool opaque0 = (node0.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0; + uint32_t triangle_id1 = triangle_id0; + uint32_t geometry_id1 = geometry_id0; + bool opaque1 = false; + + vec3 vertices[6]; + vertices[0] = vec3(node0.coords[0][0], node0.coords[0][1], node0.coords[0][2]); + vertices[1] = vec3(node0.coords[1][0], node0.coords[1][1], node0.coords[1][2]); + vertices[2] = vec3(node0.coords[2][0], node0.coords[2][1], node0.coords[2][2]); + + uint32_t pair_vertex_indices = 0x210; + + uint32_t pair_size = 1; + if (pair_index_node_index1 != RADV_BVH_INVALID_NODE) { + pair_size = 2; + + uint32_t leaf_index1 = pair_index_node_index1 & 0x0fffffff; + vk_ir_triangle_node node1 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index1)); + + triangle_id1 = node1.triangle_id; + geometry_id1 = node1.geometry_id_and_flags & 0xffffff; + opaque1 = (node1.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0; + + vertices[3] = vec3(node1.coords[0][0], node1.coords[0][1], node1.coords[0][2]); + vertices[4] = vec3(node1.coords[1][0], node1.coords[1][1], node1.coords[1][2]); + vertices[5] = vec3(node1.coords[2][0], node1.coords[2][1], node1.coords[2][2]); + + pair_vertex_indices = 0x543210; + + /* Deduplicate vertices here so it does not have to be done during the compression loop. */ + for (uint32_t i = 0; i < 3; i++) { + for (uint32_t j = 0; j < 3; j++) { + if (vertices[3 + i] == vertices[j]) { + uint32_t bit_offset = (i + 3) * 4; + uint32_t clear_mask = ~(0xf << bit_offset); + pair_vertex_indices = (pair_vertex_indices & clear_mask) | (j << bit_offset); + break; + } + } + } + } + + /* Encode inside a loop. Every active invocation tries to compress with the previously chosen + * nodes. The invocation with the smallest node size is chosen. TODO: Are there better heuristics? + * If there are no new candidates because the node would be too large, encode the previously chosen nodes + * and break out of the loop. In this case the first active invocation is chosen. + */ + + /* Each vertex is described by 8 bits. The highest 4 contain the invocation index and the low 4 bits contain the + * array index. + */ + uint64_t vertex_indices = UNASSIGNED_VERTEX_INDICES; + + bool vertex_used[6] = {false, false, false, false, false, false}; + + uint32_t hw_node_index = 0; + uvec3 encode_vertex_payload_bit_size; + uint32_t encode_trailing_zero_bits; + uint32_t encode_geometry_id_base_bit_size; + uint32_t encode_geometry_id_payload_bit_size; + uint32_t encode_triangle_id_base_bit_size; + uint32_t encode_triangle_id_payload_bit_size; + uint32_t encode_indices_midpoint; + + uint32_t invocation_vertex_count = pair_index_node_index1 != RADV_BVH_INVALID_NODE ? 6 : 3; + + while (true) { + /* assigned is true for every invocation whorse triangles are already part of the node. */ + bool assigned = vertex_indices != UNASSIGNED_VERTEX_INDICES; + uint32_t assigned_mask = radv_ballot(cluster, assigned); + uint32_t first_assigned_invocation = findLSB(assigned_mask); + uint32_t last_assigned_invocation = assigned_mask != 0 ? findMSB(assigned_mask) : 0; + + if (!assigned) + vertex_indices = 0; + + bool found[6] = {false, false, false, false, false, false}; + + /* At this point vertex_used is only set for assigned invocations since the rejected candidate invocations are + * reset. + */ + uint32_t vertex_count = 0; + for (uint32_t i = 0; i < 6; i++) + vertex_count += bitCount(radv_ballot(cluster, vertex_used[i])); + + for (uint32_t target_invocation = first_assigned_invocation; target_invocation <= last_assigned_invocation; + target_invocation++) { + + if (((assigned_mask >> target_invocation) & 1) == 0) + continue; + + vec3 target_vertices[6]; + bool target_vertex_used[6]; + for (uint32_t i = 0; i < 6; i++) { + target_vertices[i] = radv_read_invocation(cluster, target_invocation, vertices[i]); + target_vertex_used[i] = radv_read_invocation(cluster, target_invocation, vertex_used[i]); + } + + uint32_t target_vertex_count = radv_read_invocation(cluster, target_invocation, invocation_vertex_count); + + if (!assigned) { + for (uint32_t candidate_vertex_index = 0; candidate_vertex_index < invocation_vertex_count; + candidate_vertex_index++) { + if (found[candidate_vertex_index]) + continue; + + uint32_t assign_index = 0; + + for (uint32_t target_vertex_index = 0; target_vertex_index < target_vertex_count; + target_vertex_index++) { + if (target_vertex_used[target_vertex_index] && + target_vertices[target_vertex_index] == vertices[candidate_vertex_index]) { + found[candidate_vertex_index] = true; + assign_index = target_vertex_index; + } + } + + if (found[candidate_vertex_index]) + vertex_indices |= uint64_t((target_invocation << 4) + assign_index) + << uint64_t(candidate_vertex_index * 8); + } + } + } + + /* Handle the remaining vertices that are not already present in the assigned invocations. */ + if (!assigned) { + for (uint32_t i = 0; i < invocation_vertex_count; i++) { + if (found[i]) + continue; + + uint32_t pair_vertex_index = (pair_vertex_indices >> (i * 4)) & 0xf; + if (pair_vertex_index == i) { + vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8); + vertex_used[i] = true; + vertex_count++; + } else { + uint64_t vertex_index = (vertex_indices >> uint64_t(pair_vertex_index * 8)) & 0xff; + vertex_indices |= vertex_index << uint64_t(i * 8); + } + } + } + + /* Compute the node layout and size. For assigned invocations, the values contain information about the node with + * only the assigned triangles and for !assigned invocations, the current invocation is included. + */ + + uint32_t triangle_id_base_bit_size; + uint32_t triangle_id_payload_bit_size; + uint32_t geometry_id_base_bit_size; + uint32_t geometry_id_payload_bit_size; + for (uint32_t i = 0; i <= first_assigned_invocation; i++) { + /* Determine the number of bits required to represent the node ids in the hw's encoding format. + * Base and "offset" are masked and OR'd together, so look at the highest-ordered differing bit. + */ + uint32_t triangle_id_base = radv_read_invocation(cluster, i, triangle_id0); + triangle_id_base_bit_size = findMSB(triangle_id_base) + 1; + uint32_t invoc_triangle_id_payload_bit_size = + max(findMSB(triangle_id0 ^ triangle_id_base), findMSB(triangle_id1 ^ triangle_id_base)) + 1; + triangle_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_triangle_id_payload_bit_size : 0, + RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + uint32_t geometry_id_base = radv_read_invocation(cluster, i, geometry_id0); + geometry_id_base_bit_size = align(findMSB(geometry_id_base) + 1, 2); + uint32_t invoc_geometry_id_payload_bit_size = + max(findMSB(geometry_id0 ^ geometry_id_base), findMSB(geometry_id1 ^ geometry_id_base)) + 1; + geometry_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_geometry_id_payload_bit_size : 0, + RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + if (!assigned) { + triangle_id_payload_bit_size = max(triangle_id_payload_bit_size, invoc_triangle_id_payload_bit_size); + geometry_id_payload_bit_size = max(geometry_id_payload_bit_size, invoc_geometry_id_payload_bit_size); + } + + if (cluster.invocation_index <= i) + break; + } + + geometry_id_payload_bit_size = align(geometry_id_payload_bit_size, 2); + + /* vertex_used[0] is guaranteed to be true for at least one invocation. */ + uvec3 vertex_prefix = first_assigned_invocation == 0xffffffff + ? floatBitsToUint(vertices[0]) + : radv_read_invocation(cluster, first_assigned_invocation, floatBitsToUint(vertices[0])); + uvec3 vertex_payload_mask = uvec3(0); + uint32_t vertex_non_zero_mask = 0; + for (uint32_t i = 0; i < invocation_vertex_count; i++) { + vertex_payload_mask |= vertex_prefix ^ floatBitsToUint(vertices[i]); + vertex_non_zero_mask |= + floatBitsToUint(vertices[i].x) | floatBitsToUint(vertices[i].y) | floatBitsToUint(vertices[i].z); + } + + uint32_t trailing_zero_bits = min(findLSB(vertex_non_zero_mask), 32u); + uvec3 vertex_payload_bit_size = min(findMSB(vertex_payload_mask), 31u) + 1; + + if (!assigned) { + trailing_zero_bits = 32; + vertex_payload_bit_size = uvec3(0); + } + + trailing_zero_bits = subgroupClusteredMin(trailing_zero_bits, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + vertex_payload_bit_size = + subgroupClusteredMax(vertex_payload_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + if (!assigned) { + trailing_zero_bits = min(trailing_zero_bits, min(findLSB(vertex_non_zero_mask), 32u)); + vertex_payload_bit_size = max(vertex_payload_bit_size, min(findMSB(vertex_payload_mask), 31u) + 1); + } + + vertex_payload_bit_size.x = + vertex_payload_bit_size.x > trailing_zero_bits ? vertex_payload_bit_size.x - trailing_zero_bits : 1; + vertex_payload_bit_size.y = + vertex_payload_bit_size.y > trailing_zero_bits ? vertex_payload_bit_size.y - trailing_zero_bits : 1; + vertex_payload_bit_size.z = + vertex_payload_bit_size.z > trailing_zero_bits ? vertex_payload_bit_size.z - trailing_zero_bits : 1; + + uvec3 vertex_base_bit_size = uvec3(32 - trailing_zero_bits) - vertex_payload_bit_size; + + uint32_t required_bit_size = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE; + + required_bit_size += vertex_base_bit_size.x + vertex_base_bit_size.y + vertex_base_bit_size.z; + required_bit_size += + vertex_count * (vertex_payload_bit_size.x + vertex_payload_bit_size.y + vertex_payload_bit_size.z); + + uint32_t pair_count = bitCount(assigned_mask); + if (!assigned) + pair_count++; + + required_bit_size += geometry_id_base_bit_size + (pair_count * 2 - 1) * geometry_id_payload_bit_size; + uint32_t indices_midpoint = required_bit_size; + required_bit_size += triangle_id_base_bit_size + (pair_count * 2 - 1) * triangle_id_payload_bit_size; + + uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count; + required_bit_size += triangle_pair_descs_size; + + if (vertex_count > 15) + required_bit_size = RADV_GFX12_BVH_NODE_SIZE * 8 + 1; + + /* This is only relevant for unassigned invocations. If every invocation is assigned, the 0xffffffff will force a + * final flush. + */ + uint32_t min_required_bit_size = + subgroupClusteredMin(assigned ? 0xffffffff : required_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + /* The last iteration always needs to write the remaining triangles. */ + if (min_required_bit_size > RADV_GFX12_BVH_NODE_SIZE * 8) { + if (assigned) { + encode_vertex_payload_bit_size = vertex_payload_bit_size; + encode_trailing_zero_bits = trailing_zero_bits; + encode_geometry_id_base_bit_size = geometry_id_base_bit_size; + encode_geometry_id_payload_bit_size = geometry_id_payload_bit_size; + encode_triangle_id_base_bit_size = triangle_id_base_bit_size; + encode_triangle_id_payload_bit_size = triangle_id_payload_bit_size; + encode_indices_midpoint = indices_midpoint; + break; + } else { + hw_node_index++; + + vertex_indices = UNASSIGNED_VERTEX_INDICES; + for (uint32_t i = 0; i < 6; i++) + vertex_used[i] = false; + } + } else { + uint32_t chosen_invocation = + findMSB(radv_ballot(cluster, !assigned && required_bit_size == min_required_bit_size)); + if (cluster.invocation_index != chosen_invocation && !assigned) { + vertex_indices = UNASSIGNED_VERTEX_INDICES; + for (uint32_t i = 0; i < 6; i++) + vertex_used[i] = false; + } + } + } + + uint32_t hw_node_count = subgroupClusteredMax(hw_node_index, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) + 1; + + uint32_t pair_index; + uint32_t pair_base_index = 0; + uint32_t pair_count; + uint32_t first_active_in_node; + uint32_t node_mask; + uint32_t node_invocations; + for (uint32_t i = 0; i < hw_node_count; i++) { + uint32_t current_node_mask = radv_ballot(cluster, hw_node_index == i); + if (hw_node_index == i) { + node_mask = current_node_mask; + pair_count = bitCount(node_mask); + first_active_in_node = findLSB(node_mask); + pair_index = bitCount(node_mask & ((1u << cluster.invocation_index) - 1)); + node_invocations = subgroupClusteredOr(cluster.invocation_index << (pair_index * 4), + RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + break; + } + pair_base_index += bitCount(current_node_mask); + } + + bool is_single_prim_node = pair_count == 1 && pair_index_node_index1 == RADV_BVH_INVALID_NODE; + + /* If there is a node that contains only one primitive, abort this encoding attempt and retry during a second pass + * which will pair such nodes. This needs a separate pass so that the allocated nodes of two batches can be + * guaranteed to be close enough since primitive nodes can only have small relative offsets. The retry pass has + * RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (is_retry) set. + */ + uint32_t single_prim_node_invoc = findLSB(radv_ballot(cluster, is_single_prim_node)); + bool has_single_prim_node = radv_ballot(cluster, is_single_prim_node) != 0; + if (!is_retry && has_single_prim_node) { + if (cluster.invocation_index == 0) { + uint32_t retry_base_invocation = + atomicAdd(DEREF(args.header).driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X], + RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT); + + uint32_t retry_batch_index_index = retry_base_invocation / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT; + VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size); + DEREF(INDEX(uint32_t, retry_indices, retry_batch_index_index)) = task_index; + } + + return; + } + + if (is_retry) { + /* Move the single primitive node to the end since it needs to offset into the next batch. */ + uint32_t single_prim_pair_base_index = radv_read_invocation(cluster, single_prim_node_invoc, pair_base_index); + + if (pair_base_index > single_prim_pair_base_index) + pair_base_index--; + if (is_single_prim_node) + pair_base_index = total_pair_count - 1; + } + + REF(radv_gfx12_box_node) parent_node = + REF(radv_gfx12_box_node)(args.output_base + args.output_bvh_offset + DEREF(task).parent_offset); + uint32_t first_leaf_child_index = (DEREF(parent_node).child_count_exponents >> 28) + 1; + if (first_leaf_child_index == 0x10) + first_leaf_child_index = 0; + + /* Two batches are always combined into one during the retry pass is there is a second batch. The goal is to merge + * all primitive nodes with just one triangle (except one if there is an odd number of such nodes). Since the + * compression loop above can always merge at least two nodes, the following assumptions should hold: + * + * - There is at most one primitive node with only one triangle in a batch + * - this primitive has the max hw_node_index in this batch. + * + * If there is a second batch, the first batch will allocate one less primitive node. This is the triangle that will + * be merged into the second batch which we know has the highest hw_node_index/dst_offset. The second batch starts in + * dst memory where the primitive that was removed from the first batch should have been. The merged triangle can be + * referenced in two different ways: + * + * - If the batch contains only one triangle, the primitive_base_id is changed to point at the merged node. + * - Otherwise the node size inside the cild info before the moved triangle child info is set to skip ahead to the + * merged primitive node in the second batch. + */ + + if (is_retry) { + assert(bitCount(radv_ballot(cluster, is_single_prim_node)) == 1, + "radv: encode_triangles_gfx12: There must be exactly one node with only one triangle.\n"); + assert(!is_single_prim_node || hw_node_index == hw_node_count - 1, + "radv: encode_triangles_gfx12: The single triangle primitive node must be last.\n"); + } + + radv_invocation_cluster alloc_cluster = cluster; + uint32_t alloc_hw_node_count = hw_node_count; + bool has_second_batch = false; + bool jump_to_second_batch = false; + uint32_t single_prim_node_invocs[2]; + if (is_retry) { + radv_invocation_cluster_init(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT * 2); + + has_second_batch = (radv_ballot(alloc_cluster, true) >> RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) != 0; + + single_prim_node_invocs[0] = radv_read_invocation(alloc_cluster, 0, single_prim_node_invoc); + single_prim_node_invocs[1] = + radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, single_prim_node_invoc) + + RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT; + + if (has_second_batch) { + alloc_hw_node_count = + radv_read_invocation(alloc_cluster, 0, hw_node_count) + + radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, hw_node_count) - 1; + + jump_to_second_batch = alloc_cluster.invocation_index < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT; + if (is_single_prim_node) { + encode_vertex_payload_bit_size = uvec3(32); + encode_trailing_zero_bits = 0; + encode_geometry_id_base_bit_size = 24; + encode_geometry_id_payload_bit_size = 24; + encode_triangle_id_base_bit_size = 24; + encode_triangle_id_payload_bit_size = 24; + encode_indices_midpoint = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 32 * 9 * 2 + 24 * 2; + + vertex_indices = 0; + for (uint32_t i = 0; i < 6; i++) { + vertex_used[i] = true; + vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8); + } + + vertices[3] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[0]); + vertices[4] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[1]); + vertices[5] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[2]); + + triangle_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], triangle_id0); + geometry_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], geometry_id0); + opaque1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], opaque0); + + /* Indicate that there is a second node. The actual value of pair_index_node_index1 is not used. */ + pair_index_node_index1 = 0; + } + } + } + + /* Allocate space for the primitive node. */ + uint32_t dst_offset; + if (cluster.invocation_index == 0) { + if (alloc_cluster.invocation_index == 0) + dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, alloc_hw_node_count * RADV_GFX12_BVH_NODE_SIZE); + dst_offset = radv_read_invocation(alloc_cluster, 0, dst_offset); + + if (alloc_cluster.invocation_index == RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) { + dst_offset += + radv_read_invocation(alloc_cluster, 0, hw_node_count) * RADV_GFX12_BVH_NODE_SIZE - RADV_GFX12_BVH_NODE_SIZE; + } + + DEREF(parent_node).primitive_base_id = pack_node_id(dst_offset, 0); + DEREF(parent_node).child_count_exponents = (DEREF(parent_node).child_count_exponents & 0x0fffffff) | + ((first_leaf_child_index + total_pair_count - 1) << 28); + } + dst_offset = radv_read_invocation(cluster, 0, dst_offset) + hw_node_index * RADV_GFX12_BVH_NODE_SIZE; + + uint32_t second_dst_offset = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset); + bool rewrite_primitive_base_id = jump_to_second_batch && total_pair_count == 1; + if (rewrite_primitive_base_id) + DEREF(parent_node).primitive_base_id = pack_node_id(second_dst_offset, 0); + + radv_gfx12_box_child child = DEREF(parent_node).children[first_leaf_child_index + cluster.invocation_index]; + + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + if (pair_index < pair_count - 1) + child.dword2 = child.dword2 & 0xffffff; + + uint32_t jump_size = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset) - dst_offset; + if (jump_to_second_batch && !rewrite_primitive_base_id && pair_base_index + pair_index == (total_pair_count - 1) - 1) + child.dword2 = (child.dword2 & 0xffffff) | ((jump_size / RADV_GFX12_BVH_NODE_SIZE) << 28); + + /* Update the node type because it encodes the pair index which cannot be known in advance. + * The BVH8 encoding uses 4 bits for the type. The high bit is used to reference up to 8 pairs. + */ + child.dword2 |= ((pair_index & 0x3) << 24); + if (pair_index >= 4) + child.dword2 |= (8 << 24); + + DEREF(parent_node).children[first_leaf_child_index + pair_base_index + pair_index] = child; + + /* Return because the triangle is written by the second batch. */ + if (is_single_prim_node && jump_to_second_batch) + return; + + VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset; + + bit_writer writer; + bit_writer_init(writer, dst_leaf_addr); + + if (cluster.invocation_index == first_active_in_node) { + bit_writer_write(writer, encode_vertex_payload_bit_size.x - 1, 5); /* x_vertex_bits_minus_one */ + bit_writer_write(writer, encode_vertex_payload_bit_size.y - 1, 5); /* y_vertex_bits_minus_one */ + bit_writer_write(writer, encode_vertex_payload_bit_size.z - 1, 5); /* z_vertex_bits_minus_one */ + bit_writer_write(writer, encode_trailing_zero_bits, 5); /* trailing_zero_bits */ + bit_writer_write(writer, encode_geometry_id_base_bit_size / 2, 4); /* geometry_index_base_bits_div_2 */ + bit_writer_write(writer, encode_geometry_id_payload_bit_size / 2, 4); /* geometry_index_bits_div_2 */ + bit_writer_write(writer, pair_count - 1, 3); /* triangle_pair_count_minus_one */ + bit_writer_write(writer, 0, 1); /* vertex_type */ + bit_writer_write(writer, encode_triangle_id_base_bit_size, 5); /* primitive_index_base_bits */ + bit_writer_write(writer, encode_triangle_id_payload_bit_size, 5); /* primitive_index_bits */ + bit_writer_write(writer, encode_indices_midpoint, 10); + + uvec3 vertex_prefix = floatBitsToUint(vertices[0]); + uvec3 vertex_base_bit_size = uvec3(32 - encode_trailing_zero_bits) - encode_vertex_payload_bit_size; + if (vertex_base_bit_size.x > 0) { + bit_writer_write(writer, vertex_prefix.x >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.x), + vertex_base_bit_size.x); + } + if (vertex_base_bit_size.y > 0) { + bit_writer_write(writer, vertex_prefix.y >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.y), + vertex_base_bit_size.y); + } + if (vertex_base_bit_size.z > 0) { + bit_writer_write(writer, vertex_prefix.z >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.z), + vertex_base_bit_size.z); + } + } + + uint32_t vertex_used_mask[6]; + for (uint32_t processed_node_index = 0; processed_node_index < hw_node_count; processed_node_index++) { + if (processed_node_index != hw_node_index) + continue; + + for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++) + vertex_used_mask[vertex_index] = radv_ballot(cluster, vertex_used[vertex_index]); + } + + for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++) { + uvec3 vertex = floatBitsToUint(vertices[vertex_index]) >> encode_trailing_zero_bits; + vertex = vertex & uvec3((1ul << uint64_t(encode_vertex_payload_bit_size.x)) - 1, + (1ul << uint64_t(encode_vertex_payload_bit_size.y)) - 1, + (1ul << uint64_t(encode_vertex_payload_bit_size.z)) - 1); + + for (uint32_t i = 0; i < pair_count; i++) { + uint32_t invocation = (node_invocations >> (i * 4)) & 0xf; + if ((vertex_used_mask[vertex_index] & (1u << invocation)) == 0) + continue; + + uvec3 current_vertex = radv_read_invocation(cluster, invocation, vertex); + + if (cluster.invocation_index == first_active_in_node) { + bit_writer_write(writer, current_vertex.x, encode_vertex_payload_bit_size.x); + bit_writer_write(writer, current_vertex.y, encode_vertex_payload_bit_size.y); + bit_writer_write(writer, current_vertex.z, encode_vertex_payload_bit_size.z); + } + } + } + + if (encode_geometry_id_payload_bit_size > 0) { + uint32_t geometry_id_payload_mask = + (encode_geometry_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_geometry_id_payload_bit_size) - 1); + uint32_t geometry_id_payloads[2] = { + geometry_id0 & geometry_id_payload_mask, + geometry_id1 & geometry_id_payload_mask, + }; + + for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) { + uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf; + + uint32_t payload0 = radv_read_invocation(cluster, invocation, geometry_id_payloads[0]); + uint32_t payload1 = radv_read_invocation(cluster, invocation, geometry_id_payloads[1]); + if (cluster.invocation_index == first_active_in_node) { + bit_writer_write(writer, payload1, encode_geometry_id_payload_bit_size); + if (invocation != first_active_in_node) + bit_writer_write(writer, payload0, encode_geometry_id_payload_bit_size); + } + } + } + + if (cluster.invocation_index == first_active_in_node) { + bit_writer_write(writer, geometry_id0, encode_geometry_id_base_bit_size); + bit_writer_write(writer, triangle_id0, encode_triangle_id_base_bit_size); + } + + if (encode_triangle_id_payload_bit_size > 0) { + uint32_t triangle_id_payload_mask = + (encode_triangle_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_triangle_id_payload_bit_size) - 1); + uint32_t triangle_id_payloads[2] = { + triangle_id0 & triangle_id_payload_mask, + triangle_id1 & triangle_id_payload_mask, + }; + + for (uint32_t i = 0; i < pair_count; i++) { + uint32_t invocation = (node_invocations >> (i * 4)) & 0xf; + + uint32_t payload0 = radv_read_invocation(cluster, invocation, triangle_id_payloads[0]); + uint32_t payload1 = radv_read_invocation(cluster, invocation, triangle_id_payloads[1]); + if (cluster.invocation_index == first_active_in_node) { + if (invocation != first_active_in_node) + bit_writer_write(writer, payload0, encode_triangle_id_payload_bit_size); + bit_writer_write(writer, payload1, encode_triangle_id_payload_bit_size); + } + } + } + + if (cluster.invocation_index == first_active_in_node) { + uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count; + uint32_t target = 32 * 32 - triangle_pair_descs_size; + uint32_t skip_count = target - writer.total_count; + if (skip_count <= 32) + bit_writer_write(writer, 0, skip_count); + else + bit_writer_skip_to(writer, target); + } + + uint32_t encoded_vertex_indices = 0; + for (uint32_t i = 0; i < 6; i++) { + uint32_t vertex_index = uint32_t((vertex_indices >> (i * 8)) & 0xff); + uint32_t invocation = vertex_index >> 4; + uint32_t array_index = vertex_index & 0xf; + + uint32_t encoded_index = bitCount(vertex_used_mask[array_index] & ((1u << invocation) - 1)); + for (uint32_t j = 0; j < 5; j++) { + if (array_index > j) { + encoded_index += bitCount(vertex_used_mask[j]); + } + } + + encoded_vertex_indices |= (encoded_index << (i * 4)); + } + + for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) { + uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf; + + bool has_second_triangle = + radv_read_invocation(cluster, invocation, pair_index_node_index1 != RADV_BVH_INVALID_NODE); + bool current_opaque0 = radv_read_invocation(cluster, invocation, opaque0); + bool current_opaque1 = radv_read_invocation(cluster, invocation, opaque1); + uint32_t current_encoded_vertex_indices = radv_read_invocation(cluster, invocation, encoded_vertex_indices); + + if (cluster.invocation_index == first_active_in_node) { + bit_writer_write(writer, 1, 1); /* prim_range_stop */ + bit_writer_write(writer, 0, 1); /* tri1_double_sided */ + bit_writer_write(writer, (has_second_triangle && current_opaque1) ? 1 : 0, 1); /* tri1_opaque */ + bit_writer_write(writer, has_second_triangle ? (current_encoded_vertex_indices >> 12) : 0, + 12); /* tri1_v0_index, tri1_v1_index, tri1_v2_index */ + bit_writer_write(writer, 0, 1); /* tri0_double_sided */ + bit_writer_write(writer, current_opaque0 ? 1 : 0, 1); /* tri0_opaque */ + bit_writer_write(writer, current_encoded_vertex_indices & 0xfff, + 12); /* tri0_v0_index, tri0_v1_index, tri0_v2_index */ + } + } + + if (cluster.invocation_index == first_active_in_node) + bit_writer_finish(writer); +} diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index 654692cc31c..3320ef67428 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -14,6 +14,9 @@ bvh_shaders = [ [ 'encode_gfx12.comp', 'encode_gfx12', + ],[ + 'encode_triangles_gfx12.comp', + 'encode_triangles_gfx12', ], [ 'encode.comp', diff --git a/src/amd/vulkan/meta/radv_meta.h b/src/amd/vulkan/meta/radv_meta.h index 4401ff72c6c..715d31e87e6 100644 --- a/src/amd/vulkan/meta/radv_meta.h +++ b/src/amd/vulkan/meta/radv_meta.h @@ -120,6 +120,7 @@ enum radv_meta_object_key_type { RADV_META_OBJECT_KEY_BVH_COPY, RADV_META_OBJECT_KEY_BVH_COPY_BLAS_ADDRS_GFX12, RADV_META_OBJECT_KEY_BVH_ENCODE, + RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12, RADV_META_OBJECT_KEY_BVH_UPDATE, RADV_META_OBJECT_KEY_BVH_HEADER, }; diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 4f5cde9491f..3d8734e5068 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -33,6 +33,10 @@ static const uint32_t encode_gfx12_spv[] = { #include "bvh/encode_gfx12.spv.h" }; +static const uint32_t encode_triangles_gfx12_spv[] = { +#include "bvh/encode_triangles_gfx12.spv.h" +}; + static const uint32_t header_spv[] = { #include "bvh/header.spv.h" }; @@ -71,6 +75,7 @@ enum radv_encode_key_bits { RADV_ENCODE_KEY_COMPACT = (1 << 0), RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 1), RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 2), + RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 3), }; static void @@ -148,12 +153,16 @@ radv_get_acceleration_structure_layout(struct radv_device *device, /* root node */ offset += internal_node_size; - accel_struct->leaf_nodes_offset = offset; - offset += bvh_leaf_size * state->leaf_node_count; + if (!(state->config.encode_key[0] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)) { + accel_struct->leaf_nodes_offset = offset; + offset += bvh_leaf_size * state->leaf_node_count; + } accel_struct->internal_nodes_offset = offset; /* Factor out the root node. */ offset += internal_node_size * (internal_count - 1); + if (state->config.encode_key[0] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) + offset += bvh_leaf_size * state->leaf_node_count; accel_struct->size = offset; } @@ -230,6 +239,23 @@ radv_get_as_size(VkDevice _device, const struct vk_acceleration_structure_build_ return accel_struct.size; } +static uint32_t +radv_get_triangle_batches_size(const struct vk_acceleration_structure_build_state *state) +{ + return state->leaf_node_count * sizeof(struct radv_triangle_encode_task); +} + +static VkDeviceSize +radv_get_encode_scratch_size(VkDevice _device, const struct vk_acceleration_structure_build_state *state) +{ + if (state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) { + uint32_t retry_batch_indices_size = state->leaf_node_count * sizeof(uint32_t); + return radv_get_triangle_batches_size(state) + retry_batch_indices_size; + } + + return 0; +} + static VkDeviceSize radv_get_update_scratch_size(VkDevice _device, const struct vk_acceleration_structure_build_state *state) { @@ -267,7 +293,7 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR | VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) && geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) - encode_key |= RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12; + encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; } if (state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) @@ -275,6 +301,7 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s state->config.encode_key[0] = encode_key; state->config.encode_key[1] = encode_key; + state->config.encode_key[2] = encode_key; uint32_t update_key = 0; if (state->build_info->srcAccelerationStructure == state->build_info->dstAccelerationStructure) @@ -351,6 +378,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key) flags |= RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS; if (key & RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12) flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES; + if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) + flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES; return flags; } @@ -438,11 +467,22 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration }, .dst_node_offset = layout.internal_nodes_offset - layout.bvh_offset, .dst_leaf_node_offset = layout.leaf_nodes_offset - layout.bvh_offset, + .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X] = 0, + .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Y] = 1, + .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Z] = 1, + .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X] = 0, + .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Y] = 1, + .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Z] = 1, }; + uint32_t header_update_size = + offsetof(struct vk_ir_header, driver_internal) - offsetof(struct vk_ir_header, sync_data); + if (state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) + header_update_size = sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data); + const uint8_t *update_data = ((const uint8_t *)&header + offsetof(struct vk_ir_header, sync_data)); radv_update_memory_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, sync_data), update_data, - sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data)); + header_update_size); if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope) cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2; @@ -467,6 +507,118 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration radv_compute_dispatch(cmd_buffer, &dispatch); } +static VkResult +radv_encode_triangles_bind_pipeline_gfx12(VkCommandBuffer commandBuffer, + const struct vk_acceleration_structure_build_state *state) +{ + bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; + if (!compress_triangles) + return VK_SUCCESS; + + /* Wait for internal encoding to finish. */ + vk_barrier_compute_w_to_compute_r(commandBuffer); + + radv_bvh_build_bind_pipeline(commandBuffer, RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12, + encode_triangles_gfx12_spv, sizeof(encode_triangles_gfx12_spv), + sizeof(struct encode_triangles_gfx12_args), 0); + + return VK_SUCCESS; +} + +static void +radv_encode_triangles_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state) +{ + bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; + if (!compress_triangles) + return; + + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, dst, state->build_info->dstAccelerationStructure); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + uint64_t intermediate_header_addr = state->build_info->scratchData.deviceAddress + state->scratch.header_offset; + uint64_t intermediate_bvh_addr = state->build_info->scratchData.deviceAddress + state->scratch.ir_offset; + + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, state, &layout); + + const struct encode_triangles_gfx12_args args = { + .intermediate_bvh = intermediate_bvh_addr, + .output_base = vk_acceleration_structure_get_va(dst), + .header = intermediate_header_addr, + .output_bvh_offset = layout.bvh_offset, + .leaf_node_offsets_offset = layout.leaf_node_offsets_offset, + .batches_size = radv_get_triangle_batches_size(state), + }; + radv_bvh_build_set_args(commandBuffer, &args, sizeof(args)); + + struct radv_dispatch_info dispatch = { + .unaligned = true, + .indirect_va = intermediate_header_addr + + offsetof(struct vk_ir_header, driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X]), + }; + + radv_compute_dispatch(cmd_buffer, &dispatch); +} + +static VkResult +radv_encode_triangles_retry_bind_pipeline_gfx12(VkCommandBuffer commandBuffer, + const struct vk_acceleration_structure_build_state *state) +{ + bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; + if (!compress_triangles) + return VK_SUCCESS; + + /* Wait for the first triangle compression pass to finish. */ + vk_barrier_compute_w_to_compute_r(commandBuffer); + vk_barrier_compute_w_to_indirect_compute_r(commandBuffer); + + radv_bvh_build_bind_pipeline(commandBuffer, RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12, + encode_triangles_gfx12_spv, sizeof(encode_triangles_gfx12_spv), + sizeof(struct encode_triangles_gfx12_args), + RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY); + + return VK_SUCCESS; +} + +static void +radv_encode_triangles_retry_gfx12(VkCommandBuffer commandBuffer, + const struct vk_acceleration_structure_build_state *state) +{ + bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; + if (!compress_triangles) + return; + + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, dst, state->build_info->dstAccelerationStructure); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + uint64_t intermediate_header_addr = state->build_info->scratchData.deviceAddress + state->scratch.header_offset; + uint64_t intermediate_bvh_addr = state->build_info->scratchData.deviceAddress + state->scratch.ir_offset; + + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, state, &layout); + + const struct encode_triangles_gfx12_args args = { + .intermediate_bvh = intermediate_bvh_addr, + .output_base = vk_acceleration_structure_get_va(dst), + .header = intermediate_header_addr, + .output_bvh_offset = layout.bvh_offset, + .leaf_node_offsets_offset = layout.leaf_node_offsets_offset, + .batches_size = radv_get_triangle_batches_size(state), + }; + radv_bvh_build_set_args(commandBuffer, &args, sizeof(args)); + + struct radv_dispatch_info dispatch = { + .unaligned = true, + .indirect_va = + intermediate_header_addr + + offsetof(struct vk_ir_header, driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X]), + }; + + radv_compute_dispatch(cmd_buffer, &dispatch); +} + static VkResult radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state) { @@ -806,20 +958,29 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) .get_build_config = radv_get_build_config, .get_as_size = radv_get_as_size, .get_update_scratch_size = radv_get_update_scratch_size, - .encode_bind_pipeline[1] = radv_init_header_bind_pipeline, - .encode_as[1] = radv_init_header, .init_update_scratch = radv_init_update_scratch, .update_bind_pipeline[0] = radv_update_bind_pipeline, }; if (radv_use_bvh8(pdev)) { device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as_gfx12; + device->meta_state.accel_struct_build.build_ops.get_encode_scratch_size = radv_get_encode_scratch_size; device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline_gfx12; device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12; + device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[1] = + radv_encode_triangles_bind_pipeline_gfx12; + device->meta_state.accel_struct_build.build_ops.encode_as[1] = radv_encode_triangles_gfx12; + device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[2] = + radv_encode_triangles_retry_bind_pipeline_gfx12; + device->meta_state.accel_struct_build.build_ops.encode_as[2] = radv_encode_triangles_retry_gfx12; + device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[3] = radv_init_header_bind_pipeline; + device->meta_state.accel_struct_build.build_ops.encode_as[3] = radv_init_header; } else { device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as; device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline; device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as; + device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[1] = radv_init_header_bind_pipeline; + device->meta_state.accel_struct_build.build_ops.encode_as[1] = radv_init_header; device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv; device->meta_state.accel_struct_build.build_ops.leaf_spirv_override_size = sizeof(leaf_spv); }