mesa/src/amd/vulkan/bvh/encode_gfx12.comp
Konstantin Seurer d423554e9e radv/bvh: Pair compress triangles in more cases
Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36965>
2025-10-21 19:32:55 +00:00

405 lines
18 KiB
Text

/*
* Copyright © 2022 Friedrich Vock
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#version 460
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#define GFX12
#define USE_GLOBAL_SYNC
#include "build_helpers.h"
#include "build_interface.h"
#include "encode.h"
#include "invocation_cluster.h"
layout(push_constant) uniform CONSTS
{
encode_gfx12_args args;
};
void
encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_internal_nodes, uint32_t node_index)
{
/* Each invocation cluster encodes one internal node. */
radv_invocation_cluster cluster;
radv_invocation_cluster_init(cluster, 8);
REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, node_index);
vk_ir_box_node src = DEREF(src_node);
bool is_root_node = node_index == DEREF(args.header).ir_internal_node_count - 1;
for (;;) {
/* Make changes to the current node's BVH offset value visible. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t bvh_offset;
if (cluster.invocation_index == 0) {
bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
}
bvh_offset = radv_read_invocation(cluster, 0, bvh_offset);
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
break;
REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t child = RADV_BVH_INVALID_NODE;
if (cluster.invocation_index < 2)
child = src.children[cluster.invocation_index];
uint32_t second_child = RADV_BVH_INVALID_NODE;
while (true) {
uint32_t valid_children = radv_ballot(cluster, child != RADV_BVH_INVALID_NODE);
if ((valid_children & 0x80) != 0 || valid_children == 0)
break;
float surface_area = -1.0;
bool is_valid_internal = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_internal;
if (is_valid_internal) {
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child))).aabb;
surface_area = aabb_surface_area(child_aabb);
}
float max_surface_area = subgroupClusteredMax(surface_area, 8);
uint32_t collapse_index = findLSB(radv_ballot(cluster, is_valid_internal && surface_area == max_surface_area));
if (collapse_index == 0xffffffff)
break;
uint32_t right;
if (cluster.invocation_index == collapse_index) {
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child));
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
uint32_t left = DEREF(child_node).children[0];
right = DEREF(child_node).children[1];
if (left == RADV_BVH_INVALID_NODE) {
left = right;
right = RADV_BVH_INVALID_NODE;
} else if (right != RADV_BVH_INVALID_NODE && ir_id_to_type(left) == vk_ir_node_triangle &&
ir_id_to_type(right) == vk_ir_node_triangle &&
(VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))) {
second_child = right;
right = RADV_BVH_INVALID_NODE;
}
child = left;
}
right = radv_read_invocation(cluster, collapse_index, right);
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) {
bool is_valid_triangle = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_triangle;
uint32_t right_pair_mask =
radv_ballot(cluster, is_valid_triangle && second_child == RADV_BVH_INVALID_NODE &&
right != RADV_BVH_INVALID_NODE && ir_id_to_type(right) == vk_ir_node_triangle);
if (right_pair_mask != 0) {
if (cluster.invocation_index == findLSB(right_pair_mask))
second_child = right;
continue;
}
}
if (cluster.invocation_index == findMSB(valid_children) + 1)
child = right;
}
if ((VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) &&
child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_internal &&
second_child == RADV_BVH_INVALID_NODE) {
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child));
uint32_t left = DEREF(child_node).children[0];
uint32_t right = DEREF(child_node).children[1];
if (ir_id_to_type(left) == vk_ir_node_triangle && ir_id_to_type(right) == vk_ir_node_triangle) {
child = left;
second_child = right;
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
}
}
bool is_valid = child != RADV_BVH_INVALID_NODE;
bool is_valid_primitive = is_valid && ir_id_to_type(child) != vk_ir_node_internal;
bool is_valid_internal = is_valid && ir_id_to_type(child) == vk_ir_node_internal;
uint32_t child_leaf_node_count = bitCount(radv_ballot(cluster, is_valid_primitive));
uint32_t child_internal_node_count = bitCount(radv_ballot(cluster, is_valid_internal));
uint32_t leaf_node_size;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
case VK_GEOMETRY_TYPE_AABBS_KHR:
leaf_node_size = RADV_GFX12_BVH_NODE_SIZE;
break;
default:
/* instances */
leaf_node_size = 2 * RADV_GFX12_BVH_NODE_SIZE;
break;
}
uint32_t child_leaf_nodes_size = child_leaf_node_count * leaf_node_size;
uint32_t child_internal_nodes_size = child_internal_node_count * RADV_GFX12_BVH_NODE_SIZE;
uint32_t dst_leaf_offset;
uint32_t dst_internal_offset;
if (cluster.invocation_index == 0) {
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))
dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
}
dst_leaf_offset = radv_read_invocation(cluster, 0, dst_leaf_offset);
dst_internal_offset = radv_read_invocation(cluster, 0, dst_internal_offset);
uint32_t child_index = 0;
uint32_t dst_offset = 0;
if (is_valid_internal) {
child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
dst_offset = dst_internal_offset + child_index * RADV_GFX12_BVH_NODE_SIZE;
uint32_t offset = ir_id_to_offset(child);
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_offset;
}
if (is_valid_primitive) {
child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
dst_offset = dst_leaf_offset + child_index * leaf_node_size;
child_index += child_internal_node_count;
}
vec3 origin = src.base.aabb.min;
vec3 extent = src.base.aabb.max - src.base.aabb.min;
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
uint32_t valid_child_count = child_internal_node_count;
uint32_t output_valid_child_count = valid_child_count;
/* Do not include triangle nodes if RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES because
* the count can only be computed by the encode pass.
*/
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))
output_valid_child_count += child_leaf_node_count;
valid_child_count += child_leaf_node_count;
if (cluster.invocation_index == 0) {
DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
if (is_root_node)
DEREF(dst).parent_id = RADV_BVH_INVALID_NODE;
DEREF(dst).origin = origin;
DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
(extent_exponents.z << 16) | ((output_valid_child_count - 1) << 28);
DEREF(dst).obb_matrix_index = 0x7f;
}
if (is_valid) {
uint32_t type = ir_id_to_type(child);
uint32_t offset = ir_id_to_offset(child);
VOID_REF dst_child_addr = args.output_base + args.output_bvh_offset + dst_offset;
uint32_t child_node_size_128b = 1;
uint32_t encoded_type = 0;
uint32_t cull_mask = 0xff;
uint32_t cull_flags = 0;
if (type == vk_ir_node_internal) {
encoded_type = 5;
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
cull_flags = DEREF(child_node).flags & 0x3;
REF(radv_gfx12_box_node) child_box = REF(radv_gfx12_box_node)(dst_child_addr);
DEREF(child_box).parent_id = node_id;
} else if (VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) {
/* We try to encode 16 (RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT) triangles into a single node. */
uint32_t batch_aligned_triangle_index;
if (cluster.invocation_index == radv_first_active_invocation(cluster)) {
/* Each invocation will encode a triangle pair. */
batch_aligned_triangle_index =
atomicAdd(DEREF(args.header).driver_internal[0], RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
}
batch_aligned_triangle_index =
radv_read_invocation(cluster, radv_first_active_invocation(cluster), batch_aligned_triangle_index);
VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header));
REF(radv_triangle_encode_task) task =
INDEX(radv_triangle_encode_task, triangle_tasks,
batch_aligned_triangle_index / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
if (cluster.invocation_index == radv_first_active_invocation(cluster))
DEREF(task).parent_offset = bvh_offset;
uint32_t triangle_pair_index = child_index - child_internal_node_count;
DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 0] =
(child_index << 28) | (ir_id_to_offset(child) / ir_leaf_node_size);
if (second_child != RADV_BVH_INVALID_NODE) {
DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] =
(child_index << 28) | (ir_id_to_offset(second_child) / ir_leaf_node_size);
} else {
DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] = RADV_BVH_INVALID_NODE;
}
if (child_leaf_node_count < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT &&
cluster.invocation_index == radv_first_active_invocation(cluster))
DEREF(task).pair_index_node_index[child_leaf_node_count * 2] = RADV_BVH_INVALID_NODE;
} else {
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS)) {
/* Write leaf node offset. */
{
uint32_t leaf_index = offset / ir_leaf_node_size;
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
child_dst_offset = INDEX(uint32_t, child_dst_offset, leaf_index);
DEREF(child_dst_offset) = dst_offset;
}
if (second_child != RADV_BVH_INVALID_NODE) {
uint32_t leaf_index = ir_id_to_offset(second_child) / ir_leaf_node_size;
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
child_dst_offset = INDEX(uint32_t, child_dst_offset, leaf_index);
DEREF(child_dst_offset) = dst_offset;
}
}
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
vk_ir_triangle_node src_node0 = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset)));
bool opaque = (src_node0.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
cull_flags = opaque ? VK_BVH_BOX_FLAG_ONLY_OPAQUE : VK_BVH_BOX_FLAG_NO_OPAQUE;
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) && second_child != RADV_BVH_INVALID_NODE) {
vk_ir_triangle_node src_node1 =
DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, ir_id_to_offset(second_child))));
opaque = (src_node1.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
cull_flags &= opaque ? VK_BVH_BOX_FLAG_ONLY_OPAQUE : VK_BVH_BOX_FLAG_NO_OPAQUE;
radv_encode_triangle_gfx12(dst_child_addr, src_node0, src_node1);
} else {
radv_encode_triangle_gfx12(dst_child_addr, src_node0);
}
break;
}
case VK_GEOMETRY_TYPE_AABBS_KHR: {
vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_aabb_gfx12(dst_child_addr, src_node);
bool opaque = (src_node.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
cull_flags = opaque ? VK_BVH_BOX_FLAG_ONLY_OPAQUE : VK_BVH_BOX_FLAG_NO_OPAQUE;
break;
}
default:
/* instances */
encoded_type = 6;
child_node_size_128b = 2;
vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_instance_gfx12(dst_child_addr, src_node, node_id);
cull_mask = src_node.custom_instance_and_mask >> 24;
cull_flags = src_node.root_flags & 0x3;
break;
}
}
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb;
if (second_child != RADV_BVH_INVALID_NODE) {
vk_aabb second_child_aabb =
DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(second_child))).aabb;
child_aabb.min = min(child_aabb.min, second_child_aabb.min);
child_aabb.max = max(child_aabb.max, second_child_aabb.max);
}
radv_gfx12_box_child box_child;
box_child.dword0 =
min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12) |
cull_flags << 24;
/* TODO: subtree mask culling */
box_child.dword1 =
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) |
(cull_mask << 24);
box_child.dword2 =
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) |
(encoded_type << 24) | (child_node_size_128b << 28);
DEREF(dst).children[child_index] = box_child;
} else {
child_index =
bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)) + valid_child_count;
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst).children[child_index] = null_child;
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
break;
}
if (is_root_node && cluster.invocation_index == 0) {
vk_aabb aabb = src.base.aabb;
if (DEREF(args.header).active_leaf_count == 0)
aabb = vk_aabb(vec3(NAN), vec3(NAN));
REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base);
DEREF(header).aabb = aabb;
DEREF(header).bvh_offset = args.output_bvh_offset;
}
}
void
main()
{
uint32_t ir_leaf_node_size;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
break;
}
case VK_GEOMETRY_TYPE_AABBS_KHR: {
ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
break;
}
default:
/* instances */
ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
break;
}
uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
REF(vk_ir_box_node) intermediate_internal_nodes =
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
uint32_t ir_internal_node_count = DEREF(args.header).ir_internal_node_count;
uint32_t encode_invocation_count = ir_internal_node_count * 8;
uint32_t global_id = gl_GlobalInvocationID.x;
if (global_id >= encode_invocation_count)
return;
/* Revert the order so we start at the root */
uint32_t node_index = ir_internal_node_count - 1 - global_id / 8;
encode_gfx12(ir_leaf_node_size, intermediate_internal_nodes, node_index);
}