mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-29 21:00:16 +01:00
radv: Optimize the gfx12 encode shader
Reviewed-by: Natalie Vock <natalie.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34273>
This commit is contained in:
parent
97f6287827
commit
76031ba53d
3 changed files with 297 additions and 218 deletions
|
|
@ -18,14 +18,20 @@
|
|||
#extension GL_EXT_buffer_reference : require
|
||||
#extension GL_EXT_buffer_reference2 : require
|
||||
#extension GL_KHR_memory_scope_semantics : require
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#extension GL_KHR_shader_subgroup_clustered : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
#define GFX12
|
||||
#define USE_GLOBAL_SYNC
|
||||
|
||||
#include "build_helpers.h"
|
||||
#include "build_interface.h"
|
||||
#include "encode.h"
|
||||
#include "invocation_cluster.h"
|
||||
|
||||
layout(push_constant) uniform CONSTS
|
||||
{
|
||||
|
|
@ -39,15 +45,234 @@ set_parent(uint32_t child, uint32_t parent)
|
|||
DEREF(REF(uint32_t)(addr)) = parent;
|
||||
}
|
||||
|
||||
void
|
||||
encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_internal_nodes, uint32_t node_index)
|
||||
{
|
||||
/* Each invocation cluster encodes one internal node. */
|
||||
radv_invocation_cluster cluster;
|
||||
radv_invocation_cluster_init(cluster, 8);
|
||||
|
||||
REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, node_index);
|
||||
vk_ir_box_node src = DEREF(src_node);
|
||||
bool is_root_node = node_index == DEREF(args.header).ir_internal_node_count - 1;
|
||||
|
||||
for (;;) {
|
||||
/* Make changes to the current node's BVH offset value visible. */
|
||||
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
|
||||
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
|
||||
|
||||
uint32_t bvh_offset;
|
||||
if (cluster.invocation_index == 0) {
|
||||
bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
|
||||
}
|
||||
bvh_offset = radv_read_invocation(cluster, 0, bvh_offset);
|
||||
|
||||
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
|
||||
continue;
|
||||
|
||||
if (bvh_offset == VK_NULL_BVH_OFFSET)
|
||||
break;
|
||||
|
||||
REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset));
|
||||
|
||||
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
|
||||
|
||||
uint32_t child = RADV_BVH_INVALID_NODE;
|
||||
if (cluster.invocation_index < 2)
|
||||
child = src.children[cluster.invocation_index];
|
||||
|
||||
while (true) {
|
||||
uint32_t valid_children = radv_ballot(cluster, child != RADV_BVH_INVALID_NODE);
|
||||
if ((valid_children & 0x80) != 0 || valid_children == 0)
|
||||
break;
|
||||
|
||||
float surface_area = -1.0;
|
||||
bool is_valid_internal = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_internal;
|
||||
if (is_valid_internal) {
|
||||
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child))).aabb;
|
||||
surface_area = aabb_surface_area(child_aabb);
|
||||
}
|
||||
|
||||
float max_surface_area = subgroupClusteredMax(surface_area, 8);
|
||||
|
||||
uint32_t collapse_index = findLSB(radv_ballot(cluster, is_valid_internal && surface_area == max_surface_area));
|
||||
if (collapse_index == 0xffffffff)
|
||||
break;
|
||||
|
||||
uint32_t right;
|
||||
if (cluster.invocation_index == collapse_index) {
|
||||
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child));
|
||||
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
|
||||
|
||||
uint32_t left = DEREF(child_node).children[0];
|
||||
right = DEREF(child_node).children[1];
|
||||
|
||||
if (left == RADV_BVH_INVALID_NODE) {
|
||||
left = right;
|
||||
right = RADV_BVH_INVALID_NODE;
|
||||
}
|
||||
|
||||
child = left;
|
||||
}
|
||||
right = radv_read_invocation(cluster, collapse_index, right);
|
||||
|
||||
if (cluster.invocation_index == findMSB(valid_children) + 1)
|
||||
child = right;
|
||||
}
|
||||
|
||||
bool is_valid = child != RADV_BVH_INVALID_NODE;
|
||||
bool is_valid_primitive = is_valid && ir_id_to_type(child) != vk_ir_node_internal;
|
||||
bool is_valid_internal = is_valid && ir_id_to_type(child) == vk_ir_node_internal;
|
||||
|
||||
uint32_t child_leaf_node_count = bitCount(radv_ballot(cluster, is_valid_primitive));
|
||||
uint32_t child_internal_node_count = bitCount(radv_ballot(cluster, is_valid_internal));
|
||||
|
||||
uint32_t leaf_node_size;
|
||||
switch (args.geometry_type) {
|
||||
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
|
||||
case VK_GEOMETRY_TYPE_AABBS_KHR:
|
||||
leaf_node_size = RADV_GFX12_BVH_NODE_SIZE;
|
||||
break;
|
||||
default:
|
||||
/* instances */
|
||||
leaf_node_size = 2 * RADV_GFX12_BVH_NODE_SIZE;
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t child_leaf_nodes_size = child_leaf_node_count * leaf_node_size;
|
||||
uint32_t child_internal_nodes_size = child_internal_node_count * RADV_GFX12_BVH_NODE_SIZE;
|
||||
|
||||
uint32_t dst_leaf_offset;
|
||||
uint32_t dst_internal_offset;
|
||||
if (cluster.invocation_index == 0) {
|
||||
dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
|
||||
dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
|
||||
}
|
||||
dst_leaf_offset = radv_read_invocation(cluster, 0, dst_leaf_offset);
|
||||
dst_internal_offset = radv_read_invocation(cluster, 0, dst_internal_offset);
|
||||
|
||||
uint32_t child_index = 0;
|
||||
uint32_t dst_offset = 0;
|
||||
if (is_valid_internal) {
|
||||
child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
|
||||
dst_offset = dst_internal_offset + child_index * RADV_GFX12_BVH_NODE_SIZE;
|
||||
|
||||
uint32_t offset = ir_id_to_offset(child);
|
||||
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
|
||||
DEREF(child_node).bvh_offset = dst_offset;
|
||||
}
|
||||
if (is_valid_primitive) {
|
||||
child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
|
||||
dst_offset = dst_leaf_offset + child_index * leaf_node_size;
|
||||
child_index += child_internal_node_count;
|
||||
}
|
||||
|
||||
vec3 origin = src.base.aabb.min;
|
||||
vec3 extent = src.base.aabb.max - src.base.aabb.min;
|
||||
|
||||
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
|
||||
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
|
||||
|
||||
uint32_t valid_child_count = child_leaf_node_count + child_internal_node_count;
|
||||
if (cluster.invocation_index == 0) {
|
||||
DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
|
||||
DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
|
||||
DEREF(dst).origin = origin;
|
||||
DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
|
||||
(extent_exponents.z << 16) | ((valid_child_count - 1) << 28);
|
||||
DEREF(dst).obb_matrix_index = 0x7f;
|
||||
}
|
||||
|
||||
if (is_valid) {
|
||||
uint32_t type = ir_id_to_type(child);
|
||||
uint32_t offset = ir_id_to_offset(child);
|
||||
|
||||
uint32_t child_node_size_128b = 1;
|
||||
uint32_t encoded_type = 0;
|
||||
uint32_t cull_mask = 0xff;
|
||||
if (type == vk_ir_node_internal) {
|
||||
encoded_type = 5;
|
||||
} else {
|
||||
/* Write leaf node offset. */
|
||||
uint32_t leaf_index = offset / ir_leaf_node_size;
|
||||
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
|
||||
child_dst_offset = INDEX(uint32_t, child_dst_offset, leaf_index);
|
||||
DEREF(child_dst_offset) = dst_offset;
|
||||
|
||||
VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset;
|
||||
|
||||
switch (args.geometry_type) {
|
||||
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
|
||||
vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset)));
|
||||
radv_encode_triangle_gfx12(dst_leaf_addr, src_node);
|
||||
break;
|
||||
}
|
||||
case VK_GEOMETRY_TYPE_AABBS_KHR: {
|
||||
vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset)));
|
||||
radv_encode_aabb_gfx12(dst_leaf_addr, src_node);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
/* instances */
|
||||
encoded_type = 6;
|
||||
child_node_size_128b = 2;
|
||||
|
||||
vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset)));
|
||||
radv_encode_instance_gfx12(dst_leaf_addr, src_node);
|
||||
|
||||
cull_mask = src_node.custom_instance_and_mask >> 24;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb;
|
||||
|
||||
radv_gfx12_box_child box_child;
|
||||
/* TODO: subtree flags culling */
|
||||
box_child.dword0 =
|
||||
min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
|
||||
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
|
||||
/* TODO: subtree mask culling */
|
||||
box_child.dword1 =
|
||||
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
|
||||
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) |
|
||||
(cull_mask << 24);
|
||||
box_child.dword2 =
|
||||
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
|
||||
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) |
|
||||
(encoded_type << 24) | (child_node_size_128b << 28);
|
||||
DEREF(dst).children[child_index] = box_child;
|
||||
|
||||
set_parent(pack_node_id(dst_offset, encoded_type), node_id);
|
||||
} else {
|
||||
child_index =
|
||||
bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)) + valid_child_count;
|
||||
radv_gfx12_box_child null_child;
|
||||
null_child.dword0 = 0xffffffff;
|
||||
null_child.dword1 = 0xfff;
|
||||
null_child.dword2 = 0;
|
||||
DEREF(dst).children[child_index] = null_child;
|
||||
}
|
||||
|
||||
/* Make changes to the children's BVH offset value available to the other invocations. */
|
||||
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
|
||||
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
|
||||
break;
|
||||
}
|
||||
|
||||
if (is_root_node && cluster.invocation_index == 0) {
|
||||
REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base);
|
||||
DEREF(header).aabb = src.base.aabb;
|
||||
DEREF(header).bvh_offset = args.output_bvh_offset;
|
||||
|
||||
set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
main()
|
||||
{
|
||||
if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
|
||||
return;
|
||||
|
||||
/* Revert the order so we start at the root */
|
||||
uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
|
||||
|
||||
uint32_t ir_leaf_node_size;
|
||||
switch (args.geometry_type) {
|
||||
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
|
||||
|
|
@ -65,211 +290,17 @@ main()
|
|||
}
|
||||
|
||||
uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
|
||||
uint32_t dst_internal_offset = id_to_offset(RADV_BVH_ROOT_NODE);
|
||||
|
||||
REF(vk_ir_box_node) intermediate_internal_nodes =
|
||||
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
|
||||
REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
|
||||
vk_ir_box_node src = DEREF(src_node);
|
||||
|
||||
bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;
|
||||
uint32_t ir_internal_node_count = DEREF(args.header).ir_internal_node_count;
|
||||
uint32_t encode_invocation_count = ir_internal_node_count * 8;
|
||||
|
||||
for (;;) {
|
||||
/* Make changes to the current node's BVH offset value visible. */
|
||||
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
|
||||
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
|
||||
uint32_t global_id = gl_GlobalInvocationID.x;
|
||||
if (global_id >= encode_invocation_count)
|
||||
return;
|
||||
|
||||
uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
|
||||
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
|
||||
continue;
|
||||
|
||||
if (bvh_offset == VK_NULL_BVH_OFFSET)
|
||||
break;
|
||||
|
||||
REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset));
|
||||
|
||||
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
|
||||
|
||||
uint32_t children[8];
|
||||
|
||||
uint32_t found_child_count = 0;
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
if (src.children[i] != RADV_BVH_INVALID_NODE) {
|
||||
children[found_child_count] = src.children[i];
|
||||
found_child_count++;
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: Collapse child nodes with high SAH values. */
|
||||
while (found_child_count < 8) {
|
||||
bool progress = false;
|
||||
for (int32_t i = 0; i < found_child_count; i++) {
|
||||
uint32_t child_id = children[i];
|
||||
if (ir_id_to_type(child_id) != vk_ir_node_internal)
|
||||
continue;
|
||||
|
||||
progress = true;
|
||||
|
||||
REF(vk_ir_box_node) child_node =
|
||||
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child_id));
|
||||
uint32_t grandchildren[2] = DEREF(child_node).children;
|
||||
uint32_t valid_grandchild_count = 0;
|
||||
|
||||
if (grandchildren[1] != RADV_BVH_INVALID_NODE)
|
||||
valid_grandchild_count++;
|
||||
|
||||
if (grandchildren[0] != RADV_BVH_INVALID_NODE)
|
||||
valid_grandchild_count++;
|
||||
else
|
||||
grandchildren[0] = grandchildren[1];
|
||||
|
||||
if (valid_grandchild_count > 1) {
|
||||
children[found_child_count] = grandchildren[1];
|
||||
found_child_count++;
|
||||
}
|
||||
|
||||
if (valid_grandchild_count > 0) {
|
||||
children[i] = grandchildren[0];
|
||||
} else {
|
||||
found_child_count--;
|
||||
children[i] = children[found_child_count];
|
||||
}
|
||||
|
||||
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
|
||||
|
||||
if (found_child_count == 8)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!progress)
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t child_leaf_nodes_size = 0;
|
||||
uint32_t child_internal_nodes_size = 0;
|
||||
for (uint32_t i = 0; i < found_child_count; i++) {
|
||||
uint32_t type = ir_id_to_type(children[i]);
|
||||
if (type == vk_ir_node_internal)
|
||||
child_internal_nodes_size += RADV_GFX12_BVH_NODE_SIZE;
|
||||
else if (type == vk_ir_node_instance)
|
||||
child_leaf_nodes_size += 2 * RADV_GFX12_BVH_NODE_SIZE;
|
||||
else
|
||||
child_leaf_nodes_size += RADV_GFX12_BVH_NODE_SIZE;
|
||||
}
|
||||
|
||||
uint32_t dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
|
||||
uint32_t dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
|
||||
|
||||
vec3 origin = src.base.aabb.min;
|
||||
vec3 extent = src.base.aabb.max - src.base.aabb.min;
|
||||
|
||||
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
|
||||
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
|
||||
|
||||
DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
|
||||
DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
|
||||
DEREF(dst).origin = origin;
|
||||
DEREF(dst).child_count_exponents =
|
||||
extent_exponents.x | (extent_exponents.y << 8) | (extent_exponents.z << 16) | ((found_child_count - 1) << 28);
|
||||
DEREF(dst).obb_matrix_index = 0x7f;
|
||||
|
||||
for (uint32_t i = 0; i < found_child_count; i++) {
|
||||
uint32_t child_id = children[i];
|
||||
uint32_t type = ir_id_to_type(child_id);
|
||||
uint32_t offset = ir_id_to_offset(child_id);
|
||||
|
||||
uint32_t child_node_size_128b = 1;
|
||||
uint32_t encoded_type = 0;
|
||||
uint32_t dst_offset = 0;
|
||||
uint32_t cull_mask = 0xff;
|
||||
if (type == vk_ir_node_internal) {
|
||||
encoded_type = 5;
|
||||
dst_offset = dst_internal_offset;
|
||||
|
||||
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
|
||||
DEREF(child_node).bvh_offset = dst_internal_offset;
|
||||
|
||||
dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE;
|
||||
} else {
|
||||
dst_offset = dst_leaf_offset;
|
||||
|
||||
/* Write leaf node offset. */
|
||||
uint32_t child_index = offset / ir_leaf_node_size;
|
||||
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
|
||||
child_dst_offset = INDEX(uint32_t, child_dst_offset, child_index);
|
||||
DEREF(child_dst_offset) = dst_offset;
|
||||
|
||||
VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_leaf_offset;
|
||||
|
||||
switch (args.geometry_type) {
|
||||
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
|
||||
vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset)));
|
||||
radv_encode_triangle_gfx12(dst_leaf_addr, src_node);
|
||||
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
|
||||
break;
|
||||
}
|
||||
case VK_GEOMETRY_TYPE_AABBS_KHR: {
|
||||
vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset)));
|
||||
radv_encode_aabb_gfx12(dst_leaf_addr, src_node);
|
||||
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
/* instances */
|
||||
encoded_type = 6;
|
||||
child_node_size_128b = 2;
|
||||
|
||||
vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset)));
|
||||
radv_encode_instance_gfx12(dst_leaf_addr, src_node);
|
||||
|
||||
cull_mask = src_node.custom_instance_and_mask >> 24;
|
||||
|
||||
dst_leaf_offset += 2 * RADV_GFX12_BVH_NODE_SIZE;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb;
|
||||
|
||||
radv_gfx12_box_child child;
|
||||
/* TODO: subtree flags culling */
|
||||
child.dword0 = min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
|
||||
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
|
||||
/* TODO: subtree mask culling */
|
||||
child.dword1 =
|
||||
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
|
||||
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) |
|
||||
(cull_mask << 24);
|
||||
child.dword2 =
|
||||
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
|
||||
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) |
|
||||
(encoded_type << 24) | (child_node_size_128b << 28);
|
||||
DEREF(dst).children[i] = child;
|
||||
|
||||
set_parent(pack_node_id(dst_offset, encoded_type), node_id);
|
||||
}
|
||||
|
||||
/* Set remaining children to invalid */
|
||||
for (uint32_t i = found_child_count; i < 8; i++) {
|
||||
radv_gfx12_box_child null_child;
|
||||
null_child.dword0 = 0xffffffff;
|
||||
null_child.dword1 = 0xfff;
|
||||
null_child.dword2 = 0;
|
||||
DEREF(dst).children[i] = null_child;
|
||||
}
|
||||
|
||||
/* Make changes to the children's BVH offset value available to the other invocations. */
|
||||
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
|
||||
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
|
||||
break;
|
||||
}
|
||||
|
||||
if (is_root_node) {
|
||||
REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base);
|
||||
DEREF(header).aabb = src.base.aabb;
|
||||
DEREF(header).bvh_offset = args.output_bvh_offset;
|
||||
|
||||
set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE);
|
||||
}
|
||||
/* Revert the order so we start at the root */
|
||||
uint32_t node_index = ir_internal_node_count - 1 - global_id / 8;
|
||||
encode_gfx12(ir_leaf_node_size, intermediate_internal_nodes, node_index);
|
||||
}
|
||||
|
|
|
|||
39
src/amd/vulkan/bvh/invocation_cluster.h
Normal file
39
src/amd/vulkan/bvh/invocation_cluster.h
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright © 2025 Valve Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Helpers for encoding BVH nodes on different HW generations. */
|
||||
|
||||
#ifndef RADV_BVH_INVOCATION_CLUSTER_H
|
||||
#define RADV_BVH_INVOCATION_CLUSTER_H
|
||||
|
||||
struct radv_invocation_cluster {
|
||||
uint32_t invocation_index;
|
||||
uint32_t cluster_index;
|
||||
uint32_t cluster_size;
|
||||
};
|
||||
|
||||
/* cluster_size has to be a power of two and <32. */
|
||||
void
|
||||
radv_invocation_cluster_init(out radv_invocation_cluster cluster, uint32_t cluster_size)
|
||||
{
|
||||
cluster.invocation_index = gl_SubgroupInvocationID & (cluster_size - 1);
|
||||
cluster.cluster_index = gl_SubgroupInvocationID / cluster_size;
|
||||
cluster.cluster_size = cluster_size;
|
||||
}
|
||||
|
||||
#define radv_read_invocation(cluster, index, value) \
|
||||
subgroupShuffle(value, (gl_SubgroupInvocationID & (~(cluster.cluster_size - 1))) + index)
|
||||
|
||||
uint32_t
|
||||
radv_ballot(radv_invocation_cluster cluster, bool value)
|
||||
{
|
||||
uvec4 ballot = subgroupBallot(value);
|
||||
uint64_t ballot64 = uint64_t(ballot.x) | (uint64_t(ballot.y) << 32ul);
|
||||
uint32_t cluster_shift = gl_SubgroupInvocationID & (~(cluster.cluster_size - 1));
|
||||
return uint32_t((ballot64 >> cluster_shift) & ((1u << cluster.cluster_size) - 1));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -540,11 +540,20 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructur
|
|||
struct acceleration_structure_layout layout;
|
||||
radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
|
||||
|
||||
uint32_t dst_internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset;
|
||||
uint32_t dst_leaf_nodes_offset = layout.leaf_nodes_offset - layout.bvh_offset;
|
||||
uint32_t offsets[2] = {dst_internal_nodes_offset, dst_leaf_nodes_offset};
|
||||
radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset), offsets,
|
||||
sizeof(offsets));
|
||||
struct vk_ir_header header = {
|
||||
.sync_data =
|
||||
{
|
||||
.current_phase_end_counter = TASK_INDEX_INVALID,
|
||||
/* Will be updated by the first PLOC shader invocation */
|
||||
.task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
|
||||
},
|
||||
.dst_node_offset = layout.internal_nodes_offset - layout.bvh_offset,
|
||||
.dst_leaf_node_offset = layout.leaf_nodes_offset - layout.bvh_offset,
|
||||
};
|
||||
|
||||
const uint8_t *update_data = ((const uint8_t *)&header + offsetof(struct vk_ir_header, sync_data));
|
||||
radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, sync_data), update_data,
|
||||
sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data));
|
||||
if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope)
|
||||
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
|
||||
|
||||
|
|
@ -560,10 +569,11 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructur
|
|||
vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
|
||||
|
||||
uint32_t internal_count = MAX2(leaf_count, 2) - 1;
|
||||
|
||||
struct radv_dispatch_info dispatch = {
|
||||
.unaligned = true,
|
||||
.ordered = true,
|
||||
.blocks = {MAX2(leaf_count, 1), 1, 1},
|
||||
.blocks = {DIV_ROUND_UP(internal_count * 8, 64), 1, 1},
|
||||
};
|
||||
|
||||
radv_compute_dispatch(cmd_buffer, &dispatch);
|
||||
|
|
@ -664,9 +674,8 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
|
|||
geometry_infos[i].primitive_count = build_range_infos[i].primitiveCount;
|
||||
}
|
||||
|
||||
radv_CmdUpdateBuffer(commandBuffer, vk_buffer_to_handle(dst->buffer),
|
||||
dst->offset + layout.geometry_info_offset, geometry_infos_size,
|
||||
geometry_infos);
|
||||
radv_CmdUpdateBuffer(commandBuffer, vk_buffer_to_handle(dst->buffer), dst->offset + layout.geometry_info_offset,
|
||||
geometry_infos_size, geometry_infos);
|
||||
|
||||
free(geometry_infos);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue