radv: Optimize the gfx12 encode shader

Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34273>
This commit is contained in:
Konstantin Seurer 2025-04-01 15:48:03 +02:00 committed by Marge Bot
parent 97f6287827
commit 76031ba53d
3 changed files with 297 additions and 218 deletions

View file

@ -18,14 +18,20 @@
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_buffer_reference2 : require
#extension GL_KHR_memory_scope_semantics : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_shuffle : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_clustered : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#define GFX12
#define USE_GLOBAL_SYNC
#include "build_helpers.h"
#include "build_interface.h"
#include "encode.h"
#include "invocation_cluster.h"
layout(push_constant) uniform CONSTS
{
@ -39,15 +45,234 @@ set_parent(uint32_t child, uint32_t parent)
DEREF(REF(uint32_t)(addr)) = parent;
}
void
encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_internal_nodes, uint32_t node_index)
{
/* Each invocation cluster encodes one internal node. */
radv_invocation_cluster cluster;
radv_invocation_cluster_init(cluster, 8);
REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, node_index);
vk_ir_box_node src = DEREF(src_node);
bool is_root_node = node_index == DEREF(args.header).ir_internal_node_count - 1;
for (;;) {
/* Make changes to the current node's BVH offset value visible. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t bvh_offset;
if (cluster.invocation_index == 0) {
bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
}
bvh_offset = radv_read_invocation(cluster, 0, bvh_offset);
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
break;
REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t child = RADV_BVH_INVALID_NODE;
if (cluster.invocation_index < 2)
child = src.children[cluster.invocation_index];
while (true) {
uint32_t valid_children = radv_ballot(cluster, child != RADV_BVH_INVALID_NODE);
if ((valid_children & 0x80) != 0 || valid_children == 0)
break;
float surface_area = -1.0;
bool is_valid_internal = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_internal;
if (is_valid_internal) {
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child))).aabb;
surface_area = aabb_surface_area(child_aabb);
}
float max_surface_area = subgroupClusteredMax(surface_area, 8);
uint32_t collapse_index = findLSB(radv_ballot(cluster, is_valid_internal && surface_area == max_surface_area));
if (collapse_index == 0xffffffff)
break;
uint32_t right;
if (cluster.invocation_index == collapse_index) {
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child));
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
uint32_t left = DEREF(child_node).children[0];
right = DEREF(child_node).children[1];
if (left == RADV_BVH_INVALID_NODE) {
left = right;
right = RADV_BVH_INVALID_NODE;
}
child = left;
}
right = radv_read_invocation(cluster, collapse_index, right);
if (cluster.invocation_index == findMSB(valid_children) + 1)
child = right;
}
bool is_valid = child != RADV_BVH_INVALID_NODE;
bool is_valid_primitive = is_valid && ir_id_to_type(child) != vk_ir_node_internal;
bool is_valid_internal = is_valid && ir_id_to_type(child) == vk_ir_node_internal;
uint32_t child_leaf_node_count = bitCount(radv_ballot(cluster, is_valid_primitive));
uint32_t child_internal_node_count = bitCount(radv_ballot(cluster, is_valid_internal));
uint32_t leaf_node_size;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
case VK_GEOMETRY_TYPE_AABBS_KHR:
leaf_node_size = RADV_GFX12_BVH_NODE_SIZE;
break;
default:
/* instances */
leaf_node_size = 2 * RADV_GFX12_BVH_NODE_SIZE;
break;
}
uint32_t child_leaf_nodes_size = child_leaf_node_count * leaf_node_size;
uint32_t child_internal_nodes_size = child_internal_node_count * RADV_GFX12_BVH_NODE_SIZE;
uint32_t dst_leaf_offset;
uint32_t dst_internal_offset;
if (cluster.invocation_index == 0) {
dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
}
dst_leaf_offset = radv_read_invocation(cluster, 0, dst_leaf_offset);
dst_internal_offset = radv_read_invocation(cluster, 0, dst_internal_offset);
uint32_t child_index = 0;
uint32_t dst_offset = 0;
if (is_valid_internal) {
child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
dst_offset = dst_internal_offset + child_index * RADV_GFX12_BVH_NODE_SIZE;
uint32_t offset = ir_id_to_offset(child);
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_offset;
}
if (is_valid_primitive) {
child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
dst_offset = dst_leaf_offset + child_index * leaf_node_size;
child_index += child_internal_node_count;
}
vec3 origin = src.base.aabb.min;
vec3 extent = src.base.aabb.max - src.base.aabb.min;
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
uint32_t valid_child_count = child_leaf_node_count + child_internal_node_count;
if (cluster.invocation_index == 0) {
DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
DEREF(dst).origin = origin;
DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
(extent_exponents.z << 16) | ((valid_child_count - 1) << 28);
DEREF(dst).obb_matrix_index = 0x7f;
}
if (is_valid) {
uint32_t type = ir_id_to_type(child);
uint32_t offset = ir_id_to_offset(child);
uint32_t child_node_size_128b = 1;
uint32_t encoded_type = 0;
uint32_t cull_mask = 0xff;
if (type == vk_ir_node_internal) {
encoded_type = 5;
} else {
/* Write leaf node offset. */
uint32_t leaf_index = offset / ir_leaf_node_size;
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
child_dst_offset = INDEX(uint32_t, child_dst_offset, leaf_index);
DEREF(child_dst_offset) = dst_offset;
VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_triangle_gfx12(dst_leaf_addr, src_node);
break;
}
case VK_GEOMETRY_TYPE_AABBS_KHR: {
vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_aabb_gfx12(dst_leaf_addr, src_node);
break;
}
default:
/* instances */
encoded_type = 6;
child_node_size_128b = 2;
vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_instance_gfx12(dst_leaf_addr, src_node);
cull_mask = src_node.custom_instance_and_mask >> 24;
break;
}
}
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb;
radv_gfx12_box_child box_child;
/* TODO: subtree flags culling */
box_child.dword0 =
min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
/* TODO: subtree mask culling */
box_child.dword1 =
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) |
(cull_mask << 24);
box_child.dword2 =
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) |
(encoded_type << 24) | (child_node_size_128b << 28);
DEREF(dst).children[child_index] = box_child;
set_parent(pack_node_id(dst_offset, encoded_type), node_id);
} else {
child_index =
bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1)) + valid_child_count;
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst).children[child_index] = null_child;
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
break;
}
if (is_root_node && cluster.invocation_index == 0) {
REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base);
DEREF(header).aabb = src.base.aabb;
DEREF(header).bvh_offset = args.output_bvh_offset;
set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE);
}
}
void
main()
{
if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
return;
/* Revert the order so we start at the root */
uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
uint32_t ir_leaf_node_size;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
@ -65,211 +290,17 @@ main()
}
uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
uint32_t dst_internal_offset = id_to_offset(RADV_BVH_ROOT_NODE);
REF(vk_ir_box_node) intermediate_internal_nodes =
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
vk_ir_box_node src = DEREF(src_node);
bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;
uint32_t ir_internal_node_count = DEREF(args.header).ir_internal_node_count;
uint32_t encode_invocation_count = ir_internal_node_count * 8;
for (;;) {
/* Make changes to the current node's BVH offset value visible. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t global_id = gl_GlobalInvocationID.x;
if (global_id >= encode_invocation_count)
return;
uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
break;
REF(radv_gfx12_box_node) dst = REF(radv_gfx12_box_node)(args.output_base + (args.output_bvh_offset + bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t children[8];
uint32_t found_child_count = 0;
for (uint32_t i = 0; i < 2; i++) {
if (src.children[i] != RADV_BVH_INVALID_NODE) {
children[found_child_count] = src.children[i];
found_child_count++;
}
}
/* TODO: Collapse child nodes with high SAH values. */
while (found_child_count < 8) {
bool progress = false;
for (int32_t i = 0; i < found_child_count; i++) {
uint32_t child_id = children[i];
if (ir_id_to_type(child_id) != vk_ir_node_internal)
continue;
progress = true;
REF(vk_ir_box_node) child_node =
REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, ir_id_to_offset(child_id));
uint32_t grandchildren[2] = DEREF(child_node).children;
uint32_t valid_grandchild_count = 0;
if (grandchildren[1] != RADV_BVH_INVALID_NODE)
valid_grandchild_count++;
if (grandchildren[0] != RADV_BVH_INVALID_NODE)
valid_grandchild_count++;
else
grandchildren[0] = grandchildren[1];
if (valid_grandchild_count > 1) {
children[found_child_count] = grandchildren[1];
found_child_count++;
}
if (valid_grandchild_count > 0) {
children[i] = grandchildren[0];
} else {
found_child_count--;
children[i] = children[found_child_count];
}
DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
if (found_child_count == 8)
break;
}
if (!progress)
break;
}
uint32_t child_leaf_nodes_size = 0;
uint32_t child_internal_nodes_size = 0;
for (uint32_t i = 0; i < found_child_count; i++) {
uint32_t type = ir_id_to_type(children[i]);
if (type == vk_ir_node_internal)
child_internal_nodes_size += RADV_GFX12_BVH_NODE_SIZE;
else if (type == vk_ir_node_instance)
child_leaf_nodes_size += 2 * RADV_GFX12_BVH_NODE_SIZE;
else
child_leaf_nodes_size += RADV_GFX12_BVH_NODE_SIZE;
}
uint32_t dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
uint32_t dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
vec3 origin = src.base.aabb.min;
vec3 extent = src.base.aabb.max - src.base.aabb.min;
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
DEREF(dst).origin = origin;
DEREF(dst).child_count_exponents =
extent_exponents.x | (extent_exponents.y << 8) | (extent_exponents.z << 16) | ((found_child_count - 1) << 28);
DEREF(dst).obb_matrix_index = 0x7f;
for (uint32_t i = 0; i < found_child_count; i++) {
uint32_t child_id = children[i];
uint32_t type = ir_id_to_type(child_id);
uint32_t offset = ir_id_to_offset(child_id);
uint32_t child_node_size_128b = 1;
uint32_t encoded_type = 0;
uint32_t dst_offset = 0;
uint32_t cull_mask = 0xff;
if (type == vk_ir_node_internal) {
encoded_type = 5;
dst_offset = dst_internal_offset;
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_internal_offset;
dst_internal_offset += RADV_GFX12_BVH_NODE_SIZE;
} else {
dst_offset = dst_leaf_offset;
/* Write leaf node offset. */
uint32_t child_index = offset / ir_leaf_node_size;
REF(uint32_t) child_dst_offset = REF(uint32_t)(args.output_base + args.leaf_node_offsets_offset);
child_dst_offset = INDEX(uint32_t, child_dst_offset, child_index);
DEREF(child_dst_offset) = dst_offset;
VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_leaf_offset;
switch (args.geometry_type) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
vk_ir_triangle_node src_node = DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_triangle_gfx12(dst_leaf_addr, src_node);
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
break;
}
case VK_GEOMETRY_TYPE_AABBS_KHR: {
vk_ir_aabb_node src_node = DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_aabb_gfx12(dst_leaf_addr, src_node);
dst_leaf_offset += RADV_GFX12_BVH_NODE_SIZE;
break;
}
default:
/* instances */
encoded_type = 6;
child_node_size_128b = 2;
vk_ir_instance_node src_node = DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, offset)));
radv_encode_instance_gfx12(dst_leaf_addr, src_node);
cull_mask = src_node.custom_instance_and_mask >> 24;
dst_leaf_offset += 2 * RADV_GFX12_BVH_NODE_SIZE;
break;
}
}
vk_aabb child_aabb = DEREF(REF(vk_ir_node) OFFSET(args.intermediate_bvh, offset)).aabb;
radv_gfx12_box_child child;
/* TODO: subtree flags culling */
child.dword0 = min(uint32_t(floor((child_aabb.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((child_aabb.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
/* TODO: subtree mask culling */
child.dword1 =
min(uint32_t(floor((child_aabb.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((child_aabb.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12) |
(cull_mask << 24);
child.dword2 =
min(uint32_t(ceil((child_aabb.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((child_aabb.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12) |
(encoded_type << 24) | (child_node_size_128b << 28);
DEREF(dst).children[i] = child;
set_parent(pack_node_id(dst_offset, encoded_type), node_id);
}
/* Set remaining children to invalid */
for (uint32_t i = found_child_count; i < 8; i++) {
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst).children[i] = null_child;
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
break;
}
if (is_root_node) {
REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(args.output_base);
DEREF(header).aabb = src.base.aabb;
DEREF(header).bvh_offset = args.output_bvh_offset;
set_parent(RADV_BVH_ROOT_NODE, RADV_BVH_INVALID_NODE);
}
/* Revert the order so we start at the root */
uint32_t node_index = ir_internal_node_count - 1 - global_id / 8;
encode_gfx12(ir_leaf_node_size, intermediate_internal_nodes, node_index);
}

View file

@ -0,0 +1,39 @@
/*
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
/* Helpers for encoding BVH nodes on different HW generations. */
#ifndef RADV_BVH_INVOCATION_CLUSTER_H
#define RADV_BVH_INVOCATION_CLUSTER_H
struct radv_invocation_cluster {
uint32_t invocation_index;
uint32_t cluster_index;
uint32_t cluster_size;
};
/* cluster_size has to be a power of two and <32. */
void
radv_invocation_cluster_init(out radv_invocation_cluster cluster, uint32_t cluster_size)
{
cluster.invocation_index = gl_SubgroupInvocationID & (cluster_size - 1);
cluster.cluster_index = gl_SubgroupInvocationID / cluster_size;
cluster.cluster_size = cluster_size;
}
#define radv_read_invocation(cluster, index, value) \
subgroupShuffle(value, (gl_SubgroupInvocationID & (~(cluster.cluster_size - 1))) + index)
uint32_t
radv_ballot(radv_invocation_cluster cluster, bool value)
{
uvec4 ballot = subgroupBallot(value);
uint64_t ballot64 = uint64_t(ballot.x) | (uint64_t(ballot.y) << 32ul);
uint32_t cluster_shift = gl_SubgroupInvocationID & (~(cluster.cluster_size - 1));
return uint32_t((ballot64 >> cluster_shift) & ((1u << cluster.cluster_size) - 1));
}
#endif

View file

@ -540,11 +540,20 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructur
struct acceleration_structure_layout layout;
radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
uint32_t dst_internal_nodes_offset = layout.internal_nodes_offset - layout.bvh_offset;
uint32_t dst_leaf_nodes_offset = layout.leaf_nodes_offset - layout.bvh_offset;
uint32_t offsets[2] = {dst_internal_nodes_offset, dst_leaf_nodes_offset};
radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset), offsets,
sizeof(offsets));
struct vk_ir_header header = {
.sync_data =
{
.current_phase_end_counter = TASK_INDEX_INVALID,
/* Will be updated by the first PLOC shader invocation */
.task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
},
.dst_node_offset = layout.internal_nodes_offset - layout.bvh_offset,
.dst_leaf_node_offset = layout.leaf_nodes_offset - layout.bvh_offset,
};
const uint8_t *update_data = ((const uint8_t *)&header + offsetof(struct vk_ir_header, sync_data));
radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, sync_data), update_data,
sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data));
if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope)
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
@ -560,10 +569,11 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const VkAccelerationStructur
vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
uint32_t internal_count = MAX2(leaf_count, 2) - 1;
struct radv_dispatch_info dispatch = {
.unaligned = true,
.ordered = true,
.blocks = {MAX2(leaf_count, 1), 1, 1},
.blocks = {DIV_ROUND_UP(internal_count * 8, 64), 1, 1},
};
radv_compute_dispatch(cmd_buffer, &dispatch);
@ -664,9 +674,8 @@ radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBui
geometry_infos[i].primitive_count = build_range_infos[i].primitiveCount;
}
radv_CmdUpdateBuffer(commandBuffer, vk_buffer_to_handle(dst->buffer),
dst->offset + layout.geometry_info_offset, geometry_infos_size,
geometry_infos);
radv_CmdUpdateBuffer(commandBuffer, vk_buffer_to_handle(dst->buffer), dst->offset + layout.geometry_info_offset,
geometry_infos_size, geometry_infos);
free(geometry_infos);
}