radv: Add an option for dumping BVH stats

The option uses the dumping already implemented for rra to gather
statistics about BVHs on the CPU and write them to a csv file. This csv
file can then be compared using a tool similar to report-fossils to
judge the impact of changes to the bvh build code.

Reviewed-by: Natalie Vock <natalie.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38463>
This commit is contained in:
Konstantin Seurer 2025-11-15 09:57:37 +01:00 committed by Marge Bot
parent 356d88457a
commit 8c10eab1f3
7 changed files with 397 additions and 3 deletions

View file

@ -374,7 +374,16 @@ rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *p
struct radv_device *device = radv_queue_device(queue);
VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence);
if (result != VK_SUCCESS || !device->rra_trace.triggered)
if (result != VK_SUCCESS)
return result;
if (radv_bvh_stats_file()) {
result = radv_dump_bvh_stats(_queue);
if (result != VK_SUCCESS)
return result;
}
if (!device->rra_trace.triggered)
return result;
uint32_t total_trace_count = 0;

View file

@ -716,7 +716,7 @@ radv_device_init_tools(struct radv_device *device)
if (result != VK_SUCCESS)
return result;
if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) {
if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) {
result = radv_rra_trace_init(device);
if (result != VK_SUCCESS)
return result;
@ -808,7 +808,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev))
if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev))
add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
#ifndef _WIN32

View file

@ -116,4 +116,17 @@ const char *radv_get_perftest_option_name(int id);
bool radv_is_rt_wave64_enabled(const struct radv_instance *instance);
static const char *
radv_bvh_stats_file()
{
return os_get_option_secure("RADV_BVH_STATS_FILE");
}
static bool
radv_bvh_dumping_enabled(const struct radv_instance *instance)
{
/* Gathering bvh stats uses a large part of the rra code for dumping bvhs. */
return (instance->vk.trace_mode & RADV_TRACE_MODE_RRA) || radv_bvh_stats_file();
}
#endif /* RADV_INSTANCE_H */

View file

@ -489,6 +489,10 @@ radv_rra_trace_init(struct radv_device *device)
device->rra_trace.ray_history = UTIL_DYNARRAY_INIT;
/* BVH stats dumping does not need ray history. */
if (!(radv_physical_device_instance(pdev)->vk.trace_mode & RADV_TRACE_MODE_RRA))
return VK_SUCCESS;
device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024);
if (device->rra_trace.ray_history_buffer_size <
sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token))
@ -625,6 +629,9 @@ radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
simple_mtx_destroy(&data->data_mtx);
_mesa_hash_table_destroy(data->accel_structs, NULL);
_mesa_hash_table_u64_destroy(data->accel_struct_vas);
if (data->stats_file)
fclose(data->stats_file);
}
void
@ -1298,3 +1305,168 @@ cleanup:
free(accel_struct_offsets);
return result;
}
static void
dump_bvh_stats(struct radv_device *device, struct vk_acceleration_structure *accel_struct,
struct radv_rra_accel_struct_data *accel_struct_data, uint8_t *data, struct hash_table_u64 *blas_sah,
bool tlas_pass)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_instance *instance = radv_physical_device_instance(pdev);
struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data;
bool is_tlas = header->instance_count > 0;
if (is_tlas != tlas_pass)
return;
/* convert root node id to offset */
uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3;
if (rra_validate_header(accel_struct_data, header)) {
return;
}
if (radv_use_bvh8(pdev)) {
if (rra_validate_node_gfx12(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct_data->size, !is_tlas, 0)) {
return;
}
} else {
if (rra_validate_node_gfx10_3(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct_data->size, !is_tlas, 0)) {
return;
}
}
if (!device->rra_trace.stats_file) {
device->rra_trace.stats_file = fopen(radv_bvh_stats_file(), "w");
fprintf(device->rra_trace.stats_file, "app,name,type,allocated_size,compacted_size");
if (radv_use_bvh8(pdev)) {
fprintf(device->rra_trace.stats_file, ",max_depth,box_node_count,primitive_node_count,instance_node_count");
} else {
fprintf(device->rra_trace.stats_file, ",max_depth,box16_node_count,box32_node_count,triangle_node_count,"
"instance_node_count,procedural_node_count");
}
fprintf(device->rra_trace.stats_file, ",sah,scene_sah\n");
}
fprintf(device->rra_trace.stats_file, "\"%s\",%s,%s,%" PRIu64 ",%" PRIu64, instance->vk.app_info.app_name,
vk_object_base_name(&accel_struct->base), is_tlas ? "tlas" : "blas", accel_struct_data->size,
header->compacted_size);
float extent[3] = {
header->aabb.max.x - header->aabb.min.x,
header->aabb.max.y - header->aabb.min.y,
header->aabb.max.z - header->aabb.min.z,
};
float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
float sah;
float instance_sah;
if (radv_use_bvh8(pdev)) {
struct radv_bvh_stats_gfx12 stats = {};
radv_gather_bvh_stats_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
sah = stats.sah;
instance_sah = stats.instance_sah;
fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u", stats.max_depth, stats.box_node_count,
stats.primitive_node_count, stats.instance_node_count);
} else {
struct radv_bvh_stats_gfx10_3 stats = {};
radv_gather_bvh_stats_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
sah = stats.sah;
instance_sah = stats.instance_sah;
fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u,%u,%u", stats.max_depth, stats.box16_node_count,
stats.box32_node_count, stats.triangle_node_count, stats.instance_node_count,
stats.procedural_node_count);
}
fprintf(device->rra_trace.stats_file, ",%u", (uint32_t)(sah / surface_area * 1000000));
if (is_tlas) {
fprintf(device->rra_trace.stats_file, ",%u\n", (uint32_t)((sah + instance_sah) / surface_area * 1000000));
} else {
fprintf(device->rra_trace.stats_file, ",0\n");
float *sah_ptr = ralloc(blas_sah, float);
*sah_ptr = sah / surface_area;
_mesa_hash_table_u64_insert(blas_sah, vk_acceleration_structure_get_va(accel_struct), sah_ptr);
}
fflush(device->rra_trace.stats_file);
}
VkResult
radv_dump_bvh_stats(VkQueue vk_queue)
{
VK_FROM_HANDLE(radv_queue, queue, vk_queue);
struct radv_device *device = radv_queue_device(queue);
VkDevice vk_device = radv_device_to_handle(device);
VkResult result = vk_common_DeviceWaitIdle(vk_device);
if (result != VK_SUCCESS)
return result;
struct hash_entry **hash_entries = NULL;
struct hash_table_u64 *blas_sah = NULL;
uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
hash_entries = malloc(sizeof(*hash_entries) * struct_count);
if (!hash_entries) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto cleanup;
}
struct hash_entry *last_entry = NULL;
for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i)
hash_entries[i] = last_entry;
qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp);
struct rra_copy_context copy_ctx = {
.device = vk_device,
.queue = vk_queue,
.entries = hash_entries,
.family_index = queue->vk.queue_family_index,
.min_size = device->rra_trace.ray_history_buffer_size,
};
result = rra_copy_context_init(&copy_ctx);
if (result != VK_SUCCESS)
goto cleanup;
blas_sah = _mesa_hash_table_u64_create(NULL);
for (unsigned i = 0; i < struct_count; i++) {
void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
if (!mapped_data)
continue;
dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, false);
rra_unmap_accel_struct_data(&copy_ctx, i);
}
for (unsigned i = 0; i < struct_count; i++) {
if (_mesa_hash_table_u64_search(blas_sah, vk_acceleration_structure_get_va(hash_entries[i]->key)))
continue;
void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
if (!mapped_data)
continue;
dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, true);
rra_unmap_accel_struct_data(&copy_ctx, i);
}
rra_copy_context_finish(&copy_ctx);
result = VK_SUCCESS;
cleanup:
_mesa_hash_table_u64_destroy(blas_sah);
free(hash_entries);
return result;
}

View file

@ -107,6 +107,7 @@ struct radv_rra_trace_data {
struct hash_table *accel_structs;
struct hash_table_u64 *accel_struct_vas;
simple_mtx_t data_mtx;
FILE *stats_file;
bool validate_as;
bool copy_after_build;
bool triggered;
@ -322,4 +323,32 @@ void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_
void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
uint32_t dst_offset);
struct radv_bvh_stats_gfx10_3 {
uint32_t max_depth;
float sah;
float instance_sah;
uint32_t box16_node_count;
uint32_t box32_node_count;
uint32_t triangle_node_count;
uint32_t instance_node_count;
uint32_t procedural_node_count;
};
struct radv_bvh_stats_gfx12 {
uint32_t max_depth;
float sah;
float instance_sah;
uint32_t box_node_count;
uint32_t primitive_node_count;
uint32_t instance_node_count;
};
void radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats);
void radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats);
VkResult radv_dump_bvh_stats(VkQueue vk_queue);
#endif /* RADV_RRA_H */

View file

@ -357,3 +357,78 @@ rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_
return dst_id;
}
void
radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats)
{
uint32_t node_type = node_id & 7;
const void *node = bvh + ((node_id & (~7u)) << 3);
stats->max_depth = MAX2(stats->max_depth, depth);
switch (node_type) {
case radv_bvh_node_box16: {
stats->sah += 1.0 * p;
stats->box16_node_count++;
const struct radv_bvh_box16_node *box16 = node;
for (uint32_t i = 0; i < 4; i++) {
if (box16->children[i] != 0xffffffff) {
float extent[3] = {
_mesa_half_to_float(box16->coords[i].max_x) - _mesa_half_to_float(box16->coords[i].min_x),
_mesa_half_to_float(box16->coords[i].max_y) - _mesa_half_to_float(box16->coords[i].min_y),
_mesa_half_to_float(box16->coords[i].max_z) - _mesa_half_to_float(box16->coords[i].min_z),
};
float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats);
}
}
break;
}
case radv_bvh_node_box32: {
stats->sah += 1.5 * p;
stats->box32_node_count++;
const struct radv_bvh_box32_node *box32 = node;
for (uint32_t i = 0; i < 4; i++) {
if (box32->children[i] != 0xffffffff) {
float extent[3] = {
box32->coords[i].max.x - box32->coords[i].min.x,
box32->coords[i].max.y - box32->coords[i].min.y,
box32->coords[i].max.z - box32->coords[i].min.z,
};
float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
radv_gather_bvh_stats_gfx10_3(bvh, box32->children[i], depth + 1, surface_area, blas_sah, stats);
}
}
break;
}
case radv_bvh_node_instance: {
stats->sah += 2.0 * p;
stats->instance_node_count++;
const struct radv_bvh_instance_node *instance = node;
uint64_t blas_va = radv_node_to_addr(instance->bvh_ptr) - instance->bvh_offset;
float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
if (sah)
stats->instance_sah += *sah * p;
else
fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%" PRIx64 "\n", blas_va);
break;
}
case radv_bvh_node_triangle:
stats->sah += 2.0 * p;
stats->triangle_node_count++;
break;
case radv_bvh_node_aabb:
stats->sah += 4.0 * p;
stats->procedural_node_count++;
break;
default:
break;
}
}

View file

@ -10,6 +10,7 @@
#include "radv_rra.h"
#include "util/bitset.h"
#include "util/compiler.h"
struct rra_instance_sideband_data {
uint32_t instance_index;
@ -307,3 +308,98 @@ rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id
}
}
}
void
radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float surface_area,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats)
{
uint32_t node_type = node_id & 0xf;
const void *node = bvh + ((node_id & (~0xf)) << 3);
stats->max_depth = MAX2(stats->max_depth, depth);
switch (node_type) {
case radv_bvh_node_box32: {
stats->box_node_count++;
stats->sah += 0.5 * surface_area;
const struct radv_gfx12_box_node *src = node;
uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28;
if (valid_child_count_minus_one != 0xf) {
uint32_t internal_id = src->internal_base_id;
uint32_t primitive_id = src->primitive_base_id;
uint32_t exponents[3] = {
src->child_count_exponents & 0xff,
(src->child_count_exponents >> 8) & 0xff,
(src->child_count_exponents >> 16) & 0xff,
};
float extent[3] = {
uif(exponents[0] << 23),
uif(exponents[1] << 23),
uif(exponents[2] << 23),
};
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
uint32_t child_size = src->children[i].dword2 >> 28;
uint32_t child_id;
if (child_type == radv_bvh_node_box32) {
child_id = internal_id | child_type;
internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
} else {
child_id = primitive_id | child_type;
primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
}
float min[3] = {
(float)(src->children[i].dword0 & 0xfff) / 0x1000 * extent[0],
(float)((src->children[i].dword0 >> 12) & 0xfff) / 0x1000 * extent[1],
(float)(src->children[i].dword1 & 0xfff) / 0x1000 * extent[2],
};
float max[3] = {
(float)(((src->children[i].dword1 >> 12) & 0xfff) + 1) / 0x1000 * extent[0],
(float)((src->children[i].dword2 & 0xfff) + 1) / 0x1000 * extent[1],
(float)(((src->children[i].dword2 >> 12) & 0xfff) + 1) / 0x1000 * extent[2],
};
float child_extent[3] = {
max[0] - min[0],
max[1] - min[1],
max[2] - min[2],
};
float child_surface_area = 2 * (child_extent[0] * child_extent[1] + child_extent[0] * child_extent[2] +
child_extent[1] * child_extent[2]);
radv_gather_bvh_stats_gfx12(bvh, child_id, depth + 1, child_surface_area, blas_sah, stats);
}
}
break;
}
case radv_bvh_node_instance: {
stats->instance_node_count++;
stats->sah += 0.7 * surface_area;
struct radv_gfx12_instance_node *instance = (struct radv_gfx12_instance_node *)(node);
const struct radv_gfx12_instance_node_user_data *user_data =
(const void *)((const uint8_t *)node + sizeof(struct radv_gfx12_instance_node));
uint64_t blas_va = radv_node_to_addr(instance->pointer_flags_bvh_addr) - user_data->bvh_offset;
float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
if (sah)
stats->instance_sah += *sah * surface_area;
else
fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%" PRIx64 "\n", blas_va);
break;
}
case radv_bvh_node_triangle:
stats->primitive_node_count++;
FALLTHROUGH;
default:
stats->sah += 1.0 * surface_area;
break;
}
}