diff --git a/src/amd/vulkan/layers/radv_rra_layer.c b/src/amd/vulkan/layers/radv_rra_layer.c index b95a1331d45..df0573cd3e0 100644 --- a/src/amd/vulkan/layers/radv_rra_layer.c +++ b/src/amd/vulkan/layers/radv_rra_layer.c @@ -374,7 +374,16 @@ rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *p struct radv_device *device = radv_queue_device(queue); VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence); - if (result != VK_SUCCESS || !device->rra_trace.triggered) + if (result != VK_SUCCESS) + return result; + + if (radv_bvh_stats_file()) { + result = radv_dump_bvh_stats(_queue); + if (result != VK_SUCCESS) + return result; + } + + if (!device->rra_trace.triggered) return result; uint32_t total_trace_count = 0; diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 8b1097fdf83..fa9cfdee063 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -716,7 +716,7 @@ radv_device_init_tools(struct radv_device *device) if (result != VK_SUCCESS) return result; - if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) { + if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) { result = radv_rra_trace_init(device); if (result != VK_SUCCESS) return result; @@ -808,7 +808,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP) add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE); - if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) + if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE); #ifndef _WIN32 diff --git a/src/amd/vulkan/radv_instance.h b/src/amd/vulkan/radv_instance.h index 4df6002c12c..e4ed715e495 100644 --- a/src/amd/vulkan/radv_instance.h +++ b/src/amd/vulkan/radv_instance.h @@ -116,4 +116,17 @@ const char *radv_get_perftest_option_name(int id); bool radv_is_rt_wave64_enabled(const struct radv_instance *instance); +static const char * +radv_bvh_stats_file() +{ + return os_get_option_secure("RADV_BVH_STATS_FILE"); +} + +static bool +radv_bvh_dumping_enabled(const struct radv_instance *instance) +{ + /* Gathering bvh stats uses a large part of the rra code for dumping bvhs. */ + return (instance->vk.trace_mode & RADV_TRACE_MODE_RRA) || radv_bvh_stats_file(); +} + #endif /* RADV_INSTANCE_H */ diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c index bd116a4c93f..bbe267a3226 100644 --- a/src/amd/vulkan/radv_rra.c +++ b/src/amd/vulkan/radv_rra.c @@ -489,6 +489,10 @@ radv_rra_trace_init(struct radv_device *device) device->rra_trace.ray_history = UTIL_DYNARRAY_INIT; + /* BVH stats dumping does not need ray history. */ + if (!(radv_physical_device_instance(pdev)->vk.trace_mode & RADV_TRACE_MODE_RRA)) + return VK_SUCCESS; + device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024); if (device->rra_trace.ray_history_buffer_size < sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token)) @@ -625,6 +629,9 @@ radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data) simple_mtx_destroy(&data->data_mtx); _mesa_hash_table_destroy(data->accel_structs, NULL); _mesa_hash_table_u64_destroy(data->accel_struct_vas); + + if (data->stats_file) + fclose(data->stats_file); } void @@ -1298,3 +1305,168 @@ cleanup: free(accel_struct_offsets); return result; } + +static void +dump_bvh_stats(struct radv_device *device, struct vk_acceleration_structure *accel_struct, + struct radv_rra_accel_struct_data *accel_struct_data, uint8_t *data, struct hash_table_u64 *blas_sah, + bool tlas_pass) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + const struct radv_instance *instance = radv_physical_device_instance(pdev); + + struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data; + + bool is_tlas = header->instance_count > 0; + if (is_tlas != tlas_pass) + return; + + /* convert root node id to offset */ + uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3; + + if (rra_validate_header(accel_struct_data, header)) { + return; + } + if (radv_use_bvh8(pdev)) { + if (rra_validate_node_gfx12(device->rra_trace.accel_struct_vas, data + header->bvh_offset, + data + header->bvh_offset + src_root_offset, header->geometry_count, + accel_struct_data->size, !is_tlas, 0)) { + return; + } + } else { + if (rra_validate_node_gfx10_3(device->rra_trace.accel_struct_vas, data + header->bvh_offset, + data + header->bvh_offset + src_root_offset, header->geometry_count, + accel_struct_data->size, !is_tlas, 0)) { + return; + } + } + + if (!device->rra_trace.stats_file) { + device->rra_trace.stats_file = fopen(radv_bvh_stats_file(), "w"); + fprintf(device->rra_trace.stats_file, "app,name,type,allocated_size,compacted_size"); + if (radv_use_bvh8(pdev)) { + fprintf(device->rra_trace.stats_file, ",max_depth,box_node_count,primitive_node_count,instance_node_count"); + } else { + fprintf(device->rra_trace.stats_file, ",max_depth,box16_node_count,box32_node_count,triangle_node_count," + "instance_node_count,procedural_node_count"); + } + fprintf(device->rra_trace.stats_file, ",sah,scene_sah\n"); + } + + fprintf(device->rra_trace.stats_file, "\"%s\",%s,%s,%" PRIu64 ",%" PRIu64, instance->vk.app_info.app_name, + vk_object_base_name(&accel_struct->base), is_tlas ? "tlas" : "blas", accel_struct_data->size, + header->compacted_size); + + float extent[3] = { + header->aabb.max.x - header->aabb.min.x, + header->aabb.max.y - header->aabb.min.y, + header->aabb.max.z - header->aabb.min.z, + }; + float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]); + + float sah; + float instance_sah; + if (radv_use_bvh8(pdev)) { + struct radv_bvh_stats_gfx12 stats = {}; + radv_gather_bvh_stats_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats); + sah = stats.sah; + instance_sah = stats.instance_sah; + fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u", stats.max_depth, stats.box_node_count, + stats.primitive_node_count, stats.instance_node_count); + } else { + struct radv_bvh_stats_gfx10_3 stats = {}; + radv_gather_bvh_stats_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats); + sah = stats.sah; + instance_sah = stats.instance_sah; + fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u,%u,%u", stats.max_depth, stats.box16_node_count, + stats.box32_node_count, stats.triangle_node_count, stats.instance_node_count, + stats.procedural_node_count); + } + + fprintf(device->rra_trace.stats_file, ",%u", (uint32_t)(sah / surface_area * 1000000)); + + if (is_tlas) { + fprintf(device->rra_trace.stats_file, ",%u\n", (uint32_t)((sah + instance_sah) / surface_area * 1000000)); + } else { + fprintf(device->rra_trace.stats_file, ",0\n"); + + float *sah_ptr = ralloc(blas_sah, float); + *sah_ptr = sah / surface_area; + _mesa_hash_table_u64_insert(blas_sah, vk_acceleration_structure_get_va(accel_struct), sah_ptr); + } + + fflush(device->rra_trace.stats_file); +} + +VkResult +radv_dump_bvh_stats(VkQueue vk_queue) +{ + VK_FROM_HANDLE(radv_queue, queue, vk_queue); + struct radv_device *device = radv_queue_device(queue); + VkDevice vk_device = radv_device_to_handle(device); + + VkResult result = vk_common_DeviceWaitIdle(vk_device); + if (result != VK_SUCCESS) + return result; + + struct hash_entry **hash_entries = NULL; + struct hash_table_u64 *blas_sah = NULL; + + uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs); + + hash_entries = malloc(sizeof(*hash_entries) * struct_count); + if (!hash_entries) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto cleanup; + } + + struct hash_entry *last_entry = NULL; + for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i) + hash_entries[i] = last_entry; + + qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp); + + struct rra_copy_context copy_ctx = { + .device = vk_device, + .queue = vk_queue, + .entries = hash_entries, + .family_index = queue->vk.queue_family_index, + .min_size = device->rra_trace.ray_history_buffer_size, + }; + + result = rra_copy_context_init(©_ctx); + if (result != VK_SUCCESS) + goto cleanup; + + blas_sah = _mesa_hash_table_u64_create(NULL); + + for (unsigned i = 0; i < struct_count; i++) { + void *mapped_data = rra_map_accel_struct_data(©_ctx, i); + if (!mapped_data) + continue; + + dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, false); + + rra_unmap_accel_struct_data(©_ctx, i); + } + + for (unsigned i = 0; i < struct_count; i++) { + if (_mesa_hash_table_u64_search(blas_sah, vk_acceleration_structure_get_va(hash_entries[i]->key))) + continue; + + void *mapped_data = rra_map_accel_struct_data(©_ctx, i); + if (!mapped_data) + continue; + + dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, true); + + rra_unmap_accel_struct_data(©_ctx, i); + } + + rra_copy_context_finish(©_ctx); + + result = VK_SUCCESS; +cleanup: + _mesa_hash_table_u64_destroy(blas_sah); + free(hash_entries); + return result; +} diff --git a/src/amd/vulkan/radv_rra.h b/src/amd/vulkan/radv_rra.h index b8a6911a933..5396cb40bff 100644 --- a/src/amd/vulkan/radv_rra.h +++ b/src/amd/vulkan/radv_rra.h @@ -107,6 +107,7 @@ struct radv_rra_trace_data { struct hash_table *accel_structs; struct hash_table_u64 *accel_struct_vas; simple_mtx_t data_mtx; + FILE *stats_file; bool validate_as; bool copy_after_build; bool triggered; @@ -322,4 +323,32 @@ void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_ void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, uint32_t dst_offset); +struct radv_bvh_stats_gfx10_3 { + uint32_t max_depth; + float sah; + float instance_sah; + uint32_t box16_node_count; + uint32_t box32_node_count; + uint32_t triangle_node_count; + uint32_t instance_node_count; + uint32_t procedural_node_count; +}; + +struct radv_bvh_stats_gfx12 { + uint32_t max_depth; + float sah; + float instance_sah; + uint32_t box_node_count; + uint32_t primitive_node_count; + uint32_t instance_node_count; +}; + +void radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats); + +void radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats); + +VkResult radv_dump_bvh_stats(VkQueue vk_queue); + #endif /* RADV_RRA_H */ diff --git a/src/amd/vulkan/radv_rra_gfx10_3.c b/src/amd/vulkan/radv_rra_gfx10_3.c index 7fb4efd0ed5..c46d7a30e50 100644 --- a/src/amd/vulkan/radv_rra_gfx10_3.c +++ b/src/amd/vulkan/radv_rra_gfx10_3.c @@ -357,3 +357,78 @@ rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_ return dst_id; } + +void +radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats) +{ + uint32_t node_type = node_id & 7; + const void *node = bvh + ((node_id & (~7u)) << 3); + + stats->max_depth = MAX2(stats->max_depth, depth); + + switch (node_type) { + case radv_bvh_node_box16: { + stats->sah += 1.0 * p; + stats->box16_node_count++; + + const struct radv_bvh_box16_node *box16 = node; + for (uint32_t i = 0; i < 4; i++) { + if (box16->children[i] != 0xffffffff) { + float extent[3] = { + _mesa_half_to_float(box16->coords[i].max_x) - _mesa_half_to_float(box16->coords[i].min_x), + _mesa_half_to_float(box16->coords[i].max_y) - _mesa_half_to_float(box16->coords[i].min_y), + _mesa_half_to_float(box16->coords[i].max_z) - _mesa_half_to_float(box16->coords[i].min_z), + }; + float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]); + radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats); + } + } + + break; + } + case radv_bvh_node_box32: { + stats->sah += 1.5 * p; + stats->box32_node_count++; + + const struct radv_bvh_box32_node *box32 = node; + for (uint32_t i = 0; i < 4; i++) { + if (box32->children[i] != 0xffffffff) { + float extent[3] = { + box32->coords[i].max.x - box32->coords[i].min.x, + box32->coords[i].max.y - box32->coords[i].min.y, + box32->coords[i].max.z - box32->coords[i].min.z, + }; + float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]); + radv_gather_bvh_stats_gfx10_3(bvh, box32->children[i], depth + 1, surface_area, blas_sah, stats); + } + } + + break; + } + case radv_bvh_node_instance: { + stats->sah += 2.0 * p; + stats->instance_node_count++; + + const struct radv_bvh_instance_node *instance = node; + uint64_t blas_va = radv_node_to_addr(instance->bvh_ptr) - instance->bvh_offset; + float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va); + if (sah) + stats->instance_sah += *sah * p; + else + fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%" PRIx64 "\n", blas_va); + + break; + } + case radv_bvh_node_triangle: + stats->sah += 2.0 * p; + stats->triangle_node_count++; + break; + case radv_bvh_node_aabb: + stats->sah += 4.0 * p; + stats->procedural_node_count++; + break; + default: + break; + } +} diff --git a/src/amd/vulkan/radv_rra_gfx12.c b/src/amd/vulkan/radv_rra_gfx12.c index f4963e06dcc..ccdce18d58a 100644 --- a/src/amd/vulkan/radv_rra_gfx12.c +++ b/src/amd/vulkan/radv_rra_gfx12.c @@ -10,6 +10,7 @@ #include "radv_rra.h" #include "util/bitset.h" +#include "util/compiler.h" struct rra_instance_sideband_data { uint32_t instance_index; @@ -307,3 +308,98 @@ rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id } } } + +void +radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float surface_area, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats) +{ + uint32_t node_type = node_id & 0xf; + const void *node = bvh + ((node_id & (~0xf)) << 3); + + stats->max_depth = MAX2(stats->max_depth, depth); + + switch (node_type) { + case radv_bvh_node_box32: { + stats->box_node_count++; + stats->sah += 0.5 * surface_area; + + const struct radv_gfx12_box_node *src = node; + + uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28; + + if (valid_child_count_minus_one != 0xf) { + uint32_t internal_id = src->internal_base_id; + uint32_t primitive_id = src->primitive_base_id; + + uint32_t exponents[3] = { + src->child_count_exponents & 0xff, + (src->child_count_exponents >> 8) & 0xff, + (src->child_count_exponents >> 16) & 0xff, + }; + float extent[3] = { + uif(exponents[0] << 23), + uif(exponents[1] << 23), + uif(exponents[2] << 23), + }; + + for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { + uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf; + uint32_t child_size = src->children[i].dword2 >> 28; + + uint32_t child_id; + if (child_type == radv_bvh_node_box32) { + child_id = internal_id | child_type; + internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + } else { + child_id = primitive_id | child_type; + primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + } + + float min[3] = { + (float)(src->children[i].dword0 & 0xfff) / 0x1000 * extent[0], + (float)((src->children[i].dword0 >> 12) & 0xfff) / 0x1000 * extent[1], + (float)(src->children[i].dword1 & 0xfff) / 0x1000 * extent[2], + }; + float max[3] = { + (float)(((src->children[i].dword1 >> 12) & 0xfff) + 1) / 0x1000 * extent[0], + (float)((src->children[i].dword2 & 0xfff) + 1) / 0x1000 * extent[1], + (float)(((src->children[i].dword2 >> 12) & 0xfff) + 1) / 0x1000 * extent[2], + }; + float child_extent[3] = { + max[0] - min[0], + max[1] - min[1], + max[2] - min[2], + }; + float child_surface_area = 2 * (child_extent[0] * child_extent[1] + child_extent[0] * child_extent[2] + + child_extent[1] * child_extent[2]); + + radv_gather_bvh_stats_gfx12(bvh, child_id, depth + 1, child_surface_area, blas_sah, stats); + } + } + + break; + } + case radv_bvh_node_instance: { + stats->instance_node_count++; + stats->sah += 0.7 * surface_area; + + struct radv_gfx12_instance_node *instance = (struct radv_gfx12_instance_node *)(node); + const struct radv_gfx12_instance_node_user_data *user_data = + (const void *)((const uint8_t *)node + sizeof(struct radv_gfx12_instance_node)); + uint64_t blas_va = radv_node_to_addr(instance->pointer_flags_bvh_addr) - user_data->bvh_offset; + float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va); + if (sah) + stats->instance_sah += *sah * surface_area; + else + fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%" PRIx64 "\n", blas_va); + + break; + } + case radv_bvh_node_triangle: + stats->primitive_node_count++; + FALLTHROUGH; + default: + stats->sah += 1.0 * surface_area; + break; + } +}