From a4b71dea85ddc1fed26687e7894d4681d061319b Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Sat, 15 Nov 2025 09:57:37 +0100
Subject: [PATCH 1/7] radv: Add an option for dumping BVH stats

The option uses the dumping already implemented for rra to gather
statistics about BVHs on the CPU and write them to a csv file. This csv
file can then be compared using a tool similar to report-fossils to
judge the impact of changes to the bvh build code.
---
 src/amd/vulkan/layers/radv_rra_layer.c |  11 +-
 src/amd/vulkan/radv_device.c           |   4 +-
 src/amd/vulkan/radv_instance.h         |  13 ++
 src/amd/vulkan/radv_rra.c              | 173 ++++++++++++++++++++++++-
 src/amd/vulkan/radv_rra.h              |  29 +++++
 src/amd/vulkan/radv_rra_gfx10_3.c      |  75 +++++++++++
 src/amd/vulkan/radv_rra_gfx12.c        |  96 ++++++++++++++
 7 files changed, 397 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/layers/radv_rra_layer.c b/src/amd/vulkan/layers/radv_rra_layer.c
index b95a1331d45..df0573cd3e0 100644
--- a/src/amd/vulkan/layers/radv_rra_layer.c
+++ b/src/amd/vulkan/layers/radv_rra_layer.c
@@ -374,7 +374,16 @@ rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *p
    struct radv_device *device = radv_queue_device(queue);
 
    VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence);
-   if (result != VK_SUCCESS || !device->rra_trace.triggered)
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (radv_bvh_stats_file()) {
+      result = radv_dump_bvh_stats(_queue);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   if (!device->rra_trace.triggered)
       return result;
 
    uint32_t total_trace_count = 0;
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 9d688a488b6..11793cc38ca 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -722,7 +722,7 @@ radv_device_init_tools(struct radv_device *device)
    if (result != VK_SUCCESS)
       return result;
 
-   if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) {
+   if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) {
       result = radv_rra_trace_init(device);
       if (result != VK_SUCCESS)
          return result;
@@ -798,7 +798,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
    if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
       add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
 
-   if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev))
+   if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev))
       add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
 
 #ifndef _WIN32
diff --git a/src/amd/vulkan/radv_instance.h b/src/amd/vulkan/radv_instance.h
index 84a4d88cb32..4fe5b723621 100644
--- a/src/amd/vulkan/radv_instance.h
+++ b/src/amd/vulkan/radv_instance.h
@@ -115,4 +115,17 @@ const char *radv_get_perftest_option_name(int id);
 
 bool radv_is_rt_wave64_enabled(const struct radv_instance *instance);
 
+static const char *
+radv_bvh_stats_file()
+{
+   return os_get_option("RADV_BVH_STATS_FILE");
+}
+
+static bool
+radv_bvh_dumping_enabled(const struct radv_instance *instance)
+{
+   /* Gathering bvh stats uses a large part of the rra code for dumping bvhs. */
+   return (instance->vk.trace_mode & RADV_TRACE_MODE_RRA) || radv_bvh_stats_file();
+}
+
 #endif /* RADV_INSTANCE_H */
diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c
index bba3e87943b..a307b535c7f 100644
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@@ -488,6 +488,10 @@ radv_rra_trace_init(struct radv_device *device)
 
    device->rra_trace.ray_history = UTIL_DYNARRAY_INIT;
 
+   /* BVH stats dumping does not need ray history. */
+   if (!(radv_physical_device_instance(pdev)->vk.trace_mode & RADV_TRACE_MODE_RRA))
+      return VK_SUCCESS;
+
    device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024);
    if (device->rra_trace.ray_history_buffer_size <
        sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token))
@@ -624,6 +628,9 @@ radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
    simple_mtx_destroy(&data->data_mtx);
    _mesa_hash_table_destroy(data->accel_structs, NULL);
    _mesa_hash_table_u64_destroy(data->accel_struct_vas);
+
+   if (data->stats_file)
+      fclose(data->stats_file);
 }
 
 void
@@ -789,7 +796,7 @@ rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
    if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET)
       return NULL;
 
-   if (data->buffer->memory) {
+   if (data->buffer && data->buffer->memory) {
       VkMemoryMapInfo memory_map_info = {
          .sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
          .memory = data->buffer->memory,
@@ -1297,3 +1304,167 @@ cleanup:
    free(accel_struct_offsets);
    return result;
 }
+
+static void
+dump_bvh_stats(struct radv_device *device, struct vk_acceleration_structure *accel_struct,
+               struct radv_rra_accel_struct_data *accel_struct_data, uint8_t *data, struct hash_table_u64 *blas_sah,
+               bool tlas_pass)
+{
+   const struct radv_physical_device *pdev = radv_device_physical(device);
+   const struct radv_instance *instance = radv_physical_device_instance(pdev);
+
+   struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data;
+
+   bool is_tlas = header->instance_count > 0;
+   if (is_tlas != tlas_pass)
+      return;
+
+   /* convert root node id to offset */
+   uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3;
+
+   if (rra_validate_header(accel_struct_data, header)) {
+      return;
+   }
+   if (radv_use_bvh8(pdev)) {
+      if (rra_validate_node_gfx12(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
+                                  data + header->bvh_offset + src_root_offset, header->geometry_count,
+                                  accel_struct_data->size, !is_tlas, 0)) {
+         return;
+      }
+   } else {
+      if (rra_validate_node_gfx10_3(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
+                                    data + header->bvh_offset + src_root_offset, header->geometry_count,
+                                    accel_struct_data->size, !is_tlas, 0)) {
+         return;
+      }
+   }
+
+   if (!device->rra_trace.stats_file) {
+      device->rra_trace.stats_file = fopen(radv_bvh_stats_file(), "w");
+      fprintf(device->rra_trace.stats_file, "app,name,type,allocated_size,compacted_size");
+      if (radv_use_bvh8(pdev)) {
+         fprintf(device->rra_trace.stats_file, ",max_depth,box_node_count,primitive_node_count,instance_node_count");
+      } else {
+         fprintf(device->rra_trace.stats_file, ",max_depth,box16_node_count,box32_node_count,triangle_node_count,"
+                                               "instance_node_count,procedual_node_count");
+      }
+      fprintf(device->rra_trace.stats_file, ",sah,scene_sah\n");
+   }
+
+   fprintf(device->rra_trace.stats_file, "\"%s\",%s,%s,%" PRIu64 ",%" PRIu64, instance->vk.app_info.app_name,
+           vk_object_base_name(&accel_struct->base), is_tlas ? "tlas" : "blas", accel_struct_data->size,
+           header->compacted_size);
+
+   float extent[3] = {
+      header->aabb.max.x - header->aabb.min.x,
+      header->aabb.max.y - header->aabb.min.y,
+      header->aabb.max.z - header->aabb.min.z,
+   };
+   float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
+
+   float sah;
+   float instance_sah;
+   if (radv_use_bvh8(pdev)) {
+      struct radv_bvh_stats_gfx12 stats = {};
+      radv_gather_bvh_stats_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
+      sah = stats.sah;
+      instance_sah = stats.instance_sah;
+      fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u", stats.max_depth, stats.box_node_count,
+              stats.primitive_node_count, stats.instance_node_count);
+   } else {
+      struct radv_bvh_stats_gfx10_3 stats = {};
+      radv_gather_bvh_stats_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
+      sah = stats.sah;
+      instance_sah = stats.instance_sah;
+      fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u,%u,%u", stats.max_depth, stats.box16_node_count,
+              stats.box32_node_count, stats.triangle_node_count, stats.instance_node_count, stats.procedual_node_count);
+   }
+
+   fprintf(device->rra_trace.stats_file, ",%u", (uint32_t)(sah / surface_area * 1000000));
+
+   if (is_tlas) {
+      fprintf(device->rra_trace.stats_file, ",%u\n", (uint32_t)((sah + instance_sah) / surface_area * 1000000));
+   } else {
+      fprintf(device->rra_trace.stats_file, ",0\n");
+
+      float *sah_ptr = ralloc(blas_sah, float);
+      *sah_ptr = sah / surface_area;
+      _mesa_hash_table_u64_insert(blas_sah, vk_acceleration_structure_get_va(accel_struct), sah_ptr);
+   }
+
+   fflush(device->rra_trace.stats_file);
+}
+
+VkResult
+radv_dump_bvh_stats(VkQueue vk_queue)
+{
+   VK_FROM_HANDLE(radv_queue, queue, vk_queue);
+   struct radv_device *device = radv_queue_device(queue);
+   VkDevice vk_device = radv_device_to_handle(device);
+
+   VkResult result = vk_common_DeviceWaitIdle(vk_device);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct hash_entry **hash_entries = NULL;
+   struct hash_table_u64 *blas_sah = NULL;
+
+   uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
+
+   hash_entries = malloc(sizeof(*hash_entries) * struct_count);
+   if (!hash_entries) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto cleanup;
+   }
+
+   struct hash_entry *last_entry = NULL;
+   for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i)
+      hash_entries[i] = last_entry;
+
+   qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp);
+
+   struct rra_copy_context copy_ctx = {
+      .device = vk_device,
+      .queue = vk_queue,
+      .entries = hash_entries,
+      .family_index = queue->vk.queue_family_index,
+      .min_size = device->rra_trace.ray_history_buffer_size,
+   };
+
+   result = rra_copy_context_init(&copy_ctx);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+   blas_sah = _mesa_hash_table_u64_create(NULL);
+
+   for (unsigned i = 0; i < struct_count; i++) {
+      void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
+      if (!mapped_data)
+         continue;
+
+      dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, false);
+
+      rra_unmap_accel_struct_data(&copy_ctx, i);
+   }
+
+   for (unsigned i = 0; i < struct_count; i++) {
+      if (_mesa_hash_table_u64_search(blas_sah, vk_acceleration_structure_get_va(hash_entries[i]->key)))
+         continue;
+
+      void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
+      if (!mapped_data)
+         continue;
+
+      dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, true);
+
+      rra_unmap_accel_struct_data(&copy_ctx, i);
+   }
+
+   rra_copy_context_finish(&copy_ctx);
+
+   result = VK_SUCCESS;
+cleanup:
+   _mesa_hash_table_u64_destroy(blas_sah);
+   free(hash_entries);
+   return result;
+}
diff --git a/src/amd/vulkan/radv_rra.h b/src/amd/vulkan/radv_rra.h
index c2c86f15d9f..5964ee643bf 100644
--- a/src/amd/vulkan/radv_rra.h
+++ b/src/amd/vulkan/radv_rra.h
@@ -107,6 +107,7 @@ struct radv_rra_trace_data {
    struct hash_table *accel_structs;
    struct hash_table_u64 *accel_struct_vas;
    simple_mtx_t data_mtx;
+   FILE *stats_file;
    bool validate_as;
    bool copy_after_build;
    bool triggered;
@@ -320,4 +321,32 @@ void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_
 void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
                               uint32_t dst_offset);
 
+struct radv_bvh_stats_gfx10_3 {
+   uint32_t max_depth;
+   float sah;
+   float instance_sah;
+   uint32_t box16_node_count;
+   uint32_t box32_node_count;
+   uint32_t triangle_node_count;
+   uint32_t instance_node_count;
+   uint32_t procedual_node_count;
+};
+
+struct radv_bvh_stats_gfx12 {
+   uint32_t max_depth;
+   float sah;
+   float instance_sah;
+   uint32_t box_node_count;
+   uint32_t primitive_node_count;
+   uint32_t instance_node_count;
+};
+
+void radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
+                                   struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats);
+
+void radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
+                                 struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats);
+
+VkResult radv_dump_bvh_stats(VkQueue vk_queue);
+
 #endif /* RADV_RRA_H */
diff --git a/src/amd/vulkan/radv_rra_gfx10_3.c b/src/amd/vulkan/radv_rra_gfx10_3.c
index 8ff1f01aa9d..b268048befe 100644
--- a/src/amd/vulkan/radv_rra_gfx10_3.c
+++ b/src/amd/vulkan/radv_rra_gfx10_3.c
@@ -355,3 +355,78 @@ rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_
 
    return dst_id;
 }
+
+void
+radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
+                              struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats)
+{
+   uint32_t node_type = node_id & 7;
+   const void *node = bvh + ((node_id & (~7u)) << 3);
+
+   stats->max_depth = MAX2(stats->max_depth, depth);
+
+   switch (node_type) {
+   case radv_bvh_node_box16: {
+      stats->sah += 1.0 * p;
+      stats->box16_node_count++;
+
+      const struct radv_bvh_box16_node *box16 = node;
+      for (uint32_t i = 0; i < 4; i++) {
+         if (box16->children[i] != 0xffffffff) {
+            float extent[3] = {
+               _mesa_half_to_float(box16->coords[i][1][0]) - _mesa_half_to_float(box16->coords[i][0][0]),
+               _mesa_half_to_float(box16->coords[i][1][1]) - _mesa_half_to_float(box16->coords[i][0][1]),
+               _mesa_half_to_float(box16->coords[i][1][2]) - _mesa_half_to_float(box16->coords[i][0][2]),
+            };
+            float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
+            radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats);
+         }
+      }
+
+      break;
+   }
+   case radv_bvh_node_box32: {
+      stats->sah += 1.5 * p;
+      stats->box32_node_count++;
+
+      const struct radv_bvh_box32_node *box32 = node;
+      for (uint32_t i = 0; i < 4; i++) {
+         if (box32->children[i] != 0xffffffff) {
+            float extent[3] = {
+               box32->coords[i].max.x - box32->coords[i].min.x,
+               box32->coords[i].max.y - box32->coords[i].min.y,
+               box32->coords[i].max.z - box32->coords[i].min.z,
+            };
+            float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
+            radv_gather_bvh_stats_gfx10_3(bvh, box32->children[i], depth + 1, surface_area, blas_sah, stats);
+         }
+      }
+
+      break;
+   }
+   case radv_bvh_node_instance: {
+      stats->sah += 2.0 * p;
+      stats->instance_node_count++;
+
+      const struct radv_bvh_instance_node *instance = node;
+      uint64_t blas_va = radv_node_to_addr(instance->bvh_ptr) - instance->bvh_offset;
+      float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
+      if (sah)
+         stats->instance_sah += *sah * p;
+      else
+         fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va);
+
+      break;
+   }
+   case radv_bvh_node_triangle:
+      stats->sah += 2.0 * p;
+      stats->triangle_node_count++;
+      break;
+   case radv_bvh_node_aabb:
+      stats->sah += 4.0 * p;
+      stats->procedual_node_count++;
+      break;
+   default:
+      break;
+   }
+}
diff --git a/src/amd/vulkan/radv_rra_gfx12.c b/src/amd/vulkan/radv_rra_gfx12.c
index d0029d8be42..2c116450639 100644
--- a/src/amd/vulkan/radv_rra_gfx12.c
+++ b/src/amd/vulkan/radv_rra_gfx12.c
@@ -10,6 +10,7 @@
 #include "radv_rra.h"
 
 #include "util/bitset.h"
+#include "util/compiler.h"
 
 struct rra_instance_sideband_data {
    uint32_t instance_index;
@@ -306,3 +307,98 @@ rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id
       }
    }
 }
+
+void
+radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float surface_area,
+                            struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats)
+{
+   uint32_t node_type = node_id & 0xf;
+   const void *node = bvh + ((node_id & (~0xf)) << 3);
+
+   stats->max_depth = MAX2(stats->max_depth, depth);
+
+   switch (node_type) {
+   case radv_bvh_node_box32: {
+      stats->box_node_count++;
+      stats->sah += 0.5 * surface_area;
+
+      const struct radv_gfx12_box_node *src = node;
+
+      uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28;
+
+      if (valid_child_count_minus_one != 0xf) {
+         uint32_t internal_id = src->internal_base_id;
+         uint32_t primitive_id = src->primitive_base_id;
+
+         uint32_t exponents[3] = {
+            src->child_count_exponents & 0xff,
+            (src->child_count_exponents >> 8) & 0xff,
+            (src->child_count_exponents >> 16) & 0xff,
+         };
+         float extent[3] = {
+            uif(exponents[0] << 23),
+            uif(exponents[1] << 23),
+            uif(exponents[2] << 23),
+         };
+
+         for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
+            uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
+            uint32_t child_size = src->children[i].dword2 >> 28;
+
+            uint32_t child_id;
+            if (child_type == radv_bvh_node_box32) {
+               child_id = internal_id | child_type;
+               internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
+            } else {
+               child_id = primitive_id | child_type;
+               primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
+            }
+
+            float min[3] = {
+               (float)(src->children[i].dword0 & 0xfff) / 0x1000 * extent[0],
+               (float)((src->children[i].dword0 >> 12) & 0xfff) / 0x1000 * extent[1],
+               (float)(src->children[i].dword1 & 0xfff) / 0x1000 * extent[2],
+            };
+            float max[3] = {
+               (float)(((src->children[i].dword1 >> 12) & 0xfff) + 1) / 0x1000 * extent[0],
+               (float)((src->children[i].dword2 & 0xfff) + 1) / 0x1000 * extent[1],
+               (float)(((src->children[i].dword2 >> 12) & 0xfff) + 1) / 0x1000 * extent[2],
+            };
+            float child_extent[3] = {
+               max[0] - min[0],
+               max[1] - min[1],
+               max[2] - min[2],
+            };
+            float child_surface_area = 2 * (child_extent[0] * child_extent[1] + child_extent[0] * child_extent[2] +
+                                            child_extent[1] * child_extent[2]);
+
+            radv_gather_bvh_stats_gfx12(bvh, child_id, depth + 1, child_surface_area, blas_sah, stats);
+         }
+      }
+
+      break;
+   }
+   case radv_bvh_node_instance: {
+      stats->instance_node_count++;
+      stats->sah += 0.7 * surface_area;
+
+      struct radv_gfx12_instance_node *instance = (struct radv_gfx12_instance_node *)(node);
+      const struct radv_gfx12_instance_node_user_data *user_data =
+         (const void *)((const uint8_t *)node + sizeof(struct radv_gfx12_instance_node));
+      uint64_t blas_va = radv_node_to_addr(instance->pointer_flags_bvh_addr) - user_data->bvh_offset;
+      float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
+      if (sah)
+         stats->instance_sah += *sah * surface_area;
+      else
+         fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va);
+
+      break;
+   }
+   case radv_bvh_node_triangle:
+      stats->primitive_node_count++;
+      FALLTHROUGH;
+   default:
+      stats->sah += 1.0 * surface_area;
+      break;
+   }
+}

From 666fd022d20020a1857cc37d04b75594bd7fe6c8 Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Tue, 4 Nov 2025 21:06:04 +0100
Subject: [PATCH 2/7] nir: Add f2f16_ru/rd opcodes

Those are variants of f2f16 that always round up/down. Constant folding
requires nextafter that supports half floats (util_nextafter).
---
 src/compiler/nir/nir_opcodes.py | 18 +++++++++++++++-
 src/util/half_float.c           | 38 +++++++++++++++++++++++++++++++++
 src/util/half_float.h           |  3 +++
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 6c9aa7e5117..723d2691101 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -257,7 +257,7 @@ for src_t in [tint, tuint, tfloat, tbool]:
    for dst_t in dst_types:
       for dst_bit_size in type_sizes(dst_t):
           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
-              rnd_modes = ['_rtne', '_rtz', '']
+              rnd_modes = ['_rtne', '_rtz', '_ru', '_rd', '']
               for rnd_mode in rnd_modes:
                   if rnd_mode == '_rtne':
                       conv_expr = """
@@ -279,6 +279,22 @@ for src_t in [tint, tuint, tfloat, tbool]:
                          dst = src0;
                       }
                       """
+                  elif rnd_mode == '_ru':
+                      conv_expr = """
+                      if (bit_size > 16) {
+                         dst = _mesa_half_to_float(_mesa_float_to_float16_ru(src0));
+                      } else {
+                         dst = src0;
+                      }
+                      """
+                  elif rnd_mode == '_rd':
+                      conv_expr = """
+                      if (bit_size > 16) {
+                         dst = _mesa_half_to_float(_mesa_float_to_float16_rd(src0));
+                      } else {
+                         dst = src0;
+                      }
+                      """
                   else:
                       conv_expr = """
                       if (bit_size > 32) {
diff --git a/src/util/half_float.c b/src/util/half_float.c
index 0eacf06c5a8..6734842df1e 100644
--- a/src/util/half_float.c
+++ b/src/util/half_float.c
@@ -211,3 +211,41 @@ uint16_t _mesa_uint16_div_64k_to_half(uint16_t v)
 
    return (e << 10) | m;
 }
+
+static uint16_t
+util_nextafter16(uint16_t x, bool up)
+{
+   uint16_t sign_mask = 1ull << 15;
+   uint16_t min_abs = 1;
+
+   float f = _mesa_half_to_float(x);
+   if (isnan(f) || (f == INFINITY && up) || (f == -INFINITY && !up))
+      return x;
+
+   /* beware of: +/-0.0 - 1 == NaN */
+   uint16_t xn = f == 0 ? (sign_mask | min_abs) : x - 1;
+
+   /* beware of -0.0 + 1 == -0x1p-149 */
+   uint16_t xp = f == 0 ? min_abs : x + 1;
+
+   /* nextafter can be implemented by just +/- 1 on the int value */
+   return (up ^ (f < 0)) ? xp : xn;
+}
+
+uint16_t
+_mesa_float_to_float16_ru(float val)
+{
+   uint16_t half = _mesa_float_to_half(val);
+   if (_mesa_half_to_float(half) < val)
+      return util_nextafter16(half, true);
+   return half;
+}
+
+uint16_t
+_mesa_float_to_float16_rd(float val)
+{
+   uint16_t half = _mesa_float_to_half(val);
+   if (_mesa_half_to_float(half) > val)
+      return util_nextafter16(half, false);
+   return half;
+}
diff --git a/src/util/half_float.h b/src/util/half_float.h
index f184323bd60..6961e1ed618 100644
--- a/src/util/half_float.h
+++ b/src/util/half_float.h
@@ -113,6 +113,9 @@ _mesa_float_to_float16_rtz(float val)
    return _mesa_float_to_float16_rtz_slow(val);
 }
 
+uint16_t _mesa_float_to_float16_ru(float val);
+uint16_t _mesa_float_to_float16_rd(float val);
+
 static inline uint16_t
 _mesa_float_to_float16_rtne(float val)
 {

From acd748dba76d89194c9f4f7c30fb9e75c4dee4fb Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Thu, 6 Nov 2025 08:08:25 +0100
Subject: [PATCH 3/7] spirv: Add internal f2f16 opcodes

The OpFConvert+FPRoundingModeRTP/FPRoundingModeRTN cannot be used
because GL_EXT_spirv_intrinsics does not allow decorations. Instead,
we need opcodes that encode the rounding mode so that they can be used
in glsl code.
---
 src/compiler/spirv/spirv_internal_exts.h | 13 ++++++++++++
 src/compiler/spirv/spirv_to_nir.c        | 25 ++++++++++++++++++++++++
 src/compiler/spirv/vtn_private.h         |  1 +
 3 files changed, 39 insertions(+)
 create mode 100644 src/compiler/spirv/spirv_internal_exts.h

diff --git a/src/compiler/spirv/spirv_internal_exts.h b/src/compiler/spirv/spirv_internal_exts.h
new file mode 100644
index 00000000000..74c3b9fd79b
--- /dev/null
+++ b/src/compiler/spirv/spirv_internal_exts.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright © 2025 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef SPIRV_INTERNAL_EXTS_H
+#define SPIRV_INTERNAL_EXTS_H
+
+#define SpvOpFConvertRUMesa 0
+#define SpvOpFConvertRDMesa 1
+
+#endif
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 7d4b9a7d21f..00e7d0d58f1 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -923,6 +923,29 @@ vtn_handle_non_semantic_debug_info(struct vtn_builder *b, SpvOp ext_opcode,
    return true;
 }
 
+static bool
+vtn_handle_mesa_internal(struct vtn_builder *b, SpvOp ext_opcode,
+                         const uint32_t *w, unsigned count)
+{
+   uint32_t instr = w[4];
+
+   switch (instr) {
+   case SpvOpFConvertRUMesa: {
+      struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]);
+      vtn_push_nir_ssa(b, w[2], nir_f2f16_ru(&b->nb, arg->def));
+      break;
+   }
+   case SpvOpFConvertRDMesa: {
+      struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]);
+      vtn_push_nir_ssa(b, w[2], nir_f2f16_rd(&b->nb, arg->def));
+      break;
+   }
+   }
+
+   return true;
+}
+
+
 static void
 vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
                      const uint32_t *w, unsigned count)
@@ -958,6 +981,8 @@ vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
          val->ext_handler = vtn_handle_debug_printf;
       } else if (strstr(ext, "NonSemantic.") == ext) {
          val->ext_handler = vtn_handle_non_semantic_instruction;
+      } else if (strstr(ext, "MesaInternal") == ext) {
+         val->ext_handler = vtn_handle_mesa_internal;
       } else {
          vtn_fail("Unsupported extension: %s", ext);
       }
diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index 5d601f95c86..9f6009ed8ea 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -33,6 +33,7 @@
 #include "spirv.h"
 #include "spirv_info.h"
 #include "vtn_generator_ids.h"
+#include "spirv_internal_exts.h"
 
 extern uint32_t mesa_spirv_debug;
 

From 430e435d32585652a81070bbb50f6ddb653ea39f Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Tue, 14 Oct 2025 20:48:03 +0200
Subject: [PATCH 4/7] aco: Add support to f2f16 with rtpi/rtni

Those rounding modes are needed when computing 16-bit bounding boxes
since the bounding box must not get smaller.
---
 src/amd/compiler/aco_insert_fp_mode.cpp                    | 7 +++++++
 src/amd/compiler/aco_ir.cpp                                | 2 ++
 src/amd/compiler/aco_opcodes.py                            | 2 ++
 src/amd/compiler/instruction_selection/aco_isel_setup.cpp  | 4 +++-
 .../compiler/instruction_selection/aco_select_nir_alu.cpp  | 7 +++++++
 5 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/aco_insert_fp_mode.cpp b/src/amd/compiler/aco_insert_fp_mode.cpp
index 1f116564294..e74f2334b29 100644
--- a/src/amd/compiler/aco_insert_fp_mode.cpp
+++ b/src/amd/compiler/aco_insert_fp_mode.cpp
@@ -279,6 +279,13 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block)
             instr->opcode = aco_opcode::v_cvt_f16_f32;
          else
             instr->opcode = aco_opcode::s_cvt_f16_f32;
+      } else if (instr->opcode ==  aco_opcode::p_v_cvt_f16_f32_rtpi ||
+                 instr->opcode ==  aco_opcode::p_v_cvt_f16_f32_rtni) {
+         set_mode |= fp_state.require(mode_round16_64, instr->opcode ==  aco_opcode::p_v_cvt_f16_f32_rtpi ? fp_round_pi : fp_round_ni);
+         set_mode |= fp_state.require(mode_fp16_ovfl, default_state.fields[mode_fp16_ovfl]);
+         set_mode |= fp_state.require(mode_denorm16_64, default_state.fields[mode_denorm16_64]);
+         set_mode |= fp_state.require(mode_denorm32, default_state.fields[mode_denorm32]);
+         instr->opcode = aco_opcode::v_cvt_f16_f32;
       } else if (instr->opcode == aco_opcode::p_v_cvt_pk_fp8_f32_ovfl) {
          set_mode |= fp_state.require(mode_fp16_ovfl, 1);
          instr->opcode = aco_opcode::v_cvt_pk_fp8_f32;
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index c7611d6df47..aa364f1a9de 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -718,6 +718,8 @@ instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
    /* VOP1 */
    case aco_opcode::v_cvt_f16_f32:
    case aco_opcode::p_v_cvt_f16_f32_rtne:
+   case aco_opcode::p_v_cvt_f16_f32_rtpi:
+   case aco_opcode::p_v_cvt_f16_f32_rtni:
    case aco_opcode::v_cvt_f16_u16:
    case aco_opcode::v_cvt_f16_i16:
    case aco_opcode::v_rcp_f16:
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index dfb457c3eaf..5ca1abe6a01 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -1029,6 +1029,8 @@ VOP1 = {
    ("v_cvt_i32_f32",              dst(U32), src(F32), op(0x08)),
    ("v_cvt_f16_f32",              dst(F16), src(F32), op(0x0a)),
    ("p_v_cvt_f16_f32_rtne",       dst(F16), src(F32), op(-1)),
+   ("p_v_cvt_f16_f32_rtpi",       dst(F16), src(F32), op(-1)),
+   ("p_v_cvt_f16_f32_rtni",       dst(F16), src(F32), op(-1)),
    ("v_cvt_f32_f16",              dst(F32), src(F16), op(0x0b)),
    ("v_cvt_rpi_i32_f32",          dst(U32), src(F32), op(0x0c)), #v_cvt_nearest_i32_f32 in GFX11
    ("v_cvt_flr_i32_f32",          dst(U32), src(F32), op(0x0d)),#v_cvt_floor_i32_f32 in GFX11
diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
index 4d559b15833..2eea24b6500 100644
--- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
@@ -453,7 +453,9 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_op_sdot_2x16_iadd_sat:
                case nir_op_bfdot2_bfadd:
                case nir_op_byte_perm_amd:
-               case nir_op_alignbyte_amd: type = RegType::vgpr; break;
+               case nir_op_alignbyte_amd:
+               case nir_op_f2f16_ru:
+               case nir_op_f2f16_rd: type = RegType::vgpr; break;
                case nir_op_fmul:
                case nir_op_ffma:
                case nir_op_fadd:
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
index 43bc6f16dd6..7beab6b1fe5 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
@@ -2615,6 +2615,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       break;
    }
+   case nir_op_f2f16_ru:
+   case nir_op_f2f16_rd:
+      ctx->program->needs_fp_mode_insertion = true;
+      bld.vop1(instr->op == nir_op_f2f16_ru ? aco_opcode::p_v_cvt_f16_f32_rtpi
+                                            : aco_opcode::p_v_cvt_f16_f32_rtni,
+               Definition(dst), Operand(get_alu_src(ctx, instr->src[0])));
+      break;
    case nir_op_f2f32: {
       if (dst.regClass() == s1) {
          assert(instr->src[0].src.ssa->bit_size == 16);

From 9a4e00a9255aabde8205432391d74bfb61fdef23 Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Tue, 14 Oct 2025 20:52:50 +0200
Subject: [PATCH 5/7] radv/rra: Count box16 nodes properly

Otherwise rra won't allocate memory when loading the capture.
---
 src/amd/vulkan/radv_rra.c         |  3 ++-
 src/amd/vulkan/radv_rra.h         |  2 ++
 src/amd/vulkan/radv_rra_gfx10_3.c | 14 ++++++++------
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c
index a307b535c7f..e31a5ba858c 100644
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@@ -198,7 +198,8 @@ rra_fill_accel_struct_header_common(const struct radv_physical_device *pdev, str
       /* TODO: calculate active primitives */
       .active_primitive_count = primitive_count,
       .geometry_description_count = header->geometry_count,
-      .interior_fp32_node_count = bvh_info->internal_nodes_size / sizeof(struct radv_bvh_box32_node),
+      .interior_fp32_node_count = bvh_info->box32_count,
+      .interior_fp16_node_count = bvh_info->box16_count,
       .leaf_node_count = primitive_count,
       .rt_driver_interface_version = 8 << 16,
       .rt_ip_version = pdev->info.rt_ip_version,
diff --git a/src/amd/vulkan/radv_rra.h b/src/amd/vulkan/radv_rra.h
index 5964ee643bf..730e4c45683 100644
--- a/src/amd/vulkan/radv_rra.h
+++ b/src/amd/vulkan/radv_rra.h
@@ -289,6 +289,8 @@ struct rra_bvh_info {
    uint32_t leaf_nodes_size;
    uint32_t internal_nodes_size;
    uint32_t instance_sideband_data_size;
+   uint32_t box32_count;
+   uint32_t box16_count;
    struct rra_geometry_info *geometry_infos;
 };
 
diff --git a/src/amd/vulkan/radv_rra_gfx10_3.c b/src/amd/vulkan/radv_rra_gfx10_3.c
index b268048befe..54d258d0349 100644
--- a/src/amd/vulkan/radv_rra_gfx10_3.c
+++ b/src/amd/vulkan/radv_rra_gfx10_3.c
@@ -177,9 +177,11 @@ rra_gather_bvh_info_gfx10_3(const uint8_t *bvh, uint32_t node_id, struct rra_bvh
    switch (node_type) {
    case radv_bvh_node_box16:
       dst->internal_nodes_size += sizeof(struct rra_box16_node);
+      dst->box16_count++;
       break;
    case radv_bvh_node_box32:
       dst->internal_nodes_size += sizeof(struct rra_box32_node);
+      dst->box32_count++;
       break;
    case radv_bvh_node_instance:
       dst->leaf_nodes_size += sizeof(struct rra_instance_node);
@@ -283,15 +285,15 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_
       vk_aabb bounds = {
          .min =
             {
-               _mesa_half_to_float(src->coords[i][0][0]),
-               _mesa_half_to_float(src->coords[i][0][1]),
-               _mesa_half_to_float(src->coords[i][0][2]),
+               _mesa_half_to_float(src->coords[i].min_x),
+               _mesa_half_to_float(src->coords[i].min_y),
+               _mesa_half_to_float(src->coords[i].min_z),
             },
          .max =
             {
-               _mesa_half_to_float(src->coords[i][1][0]),
-               _mesa_half_to_float(src->coords[i][1][1]),
-               _mesa_half_to_float(src->coords[i][1][2]),
+               _mesa_half_to_float(src->coords[i].max_x),
+               _mesa_half_to_float(src->coords[i].max_y),
+               _mesa_half_to_float(src->coords[i].max_z),
             },
       };
 

From 4950f6e23de9f4c75e5694821e68a3b2564ecef5 Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Tue, 14 Oct 2025 20:53:45 +0200
Subject: [PATCH 6/7] radv/bvh: Add radv_aabb16 and use it for box16 nodes

---
 src/amd/vulkan/bvh/bvh.h          | 12 +++++++++++-
 src/amd/vulkan/radv_rra_gfx10_3.c |  6 +++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index f6e867df6bb..b86cefbe1ba 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -34,6 +34,7 @@
 #else
 #include <vulkan/vulkan.h>
 typedef uint16_t float16_t;
+typedef struct radv_aabb16 radv_aabb16;
 #endif
 
 struct radv_accel_struct_serialization_header {
@@ -112,9 +113,18 @@ struct radv_bvh_instance_node {
    mat3x4 otw_matrix;
 };
 
+struct radv_aabb16 {
+   float16_t min_x;
+   float16_t min_y;
+   float16_t min_z;
+   float16_t max_x;
+   float16_t max_y;
+   float16_t max_z;
+};
+
 struct radv_bvh_box16_node {
    uint32_t children[4];
-   float16_t coords[4][2][3];
+   radv_aabb16 coords[4];
 };
 
 struct radv_bvh_box32_node {
diff --git a/src/amd/vulkan/radv_rra_gfx10_3.c b/src/amd/vulkan/radv_rra_gfx10_3.c
index 54d258d0349..6d3bfa706df 100644
--- a/src/amd/vulkan/radv_rra_gfx10_3.c
+++ b/src/amd/vulkan/radv_rra_gfx10_3.c
@@ -376,9 +376,9 @@ radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t dep
       for (uint32_t i = 0; i < 4; i++) {
          if (box16->children[i] != 0xffffffff) {
             float extent[3] = {
-               _mesa_half_to_float(box16->coords[i][1][0]) - _mesa_half_to_float(box16->coords[i][0][0]),
-               _mesa_half_to_float(box16->coords[i][1][1]) - _mesa_half_to_float(box16->coords[i][0][1]),
-               _mesa_half_to_float(box16->coords[i][1][2]) - _mesa_half_to_float(box16->coords[i][0][2]),
+               _mesa_half_to_float(box16->coords[i].max_x) - _mesa_half_to_float(box16->coords[i].min_x),
+               _mesa_half_to_float(box16->coords[i].max_y) - _mesa_half_to_float(box16->coords[i].min_y),
+               _mesa_half_to_float(box16->coords[i].max_z) - _mesa_half_to_float(box16->coords[i].min_z),
             };
             float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
             radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats);

From 19a4c1c4b59f02ca9d679faf6e762ef42e8e301c Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Wed, 15 Oct 2025 12:30:36 +0200
Subject: [PATCH 7/7] radv/bvh: Use box16 nodes when bvh8 is not used

Using box16 nodes trades bvh quality for memory bandwidth which seems to
be roughly equal in performance.

Stats assuming box16 nodes are as expensive as box32 nodes:
Totals from 7668 (79.68% of 9624) affected BVHs:
compacted_size: 951666944 -> 742347648 (-22.00%)
max_depth: 57606 -> 57615 (+0.02%)
sah: 129114796242 -> 129998517775 (+0.68%); split: -0.00%, +0.68%
scene_sah: 188564162 -> 192063633 (+1.86%); split: -0.02%, +1.88%
box16_node_count: 0 -> 3270600 (+inf%)
box32_node_count: 3365707 -> 95100 (-97.17%)
---
 src/amd/vulkan/bvh/build_helpers.h           |   4 +
 src/amd/vulkan/bvh/build_interface.h         |   1 +
 src/amd/vulkan/bvh/encode.comp               | 126 +++++++++++++------
 src/amd/vulkan/bvh/meson.build               |   2 +-
 src/amd/vulkan/radv_acceleration_structure.c |   9 +-
 src/vulkan/runtime/bvh/meson.build           |   5 +-
 src/vulkan/runtime/bvh/vk_build_helpers.h    |   1 +
 7 files changed, 109 insertions(+), 39 deletions(-)

diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h
index a63a534d349..895e1606d6d 100644
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@@ -8,6 +8,7 @@
 #define BVH_BUILD_HELPERS_H
 
 #include "bvh.h"
+#include "spirv_internal_exts.h"
 #include "vk_build_helpers.h"
 
 TYPE(radv_accel_struct_serialization_header, 8);
@@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type)
    return ptr_flags;
 }
 
+spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f);
+spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f);
+
 #endif /* BUILD_HELPERS_H */
diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h
index 15a7a2aaf5e..d3b726d296b 100644
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@@ -26,6 +26,7 @@
 #define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES        (1u << (VK_BUILD_FLAG_COUNT + 5))
 #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES       (1u << (VK_BUILD_FLAG_COUNT + 6))
 #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7))
+#define RADV_BUILD_FLAG_USE_BOX16                      (1u << (VK_BUILD_FLAG_COUNT + 8))
 
 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp
index 53c6f853d2c..1fb4dc5d728 100644
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent)
    DEREF(REF(uint32_t)(addr)) = parent;
 }
 
+radv_aabb16
+radv_aabb_f32_to_f16(vk_aabb aabb)
+{
+   radv_aabb16 aabb16;
+   aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x);
+   aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y);
+   aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z);
+   aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x);
+   aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y);
+   aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z);
+   return aabb16;
+}
+
+vk_aabb
+radv_aabb_f16_to_f32(radv_aabb16 aabb16)
+{
+   vk_aabb aabb;
+   aabb.min.x = float(aabb16.min_x);
+   aabb.min.y = float(aabb16.min_y);
+   aabb.min.z = float(aabb16.min_z);
+   aabb.max.x = float(aabb16.max_x);
+   aabb.max.y = float(aabb16.max_y);
+   aabb.max.z = float(aabb16.max_z);
+   return aabb;
+}
+
 void
 main()
 {
@@ -89,18 +115,15 @@ main()
       memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
                     gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
-      uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
-      if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
+      uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset;
+      if (node_id == VK_UNKNOWN_BVH_OFFSET)
          continue;
 
-      if (bvh_offset == VK_NULL_BVH_OFFSET)
+      if (node_id == VK_NULL_BVH_OFFSET)
          break;
 
       uint32_t flags = 0;
 
-      REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
-      uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
-
       uint32_t found_child_count = 0;
       uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE,
                               RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE};
@@ -158,20 +181,33 @@ main()
             break;
       }
 
+      REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
+      REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
+      bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16;
+
       for (uint32_t i = 0; i < found_child_count; ++i) {
          uint32_t type = ir_id_to_type(children[i]);
          uint32_t offset = ir_id_to_offset(children[i]);
-         uint32_t dst_offset;
+         uint32_t child_node_id;
+
+         vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
 
          if (type == vk_ir_node_internal) {
-            dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
+            radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb);
+            float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16));
+            float surface_area_f32 = aabb_surface_area(child_aabb);
+            bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5;
 
-            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
-            DEREF(child_node).bvh_offset = dst_offset;
+            uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset,
+                                            child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node));
+            child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32);
+
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
+            DEREF(child_node).bvh_offset = child_node_id;
             flags |= (DEREF(child_node).flags & 0x3) << i * 8;
          } else {
             uint32_t child_index = offset / ir_leaf_node_size;
-            dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
+            uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
 
             if (type == vk_ir_node_instance) {
                vk_ir_instance_node src_node =
@@ -182,47 +218,65 @@ main()
                uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]);
                flags |= (child_flags & 0x3) << i * 8;
             }
+
+            child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
          }
 
-         vk_aabb child_aabb =
-            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
-
-         /* On gfx11, infinities in AABB coords can cause garbage child nodes to be
-          * returned by box intersection tests with non-default box sorting modes.
-          * Subtract 1 from the integer representation of inf/-inf to turn it into
-          * the maximum/minimum representable floating-point value as a workaround.
-          */
-         if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
-            for (uint32_t i = 0; i < 3; ++i) {
-               if (isinf(child_aabb.min[i]))
-                  child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
-               if (isinf(child_aabb.max[i]))
-                  child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
+         if (is_box16) {
+            DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb);
+         } else {
+            /* On gfx11, infinities in AABB coords can cause garbage child nodes to be
+             * returned by box intersection tests with non-default box sorting modes.
+             * Subtract 1 from the integer representation of inf/-inf to turn it into
+             * the maximum/minimum representable floating-point value as a workaround.
+             */
+            if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
+               for (uint32_t i = 0; i < 3; ++i) {
+                  if (isinf(child_aabb.min[i]))
+                     child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
+                  if (isinf(child_aabb.max[i]))
+                     child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
+               }
             }
+
+            DEREF(dst_node_f32).coords[i] = child_aabb;
          }
 
-         DEREF(dst_node).coords[i] = child_aabb;
-
-         uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
-         children[i] = child_id;
-         set_parent(child_id, node_id);
+         children[i] = child_node_id;
+         set_parent(child_node_id, node_id);
       }
 
-      for (uint i = found_child_count; i < 4; ++i) {
+      if (is_box16) {
+         radv_aabb16 null_aabb;
+         null_aabb.min_x = NAN_F16;
+         null_aabb.min_y = NAN_F16;
+         null_aabb.min_z = NAN_F16;
+         null_aabb.max_x = NAN_F16;
+         null_aabb.max_y = NAN_F16;
+         null_aabb.max_z = NAN_F16;
+         for (uint i = found_child_count; i < 4; ++i)
+            DEREF(dst_node_f16).coords[i] = null_aabb;
+      } else {
+         for (uint i = found_child_count; i < 4; ++i) {
             for (uint comp = 0; comp < 3; ++comp) {
-               DEREF(dst_node).coords[i].min[comp] = NAN;
-               DEREF(dst_node).coords[i].max[comp] = NAN;
+               DEREF(dst_node_f32).coords[i].min[comp] = NAN;
+               DEREF(dst_node_f32).coords[i].max[comp] = NAN;
             }
+         }
       }
 
       /* Make changes to the children's BVH offset value available to the other invocations. */
       memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
                     gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
-      DEREF(dst_node).children = children;
+      if (is_box16) {
+         DEREF(dst_node_f16).children = children;
+      } else {
+         DEREF(dst_node_f32).children = children;
 
-      if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
-         DEREF(dst_node).flags = flags;
+         if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
+            DEREF(dst_node_f32).flags = flags;
+      }
 
       break;
    }
diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build
index 3320ef67428..c0328db82c7 100644
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@@ -56,7 +56,7 @@ bvh_includes = files(
 bvh_spv = []
 foreach s : bvh_shaders
   command = [
-    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
+    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5',
     '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
   ]
   command += vk_glsl_shader_preamble
diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c
index 4271a0be143..607d29866dd 100644
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@@ -75,6 +75,7 @@ enum radv_encode_key_bits {
    RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0),
    RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1),
    RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2),
+   RADV_ENCODE_KEY_USE_BOX16 = (1 << 3),
 };
 
 static void
@@ -287,6 +288,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
    VK_FROM_HANDLE(radv_device, device, _device);
    struct radv_physical_device *pdev = radv_device_physical(device);
 
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
+
    uint32_t encode_key = 0;
    if (radv_use_bvh8(pdev)) {
       /*
@@ -302,11 +305,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
           state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
          encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS;
 
-      VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
       if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
                                         VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
           geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
          encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   } else if (!radv_emulate_rt(pdev)) {
+      if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
+         encode_key |= RADV_ENCODE_KEY_USE_BOX16;
    }
 
    state->config.encode_key[0] = encode_key;
@@ -391,6 +396,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
       flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
    if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
       flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;
+   if (key & RADV_ENCODE_KEY_USE_BOX16)
+      flags |= RADV_BUILD_FLAG_USE_BOX16;
 
    return flags;
 }
diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build
index 02b2afb4163..add1590b70f 100644
--- a/src/vulkan/runtime/bvh/meson.build
+++ b/src/vulkan/runtime/bvh/meson.build
@@ -42,6 +42,7 @@ bvh_shaders = [
   ],
 ]
 
+spirv_include_dir = dir_source_root + '/src/compiler/spirv'
 vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
 
 vk_bvh_includes = files(
@@ -50,6 +51,7 @@ vk_bvh_includes = files(
   'vk_build_interface.h',
   'vk_bvh.h',
   'vk_debug.h',
+  spirv_include_dir + '/spirv_internal_exts.h',
 )
 
 vk_glsl_shader_extensions = [
@@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [
   'GL_KHR_shader_subgroup_ballot',
   'GL_KHR_shader_subgroup_clustered',
   'GL_EXT_shader_atomic_int64',
+  'GL_EXT_spirv_intrinsics',
 ]
 
 vk_glsl_shader_preamble = []
@@ -79,7 +82,7 @@ endforeach
 bvh_spv = []
 foreach s : bvh_shaders
   command = [
-    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
   ] + (with_mesa_debug ? ['-g'] : [])
   command += glslang_quiet
   command += vk_glsl_shader_preamble
diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h
index 01acb4db715..dd5795855b2 100644
--- a/src/vulkan/runtime/bvh/vk_build_helpers.h
+++ b/src/vulkan/runtime/bvh/vk_build_helpers.h
@@ -180,6 +180,7 @@
 
 #define INFINITY (1.0 / 0.0)
 #define NAN      (0.0 / 0.0)
+#define NAN_F16  (0.0hf / 0.0hf)
 
 #define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))