diff --git a/src/amd/compiler/aco_insert_fp_mode.cpp b/src/amd/compiler/aco_insert_fp_mode.cpp
index 1f116564294..e74f2334b29 100644
--- a/src/amd/compiler/aco_insert_fp_mode.cpp
+++ b/src/amd/compiler/aco_insert_fp_mode.cpp
@@ -279,6 +279,13 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block)
             instr->opcode = aco_opcode::v_cvt_f16_f32;
          else
             instr->opcode = aco_opcode::s_cvt_f16_f32;
+      } else if (instr->opcode ==  aco_opcode::p_v_cvt_f16_f32_rtpi ||
+                 instr->opcode ==  aco_opcode::p_v_cvt_f16_f32_rtni) {
+         set_mode |= fp_state.require(mode_round16_64, instr->opcode ==  aco_opcode::p_v_cvt_f16_f32_rtpi ? fp_round_pi : fp_round_ni);
+         set_mode |= fp_state.require(mode_fp16_ovfl, default_state.fields[mode_fp16_ovfl]);
+         set_mode |= fp_state.require(mode_denorm16_64, default_state.fields[mode_denorm16_64]);
+         set_mode |= fp_state.require(mode_denorm32, default_state.fields[mode_denorm32]);
+         instr->opcode = aco_opcode::v_cvt_f16_f32;
       } else if (instr->opcode == aco_opcode::p_v_cvt_pk_fp8_f32_ovfl) {
          set_mode |= fp_state.require(mode_fp16_ovfl, 1);
          instr->opcode = aco_opcode::v_cvt_pk_fp8_f32;
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index c7611d6df47..aa364f1a9de 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -718,6 +718,8 @@ instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
    /* VOP1 */
    case aco_opcode::v_cvt_f16_f32:
    case aco_opcode::p_v_cvt_f16_f32_rtne:
+   case aco_opcode::p_v_cvt_f16_f32_rtpi:
+   case aco_opcode::p_v_cvt_f16_f32_rtni:
    case aco_opcode::v_cvt_f16_u16:
    case aco_opcode::v_cvt_f16_i16:
    case aco_opcode::v_rcp_f16:
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index dfb457c3eaf..5ca1abe6a01 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -1029,6 +1029,8 @@ VOP1 = {
    ("v_cvt_i32_f32",              dst(U32), src(F32), op(0x08)),
    ("v_cvt_f16_f32",              dst(F16), src(F32), op(0x0a)),
    ("p_v_cvt_f16_f32_rtne",       dst(F16), src(F32), op(-1)),
+   ("p_v_cvt_f16_f32_rtpi",       dst(F16), src(F32), op(-1)),
+   ("p_v_cvt_f16_f32_rtni",       dst(F16), src(F32), op(-1)),
    ("v_cvt_f32_f16",              dst(F32), src(F16), op(0x0b)),
    ("v_cvt_rpi_i32_f32",          dst(U32), src(F32), op(0x0c)), #v_cvt_nearest_i32_f32 in GFX11
    ("v_cvt_flr_i32_f32",          dst(U32), src(F32), op(0x0d)),#v_cvt_floor_i32_f32 in GFX11
diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
index 4d559b15833..2eea24b6500 100644
--- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
@@ -453,7 +453,9 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_op_sdot_2x16_iadd_sat:
                case nir_op_bfdot2_bfadd:
                case nir_op_byte_perm_amd:
-               case nir_op_alignbyte_amd: type = RegType::vgpr; break;
+               case nir_op_alignbyte_amd:
+               case nir_op_f2f16_ru:
+               case nir_op_f2f16_rd: type = RegType::vgpr; break;
                case nir_op_fmul:
                case nir_op_ffma:
                case nir_op_fadd:
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
index 43bc6f16dd6..7beab6b1fe5 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
@@ -2615,6 +2615,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       break;
    }
+   case nir_op_f2f16_ru:
+   case nir_op_f2f16_rd:
+      ctx->program->needs_fp_mode_insertion = true;
+      bld.vop1(instr->op == nir_op_f2f16_ru ? aco_opcode::p_v_cvt_f16_f32_rtpi
+                                            : aco_opcode::p_v_cvt_f16_f32_rtni,
+               Definition(dst), Operand(get_alu_src(ctx, instr->src[0])));
+      break;
    case nir_op_f2f32: {
       if (dst.regClass() == s1) {
          assert(instr->src[0].src.ssa->bit_size == 16);
diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h
index a63a534d349..895e1606d6d 100644
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@@ -8,6 +8,7 @@
 #define BVH_BUILD_HELPERS_H
 
 #include "bvh.h"
+#include "spirv_internal_exts.h"
 #include "vk_build_helpers.h"
 
 TYPE(radv_accel_struct_serialization_header, 8);
@@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type)
    return ptr_flags;
 }
 
+spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f);
+spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f);
+
 #endif /* BUILD_HELPERS_H */
diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h
index 15a7a2aaf5e..d3b726d296b 100644
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@@ -26,6 +26,7 @@
 #define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES        (1u << (VK_BUILD_FLAG_COUNT + 5))
 #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES       (1u << (VK_BUILD_FLAG_COUNT + 6))
 #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7))
+#define RADV_BUILD_FLAG_USE_BOX16                      (1u << (VK_BUILD_FLAG_COUNT + 8))
 
 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index f6e867df6bb..b86cefbe1ba 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -34,6 +34,7 @@
 #else
 #include <vulkan/vulkan.h>
 typedef uint16_t float16_t;
+typedef struct radv_aabb16 radv_aabb16;
 #endif
 
 struct radv_accel_struct_serialization_header {
@@ -112,9 +113,18 @@ struct radv_bvh_instance_node {
    mat3x4 otw_matrix;
 };
 
+struct radv_aabb16 {
+   float16_t min_x;
+   float16_t min_y;
+   float16_t min_z;
+   float16_t max_x;
+   float16_t max_y;
+   float16_t max_z;
+};
+
 struct radv_bvh_box16_node {
    uint32_t children[4];
-   float16_t coords[4][2][3];
+   radv_aabb16 coords[4];
 };
 
 struct radv_bvh_box32_node {
diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp
index 53c6f853d2c..1fb4dc5d728 100644
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent)
    DEREF(REF(uint32_t)(addr)) = parent;
 }
 
+radv_aabb16
+radv_aabb_f32_to_f16(vk_aabb aabb)
+{
+   radv_aabb16 aabb16;
+   aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x);
+   aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y);
+   aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z);
+   aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x);
+   aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y);
+   aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z);
+   return aabb16;
+}
+
+vk_aabb
+radv_aabb_f16_to_f32(radv_aabb16 aabb16)
+{
+   vk_aabb aabb;
+   aabb.min.x = float(aabb16.min_x);
+   aabb.min.y = float(aabb16.min_y);
+   aabb.min.z = float(aabb16.min_z);
+   aabb.max.x = float(aabb16.max_x);
+   aabb.max.y = float(aabb16.max_y);
+   aabb.max.z = float(aabb16.max_z);
+   return aabb;
+}
+
 void
 main()
 {
@@ -89,18 +115,15 @@ main()
       memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
                     gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
-      uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
-      if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
+      uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset;
+      if (node_id == VK_UNKNOWN_BVH_OFFSET)
          continue;
 
-      if (bvh_offset == VK_NULL_BVH_OFFSET)
+      if (node_id == VK_NULL_BVH_OFFSET)
          break;
 
       uint32_t flags = 0;
 
-      REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
-      uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
-
       uint32_t found_child_count = 0;
       uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE,
                               RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE};
@@ -158,20 +181,33 @@ main()
             break;
       }
 
+      REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
+      REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
+      bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16;
+
       for (uint32_t i = 0; i < found_child_count; ++i) {
          uint32_t type = ir_id_to_type(children[i]);
          uint32_t offset = ir_id_to_offset(children[i]);
-         uint32_t dst_offset;
+         uint32_t child_node_id;
+
+         vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
 
          if (type == vk_ir_node_internal) {
-            dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
+            radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb);
+            float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16));
+            float surface_area_f32 = aabb_surface_area(child_aabb);
+            bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5;
 
-            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
-            DEREF(child_node).bvh_offset = dst_offset;
+            uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset,
+                                            child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node));
+            child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32);
+
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
+            DEREF(child_node).bvh_offset = child_node_id;
             flags |= (DEREF(child_node).flags & 0x3) << i * 8;
          } else {
             uint32_t child_index = offset / ir_leaf_node_size;
-            dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
+            uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
 
             if (type == vk_ir_node_instance) {
                vk_ir_instance_node src_node =
@@ -182,47 +218,65 @@ main()
                uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]);
                flags |= (child_flags & 0x3) << i * 8;
             }
+
+            child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
          }
 
-         vk_aabb child_aabb =
-            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
-
-         /* On gfx11, infinities in AABB coords can cause garbage child nodes to be
-          * returned by box intersection tests with non-default box sorting modes.
-          * Subtract 1 from the integer representation of inf/-inf to turn it into
-          * the maximum/minimum representable floating-point value as a workaround.
-          */
-         if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
-            for (uint32_t i = 0; i < 3; ++i) {
-               if (isinf(child_aabb.min[i]))
-                  child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
-               if (isinf(child_aabb.max[i]))
-                  child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
+         if (is_box16) {
+            DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb);
+         } else {
+            /* On gfx11, infinities in AABB coords can cause garbage child nodes to be
+             * returned by box intersection tests with non-default box sorting modes.
+             * Subtract 1 from the integer representation of inf/-inf to turn it into
+             * the maximum/minimum representable floating-point value as a workaround.
+             */
+            if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
+               for (uint32_t i = 0; i < 3; ++i) {
+                  if (isinf(child_aabb.min[i]))
+                     child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
+                  if (isinf(child_aabb.max[i]))
+                     child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
+               }
             }
+
+            DEREF(dst_node_f32).coords[i] = child_aabb;
          }
 
-         DEREF(dst_node).coords[i] = child_aabb;
-
-         uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
-         children[i] = child_id;
-         set_parent(child_id, node_id);
+         children[i] = child_node_id;
+         set_parent(child_node_id, node_id);
       }
 
-      for (uint i = found_child_count; i < 4; ++i) {
+      if (is_box16) {
+         radv_aabb16 null_aabb;
+         null_aabb.min_x = NAN_F16;
+         null_aabb.min_y = NAN_F16;
+         null_aabb.min_z = NAN_F16;
+         null_aabb.max_x = NAN_F16;
+         null_aabb.max_y = NAN_F16;
+         null_aabb.max_z = NAN_F16;
+         for (uint i = found_child_count; i < 4; ++i)
+            DEREF(dst_node_f16).coords[i] = null_aabb;
+      } else {
+         for (uint i = found_child_count; i < 4; ++i) {
             for (uint comp = 0; comp < 3; ++comp) {
-               DEREF(dst_node).coords[i].min[comp] = NAN;
-               DEREF(dst_node).coords[i].max[comp] = NAN;
+               DEREF(dst_node_f32).coords[i].min[comp] = NAN;
+               DEREF(dst_node_f32).coords[i].max[comp] = NAN;
             }
+         }
       }
 
       /* Make changes to the children's BVH offset value available to the other invocations. */
       memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
                     gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
-      DEREF(dst_node).children = children;
+      if (is_box16) {
+         DEREF(dst_node_f16).children = children;
+      } else {
+         DEREF(dst_node_f32).children = children;
 
-      if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
-         DEREF(dst_node).flags = flags;
+         if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
+            DEREF(dst_node_f32).flags = flags;
+      }
 
       break;
    }
diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build
index 3320ef67428..c0328db82c7 100644
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@@ -56,7 +56,7 @@ bvh_includes = files(
 bvh_spv = []
 foreach s : bvh_shaders
   command = [
-    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
+    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5',
     '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
   ]
   command += vk_glsl_shader_preamble
diff --git a/src/amd/vulkan/layers/radv_rra_layer.c b/src/amd/vulkan/layers/radv_rra_layer.c
index b95a1331d45..df0573cd3e0 100644
--- a/src/amd/vulkan/layers/radv_rra_layer.c
+++ b/src/amd/vulkan/layers/radv_rra_layer.c
@@ -374,7 +374,16 @@ rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *p
    struct radv_device *device = radv_queue_device(queue);
 
    VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence);
-   if (result != VK_SUCCESS || !device->rra_trace.triggered)
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (radv_bvh_stats_file()) {
+      result = radv_dump_bvh_stats(_queue);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   if (!device->rra_trace.triggered)
       return result;
 
    uint32_t total_trace_count = 0;
diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c
index 27c696b8a55..60c0d743f54 100644
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@@ -75,6 +75,7 @@ enum radv_encode_key_bits {
    RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0),
    RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1),
    RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2),
+   RADV_ENCODE_KEY_USE_BOX16 = (1 << 3),
 };
 
 static void
@@ -287,6 +288,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
    VK_FROM_HANDLE(radv_device, device, _device);
    struct radv_physical_device *pdev = radv_device_physical(device);
 
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
+
    uint32_t encode_key = 0;
    if (radv_use_bvh8(pdev)) {
       /*
@@ -302,11 +305,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
           state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
          encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS;
 
-      VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
       if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
                                         VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
           geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
          encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   } else if (!radv_emulate_rt(pdev)) {
+      if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
+         encode_key |= RADV_ENCODE_KEY_USE_BOX16;
    }
 
    state->config.encode_key[0] = encode_key;
@@ -391,6 +396,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
       flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
    if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
       flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;
+   if (key & RADV_ENCODE_KEY_USE_BOX16)
+      flags |= RADV_BUILD_FLAG_USE_BOX16;
 
    return flags;
 }
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 170f47506a1..41c97cdaa08 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -722,7 +722,7 @@ radv_device_init_tools(struct radv_device *device)
    if (result != VK_SUCCESS)
       return result;
 
-   if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) {
+   if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) {
       result = radv_rra_trace_init(device);
       if (result != VK_SUCCESS)
          return result;
@@ -798,7 +798,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
    if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
       add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
 
-   if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev))
+   if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev))
       add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
 
 #ifndef _WIN32
diff --git a/src/amd/vulkan/radv_instance.h b/src/amd/vulkan/radv_instance.h
index 84a4d88cb32..4fe5b723621 100644
--- a/src/amd/vulkan/radv_instance.h
+++ b/src/amd/vulkan/radv_instance.h
@@ -115,4 +115,17 @@ const char *radv_get_perftest_option_name(int id);
 
 bool radv_is_rt_wave64_enabled(const struct radv_instance *instance);
 
+static const char *
+radv_bvh_stats_file()
+{
+   return os_get_option("RADV_BVH_STATS_FILE");
+}
+
+static bool
+radv_bvh_dumping_enabled(const struct radv_instance *instance)
+{
+   /* Gathering bvh stats uses a large part of the rra code for dumping bvhs. */
+   return (instance->vk.trace_mode & RADV_TRACE_MODE_RRA) || radv_bvh_stats_file();
+}
+
 #endif /* RADV_INSTANCE_H */
diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c
index bba3e87943b..e31a5ba858c 100644
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@@ -198,7 +198,8 @@ rra_fill_accel_struct_header_common(const struct radv_physical_device *pdev, str
       /* TODO: calculate active primitives */
       .active_primitive_count = primitive_count,
       .geometry_description_count = header->geometry_count,
-      .interior_fp32_node_count = bvh_info->internal_nodes_size / sizeof(struct radv_bvh_box32_node),
+      .interior_fp32_node_count = bvh_info->box32_count,
+      .interior_fp16_node_count = bvh_info->box16_count,
       .leaf_node_count = primitive_count,
       .rt_driver_interface_version = 8 << 16,
       .rt_ip_version = pdev->info.rt_ip_version,
@@ -488,6 +489,10 @@ radv_rra_trace_init(struct radv_device *device)
 
    device->rra_trace.ray_history = UTIL_DYNARRAY_INIT;
 
+   /* BVH stats dumping does not need ray history. */
+   if (!(radv_physical_device_instance(pdev)->vk.trace_mode & RADV_TRACE_MODE_RRA))
+      return VK_SUCCESS;
+
    device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024);
    if (device->rra_trace.ray_history_buffer_size <
        sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token))
@@ -624,6 +629,9 @@ radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
    simple_mtx_destroy(&data->data_mtx);
    _mesa_hash_table_destroy(data->accel_structs, NULL);
    _mesa_hash_table_u64_destroy(data->accel_struct_vas);
+
+   if (data->stats_file)
+      fclose(data->stats_file);
 }
 
 void
@@ -789,7 +797,7 @@ rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
    if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET)
       return NULL;
 
-   if (data->buffer->memory) {
+   if (data->buffer && data->buffer->memory) {
       VkMemoryMapInfo memory_map_info = {
          .sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
          .memory = data->buffer->memory,
@@ -1297,3 +1305,167 @@ cleanup:
    free(accel_struct_offsets);
    return result;
 }
+
+static void
+dump_bvh_stats(struct radv_device *device, struct vk_acceleration_structure *accel_struct,
+               struct radv_rra_accel_struct_data *accel_struct_data, uint8_t *data, struct hash_table_u64 *blas_sah,
+               bool tlas_pass)
+{
+   const struct radv_physical_device *pdev = radv_device_physical(device);
+   const struct radv_instance *instance = radv_physical_device_instance(pdev);
+
+   struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data;
+
+   bool is_tlas = header->instance_count > 0;
+   if (is_tlas != tlas_pass)
+      return;
+
+   /* convert root node id to offset */
+   uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3;
+
+   if (rra_validate_header(accel_struct_data, header)) {
+      return;
+   }
+   if (radv_use_bvh8(pdev)) {
+      if (rra_validate_node_gfx12(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
+                                  data + header->bvh_offset + src_root_offset, header->geometry_count,
+                                  accel_struct_data->size, !is_tlas, 0)) {
+         return;
+      }
+   } else {
+      if (rra_validate_node_gfx10_3(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
+                                    data + header->bvh_offset + src_root_offset, header->geometry_count,
+                                    accel_struct_data->size, !is_tlas, 0)) {
+         return;
+      }
+   }
+
+   if (!device->rra_trace.stats_file) {
+      device->rra_trace.stats_file = fopen(radv_bvh_stats_file(), "w");
+      fprintf(device->rra_trace.stats_file, "app,name,type,allocated_size,compacted_size");
+      if (radv_use_bvh8(pdev)) {
+         fprintf(device->rra_trace.stats_file, ",max_depth,box_node_count,primitive_node_count,instance_node_count");
+      } else {
+         fprintf(device->rra_trace.stats_file, ",max_depth,box16_node_count,box32_node_count,triangle_node_count,"
+                                               "instance_node_count,procedual_node_count");
+      }
+      fprintf(device->rra_trace.stats_file, ",sah,scene_sah\n");
+   }
+
+   fprintf(device->rra_trace.stats_file, "\"%s\",%s,%s,%" PRIu64 ",%" PRIu64, instance->vk.app_info.app_name,
+           vk_object_base_name(&accel_struct->base), is_tlas ? "tlas" : "blas", accel_struct_data->size,
+           header->compacted_size);
+
+   float extent[3] = {
+      header->aabb.max.x - header->aabb.min.x,
+      header->aabb.max.y - header->aabb.min.y,
+      header->aabb.max.z - header->aabb.min.z,
+   };
+   float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
+
+   float sah;
+   float instance_sah;
+   if (radv_use_bvh8(pdev)) {
+      struct radv_bvh_stats_gfx12 stats = {};
+      radv_gather_bvh_stats_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
+      sah = stats.sah;
+      instance_sah = stats.instance_sah;
+      fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u", stats.max_depth, stats.box_node_count,
+              stats.primitive_node_count, stats.instance_node_count);
+   } else {
+      struct radv_bvh_stats_gfx10_3 stats = {};
+      radv_gather_bvh_stats_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
+      sah = stats.sah;
+      instance_sah = stats.instance_sah;
+      fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u,%u,%u", stats.max_depth, stats.box16_node_count,
+              stats.box32_node_count, stats.triangle_node_count, stats.instance_node_count, stats.procedual_node_count);
+   }
+
+   fprintf(device->rra_trace.stats_file, ",%u", (uint32_t)(sah / surface_area * 1000000));
+
+   if (is_tlas) {
+      fprintf(device->rra_trace.stats_file, ",%u\n", (uint32_t)((sah + instance_sah) / surface_area * 1000000));
+   } else {
+      fprintf(device->rra_trace.stats_file, ",0\n");
+
+      float *sah_ptr = ralloc(blas_sah, float);
+      *sah_ptr = sah / surface_area;
+      _mesa_hash_table_u64_insert(blas_sah, vk_acceleration_structure_get_va(accel_struct), sah_ptr);
+   }
+
+   fflush(device->rra_trace.stats_file);
+}
+
+VkResult
+radv_dump_bvh_stats(VkQueue vk_queue)
+{
+   VK_FROM_HANDLE(radv_queue, queue, vk_queue);
+   struct radv_device *device = radv_queue_device(queue);
+   VkDevice vk_device = radv_device_to_handle(device);
+
+   VkResult result = vk_common_DeviceWaitIdle(vk_device);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct hash_entry **hash_entries = NULL;
+   struct hash_table_u64 *blas_sah = NULL;
+
+   uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
+
+   hash_entries = malloc(sizeof(*hash_entries) * struct_count);
+   if (!hash_entries) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto cleanup;
+   }
+
+   struct hash_entry *last_entry = NULL;
+   for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i)
+      hash_entries[i] = last_entry;
+
+   qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp);
+
+   struct rra_copy_context copy_ctx = {
+      .device = vk_device,
+      .queue = vk_queue,
+      .entries = hash_entries,
+      .family_index = queue->vk.queue_family_index,
+      .min_size = device->rra_trace.ray_history_buffer_size,
+   };
+
+   result = rra_copy_context_init(&copy_ctx);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+   blas_sah = _mesa_hash_table_u64_create(NULL);
+
+   for (unsigned i = 0; i < struct_count; i++) {
+      void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
+      if (!mapped_data)
+         continue;
+
+      dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, false);
+
+      rra_unmap_accel_struct_data(&copy_ctx, i);
+   }
+
+   for (unsigned i = 0; i < struct_count; i++) {
+      if (_mesa_hash_table_u64_search(blas_sah, vk_acceleration_structure_get_va(hash_entries[i]->key)))
+         continue;
+
+      void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
+      if (!mapped_data)
+         continue;
+
+      dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, true);
+
+      rra_unmap_accel_struct_data(&copy_ctx, i);
+   }
+
+   rra_copy_context_finish(&copy_ctx);
+
+   result = VK_SUCCESS;
+cleanup:
+   _mesa_hash_table_u64_destroy(blas_sah);
+   free(hash_entries);
+   return result;
+}
diff --git a/src/amd/vulkan/radv_rra.h b/src/amd/vulkan/radv_rra.h
index c2c86f15d9f..730e4c45683 100644
--- a/src/amd/vulkan/radv_rra.h
+++ b/src/amd/vulkan/radv_rra.h
@@ -107,6 +107,7 @@ struct radv_rra_trace_data {
    struct hash_table *accel_structs;
    struct hash_table_u64 *accel_struct_vas;
    simple_mtx_t data_mtx;
+   FILE *stats_file;
    bool validate_as;
    bool copy_after_build;
    bool triggered;
@@ -288,6 +289,8 @@ struct rra_bvh_info {
    uint32_t leaf_nodes_size;
    uint32_t internal_nodes_size;
    uint32_t instance_sideband_data_size;
+   uint32_t box32_count;
+   uint32_t box16_count;
    struct rra_geometry_info *geometry_infos;
 };
 
@@ -320,4 +323,32 @@ void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_
 void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
                               uint32_t dst_offset);
 
+struct radv_bvh_stats_gfx10_3 {
+   uint32_t max_depth;
+   float sah;
+   float instance_sah;
+   uint32_t box16_node_count;
+   uint32_t box32_node_count;
+   uint32_t triangle_node_count;
+   uint32_t instance_node_count;
+   uint32_t procedual_node_count;
+};
+
+struct radv_bvh_stats_gfx12 {
+   uint32_t max_depth;
+   float sah;
+   float instance_sah;
+   uint32_t box_node_count;
+   uint32_t primitive_node_count;
+   uint32_t instance_node_count;
+};
+
+void radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
+                                   struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats);
+
+void radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
+                                 struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats);
+
+VkResult radv_dump_bvh_stats(VkQueue vk_queue);
+
 #endif /* RADV_RRA_H */
diff --git a/src/amd/vulkan/radv_rra_gfx10_3.c b/src/amd/vulkan/radv_rra_gfx10_3.c
index 8ff1f01aa9d..6d3bfa706df 100644
--- a/src/amd/vulkan/radv_rra_gfx10_3.c
+++ b/src/amd/vulkan/radv_rra_gfx10_3.c
@@ -177,9 +177,11 @@ rra_gather_bvh_info_gfx10_3(const uint8_t *bvh, uint32_t node_id, struct rra_bvh
    switch (node_type) {
    case radv_bvh_node_box16:
       dst->internal_nodes_size += sizeof(struct rra_box16_node);
+      dst->box16_count++;
       break;
    case radv_bvh_node_box32:
       dst->internal_nodes_size += sizeof(struct rra_box32_node);
+      dst->box32_count++;
       break;
    case radv_bvh_node_instance:
       dst->leaf_nodes_size += sizeof(struct rra_instance_node);
@@ -283,15 +285,15 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_
       vk_aabb bounds = {
          .min =
             {
-               _mesa_half_to_float(src->coords[i][0][0]),
-               _mesa_half_to_float(src->coords[i][0][1]),
-               _mesa_half_to_float(src->coords[i][0][2]),
+               _mesa_half_to_float(src->coords[i].min_x),
+               _mesa_half_to_float(src->coords[i].min_y),
+               _mesa_half_to_float(src->coords[i].min_z),
             },
          .max =
             {
-               _mesa_half_to_float(src->coords[i][1][0]),
-               _mesa_half_to_float(src->coords[i][1][1]),
-               _mesa_half_to_float(src->coords[i][1][2]),
+               _mesa_half_to_float(src->coords[i].max_x),
+               _mesa_half_to_float(src->coords[i].max_y),
+               _mesa_half_to_float(src->coords[i].max_z),
             },
       };
 
@@ -355,3 +357,78 @@ rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_
 
    return dst_id;
 }
+
+void
+radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
+                              struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats)
+{
+   uint32_t node_type = node_id & 7;
+   const void *node = bvh + ((node_id & (~7u)) << 3);
+
+   stats->max_depth = MAX2(stats->max_depth, depth);
+
+   switch (node_type) {
+   case radv_bvh_node_box16: {
+      stats->sah += 1.0 * p;
+      stats->box16_node_count++;
+
+      const struct radv_bvh_box16_node *box16 = node;
+      for (uint32_t i = 0; i < 4; i++) {
+         if (box16->children[i] != 0xffffffff) {
+            float extent[3] = {
+               _mesa_half_to_float(box16->coords[i].max_x) - _mesa_half_to_float(box16->coords[i].min_x),
+               _mesa_half_to_float(box16->coords[i].max_y) - _mesa_half_to_float(box16->coords[i].min_y),
+               _mesa_half_to_float(box16->coords[i].max_z) - _mesa_half_to_float(box16->coords[i].min_z),
+            };
+            float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
+            radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats);
+         }
+      }
+
+      break;
+   }
+   case radv_bvh_node_box32: {
+      stats->sah += 1.5 * p;
+      stats->box32_node_count++;
+
+      const struct radv_bvh_box32_node *box32 = node;
+      for (uint32_t i = 0; i < 4; i++) {
+         if (box32->children[i] != 0xffffffff) {
+            float extent[3] = {
+               box32->coords[i].max.x - box32->coords[i].min.x,
+               box32->coords[i].max.y - box32->coords[i].min.y,
+               box32->coords[i].max.z - box32->coords[i].min.z,
+            };
+            float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
+            radv_gather_bvh_stats_gfx10_3(bvh, box32->children[i], depth + 1, surface_area, blas_sah, stats);
+         }
+      }
+
+      break;
+   }
+   case radv_bvh_node_instance: {
+      stats->sah += 2.0 * p;
+      stats->instance_node_count++;
+
+      const struct radv_bvh_instance_node *instance = node;
+      uint64_t blas_va = radv_node_to_addr(instance->bvh_ptr) - instance->bvh_offset;
+      float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
+      if (sah)
+         stats->instance_sah += *sah * p;
+      else
+         fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va);
+
+      break;
+   }
+   case radv_bvh_node_triangle:
+      stats->sah += 2.0 * p;
+      stats->triangle_node_count++;
+      break;
+   case radv_bvh_node_aabb:
+      stats->sah += 4.0 * p;
+      stats->procedual_node_count++;
+      break;
+   default:
+      break;
+   }
+}
diff --git a/src/amd/vulkan/radv_rra_gfx12.c b/src/amd/vulkan/radv_rra_gfx12.c
index 3260664db52..ce233bb0d82 100644
--- a/src/amd/vulkan/radv_rra_gfx12.c
+++ b/src/amd/vulkan/radv_rra_gfx12.c
@@ -10,6 +10,7 @@
 #include "radv_rra.h"
 
 #include "util/bitset.h"
+#include "util/compiler.h"
 
 struct rra_instance_sideband_data {
    uint32_t instance_index;
@@ -306,3 +307,98 @@ rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id
       }
    }
 }
+
+void
+radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float surface_area,
+                            struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats)
+{
+   uint32_t node_type = node_id & 0xf;
+   const void *node = bvh + ((node_id & (~0xf)) << 3);
+
+   stats->max_depth = MAX2(stats->max_depth, depth);
+
+   switch (node_type) {
+   case radv_bvh_node_box32: {
+      stats->box_node_count++;
+      stats->sah += 0.5 * surface_area;
+
+      const struct radv_gfx12_box_node *src = node;
+
+      uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28;
+
+      if (valid_child_count_minus_one != 0xf) {
+         uint32_t internal_id = src->internal_base_id;
+         uint32_t primitive_id = src->primitive_base_id;
+
+         uint32_t exponents[3] = {
+            src->child_count_exponents & 0xff,
+            (src->child_count_exponents >> 8) & 0xff,
+            (src->child_count_exponents >> 16) & 0xff,
+         };
+         float extent[3] = {
+            uif(exponents[0] << 23),
+            uif(exponents[1] << 23),
+            uif(exponents[2] << 23),
+         };
+
+         for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
+            uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
+            uint32_t child_size = src->children[i].dword2 >> 28;
+
+            uint32_t child_id;
+            if (child_type == radv_bvh_node_box32) {
+               child_id = internal_id | child_type;
+               internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
+            } else {
+               child_id = primitive_id | child_type;
+               primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
+            }
+
+            float min[3] = {
+               (float)(src->children[i].dword0 & 0xfff) / 0x1000 * extent[0],
+               (float)((src->children[i].dword0 >> 12) & 0xfff) / 0x1000 * extent[1],
+               (float)(src->children[i].dword1 & 0xfff) / 0x1000 * extent[2],
+            };
+            float max[3] = {
+               (float)(((src->children[i].dword1 >> 12) & 0xfff) + 1) / 0x1000 * extent[0],
+               (float)((src->children[i].dword2 & 0xfff) + 1) / 0x1000 * extent[1],
+               (float)(((src->children[i].dword2 >> 12) & 0xfff) + 1) / 0x1000 * extent[2],
+            };
+            float child_extent[3] = {
+               max[0] - min[0],
+               max[1] - min[1],
+               max[2] - min[2],
+            };
+            float child_surface_area = 2 * (child_extent[0] * child_extent[1] + child_extent[0] * child_extent[2] +
+                                            child_extent[1] * child_extent[2]);
+
+            radv_gather_bvh_stats_gfx12(bvh, child_id, depth + 1, child_surface_area, blas_sah, stats);
+         }
+      }
+
+      break;
+   }
+   case radv_bvh_node_instance: {
+      stats->instance_node_count++;
+      stats->sah += 0.7 * surface_area;
+
+      struct radv_gfx12_instance_node *instance = (struct radv_gfx12_instance_node *)(node);
+      const struct radv_gfx12_instance_node_user_data *user_data =
+         (const void *)((const uint8_t *)node + sizeof(struct radv_gfx12_instance_node));
+      uint64_t blas_va = radv_node_to_addr(instance->pointer_flags_bvh_addr) - user_data->bvh_offset;
+      float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
+      if (sah)
+         stats->instance_sah += *sah * surface_area;
+      else
+         fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va);
+
+      break;
+   }
+   case radv_bvh_node_triangle:
+      stats->primitive_node_count++;
+      FALLTHROUGH;
+   default:
+      stats->sah += 1.0 * surface_area;
+      break;
+   }
+}
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 6c9aa7e5117..723d2691101 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -257,7 +257,7 @@ for src_t in [tint, tuint, tfloat, tbool]:
    for dst_t in dst_types:
       for dst_bit_size in type_sizes(dst_t):
           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
-              rnd_modes = ['_rtne', '_rtz', '']
+              rnd_modes = ['_rtne', '_rtz', '_ru', '_rd', '']
               for rnd_mode in rnd_modes:
                   if rnd_mode == '_rtne':
                       conv_expr = """
@@ -279,6 +279,22 @@ for src_t in [tint, tuint, tfloat, tbool]:
                          dst = src0;
                       }
                       """
+                  elif rnd_mode == '_ru':
+                      conv_expr = """
+                      if (bit_size > 16) {
+                         dst = _mesa_half_to_float(_mesa_float_to_float16_ru(src0));
+                      } else {
+                         dst = src0;
+                      }
+                      """
+                  elif rnd_mode == '_rd':
+                      conv_expr = """
+                      if (bit_size > 16) {
+                         dst = _mesa_half_to_float(_mesa_float_to_float16_rd(src0));
+                      } else {
+                         dst = src0;
+                      }
+                      """
                   else:
                       conv_expr = """
                       if (bit_size > 32) {
diff --git a/src/compiler/spirv/spirv_internal_exts.h b/src/compiler/spirv/spirv_internal_exts.h
new file mode 100644
index 00000000000..74c3b9fd79b
--- /dev/null
+++ b/src/compiler/spirv/spirv_internal_exts.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright © 2025 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef SPIRV_INTERNAL_EXTS_H
+#define SPIRV_INTERNAL_EXTS_H
+
+#define SpvOpFConvertRUMesa 0
+#define SpvOpFConvertRDMesa 1
+
+#endif
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 7d4b9a7d21f..00e7d0d58f1 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -923,6 +923,29 @@ vtn_handle_non_semantic_debug_info(struct vtn_builder *b, SpvOp ext_opcode,
    return true;
 }
 
+static bool
+vtn_handle_mesa_internal(struct vtn_builder *b, SpvOp ext_opcode,
+                         const uint32_t *w, unsigned count)
+{
+   uint32_t instr = w[4];
+
+   switch (instr) {
+   case SpvOpFConvertRUMesa: {
+      struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]);
+      vtn_push_nir_ssa(b, w[2], nir_f2f16_ru(&b->nb, arg->def));
+      break;
+   }
+   case SpvOpFConvertRDMesa: {
+      struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]);
+      vtn_push_nir_ssa(b, w[2], nir_f2f16_rd(&b->nb, arg->def));
+      break;
+   }
+   }
+
+   return true;
+}
+
+
 static void
 vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
                      const uint32_t *w, unsigned count)
@@ -958,6 +981,8 @@ vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
          val->ext_handler = vtn_handle_debug_printf;
       } else if (strstr(ext, "NonSemantic.") == ext) {
          val->ext_handler = vtn_handle_non_semantic_instruction;
+      } else if (strstr(ext, "MesaInternal") == ext) {
+         val->ext_handler = vtn_handle_mesa_internal;
       } else {
          vtn_fail("Unsupported extension: %s", ext);
       }
diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index 5d601f95c86..9f6009ed8ea 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -33,6 +33,7 @@
 #include "spirv.h"
 #include "spirv_info.h"
 #include "vtn_generator_ids.h"
+#include "spirv_internal_exts.h"
 
 extern uint32_t mesa_spirv_debug;
 
diff --git a/src/util/half_float.c b/src/util/half_float.c
index 0eacf06c5a8..6734842df1e 100644
--- a/src/util/half_float.c
+++ b/src/util/half_float.c
@@ -211,3 +211,41 @@ uint16_t _mesa_uint16_div_64k_to_half(uint16_t v)
 
    return (e << 10) | m;
 }
+
+static uint16_t
+util_nextafter16(uint16_t x, bool up)
+{
+   uint16_t sign_mask = 1ull << 15;
+   uint16_t min_abs = 1;
+
+   float f = _mesa_half_to_float(x);
+   if (isnan(f) || (f == INFINITY && up) || (f == -INFINITY && !up))
+      return x;
+
+   /* beware of: +/-0.0 - 1 == NaN */
+   uint16_t xn = f == 0 ? (sign_mask | min_abs) : x - 1;
+
+   /* beware of -0.0 + 1 == -0x1p-149 */
+   uint16_t xp = f == 0 ? min_abs : x + 1;
+
+   /* nextafter can be implemented by just +/- 1 on the int value */
+   return (up ^ (f < 0)) ? xp : xn;
+}
+
+uint16_t
+_mesa_float_to_float16_ru(float val)
+{
+   uint16_t half = _mesa_float_to_half(val);
+   if (_mesa_half_to_float(half) < val)
+      return util_nextafter16(half, true);
+   return half;
+}
+
+uint16_t
+_mesa_float_to_float16_rd(float val)
+{
+   uint16_t half = _mesa_float_to_half(val);
+   if (_mesa_half_to_float(half) > val)
+      return util_nextafter16(half, false);
+   return half;
+}
diff --git a/src/util/half_float.h b/src/util/half_float.h
index f184323bd60..6961e1ed618 100644
--- a/src/util/half_float.h
+++ b/src/util/half_float.h
@@ -113,6 +113,9 @@ _mesa_float_to_float16_rtz(float val)
    return _mesa_float_to_float16_rtz_slow(val);
 }
 
+uint16_t _mesa_float_to_float16_ru(float val);
+uint16_t _mesa_float_to_float16_rd(float val);
+
 static inline uint16_t
 _mesa_float_to_float16_rtne(float val)
 {
diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build
index 02b2afb4163..add1590b70f 100644
--- a/src/vulkan/runtime/bvh/meson.build
+++ b/src/vulkan/runtime/bvh/meson.build
@@ -42,6 +42,7 @@ bvh_shaders = [
   ],
 ]
 
+spirv_include_dir = dir_source_root + '/src/compiler/spirv'
 vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
 
 vk_bvh_includes = files(
@@ -50,6 +51,7 @@ vk_bvh_includes = files(
   'vk_build_interface.h',
   'vk_bvh.h',
   'vk_debug.h',
+  spirv_include_dir + '/spirv_internal_exts.h',
 )
 
 vk_glsl_shader_extensions = [
@@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [
   'GL_KHR_shader_subgroup_ballot',
   'GL_KHR_shader_subgroup_clustered',
   'GL_EXT_shader_atomic_int64',
+  'GL_EXT_spirv_intrinsics',
 ]
 
 vk_glsl_shader_preamble = []
@@ -79,7 +82,7 @@ endforeach
 bvh_spv = []
 foreach s : bvh_shaders
   command = [
-    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
   ] + (with_mesa_debug ? ['-g'] : [])
   command += glslang_quiet
   command += vk_glsl_shader_preamble
diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h
index 01acb4db715..dd5795855b2 100644
--- a/src/vulkan/runtime/bvh/vk_build_helpers.h
+++ b/src/vulkan/runtime/bvh/vk_build_helpers.h
@@ -180,6 +180,7 @@
 
 #define INFINITY (1.0 / 0.0)
 #define NAN      (0.0 / 0.0)
+#define NAN_F16  (0.0hf / 0.0hf)
 
 #define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))