diff --git a/docs/features.txt b/docs/features.txt
index 56699bc3fb2..4ed1ccdeb45 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -529,7 +529,7 @@ Vulkan 1.4 -- all DONE: anv, lvp, nvk, radv/gfx8+, tu/a7xx+
 
 Khronos extensions that are not part of any Vulkan version:
 
-  VK_KHR_acceleration_structure                         DONE (anv/gfx12.5+, lvp, radv/gfx10.3+)
+  VK_KHR_acceleration_structure                         DONE (anv/gfx12.5+, lvp, radv/gfx10.3+, tu/a740+)
   VK_KHR_android_surface                                not started
   VK_KHR_calibrated_timestamps                          DONE (anv, nvk, radv, tu/a750+)
   VK_KHR_compute_shader_derivatives                     DONE (anv, nvk, radv, tu/a7xx+)
diff --git a/meson.build b/meson.build
index 7cbf7a7770e..1edc7746993 100644
--- a/meson.build
+++ b/meson.build
@@ -642,7 +642,7 @@ endif
 prog_glslang = find_program(
   'glslangValidator',
   native : true,
-  required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk or with_swrast_vk
+  required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk or with_swrast_vk or with_freedreno_vk
 )
 
 if prog_glslang.found()
diff --git a/src/freedreno/vulkan/bvh/copy.comp b/src/freedreno/vulkan/bvh/copy.comp
new file mode 100644
index 00000000000..c36c4cfde12
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/copy.comp
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2022 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+
+layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+
+#include "tu_build_interface.h"
+
+layout(push_constant) uniform CONSTS {
+   copy_args args;
+};
+
+void
+main(void)
+{
+   uint32_t global_id = gl_GlobalInvocationID.x;
+   uint32_t lanes = gl_NumWorkGroups.x * 128;
+   uint32_t increment = lanes * 16;
+
+   uint64_t copy_src_addr = args.src_addr;
+   uint64_t copy_dst_addr = args.dst_addr;
+
+   if (args.mode == TU_COPY_MODE_DESERIALIZE) {
+      copy_src_addr += SIZEOF(vk_accel_struct_serialization_header) +
+                       DEREF(REF(vk_accel_struct_serialization_header)(args.src_addr)).instance_count * SIZEOF(uint64_t);
+   }
+
+   REF(tu_accel_struct_header) header = REF(tu_accel_struct_header)(copy_src_addr);
+
+   uint64_t instance_base = args.src_addr + SIZEOF(vk_accel_struct_serialization_header);
+   uint64_t instance_offset = SIZEOF(tu_accel_struct_header);
+   uint64_t instance_end = DEREF(header).instance_count * SIZEOF(tu_instance_descriptor);
+   if (instance_end > 0)
+      instance_end += instance_offset;
+
+   if (args.mode == TU_COPY_MODE_SERIALIZE) {
+      copy_dst_addr += SIZEOF(vk_accel_struct_serialization_header) +
+                       DEREF(REF(tu_accel_struct_header)(args.src_addr)).instance_count * SIZEOF(uint64_t);
+
+      if (global_id == 0) {
+         REF(vk_accel_struct_serialization_header) ser_header =
+            REF(vk_accel_struct_serialization_header)(args.dst_addr);
+         DEREF(ser_header).serialization_size = DEREF(header).serialization_size;
+         DEREF(ser_header).deserialization_size = DEREF(header).compacted_size;
+         DEREF(ser_header).instance_count = DEREF(header).instance_count;
+      }
+
+      instance_base = args.dst_addr + SIZEOF(vk_accel_struct_serialization_header);
+   } else if (args.mode == TU_COPY_MODE_COPY) {
+      instance_end = 0;
+   }
+
+   uint64_t size = DEREF(header).compacted_size;
+   for (uint64_t offset = global_id * 16; offset < size; offset += increment) {
+      DEREF(REF(uvec4)(copy_dst_addr + offset)) =
+         DEREF(REF(uvec4)(copy_src_addr + offset));
+
+      /* Do the adjustment inline in the same invocation that copies the data so that we don't have
+       * to synchronize. */
+      if (offset < instance_end && offset >= instance_offset &&
+          (offset - instance_offset) % SIZEOF(tu_instance_descriptor) == 0) {
+         uint64_t idx = (offset - instance_offset) / SIZEOF(tu_instance_descriptor);
+
+         uint32_t bvh_offset = DEREF(REF(tu_instance_descriptor)(copy_src_addr + offset)).bvh_offset;
+         if (args.mode == TU_COPY_MODE_SERIALIZE) {
+            DEREF(INDEX(uint64_t, instance_base, idx)) =
+               DEREF(REF(tu_instance_descriptor)(copy_src_addr + offset)).bvh_ptr - bvh_offset;
+         } else { /* TU_COPY_MODE_DESERIALIZE */
+            uint64_t blas_addr = DEREF(INDEX(uint64_t, instance_base, idx));
+            DEREF(REF(tu_instance_descriptor)(copy_dst_addr + offset)).bvh_ptr = blas_addr + bvh_offset;
+         }
+      }
+   }
+}
diff --git a/src/freedreno/vulkan/bvh/encode.comp b/src/freedreno/vulkan/bvh/encode.comp
new file mode 100644
index 00000000000..3cc3176503e
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/encode.comp
@@ -0,0 +1,502 @@
+/*
+ * Copyright © 2022 Friedrich Vock
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_memory_scope_semantics : require
+
+layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+
+#include "tu_build_helpers.h"
+#include "tu_build_interface.h"
+
+layout(push_constant) uniform CONSTS {
+   encode_args args;
+};
+
+void set_parent(uint32_t child, uint32_t parent)
+{
+   uint64_t addr = args.output_bvh - child * 4 - 4;
+   DEREF(REF(uint32_t)(addr)) = parent;
+}
+
+/* This encoder struct is designed to encode a compressed node without keeping
+ * all the data live at once, making sure register pressure isn't too high.
+ */
+
+struct tu_encoder {
+   uint32_t cur_value;
+   uint word_offset;
+   uint bit_offset;
+   REF(tu_compressed_node) node;
+};
+
+void encode_init(out tu_encoder encoder, REF(tu_compressed_node) node)
+{
+   encoder.cur_value = 0;
+   encoder.word_offset = 0;
+   encoder.bit_offset = 0;
+   encoder.node = node;
+}
+
+void encode(inout tu_encoder encoder, uint32_t val, uint bits)
+{
+   encoder.cur_value |= val << encoder.bit_offset;
+   if (encoder.bit_offset + bits >= 32) {
+      DEREF(encoder.node).data[encoder.word_offset] = encoder.cur_value;
+      encoder.cur_value = val >> (32 - encoder.bit_offset);
+      encoder.word_offset++;
+      encoder.bit_offset = encoder.bit_offset + bits - 32;
+   } else {
+      encoder.bit_offset += bits;
+   }
+}
+
+void encode_skip(inout tu_encoder encoder, uint bits)
+{
+   if (encoder.bit_offset + bits >= 32) {
+      DEREF(encoder.node).data[encoder.word_offset] = encoder.cur_value;
+      encoder.word_offset++;
+      encoder.bit_offset = encoder.bit_offset + bits - 32;
+   } else {
+      encoder.bit_offset += bits;
+   }
+}
+
+void encode_finalize(tu_encoder encoder)
+{
+   DEREF(encoder.node).data[encoder.word_offset] = encoder.cur_value;
+}
+
+void
+encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, uint64_t dst_instances, REF(tu_accel_struct_header) dst_header)
+{
+   float coords[3][3];
+   uint32_t id;
+   uint32_t geometry_id;
+   uint32_t type_flags = TU_NODE_TYPE_LEAF;
+
+   switch (type) {
+   case vk_ir_node_triangle: {
+      vk_ir_triangle_node src = DEREF(REF(vk_ir_triangle_node)(src_node));
+
+      coords = src.coords;
+      uint32_t geometry_id_and_flags = src.geometry_id_and_flags;
+      if ((geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) {
+         atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_NONOPAQUE);
+      } else {
+         type_flags |= TU_NODE_TYPE_NONOPAQUE;
+         atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_OPAQUE);
+      }
+      geometry_id = geometry_id_and_flags & 0xffffff;
+      id = src.triangle_id;
+      break;
+   }
+   case vk_ir_node_aabb: {
+      vk_ir_aabb_node src = DEREF(REF(vk_ir_aabb_node)(src_node));
+      vk_aabb aabb = src.base.aabb;
+      coords[0][0] = aabb.min[0];
+      coords[0][1] = aabb.min[1];
+      coords[0][2] = aabb.min[2];
+      coords[1][0] = aabb.max[0];
+      coords[1][1] = aabb.max[1];
+      coords[1][2] = aabb.max[2];
+
+      type_flags |= TU_NODE_TYPE_AABB;
+
+      if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) {
+         atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_NONOPAQUE);
+      } else {
+         type_flags |= TU_NODE_TYPE_NONOPAQUE;
+         atomicAnd(DEREF(dst_header).instance_flags, ~TU_INSTANCE_ALL_OPAQUE);
+      }
+      geometry_id = src.geometry_id_and_flags & 0xffffff;
+      id = src.primitive_id;
+      break;
+   }
+   case vk_ir_node_instance: {
+      vk_ir_instance_node src = DEREF(REF(vk_ir_instance_node)(src_node));
+
+      id = src.instance_id;
+      geometry_id = 0;
+      REF(tu_instance_descriptor) dst_instance = REF(tu_instance_descriptor)(dst_instances + SIZEOF(tu_instance_descriptor) * id);
+
+      REF(tu_accel_struct_header) blas_header = REF(tu_accel_struct_header)(src.base_ptr);
+      uint64_t bvh_ptr = DEREF(blas_header).bvh_ptr;
+      uint32_t bvh_offset = uint32_t(bvh_ptr - src.base_ptr);
+
+      uint32_t sbt_offset_and_flags = src.sbt_offset_and_flags;
+      uint32_t custom_instance_and_mask = src.custom_instance_and_mask;
+      DEREF(dst_instance).bvh_ptr = bvh_ptr;
+      DEREF(dst_instance).custom_instance_index = custom_instance_and_mask & 0xffffffu;
+      DEREF(dst_instance).sbt_offset_and_flags = sbt_offset_and_flags;
+      DEREF(dst_instance).bvh_offset = bvh_offset;
+
+      mat4 transform = mat4(src.otw_matrix);
+
+      mat4 inv_transform = transpose(inverse(transpose(transform)));
+      DEREF(dst_instance).wto_matrix = mat3x4(inv_transform);
+      DEREF(dst_instance).otw_matrix = mat3x4(transform);
+
+      vk_aabb aabb = src.base.aabb;
+      coords[0][0] = aabb.min[0];
+      coords[0][1] = aabb.min[1];
+      coords[0][2] = aabb.min[2];
+      coords[1][0] = aabb.max[0];
+      coords[1][1] = aabb.max[1];
+      coords[1][2] = aabb.max[2];
+      
+      type_flags |= TU_NODE_TYPE_TLAS;
+
+      uint32_t instance_flags = DEREF(blas_header).instance_flags;
+
+      /* Apply VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR and
+       * VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR to correct the
+       * ALL_OPAQUE/ALL_NONOPAQUE flags.
+       */
+      if (((sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
+                                           VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR)) != 0) {
+         instance_flags &= ~(VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
+                             VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR);
+         instance_flags |= (sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
+                                                           VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR);
+      }
+      uint32_t cull_mask_and_flags = ((custom_instance_and_mask >> 16) & 0xff00) | instance_flags;
+
+      coords[2][0] = uintBitsToFloat(cull_mask_and_flags);
+      break;
+   }
+   }
+
+   REF(tu_leaf_node) dst = REF(tu_leaf_node)(dst_node);
+   DEREF(dst).coords = coords;
+   DEREF(dst).id = id;
+   DEREF(dst).geometry_id = geometry_id;
+   DEREF(dst).type_flags = type_flags;
+}
+
+/* Truncate to bfloat16 while rounding down. bfloat16 is used to store the bases.
+ */
+
+u16vec3 to_bfloat_round_down(vec3 coord)
+{
+   u32vec3 icoord = floatBitsToUint(coord);
+   return u16vec3(mix(icoord >> 16, (icoord + 0xffff) >> 16, notEqual(icoord & u32vec3(0x80000000), u32vec3(0))));
+}
+
+/* Approximate subtraction while rounding up. Return a result greater than or
+ * equal to the infinitely-precise result. This just uses the native
+ * subtraction and then shifts one ULP towards infinity. Because the result is
+ * further rounded, it should usually be good enough while being faster than
+ * emulated floating-point math.
+ *
+ * We assume here that the result is always nonnegative, because it's only used
+ * to subtract away the base.
+ */
+
+vec3 subtract_round_up_approx(vec3 a, vec3 b)
+{
+   vec3 f = a - b;
+   u32vec3 i = floatBitsToUint(f);
+
+   i++;
+
+   /* Handle infinity/zero special cases */
+   i = mix(i, floatBitsToUint(f), isinf(f));
+   i = mix(i, floatBitsToUint(f), equal(f, vec3(0)));
+
+   return uintBitsToFloat(i);
+}
+
+vec3 subtract_round_down_approx(vec3 a, vec3 b)
+{
+   vec3 f = a - b;
+   u32vec3 i = floatBitsToUint(f);
+
+   i--;
+
+   /* Handle infinity/zero special cases */
+   i = mix(i, floatBitsToUint(f), isinf(f));
+   i = mix(i, floatBitsToUint(f), equal(f, vec3(0)));
+
+   return uintBitsToFloat(i);
+}
+
+u32vec3 extract_mantissa(vec3 f)
+{
+   return mix((floatBitsToUint(f) & 0x7fffff) | 0x800000, u32vec3(0), equal(f, vec3(0)));
+}
+
+void
+encode_internal_node(uint32_t children[8], uint32_t children_offset, uint child_count,
+                     vec3 min_offset, vec3 max_offset, uint32_t bvh_offset)
+{
+   REF(tu_internal_node) dst_node = REF(tu_internal_node)(OFFSET(args.output_bvh, SIZEOF(tu_internal_node) * bvh_offset));
+
+   DEREF(dst_node).id = children_offset;
+
+   u16vec3 base_bfloat = to_bfloat_round_down(min_offset);
+   vec3 base_float = uintBitsToFloat(u32vec3(base_bfloat) << 16);
+   DEREF(dst_node).bases[0] = base_bfloat.x;
+   DEREF(dst_node).bases[1] = base_bfloat.y;
+   DEREF(dst_node).bases[2] = base_bfloat.z;
+
+   vec3 children_max = subtract_round_up_approx(max_offset, base_float);
+
+   /* The largest child offset will be encoded in 8 bits, including the
+    * explicit leading 1. We need to downcast to this precision while rounding
+    * up to catch cases where the exponent is increased by rounding up, then
+    * extract the exponent. Because children_max is always nonnegative, we can 
+    * do the downcast with "(floatBitsToUint(children_max) + 0xffff) >> 16",
+    * and then we further shift to get the rounded exponent.
+    */
+   u16vec3 exponents = u16vec3((floatBitsToUint(children_max) + 0xffff) >> 23);
+   u8vec3 exponents_u8 = u8vec3(exponents);
+   DEREF(dst_node).exponents[0] = exponents_u8.x;
+   DEREF(dst_node).exponents[1] = exponents_u8.y;
+   DEREF(dst_node).exponents[2] = exponents_u8.z;
+
+   for (uint32_t i = 0; i < child_count; i++) {
+      uint32_t offset = ir_id_to_offset(children[i]);
+
+      vk_aabb child_aabb =
+         DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+
+      /* Note: because we subtract from the minimum, we should never have a
+       * negative value here.
+       */
+      vec3 child_min = subtract_round_down_approx(child_aabb.min, base_float);
+      vec3 child_max = subtract_round_up_approx(child_aabb.max, base_float);
+      
+      u16vec3 child_min_exponents = u16vec3(floatBitsToUint(child_min) >> 23);
+      u16vec3 child_max_exponents = u16vec3(floatBitsToUint(child_max) >> 23);
+
+      u16vec3 child_min_shift = u16vec3(16) + exponents - child_min_exponents;
+      /* Divide the mantissa by 2**child_min_shift, rounding down */
+      u8vec3 child_min_mantissas =
+         mix(u8vec3(extract_mantissa(child_min) >> child_min_shift), u8vec3(0),
+		       greaterThanEqual(child_min_shift, u16vec3(32)));
+      u16vec3 child_max_shift = u16vec3(16) + exponents - child_max_exponents;
+      /* Divide the mantissa by 2**child_max_shift, rounding up */
+      u8vec3 child_max_mantissas =
+         mix(u8vec3((extract_mantissa(child_max) + ((u32vec3(1u) << u32vec3(child_max_shift)) - 1)) >> child_max_shift),
+             u8vec3(notEqual(extract_mantissa(child_max), u32vec3(0))),
+             greaterThanEqual(child_max_shift, u16vec3(32)));
+
+      DEREF(dst_node).mantissas[i][0][0] = child_min_mantissas.x;
+      DEREF(dst_node).mantissas[i][0][1] = child_min_mantissas.y;
+      DEREF(dst_node).mantissas[i][0][2] = child_min_mantissas.z;
+      DEREF(dst_node).mantissas[i][1][0] = child_max_mantissas.x;
+      DEREF(dst_node).mantissas[i][1][1] = child_max_mantissas.y;
+      DEREF(dst_node).mantissas[i][1][2] = child_max_mantissas.z;
+   }
+
+   for (uint32_t i = child_count; i < 8; i++) {
+      DEREF(dst_node).mantissas[i][0][0] = uint8_t(0xff);
+      DEREF(dst_node).mantissas[i][0][1] = uint8_t(0xff);
+      DEREF(dst_node).mantissas[i][0][2] = uint8_t(0xff);
+      DEREF(dst_node).mantissas[i][1][0] = uint8_t(0);
+      DEREF(dst_node).mantissas[i][1][1] = uint8_t(0);
+      DEREF(dst_node).mantissas[i][1][2] = uint8_t(0);
+   }
+
+   DEREF(dst_node).child_count = uint8_t(child_count);
+   DEREF(dst_node).type_flags = uint16_t(args.geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ? (TU_NODE_TYPE_TLAS >> 16) : 0);
+}
+
+void
+main()
+{
+   /* Revert the order so we start at the root */
+   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
+
+   uint32_t intermediate_leaf_node_size;
+   switch (args.geometry_type) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      intermediate_leaf_node_size = SIZEOF(vk_ir_triangle_node);
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      intermediate_leaf_node_size = SIZEOF(vk_ir_aabb_node);
+      break;
+   default: /* instances */
+      intermediate_leaf_node_size = SIZEOF(vk_ir_instance_node);
+      break;
+   }
+
+   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * intermediate_leaf_node_size;
+
+   REF(vk_ir_box_node) intermediate_internal_nodes =
+      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
+   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
+   vk_ir_box_node src = DEREF(src_node);
+
+   uint64_t dst_instances = args.output_bvh - args.output_bvh_offset + SIZEOF(tu_accel_struct_header);
+
+   bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;
+
+   REF(tu_accel_struct_header) header = REF(tu_accel_struct_header)(args.output_bvh - args.output_bvh_offset);
+
+   if (is_root_node) {
+      DEREF(header).instance_flags =
+         (args.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR ? TU_INSTANCE_ALL_AABB : 0) |
+         /* These will be removed when processing leaf nodes */
+         TU_INSTANCE_ALL_NONOPAQUE | TU_INSTANCE_ALL_OPAQUE;
+      DEREF(args.header).dst_node_offset = 1;
+   }
+
+   for (;;) {
+      /* Make changes to the current node's BVH offset value visible. */
+      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+      uint32_t bvh_offset = is_root_node ? 0 : DEREF(src_node).bvh_offset;
+      if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
+         continue;
+
+      if (bvh_offset == VK_NULL_BVH_OFFSET)
+         break;
+
+      uint32_t found_child_count = 0;
+      uint32_t children[8] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE};
+
+      for (uint32_t i = 0; i < 2; ++i)
+         if (src.children[i] != VK_BVH_INVALID_NODE)
+            children[found_child_count++] = src.children[i];
+
+      while (found_child_count < 8) {
+         int32_t collapsed_child_index = -1;
+         float largest_surface_area = -INFINITY;
+
+         for (int32_t i = 0; i < found_child_count; ++i) {
+            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
+               continue;
+
+            vk_aabb bounds =
+               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
+                                           ir_id_to_offset(children[i]))).aabb;
+
+            float surface_area = aabb_surface_area(bounds);
+            if (surface_area > largest_surface_area) {
+               largest_surface_area = surface_area;
+               collapsed_child_index = i;
+            }
+         }
+
+         if (collapsed_child_index != -1) {
+            REF(vk_ir_box_node) child_node =
+               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
+                                        ir_id_to_offset(children[collapsed_child_index]));
+            uint32_t grandchildren[2] = DEREF(child_node).children;
+            uint32_t valid_grandchild_count = 0;
+
+            if (grandchildren[1] != VK_BVH_INVALID_NODE)
+               ++valid_grandchild_count;
+
+            if (grandchildren[0] != VK_BVH_INVALID_NODE)
+               ++valid_grandchild_count;
+            else
+               grandchildren[0] = grandchildren[1];
+
+            if (valid_grandchild_count > 1)
+               children[found_child_count++] = grandchildren[1];
+
+            if (valid_grandchild_count > 0)
+               children[collapsed_child_index] = grandchildren[0];
+            else {
+               found_child_count--;
+               children[collapsed_child_index] = children[found_child_count];
+            }
+
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
+         } else
+            break;
+      }
+
+      /* If there is only one child, collapse the current node by setting the
+       * child's offset to this node's offset. Otherwise, use an atomic to
+       * allocate contiguous space for all of the children.
+       */
+      uint32_t children_offset = bvh_offset;
+      if (found_child_count > 1) {
+         children_offset = atomicAdd(DEREF(args.header).dst_node_offset, found_child_count);
+      }
+
+      vec3 min_offset = vec3(INFINITY);
+      vec3 max_offset = vec3(-INFINITY);
+      for (uint32_t i = 0; i < found_child_count; ++i) {
+         uint32_t type = ir_id_to_type(children[i]);
+         uint32_t offset = ir_id_to_offset(children[i]);
+         uint32_t dst_offset;
+
+         dst_offset = children_offset + i;
+
+         if (type == vk_ir_node_internal) {
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
+            DEREF(child_node).bvh_offset = dst_offset;
+         } else {
+            encode_leaf_node(type, args.intermediate_bvh + offset,
+                             args.output_bvh + SIZEOF(tu_internal_node) * dst_offset, dst_instances,
+                             header);
+         }
+
+         vk_aabb child_aabb =
+            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+
+         min_offset = min(min_offset, child_aabb.min);
+         max_offset = max(max_offset, child_aabb.max);
+
+         if (found_child_count > 1) {
+            set_parent(dst_offset, bvh_offset);
+         }
+      }
+
+      /* Make changes to the children's BVH offset value available to the other invocations. */
+      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+      if (found_child_count > 1 || found_child_count == 0)
+         encode_internal_node(children, children_offset, found_child_count, min_offset, max_offset, bvh_offset);
+
+      break;
+   }
+
+   if (is_root_node) {
+      DEREF(header).aabb = src.base.aabb;
+      DEREF(header).bvh_ptr = args.output_bvh;
+
+      set_parent(0, VK_BVH_INVALID_NODE);
+   }
+}
diff --git a/src/freedreno/vulkan/bvh/header.comp b/src/freedreno/vulkan/bvh/header.comp
new file mode 100644
index 00000000000..d2c1f96f50f
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/header.comp
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+#include "tu_build_interface.h"
+
+layout(push_constant) uniform CONSTS
+{
+   header_args args;
+};
+
+void
+main(void)
+{
+   uint32_t compacted_size = args.bvh_offset + DEREF(args.src).dst_node_offset * SIZEOF(tu_internal_node);
+
+   uint32_t serialization_size = compacted_size + SIZEOF(uint64_t) * args.instance_count;
+
+   uint32_t size = serialization_size - SIZEOF(vk_accel_struct_serialization_header) -
+                   SIZEOF(uint64_t) * args.instance_count;
+
+   DEREF(args.dst).compacted_size = compacted_size;
+
+   DEREF(args.dst).copy_dispatch_size[0] = DIV_ROUND_UP(compacted_size, 16 * 128);
+   DEREF(args.dst).copy_dispatch_size[1] = 1;
+   DEREF(args.dst).copy_dispatch_size[2] = 1;
+
+   DEREF(args.dst).serialization_size = serialization_size;
+
+   DEREF(args.dst).size = size;
+}
diff --git a/src/freedreno/vulkan/bvh/meson.build b/src/freedreno/vulkan/bvh/meson.build
new file mode 100644
index 00000000000..02ec2a78e7b
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/meson.build
@@ -0,0 +1,66 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# source file, output name, defines
+bvh_shaders = [
+  [
+    'encode.comp',
+    'encode',
+    [],
+  ],
+  [
+    'header.comp',
+    'header',
+    [],
+  ],
+  [
+    'copy.comp',
+    'copy',
+    []
+  ],
+]
+
+tu_bvh_include_dir = dir_source_root + '/src/freedreno/vulkan/bvh'
+
+tu_bvh_includes = files(
+  'tu_build_helpers.h',
+  'tu_build_interface.h',
+  'tu_bvh.h',
+)
+
+bvh_spv = []
+foreach s : bvh_shaders
+  command = [
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + tu_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+  ]
+  command += glslang_quiet
+
+  foreach define : s[2]
+    command += '-D' + define
+  endforeach
+
+  bvh_spv += custom_target(
+    s[1] + '.spv.h',
+    input : s[0],
+    output : s[1] + '.spv.h',
+    command : command,
+    depend_files: [vk_bvh_includes, tu_bvh_includes], 
+  )
+endforeach
diff --git a/src/freedreno/vulkan/bvh/tu_build_helpers.h b/src/freedreno/vulkan/bvh/tu_build_helpers.h
new file mode 100644
index 00000000000..e33863fa129
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/tu_build_helpers.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef TU_BVH_BUILD_HELPERS_H
+#define TU_BVH_BUILD_HELPERS_H
+
+#include "vk_build_helpers.h"
+#include "tu_bvh.h"
+
+TYPE(tu_accel_struct_header, 8);
+TYPE(tu_leaf_node, 4);
+TYPE(tu_internal_node, 4);
+TYPE(tu_compressed_node, 4);
+TYPE(tu_instance_descriptor, 8);
+
+#endif
+
diff --git a/src/freedreno/vulkan/bvh/tu_build_interface.h b/src/freedreno/vulkan/bvh/tu_build_interface.h
new file mode 100644
index 00000000000..4cf05ef597a
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/tu_build_interface.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ * Copyright © 2024 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef TU_BVH_BUILD_INTERFACE_H
+#define TU_BVH_BUILD_INTERFACE_H
+
+#ifdef VULKAN
+#include "tu_build_helpers.h"
+#else
+#include <stdint.h>
+#include "tu_bvh.h"
+#define REF(type) uint64_t
+#define VOID_REF  uint64_t
+#endif
+
+struct encode_args {
+   VOID_REF intermediate_bvh;
+   VOID_REF output_bvh;
+   REF(vk_ir_header) header;
+   uint32_t output_bvh_offset;
+   uint32_t leaf_node_count;
+   uint32_t geometry_type;
+};
+
+struct header_args {
+   REF(vk_ir_header) src;
+   REF(tu_accel_struct_header) dst;
+   uint32_t bvh_offset;
+   uint32_t instance_count;
+};
+
+#define TU_COPY_MODE_COPY        0
+#define TU_COPY_MODE_SERIALIZE   1
+#define TU_COPY_MODE_DESERIALIZE 2
+
+struct copy_args {
+   VOID_REF src_addr;
+   VOID_REF dst_addr;
+   uint32_t mode;
+};
+
+#endif
+
diff --git a/src/freedreno/vulkan/bvh/tu_bvh.h b/src/freedreno/vulkan/bvh/tu_bvh.h
new file mode 100644
index 00000000000..f3f98e83867
--- /dev/null
+++ b/src/freedreno/vulkan/bvh/tu_bvh.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2021 Bas Nieuwenhuizen
+ * Copyright © 2024 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef TU_BVH_H
+#define TU_BVH_H
+
+#ifdef VULKAN
+#define VK_UUID_SIZE 16
+#else
+#include <vulkan/vulkan.h>
+#endif
+
+#include "vk_bvh.h"
+
+/* The size in bytes of each record in the D3D-style UAV descriptor for
+ * acceleration structures. The first record is the acceleration struct header
+ * and the rest are the instances.
+ */
+#define AS_RECORD_SIZE 128
+
+/* The size of a BVH node as defined by the HW. */
+#define AS_NODE_SIZE 64
+
+struct tu_accel_struct_header {
+   vk_aabb aabb;
+
+   uint64_t bvh_ptr;
+
+   /* This word contains flags that should be set in the leaf nodes for
+    * instances pointing to this BLAS. ALL_NODES_{OPAQUE_NONOPAQUE} may be
+    * modified by the FORCE_OPAQUE and FORCE_NON_OPAQUE instance flags.
+    */
+   uint32_t instance_flags;
+
+   /* Everything after this gets either updated/copied from the CPU or written by header.comp. */
+   uint32_t copy_dispatch_size[3];
+
+   uint64_t compacted_size;
+   uint64_t serialization_size;
+   uint64_t size;
+
+   /* Everything after this gets updated/copied from the CPU. */
+   uint64_t instance_count;
+
+   uint64_t self_ptr;
+
+   uint32_t padding[10];
+};
+
+/* See
+ * https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/a7xx-ray-tracing
+ * for details of the encoding.
+ */
+
+#define TU_NODE_TYPE_TLAS      (1u << 24)
+#define TU_NODE_TYPE_LEAF      (1u << 25)
+#define TU_NODE_TYPE_NONOPAQUE (1u << 26)
+#define TU_NODE_TYPE_AABB      (1u << 27)
+
+#define TU_INTERSECTION_TYPE_TLAS      (1u << 8)
+#define TU_INTERSECTION_TYPE_LEAF      (1u << 9)
+#define TU_INTERSECTION_TYPE_NONOPAQUE (1u << 10)
+#define TU_INTERSECTION_TYPE_AABB      (1u << 11)
+#define TU_INTERSECTION_BACK_FACE      (1u << 12)
+
+#define TU_INSTANCE_ALL_OPAQUE    (1u << 2)
+#define TU_INSTANCE_ALL_NONOPAQUE (1u << 3)
+#define TU_INSTANCE_ALL_AABB      (1u << 6)
+
+struct tu_leaf_node {
+   uint32_t id;
+   float coords[3][3];
+   uint32_t geometry_id; /* Ignored by HW, we use it to stash the geometry ID */
+   uint32_t padding[4];
+   uint32_t type_flags;
+};
+
+struct tu_internal_node {
+   uint32_t id;
+   uint16_t bases[3];
+   uint8_t mantissas[8][2][3];
+   uint8_t exponents[3];
+   uint8_t child_count;
+   uint16_t type_flags;
+};
+
+struct tu_compressed_node {
+   uint32_t id;
+   uint32_t bases[3];
+   uint32_t data[12];
+};
+
+struct tu_instance_descriptor {
+   uint64_t bvh_ptr;
+
+   uint32_t custom_instance_index;
+
+   /* lower 24 bits are the sbt offset, upper 8 bits are the
+    * VkGeometryInstanceFlagsKHR
+    */
+   uint32_t sbt_offset_and_flags;
+
+   mat3x4 wto_matrix;
+
+   uint32_t bvh_offset;
+
+   /* Pad to make the size a power of 2 so that addressing math is
+    * simplified.
+    */
+   uint32_t reserved[3];
+   
+   /* Object to world matrix inverted from the initial transform. */
+   mat3x4 otw_matrix;
+};
+
+#endif
+
diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build
index 16ab9357e36..532a82f7ba6 100644
--- a/src/freedreno/vulkan/meson.build
+++ b/src/freedreno/vulkan/meson.build
@@ -19,7 +19,10 @@ tu_entrypoints = custom_target(
 
 
 libtu_files = files(
+  'bvh/tu_bvh.h',
+  'bvh/tu_build_interface.h',
   'layers/tu_rmv_layer.cc',
+  'tu_acceleration_structure.cc',
   'tu_autotune.cc',
   'tu_buffer.cc',
   'tu_buffer_view.cc',
@@ -47,6 +50,8 @@ libtu_files = files(
   'tu_util.cc',
 )
 
+subdir('bvh')
+
 libtu_includes = [
     inc_include,
     inc_src,
@@ -160,7 +165,7 @@ endif
 
 libvulkan_freedreno = shared_library(
   'vulkan_freedreno',
-  [libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files, sha1_h, u_format_pack_h],
+  [libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files, sha1_h, u_format_pack_h, bvh_spv],
   include_directories : libtu_includes,
   link_with : [
     libfreedreno_ir3,
diff --git a/src/freedreno/vulkan/tu_acceleration_structure.cc b/src/freedreno/vulkan/tu_acceleration_structure.cc
new file mode 100644
index 00000000000..f87337e899a
--- /dev/null
+++ b/src/freedreno/vulkan/tu_acceleration_structure.cc
@@ -0,0 +1,763 @@
+/*
+ * Copyright © 2021 Bas Nieuwenhuizen
+ * Copyright © 2024 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "tu_buffer.h"
+#include "tu_device.h"
+#include "tu_cmd_buffer.h"
+
+#include "vk_acceleration_structure.h"
+#include "tu_acceleration_structure.h"
+#include "radix_sort/radix_sort_u64.h"
+
+#include "util/u_hexdump.h"
+
+#include "bvh/tu_build_interface.h"
+
+static const uint32_t encode_spv[] = {
+#include "bvh/encode.spv.h"
+};
+
+static const uint32_t header_spv[] = {
+#include "bvh/header.spv.h"
+};
+
+static const uint32_t copy_spv[] = {
+#include "bvh/copy.spv.h"
+};
+
+static_assert(sizeof(struct tu_instance_descriptor) == AS_RECORD_SIZE);
+static_assert(sizeof(struct tu_accel_struct_header) == AS_RECORD_SIZE);
+static_assert(sizeof(struct tu_internal_node) == AS_NODE_SIZE);
+static_assert(sizeof(struct tu_leaf_node) == AS_NODE_SIZE);
+
+static VkResult
+get_pipeline_spv(struct tu_device *device,
+                 const char *name, const uint32_t *spv, uint32_t spv_size,
+                 unsigned push_constant_size,
+                 VkPipeline *pipeline, VkPipelineLayout *layout)
+{
+   size_t key_size = strlen(name);
+
+   const VkPushConstantRange pc_range = {
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = push_constant_size,
+   };
+
+   VkResult result = vk_meta_get_pipeline_layout(&device->vk,
+                                                 &device->meta, NULL,
+                                                 &pc_range, name, key_size,
+                                                 layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta, name, key_size);
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline = pipeline_from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkShaderModuleCreateInfo module_info = {
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = NULL,
+      .flags = 0,
+      .codeSize = spv_size,
+      .pCode = spv,
+   };
+
+   VkPipelineShaderStageCreateInfo shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &module_info,
+      .flags = 0,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+      .pSpecializationInfo = NULL,
+   };
+
+   VkComputePipelineCreateInfo pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .flags = 0,
+      .stage = shader_stage,
+      .layout = *layout,
+   };
+
+   return vk_meta_create_compute_pipeline(&device->vk, &device->meta, &pipeline_info,
+                                          name, key_size, pipeline);
+}
+
+struct bvh_layout {
+   uint64_t bvh_offset;
+   uint64_t size;
+};
+
+static void
+get_bvh_layout(VkGeometryTypeKHR geometry_type,
+               uint32_t leaf_count,
+               struct bvh_layout *layout)
+{
+   uint32_t internal_count = MAX2(leaf_count, 2) - 1;
+
+   uint64_t offset = sizeof(struct tu_accel_struct_header);
+   
+   /* Instance descriptors, one per instance. */
+   if (geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) {
+      offset += leaf_count * sizeof(struct tu_instance_descriptor);
+   }
+
+   /* Parent links, which have to go directly before bvh_offset as we index
+    * them using negative offsets from there.
+    */
+   offset += (internal_count + leaf_count) * sizeof(uint32_t);
+
+   /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
+   offset = ALIGN(offset, 64);
+   layout->bvh_offset = offset;
+
+   offset += internal_count * sizeof(struct tu_internal_node) +
+      leaf_count * sizeof(struct tu_leaf_node);
+
+   layout->size = offset;
+}
+
+VkDeviceSize get_bvh_size(VkDevice device,
+                          const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                          uint32_t leaf_count)
+{
+   struct bvh_layout layout;
+   get_bvh_layout(vk_get_as_geometry_type(pBuildInfo), leaf_count, &layout);
+   return layout.size;
+}
+
+static uint32_t
+encode_key(VkAccelerationStructureTypeKHR type,
+           VkBuildAccelerationStructureFlagBitsKHR flags)
+{
+   return 0;
+}
+
+
+static VkResult
+encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   struct tu_device *device = cmdbuf->device;
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result =
+      get_pipeline_spv(device, "encode", encode_spv, sizeof(encode_spv),
+                       sizeof(encode_args), &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+   return VK_SUCCESS;
+}
+
+static void
+encode(VkCommandBuffer commandBuffer,
+       const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+       const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+       VkDeviceAddress intermediate_as_addr,
+       VkDeviceAddress intermediate_header_addr,
+       uint32_t leaf_count,
+       uint32_t key,
+       struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   struct tu_device *device = cmdbuf->device;
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   get_pipeline_spv(device, "encode", encode_spv, sizeof(encode_spv),
+                    sizeof(encode_args), &pipeline, &layout);
+
+   struct bvh_layout bvh_layout;
+   get_bvh_layout(geometry_type, leaf_count, &bvh_layout);
+
+   const struct encode_args args = {
+      .intermediate_bvh = intermediate_as_addr,
+      .output_bvh = vk_acceleration_structure_get_va(dst) + bvh_layout.bvh_offset,
+      .header = intermediate_header_addr,
+      .output_bvh_offset = bvh_layout.bvh_offset,
+      .leaf_node_count = leaf_count,
+      .geometry_type = geometry_type,
+   };
+   vk_common_CmdPushConstants(commandBuffer, layout,
+                              VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args),
+                              &args);
+
+   tu_dispatch_unaligned_indirect(commandBuffer,
+                                  intermediate_header_addr +
+                                  offsetof(struct vk_ir_header, ir_internal_node_count));
+
+   *(VkDeviceSize *)
+      util_sparse_array_get(&device->accel_struct_ranges,
+                            vk_acceleration_structure_get_va(dst)) = dst->size;
+
+}
+
+/* Don't bother copying over the compacted size using a compute shader if
+ * compaction is never going to happen.
+ */
+enum tu_header_key {
+   HEADER_NO_DISPATCH,
+   HEADER_USE_DISPATCH
+};
+
+static uint32_t
+header_key(VkAccelerationStructureTypeKHR type,
+           VkBuildAccelerationStructureFlagBitsKHR flags)
+{
+   return (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) ?
+      HEADER_USE_DISPATCH : HEADER_NO_DISPATCH;
+}
+
+static VkResult
+header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   struct tu_device *device = cmdbuf->device;
+
+   if (key == HEADER_USE_DISPATCH) {
+      VkPipeline pipeline;
+      VkPipelineLayout layout;
+      VkResult result =
+         get_pipeline_spv(device, "header", header_spv, sizeof(header_spv),
+                          sizeof(header_args), &pipeline, &layout);
+
+      if (result != VK_SUCCESS)
+         return result;
+
+      static const VkMemoryBarrier mb = {
+         .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+         .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+         .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
+      };
+
+      vk_common_CmdPipelineBarrier(commandBuffer,
+                                   VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                                   VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                                   0, 1, &mb, 0, NULL, 0, NULL);
+
+      tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+header(VkCommandBuffer commandBuffer,
+       const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+       const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+       VkDeviceAddress intermediate_as_addr,
+       VkDeviceAddress intermediate_header_addr,
+       uint32_t leaf_count,
+       uint32_t key,
+       struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   struct tu_device *device = cmdbuf->device;
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
+
+   struct bvh_layout bvh_layout;
+   get_bvh_layout(geometry_type, leaf_count, &bvh_layout);
+
+   VkDeviceAddress header_addr = vk_acceleration_structure_get_va(dst);
+
+   size_t base = offsetof(struct tu_accel_struct_header, copy_dispatch_size);
+
+   uint32_t instance_count =
+      geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ? leaf_count : 0;
+
+   if (key == HEADER_USE_DISPATCH) {
+      base = offsetof(struct tu_accel_struct_header, instance_count);
+      VkPipeline pipeline;
+      VkPipelineLayout layout;
+      get_pipeline_spv(device, "header", header_spv, sizeof(header_spv),
+                       sizeof(header_args), &pipeline, &layout);
+
+      struct header_args args = {
+         .src = intermediate_header_addr,
+         .dst = vk_acceleration_structure_get_va(dst),
+         .bvh_offset = bvh_layout.bvh_offset,
+         .instance_count = instance_count,
+      };
+
+      vk_common_CmdPushConstants(commandBuffer, layout,
+                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args),
+                                 &args);
+
+      vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
+   }
+
+   struct tu_accel_struct_header header = {};
+
+   header.instance_count = instance_count;
+   header.self_ptr = header_addr;
+   header.compacted_size = bvh_layout.size;
+
+   header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 128);
+   header.copy_dispatch_size[1] = 1;
+   header.copy_dispatch_size[2] = 1;
+
+   header.serialization_size =
+      header.compacted_size +
+      sizeof(struct vk_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count;
+
+   header.size = header.serialization_size - sizeof(struct vk_accel_struct_serialization_header) -
+                 sizeof(uint64_t) * header.instance_count;
+
+   struct tu_cs *cs = &cmdbuf->cs;
+
+   size_t header_size = sizeof(struct tu_accel_struct_header) - base;
+   assert(base % sizeof(uint32_t) == 0);
+   assert(header_size % sizeof(uint32_t) == 0);
+   uint32_t *header_ptr = (uint32_t *)((char *)&header + base);
+
+   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + header_size / sizeof(uint32_t));
+   tu_cs_emit_qw(cs, header_addr + base);
+   tu_cs_emit_array(cs, header_ptr, header_size / sizeof(uint32_t));
+}
+
+const struct vk_acceleration_structure_build_ops tu_as_build_ops = {
+   .get_as_size = get_bvh_size,
+   .get_encode_key = { encode_key, header_key },
+   .encode_bind_pipeline = { encode_bind_pipeline, header_bind_pipeline },
+   .encode_as = { encode, header },
+};
+
+struct radix_sort_vk_target_config tu_radix_sort_config = {
+   .keyval_dwords = 2,
+   .init = { .workgroup_size_log2 = 8, },
+   .fill = { .workgroup_size_log2 = 8, .block_rows = 8 },
+   .histogram = {
+      .workgroup_size_log2 = 8,
+      .subgroup_size_log2 = 7,
+      .block_rows = 14, /* TODO tune this */
+   },
+   .prefix = {
+      .workgroup_size_log2 = 8,
+      .subgroup_size_log2 = 7,
+   },
+   .scatter = {
+      .workgroup_size_log2 = 8,
+      .subgroup_size_log2 = 7,
+      .block_rows = 14, /* TODO tune this */
+   },
+   .nonsequential_dispatch = false,
+};
+
+static VkResult
+init_radix_sort(struct tu_device *device)
+{
+   if (!device->radix_sort) {
+      mtx_lock(&device->radix_sort_mutex);
+      if (!device->radix_sort) {
+         device->radix_sort =
+            vk_create_radix_sort_u64(tu_device_to_handle(device),
+                                     &device->vk.alloc,
+                                     VK_NULL_HANDLE, tu_radix_sort_config);
+         if (!device->radix_sort) {
+            /* TODO plumb through the error here */
+            mtx_unlock(&device->radix_sort_mutex);
+            return VK_ERROR_OUT_OF_HOST_MEMORY;
+         }
+
+      }
+      mtx_unlock(&device->radix_sort_mutex);
+   }
+
+   return VK_SUCCESS;
+}
+
+struct tu_saved_compute_state {
+   uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+   struct tu_shader *compute_shader;
+};
+
+static void
+tu_save_compute_state(struct tu_cmd_buffer *cmd,
+                      struct tu_saved_compute_state *state)
+{
+   memcpy(state->push_constants, cmd->push_constants, sizeof(cmd->push_constants));
+   state->compute_shader = cmd->state.shaders[MESA_SHADER_COMPUTE];
+}
+
+static void
+tu_restore_compute_state(struct tu_cmd_buffer *cmd,
+                         struct tu_saved_compute_state *state)
+{
+   cmd->state.shaders[MESA_SHADER_COMPUTE] = state->compute_shader;
+   if (state->compute_shader) {
+      tu_cs_emit_state_ib(&cmd->cs, state->compute_shader->state);
+   }
+   memcpy(cmd->push_constants, state->push_constants, sizeof(cmd->push_constants));
+   cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t infoCount,
+                                     const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                                     const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   struct tu_device *device = cmd->device;
+   struct tu_saved_compute_state state;
+
+   VkResult result = init_radix_sort(device);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return;
+   }
+
+   tu_save_compute_state(cmd, &state);
+
+   struct vk_acceleration_structure_build_args args = {
+      .subgroup_size = 128,
+      .bvh_bounds_offset = offsetof(tu_accel_struct_header, aabb),
+      .emit_markers = false,
+      .radix_sort = device->radix_sort,
+   };
+
+   vk_cmd_build_acceleration_structures(commandBuffer,
+                                        &device->vk,
+                                        &device->meta,
+                                        infoCount,
+                                        pInfos,
+                                        ppBuildRangeInfos,
+                                        &args);
+
+   tu_restore_compute_state(cmd, &state);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureInfoKHR *pInfo)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst);
+   struct tu_saved_compute_state state;
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result =
+      get_pipeline_spv(cmd->device, "copy", copy_spv, sizeof(copy_spv),
+                    sizeof(copy_args), &pipeline, &layout);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return;
+   }
+
+   tu_save_compute_state(cmd, &state);
+
+   tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   struct copy_args consts = {
+      .src_addr = vk_acceleration_structure_get_va(src),
+      .dst_addr = vk_acceleration_structure_get_va(dst),
+      .mode = TU_COPY_MODE_COPY,
+   };
+
+   vk_common_CmdPushConstants(commandBuffer, layout,
+                              VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts),
+                              &consts);
+
+   TU_CALLX(cmd->device, tu_CmdDispatchIndirect)(
+      commandBuffer, src->buffer,
+      src->offset + offsetof(struct tu_accel_struct_header, copy_dispatch_size));
+
+   tu_restore_compute_state(cmd, &state);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer,
+                                           const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst);
+   struct tu_saved_compute_state state;
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result =
+      get_pipeline_spv(cmd->device, "copy", copy_spv, sizeof(copy_spv),
+                    sizeof(copy_args), &pipeline, &layout);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return;
+   }
+
+   tu_save_compute_state(cmd, &state);
+
+   tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   const struct copy_args consts = {
+      .src_addr = pInfo->src.deviceAddress,
+      .dst_addr = vk_acceleration_structure_get_va(dst),
+      .mode = TU_COPY_MODE_DESERIALIZE,
+   };
+
+   vk_common_CmdPushConstants(commandBuffer, layout,
+                              VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts),
+                              &consts);
+
+   vk_common_CmdDispatch(commandBuffer, 256, 1, 1);
+
+   tu_restore_compute_state(cmd, &state);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_CmdCopyAccelerationStructureToMemoryKHR(VkCommandBuffer commandBuffer,
+                                           const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src);
+   struct tu_saved_compute_state state;
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result =
+      get_pipeline_spv(cmd->device, "copy", copy_spv, sizeof(copy_spv),
+                    sizeof(copy_args), &pipeline, &layout);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return;
+   }
+
+   tu_save_compute_state(cmd, &state);
+
+   tu_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   const struct copy_args consts = {
+      .src_addr = vk_acceleration_structure_get_va(src),
+      .dst_addr = pInfo->dst.deviceAddress,
+      .mode = TU_COPY_MODE_SERIALIZE,
+   };
+
+   vk_common_CmdPushConstants(commandBuffer, layout,
+                              VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts),
+                              &consts);
+
+   TU_CALLX(cmd->device, tu_CmdDispatchIndirect)(
+      commandBuffer, src->buffer,
+      src->offset + offsetof(struct tu_accel_struct_header, copy_dispatch_size));
+
+   tu_restore_compute_state(cmd, &state);
+
+   /* Set the header of the serialized data. */
+   uint32_t header_data[2 * VK_UUID_SIZE / 4];
+   memcpy(header_data, cmd->device->physical_device->driver_uuid, VK_UUID_SIZE);
+   memcpy(header_data + VK_UUID_SIZE / 4, cmd->device->physical_device->cache_uuid, VK_UUID_SIZE);
+
+   struct tu_cs *cs = &cmd->cs;
+
+   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + ARRAY_SIZE(header_data));
+   tu_cs_emit_qw(cs, pInfo->dst.deviceAddress);
+   tu_cs_emit_array(cs, header_data, ARRAY_SIZE(header_data));
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
+                                         const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                                         const uint32_t *pMaxPrimitiveCounts,
+                                         VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo)
+{
+   VK_FROM_HANDLE(tu_device, device, _device);
+
+   init_radix_sort(device);
+
+   struct vk_acceleration_structure_build_args args = {
+      .subgroup_size = 128,
+      .radix_sort = device->radix_sort,
+   };
+
+   vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts,
+                         pSizeInfo, &args);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_GetDeviceAccelerationStructureCompatibilityKHR(VkDevice _device,
+                                                  const VkAccelerationStructureVersionInfoKHR *pVersionInfo,
+                                                  VkAccelerationStructureCompatibilityKHR *pCompatibility)
+{
+   VK_FROM_HANDLE(tu_device, device, _device);
+   bool compat =
+      memcmp(pVersionInfo->pVersionData, device->physical_device->driver_uuid, VK_UUID_SIZE) == 0 &&
+      memcmp(pVersionInfo->pVersionData + VK_UUID_SIZE, device->physical_device->cache_uuid, VK_UUID_SIZE) == 0;
+   *pCompatibility = compat ? VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR
+                            : VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR;
+}
+
+VkResult
+tu_init_null_accel_struct(struct tu_device *device)
+{
+   VkResult result = tu_bo_init_new(device, NULL,
+                                    &device->null_accel_struct_bo,
+                                    sizeof(tu_accel_struct_header) +
+                                    sizeof(tu_internal_node),
+                                    TU_BO_ALLOC_NO_FLAGS, "null AS");
+   if (result != VK_SUCCESS) {
+      return result;
+   }
+
+   result = tu_bo_map(device, device->null_accel_struct_bo, NULL);
+   if (result != VK_SUCCESS) {
+      tu_bo_finish(device, device->null_accel_struct_bo);
+      return result;
+   }
+
+   struct tu_accel_struct_header header = {
+      .bvh_ptr = device->null_accel_struct_bo->iova +
+         sizeof(tu_accel_struct_header),
+      .self_ptr = device->null_accel_struct_bo->iova,
+   };
+
+   struct tu_internal_node node = {
+      .child_count = 0,
+      .type_flags = 0,
+   };
+
+   for (unsigned i = 0; i < 8; i++) {
+      node.mantissas[i][0][0] = 0xff;
+      node.mantissas[i][0][1] = 0xff;
+      node.mantissas[i][0][2] = 0xff;
+   }
+
+   memcpy(device->null_accel_struct_bo->map, (void *)&header, sizeof(header));
+   memcpy((char *)device->null_accel_struct_bo->map + sizeof(header),
+          (void *)&node, sizeof(node));
+   return VK_SUCCESS;
+}
+
+struct tu_node {
+   uint32_t data[16];
+};
+
+static void
+dump_leaf(struct tu_leaf_node *node)
+{
+   fprintf(stderr, "\tID: %d\n", node->id);
+   fprintf(stderr, "\tgeometry ID: %d\n", node->geometry_id);
+   bool aabb = node->type_flags & TU_NODE_TYPE_AABB;
+   for (unsigned i = 0; i < (aabb ? 2 : 3); i++) {
+      fprintf(stderr, "\t(");
+      for (unsigned j = 0; j < 3; j++) {
+         if (j != 0)
+            fprintf(stderr, ", ");
+         fprintf(stderr, "%f", node->coords[i][j]);
+      }
+      fprintf(stderr, ")\n");
+   }
+}
+
+static void
+dump_internal(struct tu_internal_node *node, uint32_t *max_child)
+{
+   *max_child = MAX2(*max_child, node->id + node->child_count);
+   float base[3];
+   unsigned exponents[3];
+   for (unsigned i = 0; i < 3; i++) {
+      base[i] = uif(node->bases[i] << 16);
+      exponents[i] = node->exponents[i] - 134;
+   }
+
+   for (unsigned i = 0; i < node->child_count; i++) {
+      fprintf(stderr, "\tchild %d\n", node->id + i);
+      for (unsigned vert = 0; vert < 2; vert++) {
+         fprintf(stderr, "\t\t(");
+         for (unsigned coord = 0; coord < 3; coord++) {
+            unsigned mantissa = node->mantissas[i][vert][coord];
+            if (coord != 0)
+               fprintf(stderr, ", ");
+            fprintf(stderr, "%f", base[coord] + ldexp((float)mantissa,
+                                                      exponents[coord]));
+         }
+         fprintf(stderr, ")\n");
+      }
+   }
+}
+
+static void
+dump_as(struct vk_acceleration_structure *as)
+{
+   VK_FROM_HANDLE(tu_buffer, buf, as->buffer);
+
+   struct tu_accel_struct_header *hdr =
+      (struct tu_accel_struct_header *)((char *)buf->bo->map + as->offset);
+   
+   fprintf(stderr, "dumping AS at %" PRIx64 "\n", buf->iova + as->offset);
+   u_hexdump(stderr, (uint8_t *)hdr, sizeof(*hdr), false);
+
+   char *base = ((char *)buf->bo->map + (hdr->bvh_ptr - buf->iova));
+   struct tu_node *node = (struct tu_node *)base;
+
+   fprintf(stderr, "dumping nodes at %" PRIx64 "\n", hdr->bvh_ptr);
+
+   uint32_t max_child = 1;
+   for (unsigned i = 0; i < max_child; i++) {
+      uint32_t *parent_ptr = (uint32_t*)(base - (4 + 4 * i));
+      uint32_t parent = *parent_ptr;
+      fprintf(stderr, "node %d parent %d\n", i, parent);
+      u_hexdump(stderr, (uint8_t *)node, sizeof(*node), false);
+      if (node->data[15] & TU_NODE_TYPE_LEAF) {
+         /* TODO compressed leaves */
+         dump_leaf((struct tu_leaf_node *)node);
+      } else {
+         dump_internal((struct tu_internal_node *)node, &max_child);
+      }
+
+      node++;
+   }
+}
+
+static bool
+as_finished(struct tu_device *dev, struct vk_acceleration_structure *as)
+{
+   VK_FROM_HANDLE(tu_buffer, buf, as->buffer);
+   tu_bo_map(dev, buf->bo, NULL);
+
+   struct tu_accel_struct_header *hdr =
+      (struct tu_accel_struct_header *)((char *)buf->bo->map + as->offset);
+   return hdr->self_ptr == buf->iova + as->offset;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+tu_DestroyAccelerationStructureKHR(VkDevice _device,
+                                   VkAccelerationStructureKHR accelerationStructure,
+                                   const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(tu_device, device, _device);
+   if (TU_DEBUG(DUMPAS)) {
+      VK_FROM_HANDLE(vk_acceleration_structure, as, accelerationStructure);
+      if (as_finished(device, as))
+         dump_as(as);
+   }
+
+   vk_common_DestroyAccelerationStructureKHR(_device, accelerationStructure,
+                                             pAllocator);
+}
diff --git a/src/freedreno/vulkan/tu_acceleration_structure.h b/src/freedreno/vulkan/tu_acceleration_structure.h
new file mode 100644
index 00000000000..08a88c61ddf
--- /dev/null
+++ b/src/freedreno/vulkan/tu_acceleration_structure.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef TU_ACCELERATION_STRUCT_H
+#define TU_ACCELERATION_STRUCT_H
+
+#include "tu_common.h"
+
+VkResult tu_init_null_accel_struct(struct tu_device *device);
+
+extern const vk_acceleration_structure_build_ops tu_as_build_ops;
+
+#endif
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index 001f6d2e068..d606486adcc 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -3958,13 +3958,26 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
                        VK_ACCESS_2_SHADER_READ_BIT |
                        VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
                        VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                       VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR,
+                       VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR |
+                       VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR,
                        VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
                        VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
                        VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
+                       VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
+                       VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
                        SHADER_STAGES))
        mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ;
 
+   /* Reading the AS for copying involves doing CmdDispatchIndirect with the
+    * copy size as a parameter, so it's read by the CP as well as a shader.
+    */
+   if (gfx_read_access(flags, stages,
+                       VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR,
+                       VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
+                       VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR))
+       mask |= TU_ACCESS_SYSMEM_READ;
+
+
    if (gfx_read_access(flags, stages,
                        VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT,
                        SHADER_STAGES))
@@ -3985,6 +3998,11 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
                         SHADER_STAGES))
        mask |= TU_ACCESS_UCHE_WRITE;
 
+   if (gfx_write_access(flags, stages,
+                        VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR,
+                        VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR))
+       mask |= TU_ACCESS_UCHE_WRITE | TU_ACCESS_CP_WRITE;
+
    /* When using GMEM, the CCU is always flushed automatically to GMEM, and
     * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
     * previous writes in sysmem mode when transitioning to GMEM. Therefore we
diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc
index 80ffdf66627..e90a44e2662 100644
--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@@ -26,6 +26,7 @@
 #include "util/mesa-sha1.h"
 #include "vk_descriptors.h"
 #include "vk_util.h"
+#include "vk_acceleration_structure.h"
 
 #include "tu_buffer.h"
 #include "tu_buffer_view.h"
@@ -33,6 +34,7 @@
 #include "tu_image.h"
 #include "tu_formats.h"
 #include "tu_rmv.h"
+#include "bvh/tu_build_interface.h"
 
 static inline uint8_t *
 pool_base(struct tu_descriptor_pool *pool)
@@ -67,6 +69,7 @@ descriptor_size(struct tu_device *dev,
          COND(dev->physical_device->info->a7xx.storage_8bit, 1));
    case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
       return binding->descriptorCount;
+   case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
    default:
       return A6XX_TEX_CONST_DWORDS * 4;
    }
@@ -1156,6 +1159,27 @@ write_sampler_descriptor(uint32_t *dst, VkSampler _sampler)
    memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor));
 }
 
+static void
+write_accel_struct(uint32_t *dst, uint64_t va, uint64_t size)
+{
+   dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_32_UINT);
+
+   /* The overall range of the entire AS may be more than the max range, but
+    * the SSBO is only used to access the instance descriptors and header.
+    * Make sure that we don't specify a too-large range.
+    */
+   dst[1] = MAX2(DIV_ROUND_UP(size, AS_RECORD_SIZE), MAX_TEXEL_ELEMENTS);
+   dst[2] =
+      A6XX_TEX_CONST_2_STRUCTSIZETEXELS(AS_RECORD_SIZE / 4) |
+      A6XX_TEX_CONST_2_STARTOFFSETTEXELS(0) |
+      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
+   dst[3] = 0;
+   dst[4] = A6XX_TEX_CONST_4_BASE_LO(va);
+   dst[5] = A6XX_TEX_CONST_5_BASE_HI(va >> 32);
+   for (int j = 6; j < A6XX_TEX_CONST_DWORDS; j++)
+      dst[j] = 0;
+}
+
 /* note: this is used with immutable samplers in push descriptors */
 static void
 write_sampler_push(uint32_t *dst, const struct tu_sampler *sampler)
@@ -1203,6 +1227,18 @@ tu_GetDescriptorEXT(
    case VK_DESCRIPTOR_TYPE_SAMPLER:
       write_sampler_descriptor(dest, *pDescriptorInfo->data.pSampler);
       break;
+   case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+      if (pDescriptorInfo->data.accelerationStructure == 0) {
+         write_accel_struct(dest, device->null_accel_struct_bo->iova,
+                            device->null_accel_struct_bo->size);
+      } else {
+         VkDeviceSize size = *(VkDeviceSize *)
+            util_sparse_array_get(&device->accel_struct_ranges,
+                                  pDescriptorInfo->data.accelerationStructure);
+         write_accel_struct(dest, pDescriptorInfo->data.accelerationStructure, size);
+      }
+      break;
+   }
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
       write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
                              pDescriptorInfo->data.pInputAttachmentImage);
@@ -1236,6 +1272,8 @@ tu_update_descriptor_sets(const struct tu_device *device,
          ptr += binding_layout->offset / 4;
       }
 
+      const VkWriteDescriptorSetAccelerationStructureKHR *accel_structs = NULL;
+
       /* for immutable samplers with push descriptors: */
       const bool copy_immutable_samplers =
          dstSetOverride && binding_layout->immutable_samplers_offset;
@@ -1279,6 +1317,9 @@ tu_update_descriptor_sets(const struct tu_device *device,
          } while (remaining > 0);
 
          continue;
+      } else if (writeset->descriptorType ==
+                 VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) {
+         accel_structs = vk_find_struct_const(writeset->pNext, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR);
       }
 
       ptr += binding_layout->size / 4 * writeset->dstArrayElement;
@@ -1316,6 +1357,18 @@ tu_update_descriptor_sets(const struct tu_device *device,
             else if (copy_immutable_samplers)
                write_sampler_push(ptr, &samplers[writeset->dstArrayElement + j]);
             break;
+         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+            VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, accel_structs->pAccelerationStructures[j]);
+            if (accel_struct) {
+               write_accel_struct(ptr,
+                                  vk_acceleration_structure_get_va(accel_struct),
+                                  accel_struct->size);
+            } else {
+               write_accel_struct(ptr, device->null_accel_struct_bo->iova,
+                                  device->null_accel_struct_bo->size);
+            }
+            break;
+         }
          default:
             unreachable("unimplemented descriptor type");
             break;
@@ -1646,6 +1699,18 @@ tu_update_descriptor_set_with_template(
             else if (samplers)
                write_sampler_push(ptr, &samplers[j]);
             break;
+         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+            VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, *(const VkAccelerationStructureKHR *)src);
+            if (accel_struct) {
+               write_accel_struct(ptr,
+                                  vk_acceleration_structure_get_va(accel_struct),
+                                  accel_struct->size);
+            } else {
+               write_accel_struct(ptr, device->null_accel_struct_bo->iova,
+                                  device->null_accel_struct_bo->size);
+            }
+            break;
+         }
          default:
             unreachable("unimplemented descriptor type");
             break;
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index df94a96a870..10bfa99b5d0 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -30,6 +30,7 @@
 #include "freedreno/common/freedreno_uuid.h"
 #include "freedreno/common/freedreno_stompable_regs.h"
 
+#include "tu_acceleration_structure.h"
 #include "tu_clear_blit.h"
 #include "tu_cmd_buffer.h"
 #include "tu_cs.h"
@@ -143,9 +144,19 @@ static void
 get_device_extensions(const struct tu_physical_device *device,
                       struct vk_device_extension_table *ext)
 {
+   /* device->has_raytracing contains the value of the SW fuse. If the
+    * device doesn't have a fuse (i.e. a740), we have to ignore it because
+    * kgsl returns false. If it does have a fuse, enable raytracing if the
+    * fuse is set and we have ray_intersection.
+    */
+   bool has_raytracing =
+      device->info->a7xx.has_ray_intersection &&
+      (!device->info->a7xx.has_sw_fuse || device->has_raytracing);
+
    *ext = (struct vk_device_extension_table) { .table = {
       .KHR_8bit_storage = device->info->a7xx.storage_8bit,
       .KHR_16bit_storage = device->info->a6xx.storage_16bit,
+      .KHR_acceleration_structure = has_raytracing,
       .KHR_bind_memory2 = true,
       .KHR_buffer_device_address = true,
       .KHR_calibrated_timestamps = device->info->a7xx.has_persistent_counter,
@@ -153,6 +164,7 @@ get_device_extensions(const struct tu_physical_device *device,
       .KHR_copy_commands2 = true,
       .KHR_create_renderpass2 = true,
       .KHR_dedicated_allocation = true,
+      .KHR_deferred_host_operations = true,
       .KHR_depth_stencil_resolve = true,
       .KHR_descriptor_update_template = true,
       .KHR_device_group = true,
@@ -466,6 +478,11 @@ tu_get_features(struct tu_physical_device *pdevice,
    /* Vulkan 1.4 */
    features->pushDescriptor = true;
 
+   /* VK_KHR_acceleration_structure */
+   features->accelerationStructure = true;
+   features->accelerationStructureCaptureReplay = pdevice->has_set_iova;
+   features->descriptorBindingAccelerationStructureUpdateAfterBind = true;
+
    /* VK_KHR_compute_shader_derivatives */
    features->computeDerivativeGroupQuads = pdevice->info->chip >= 7;
    features->computeDerivativeGroupLinear = pdevice->info->chip >= 7;
@@ -1219,6 +1236,7 @@ tu_get_properties(struct tu_physical_device *pdevice,
       COND(pdevice->info->a7xx.storage_8bit, 1));
    props->robustStorageBufferDescriptorSize =
       props->storageBufferDescriptorSize;
+   props->accelerationStructureDescriptorSize = 4 * A6XX_TEX_CONST_DWORDS;
    props->inputAttachmentDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
    props->maxSamplerDescriptorBufferRange = ~0ull;
    props->maxResourceDescriptorBufferRange = ~0ull;
@@ -1304,6 +1322,16 @@ tu_get_properties(struct tu_physical_device *pdevice,
 
       memcpy(props->optimalTilingLayoutUUID, sha1, VK_UUID_SIZE);
    }
+
+   /* VK_KHR_acceleration_structure */
+   props->maxGeometryCount = (1 << 24) - 1;
+   props->maxInstanceCount = (1 << 24) - 1;
+   props->maxPrimitiveCount = (1 << 29) - 1;
+   props->maxPerStageDescriptorAccelerationStructures = max_descriptor_set_size;
+   props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = max_descriptor_set_size;
+   props->maxDescriptorSetAccelerationStructures = max_descriptor_set_size;
+   props->maxDescriptorSetUpdateAfterBindAccelerationStructures = max_descriptor_set_size;
+   props->minAccelerationStructureScratchOffsetAlignment = 128;
 }
 
 static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
@@ -2416,6 +2444,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    }
 
    device->vk.command_buffer_ops = &tu_cmd_buffer_ops;
+   device->vk.as_build_ops = &tu_as_build_ops;
    device->vk.check_status = tu_device_check_status;
    device->vk.get_timestamp = tu_device_get_timestamp;
 
@@ -2479,6 +2508,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    if (result != VK_SUCCESS)
       goto fail_queues;
 
+   util_sparse_array_init(&device->accel_struct_ranges, sizeof(VkDeviceSize), 256);
+
+   mtx_init(&device->radix_sort_mutex, mtx_plain);
+
    {
       struct ir3_compiler_options ir3_options = {
          .push_ubo_with_preamble = true,
@@ -2555,6 +2588,15 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    device->global_bo_map = global;
    tu_init_clear_blit_shaders(device);
 
+   if (device->vk.enabled_features.accelerationStructure &&
+       device->vk.enabled_features.nullDescriptor) {
+      result = tu_init_null_accel_struct(device);
+      if (result != VK_SUCCESS) {
+         vk_startup_errorf(device->instance, result, "null acceleration struct");
+         goto fail_null_accel_struct;
+      }
+   }
+
    result = tu_init_empty_shaders(device);
    if (result != VK_SUCCESS) {
       vk_startup_errorf(device->instance, result, "empty shaders");
@@ -2738,6 +2780,9 @@ fail_pipeline_cache:
 fail_dynamic_rendering:
    tu_destroy_empty_shaders(device);
 fail_empty_shaders:
+   if (device->null_accel_struct_bo)
+      tu_bo_finish(device, device->null_accel_struct_bo);
+fail_null_accel_struct:
    tu_destroy_clear_blit_shaders(device);
 fail_global_bo_map:
    TU_RMV(resource_destroy, device, device->global_bo);
@@ -2752,6 +2797,7 @@ fail_free_zombie_vma:
    u_vector_finish(&device->zombie_vmas);
    ir3_compiler_destroy(device->compiler);
 fail_compiler:
+   util_sparse_array_finish(&device->accel_struct_ranges);
    vk_meta_device_finish(&device->vk, &device->meta);
 fail_queues:
    for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
@@ -2804,6 +2850,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
 
    vk_meta_device_finish(&device->vk, &device->meta);
 
+   util_sparse_array_finish(&device->accel_struct_ranges);
+
    ir3_compiler_destroy(device->compiler);
 
    vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc);
@@ -2832,6 +2880,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
 
    tu_bo_finish(device, device->global_bo);
 
+   if (device->null_accel_struct_bo)
+      tu_bo_finish(device, device->null_accel_struct_bo);
+
    for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
       for (unsigned q = 0; q < device->queue_count[i]; q++)
          tu_queue_finish(&device->queues[i][q]);
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index d736080727a..00a9ce9d339 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -22,6 +22,8 @@
 #include "tu_suballoc.h"
 #include "tu_util.h"
 
+#include "radix_sort/radix_sort_vk.h"
+
 #include "common/freedreno_rd_output.h"
 #include "util/vma.h"
 #include "util/u_vector.h"
@@ -290,6 +292,11 @@ struct tu_device
 
    struct vk_meta_device meta;
 
+   radix_sort_vk_t *radix_sort;
+   mtx_t radix_sort_mutex;
+
+   struct util_sparse_array accel_struct_ranges;
+
 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
 
    /* Currently the kernel driver uses a 32-bit GPU address space, but it
@@ -306,6 +313,8 @@ struct tu_device
    struct tu_bo *global_bo;
    struct tu6_global *global_bo_map;
 
+   struct tu_bo *null_accel_struct_bo;
+
    uint32_t implicit_sync_bo_count;
 
    /* Device-global BO suballocator for reducing BO management overhead for
diff --git a/src/freedreno/vulkan/tu_formats.cc b/src/freedreno/vulkan/tu_formats.cc
index fb5d8c15bd4..cb134a979c6 100644
--- a/src/freedreno/vulkan/tu_formats.cc
+++ b/src/freedreno/vulkan/tu_formats.cc
@@ -12,6 +12,7 @@
 #include "vk_android.h"
 #include "vk_enum_defines.h"
 #include "vk_util.h"
+#include "vk_acceleration_structure.h"
 #include "drm-uapi/drm_fourcc.h"
 
 #include "tu_android.h"
@@ -270,6 +271,9 @@ tu_physical_device_get_format_properties(
    if (vk_format == VK_FORMAT_R8_UINT)
       optimal |= VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
 
+   if (vk_acceleration_struct_vtx_format_supported(vk_format))
+      buffer |= VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
+
 end:
    out_properties->linearTilingFeatures = linear;
    out_properties->optimalTilingFeatures = optimal;
diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc
index 14553e198ac..213024dc411 100644
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@@ -79,6 +79,7 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
             /* IBO-backed resources only need one packet for all graphics stages */
             if (stage_count)
                count += 1;
@@ -174,7 +175,8 @@ tu6_emit_load_state(struct tu_device *device,
             FALLTHROUGH;
          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
+         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
             unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
             /* IBO-backed resources only need one packet for all graphics stages */
             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc
index 824e4cd3635..ff9463bc136 100644
--- a/src/freedreno/vulkan/tu_query_pool.cc
+++ b/src/freedreno/vulkan/tu_query_pool.cc
@@ -13,9 +13,11 @@
 #include "nir/nir_builder.h"
 #include "util/os_time.h"
 
+#include "vk_acceleration_structure.h"
 #include "vk_util.h"
 
 #include "tu_buffer.h"
+#include "bvh/tu_build_interface.h"
 #include "tu_cmd_buffer.h"
 #include "tu_cs.h"
 #include "tu_device.h"
@@ -92,6 +94,11 @@ struct PACKED primitives_generated_query_slot {
    uint64_t end;
 };
 
+struct PACKED accel_struct_slot {
+   struct query_slot common;
+   uint64_t result;
+};
+
 /* Returns the IOVA or mapped address of a given uint64_t field
  * in a given slot of a query pool. */
 #define query_iova(type, pool, query, field)                               \
@@ -251,6 +258,12 @@ tu_CreateQueryPool(VkDevice _device,
       pool_size += sizeof(struct tu_perf_query_data) *
                    perf_query_info->counterIndexCount;
       break;
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+      slot_size = sizeof(struct accel_struct_slot);
+      break;
    }
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
       slot_size = sizeof(struct pipeline_stat_query_slot);
@@ -366,6 +379,10 @@ get_result_count(struct tu_query_pool *pool)
    case VK_QUERY_TYPE_OCCLUSION:
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
       return 1;
    /* Transform feedback queries write two integer values */
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
@@ -586,6 +603,10 @@ tu_GetQueryPoolResults(VkDevice _device,
    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
       return get_query_pool_results(device, pool, firstQuery, queryCount,
                                     dataSize, pData, stride, flags);
    default:
@@ -735,6 +756,10 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
       return emit_copy_query_pool_results<CHIP>(cmdbuf, cs, pool, firstQuery,
                                                 queryCount, buffer, dstOffset,
                                                 stride, flags);
@@ -802,6 +827,10 @@ tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
       break;
    default:
@@ -1710,6 +1739,56 @@ tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
    handle_multiview_queries(cmd, pool, query);
 }
 
+VKAPI_ATTR void VKAPI_CALL
+tu_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,
+                                               uint32_t accelerationStructureCount,
+                                               const VkAccelerationStructureKHR *pAccelerationStructures,
+                                               VkQueryType queryType,
+                                               VkQueryPool queryPool,
+                                               uint32_t firstQuery)
+{
+   VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
+
+   struct tu_cs *cs = &cmd->cs;
+
+   /* Flush any AS builds */
+   tu_emit_cache_flush<A7XX>(cmd);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
+      uint32_t query = i + firstQuery;
+
+      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
+      uint64_t va = vk_acceleration_structure_get_va(accel_struct);
+
+      switch (queryType) {
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+         va += offsetof(struct tu_accel_struct_header, compacted_size);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+         va += offsetof(struct tu_accel_struct_header, serialization_size);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+         va += offsetof(struct tu_accel_struct_header, instance_count);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+         va += offsetof(struct tu_accel_struct_header, size);
+         break;
+      default:
+         unreachable("Unhandle accel struct query type.");
+      }
+
+      tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
+      tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE);
+      tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
+      tu_cs_emit_qw(cs, va);
+
+      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+      tu_cs_emit_qw(cs, query_available_iova(pool, query));
+      tu_cs_emit_qw(cs, 0x1);
+   }
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
     VkPhysicalDevice                            physicalDevice,
diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc
index b96de287990..aa405f61edd 100644
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@@ -45,6 +45,7 @@ static const struct debug_control tu_debug_options[] = {
    { "hiprio", TU_DEBUG_HIPRIO },
    { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
    { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
+   { "dumpas", TU_DEBUG_DUMPAS },
    { NULL, 0 }
 };
 
diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h
index 0910acc183e..3bd037ac1f9 100644
--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@@ -51,6 +51,7 @@ enum tu_debug_flags
    TU_DEBUG_HIPRIO = 1 << 26,
    TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27,
    TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28,
+   TU_DEBUG_DUMPAS = 1 << 29,
 };
 
 struct tu_env {