vulkan/runtime,radv: Add shared BVH building framework

This is mostly adapted from radv's BVH building. This defines a common "IR" for BVH trees, two algorithms for constructing it, and a callback that the driver implements for encoding. The framework takes care of parallelizing the different passes, so the driver just has to split the encoding process into "stages" and implement just one part for each stage. The runtime changes are: Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> The radv changes are; Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31433>
2025-12-23 22:00:13 +01:00 · 2024-03-05 06:26:46 -05:00 · 2024-03-05 06:26:46 -05:00 · f8b584d6a5
commit f8b584d6a5
parent d19af11e49
59 changed files with 4500 additions and 3530 deletions
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@ -8,210 +8,7 @@
 #define BVH_BUILD_HELPERS_H

 #include "bvh.h"
-
-#define VK_FORMAT_UNDEFINED                  0
-#define VK_FORMAT_R4G4_UNORM_PACK8           1
-#define VK_FORMAT_R4G4B4A4_UNORM_PACK16      2
-#define VK_FORMAT_B4G4R4A4_UNORM_PACK16      3
-#define VK_FORMAT_R5G6B5_UNORM_PACK16        4
-#define VK_FORMAT_B5G6R5_UNORM_PACK16        5
-#define VK_FORMAT_R5G5B5A1_UNORM_PACK16      6
-#define VK_FORMAT_B5G5R5A1_UNORM_PACK16      7
-#define VK_FORMAT_A1R5G5B5_UNORM_PACK16      8
-#define VK_FORMAT_R8_UNORM                   9
-#define VK_FORMAT_R8_SNORM                   10
-#define VK_FORMAT_R8_USCALED                 11
-#define VK_FORMAT_R8_SSCALED                 12
-#define VK_FORMAT_R8_UINT                    13
-#define VK_FORMAT_R8_SINT                    14
-#define VK_FORMAT_R8_SRGB                    15
-#define VK_FORMAT_R8G8_UNORM                 16
-#define VK_FORMAT_R8G8_SNORM                 17
-#define VK_FORMAT_R8G8_USCALED               18
-#define VK_FORMAT_R8G8_SSCALED               19
-#define VK_FORMAT_R8G8_UINT                  20
-#define VK_FORMAT_R8G8_SINT                  21
-#define VK_FORMAT_R8G8_SRGB                  22
-#define VK_FORMAT_R8G8B8_UNORM               23
-#define VK_FORMAT_R8G8B8_SNORM               24
-#define VK_FORMAT_R8G8B8_USCALED             25
-#define VK_FORMAT_R8G8B8_SSCALED             26
-#define VK_FORMAT_R8G8B8_UINT                27
-#define VK_FORMAT_R8G8B8_SINT                28
-#define VK_FORMAT_R8G8B8_SRGB                29
-#define VK_FORMAT_B8G8R8_UNORM               30
-#define VK_FORMAT_B8G8R8_SNORM               31
-#define VK_FORMAT_B8G8R8_USCALED             32
-#define VK_FORMAT_B8G8R8_SSCALED             33
-#define VK_FORMAT_B8G8R8_UINT                34
-#define VK_FORMAT_B8G8R8_SINT                35
-#define VK_FORMAT_B8G8R8_SRGB                36
-#define VK_FORMAT_R8G8B8A8_UNORM             37
-#define VK_FORMAT_R8G8B8A8_SNORM             38
-#define VK_FORMAT_R8G8B8A8_USCALED           39
-#define VK_FORMAT_R8G8B8A8_SSCALED           40
-#define VK_FORMAT_R8G8B8A8_UINT              41
-#define VK_FORMAT_R8G8B8A8_SINT              42
-#define VK_FORMAT_R8G8B8A8_SRGB              43
-#define VK_FORMAT_B8G8R8A8_UNORM             44
-#define VK_FORMAT_B8G8R8A8_SNORM             45
-#define VK_FORMAT_B8G8R8A8_USCALED           46
-#define VK_FORMAT_B8G8R8A8_SSCALED           47
-#define VK_FORMAT_B8G8R8A8_UINT              48
-#define VK_FORMAT_B8G8R8A8_SINT              49
-#define VK_FORMAT_B8G8R8A8_SRGB              50
-#define VK_FORMAT_A8B8G8R8_UNORM_PACK32      51
-#define VK_FORMAT_A8B8G8R8_SNORM_PACK32      52
-#define VK_FORMAT_A8B8G8R8_USCALED_PACK32    53
-#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32    54
-#define VK_FORMAT_A8B8G8R8_UINT_PACK32       55
-#define VK_FORMAT_A8B8G8R8_SINT_PACK32       56
-#define VK_FORMAT_A8B8G8R8_SRGB_PACK32       57
-#define VK_FORMAT_A2R10G10B10_UNORM_PACK32   58
-#define VK_FORMAT_A2R10G10B10_SNORM_PACK32   59
-#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
-#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
-#define VK_FORMAT_A2R10G10B10_UINT_PACK32    62
-#define VK_FORMAT_A2R10G10B10_SINT_PACK32    63
-#define VK_FORMAT_A2B10G10R10_UNORM_PACK32   64
-#define VK_FORMAT_A2B10G10R10_SNORM_PACK32   65
-#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
-#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
-#define VK_FORMAT_A2B10G10R10_UINT_PACK32    68
-#define VK_FORMAT_A2B10G10R10_SINT_PACK32    69
-#define VK_FORMAT_R16_UNORM                  70
-#define VK_FORMAT_R16_SNORM                  71
-#define VK_FORMAT_R16_USCALED                72
-#define VK_FORMAT_R16_SSCALED                73
-#define VK_FORMAT_R16_UINT                   74
-#define VK_FORMAT_R16_SINT                   75
-#define VK_FORMAT_R16_SFLOAT                 76
-#define VK_FORMAT_R16G16_UNORM               77
-#define VK_FORMAT_R16G16_SNORM               78
-#define VK_FORMAT_R16G16_USCALED             79
-#define VK_FORMAT_R16G16_SSCALED             80
-#define VK_FORMAT_R16G16_UINT                81
-#define VK_FORMAT_R16G16_SINT                82
-#define VK_FORMAT_R16G16_SFLOAT              83
-#define VK_FORMAT_R16G16B16_UNORM            84
-#define VK_FORMAT_R16G16B16_SNORM            85
-#define VK_FORMAT_R16G16B16_USCALED          86
-#define VK_FORMAT_R16G16B16_SSCALED          87
-#define VK_FORMAT_R16G16B16_UINT             88
-#define VK_FORMAT_R16G16B16_SINT             89
-#define VK_FORMAT_R16G16B16_SFLOAT           90
-#define VK_FORMAT_R16G16B16A16_UNORM         91
-#define VK_FORMAT_R16G16B16A16_SNORM         92
-#define VK_FORMAT_R16G16B16A16_USCALED       93
-#define VK_FORMAT_R16G16B16A16_SSCALED       94
-#define VK_FORMAT_R16G16B16A16_UINT          95
-#define VK_FORMAT_R16G16B16A16_SINT          96
-#define VK_FORMAT_R16G16B16A16_SFLOAT        97
-#define VK_FORMAT_R32_UINT                   98
-#define VK_FORMAT_R32_SINT                   99
-#define VK_FORMAT_R32_SFLOAT                 100
-#define VK_FORMAT_R32G32_UINT                101
-#define VK_FORMAT_R32G32_SINT                102
-#define VK_FORMAT_R32G32_SFLOAT              103
-#define VK_FORMAT_R32G32B32_UINT             104
-#define VK_FORMAT_R32G32B32_SINT             105
-#define VK_FORMAT_R32G32B32_SFLOAT           106
-#define VK_FORMAT_R32G32B32A32_UINT          107
-#define VK_FORMAT_R32G32B32A32_SINT          108
-#define VK_FORMAT_R32G32B32A32_SFLOAT        109
-#define VK_FORMAT_R64_UINT                   110
-#define VK_FORMAT_R64_SINT                   111
-#define VK_FORMAT_R64_SFLOAT                 112
-#define VK_FORMAT_R64G64_UINT                113
-#define VK_FORMAT_R64G64_SINT                114
-#define VK_FORMAT_R64G64_SFLOAT              115
-#define VK_FORMAT_R64G64B64_UINT             116
-#define VK_FORMAT_R64G64B64_SINT             117
-#define VK_FORMAT_R64G64B64_SFLOAT           118
-#define VK_FORMAT_R64G64B64A64_UINT          119
-#define VK_FORMAT_R64G64B64A64_SINT          120
-#define VK_FORMAT_R64G64B64A64_SFLOAT        121
-
-#define VK_INDEX_TYPE_UINT16    0
-#define VK_INDEX_TYPE_UINT32    1
-#define VK_INDEX_TYPE_NONE_KHR  1000165000
-#define VK_INDEX_TYPE_UINT8_EXT 1000265000
-
-#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
-#define VK_GEOMETRY_TYPE_AABBS_KHR     1
-#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
-
-#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
-#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR         2
-#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR                 4
-#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR              8
-
-#define TYPE(type, align)                                                                                              \
-   layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref                                  \
-   {                                                                                                                   \
-      type value;                                                                                                      \
-   };
-
-#define REF(type)  type##_ref
-#define VOID_REF   uint64_t
-#define NULL       0
-#define DEREF(var) var.value
-
-#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
-
-#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
-
-#define INFINITY (1.0 / 0.0)
-#define NAN      (0.0 / 0.0)
-
-#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
-
-TYPE(int8_t, 1);
-TYPE(uint8_t, 1);
-TYPE(int16_t, 2);
-TYPE(uint16_t, 2);
-TYPE(int32_t, 4);
-TYPE(uint32_t, 4);
-TYPE(int64_t, 8);
-TYPE(uint64_t, 8);
-
-TYPE(float, 4);
-
-TYPE(vec2, 4);
-TYPE(vec3, 4);
-TYPE(vec4, 4);
-
-TYPE(uvec4, 16);
-
-TYPE(VOID_REF, 8);
-
-/* copied from u_math.h */
-uint32_t
-align(uint32_t value, uint32_t alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
-
-int32_t
-to_emulated_float(float f)
-{
-   int32_t bits = floatBitsToInt(f);
-   return f < 0 ? -2147483648 - bits : bits;
-}
-
-float
-from_emulated_float(int32_t bits)
-{
-   return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
-}
-
-TYPE(radv_aabb, 4);
-
-struct key_id_pair {
-   uint32_t id;
-   uint32_t key;
-};
-TYPE(key_id_pair, 4);
+#include "vk_build_helpers.h"

 TYPE(radv_accel_struct_serialization_header, 8);
 TYPE(radv_accel_struct_header, 8);
@ -221,12 +18,6 @@ TYPE(radv_bvh_instance_node, 8);
 TYPE(radv_bvh_box16_node, 4);
 TYPE(radv_bvh_box32_node, 4);

-TYPE(radv_ir_header, 4);
-TYPE(radv_ir_node, 4);
-TYPE(radv_ir_box_node, 4);
-
-TYPE(radv_global_sync_data, 4);
-
 uint32_t
 id_to_offset(uint32_t id)
 {
@ -259,178 +50,23 @@ addr_to_node(uint64_t addr)
   return (addr >> 3) & ((1ul << 45) - 1);
 }

-uint32_t
-ir_id_to_offset(uint32_t id)
-{
-   return id & (~3u);
-}
-
-uint32_t
-ir_id_to_type(uint32_t id)
-{
-   return id & 3u;
-}
-
-uint32_t
-pack_ir_node_id(uint32_t offset, uint32_t type)
-{
-   return offset | type;
-}
-
 uint32_t
 ir_type_to_bvh_type(uint32_t type)
 {
   switch (type) {
-   case radv_ir_node_triangle:
+   case vk_ir_node_triangle:
      return radv_bvh_node_triangle;
-   case radv_ir_node_internal:
+   case vk_ir_node_internal:
      return radv_bvh_node_box32;
-   case radv_ir_node_instance:
+   case vk_ir_node_instance:
      return radv_bvh_node_instance;
-   case radv_ir_node_aabb:
+   case vk_ir_node_aabb:
      return radv_bvh_node_aabb;
   }
   /* unreachable in valid nodes */
   return RADV_BVH_INVALID_NODE;
 }

-float
-aabb_surface_area(radv_aabb aabb)
-{
-   vec3 diagonal = aabb.max - aabb.min;
-   return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
-}
-
-/* Just a wrapper for 3 uints. */
-struct triangle_indices {
-   uint32_t index[3];
-};
-
-triangle_indices
-load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
-{
-   triangle_indices result;
-
-   uint32_t index_base = global_id * 3;
-
-   switch (index_format) {
-   case VK_INDEX_TYPE_UINT16: {
-      result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
-      break;
-   }
-   case VK_INDEX_TYPE_UINT32: {
-      result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
-      break;
-   }
-   case VK_INDEX_TYPE_NONE_KHR: {
-      result.index[0] = index_base + 0;
-      result.index[1] = index_base + 1;
-      result.index[2] = index_base + 2;
-      break;
-   }
-   case VK_INDEX_TYPE_UINT8_EXT: {
-      result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
-      break;
-   }
-   }
-
-   return result;
-}
-
-/* Just a wrapper for 3 vec4s. */
-struct triangle_vertices {
-   vec4 vertex[3];
-};
-
-TYPE(float16_t, 2);
-
-triangle_vertices
-load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
-{
-   triangle_vertices result;
-
-   for (uint32_t i = 0; i < 3; i++) {
-      VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
-      vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
-
-      switch (vertex_format) {
-      case VK_FORMAT_R32G32_SFLOAT:
-         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
-         break;
-      case VK_FORMAT_R32G32B32_SFLOAT:
-      case VK_FORMAT_R32G32B32A32_SFLOAT:
-         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
-         vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
-         break;
-      case VK_FORMAT_R16G16_SFLOAT:
-         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
-         break;
-      case VK_FORMAT_R16G16B16_SFLOAT:
-      case VK_FORMAT_R16G16B16A16_SFLOAT:
-         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
-         vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
-         break;
-      case VK_FORMAT_R16G16_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
-         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
-         break;
-      case VK_FORMAT_R16G16B16A16_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
-         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
-         vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
-         break;
-      case VK_FORMAT_R8G8_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
-         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
-         break;
-      case VK_FORMAT_R8G8B8A8_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
-         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
-         vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
-         break;
-      case VK_FORMAT_R16G16_UNORM:
-         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
-         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
-         break;
-      case VK_FORMAT_R16G16B16A16_UNORM:
-         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
-         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
-         vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
-         break;
-      case VK_FORMAT_R8G8_UNORM:
-         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
-         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
-         break;
-      case VK_FORMAT_R8G8B8A8_UNORM:
-         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
-         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
-         vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
-         break;
-      case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
-         uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
-         vertex.x = float(data & 0x3FF) / 0x3FF;
-         vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
-         vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
-         break;
-      }
-      }
-
-      result.vertex[i] = vertex;
-   }
-
-   return result;
-}
-
 /* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
 struct AccelerationStructureInstance {
   mat3x4 transform;
@ -441,7 +77,7 @@ struct AccelerationStructureInstance {
 TYPE(AccelerationStructureInstance, 8);

 bool
-build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data geom_data, uint32_t global_id)
+build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
 {
   bool is_valid = true;
   triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
@ -490,7 +126,7 @@ build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data
 }

 bool
-build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
+build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
 {
   bool is_valid = true;
   REF(radv_bvh_aabb_node) node = REF(radv_bvh_aabb_node)(dst_ptr);
@ -521,10 +157,10 @@ build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t
   return is_valid;
 }

-radv_aabb
+vk_aabb
 calculate_instance_node_bounds(radv_accel_struct_header header, mat3x4 otw_matrix)
 {
-   radv_aabb aabb;
+   vk_aabb aabb;
   for (uint32_t comp = 0; comp < 3; ++comp) {
      aabb.min[comp] = otw_matrix[comp][3];
      aabb.max[comp] = otw_matrix[comp][3];
@ -555,7 +191,7 @@ encode_sbt_offset_and_flags(uint32_t src)
 }

 bool
-build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
+build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
 {
   REF(radv_bvh_instance_node) node = REF(radv_bvh_instance_node)(dst_ptr);

@ -591,123 +227,4 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3
    From macros.h */
 #define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))

-#ifdef USE_GLOBAL_SYNC
-
-/* There might be more invocations available than tasks to do.
- * In that case, the fetched task index is greater than the
- * counter offset for the next phase. To avoid out-of-bounds
- * accessing, phases will be skipped until the task index is
- * is in-bounds again. */
-uint32_t num_tasks_to_skip = 0;
-uint32_t phase_index = 0;
-bool should_skip = false;
-shared uint32_t global_task_index;
-
-shared uint32_t shared_phase_index;
-
-uint32_t
-task_count(REF(radv_ir_header) header)
-{
-   uint32_t phase_index = DEREF(header).sync_data.phase_index;
-   return DEREF(header).sync_data.task_counts[phase_index & 1];
-}
-
-/* Sets the task count for the next phase. */
-void
-set_next_task_count(REF(radv_ir_header) header, uint32_t new_count)
-{
-   uint32_t phase_index = DEREF(header).sync_data.phase_index;
-   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
-}
-
-/*
- * This function has two main objectives:
- * Firstly, it partitions pending work among free invocations.
- * Secondly, it guarantees global synchronization between different phases.
- *
- * After every call to fetch_task, a new task index is returned.
- * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
- * to determine if the current phase should be executed or skipped.
- *
- * Since tasks are assigned per-workgroup, there is a possibility of the task index being
- * greater than the total task count.
- */
-uint32_t
-fetch_task(REF(radv_ir_header) header, bool did_work)
-{
-   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
-    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
-    * and their results are written to memory. */
-   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-   if (gl_LocalInvocationIndex == 0) {
-      if (did_work)
-         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
-      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
-
-      do {
-         /* Perform a memory barrier to refresh the current phase's end counter, in case
-          * another workgroup changed it. */
-         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-
-         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
-          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
-          * previous phase have been completed. Switching to a new phase and incrementing the phase
-          * end counter in turn notifies all invocations for that phase that it is safe to execute.
-          */
-         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
-             DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
-            if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
-               DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
-               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-            } else {
-               atomicAdd(DEREF(header).sync_data.phase_index, 1);
-               DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
-               /* Ensure the changes to the phase index and start/end counter are visible for other
-                * workgroup waiting in the loop. */
-               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-               atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
-                         DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
-            }
-            break;
-         }
-
-         /* If other invocations have finished all nodes, break out; there is no work to do */
-         if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
-            break;
-         }
-      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
-
-      shared_phase_index = DEREF(header).sync_data.phase_index;
-   }
-
-   barrier();
-   if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
-      return TASK_INDEX_INVALID;
-
-   num_tasks_to_skip = shared_phase_index - phase_index;
-
-   uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
-   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
-}
-
-bool
-should_execute_phase()
-{
-   if (num_tasks_to_skip > 0) {
-      /* Skip to next phase. */
-      ++phase_index;
-      --num_tasks_to_skip;
-      return false;
-   }
-   return true;
-}
-
-#define PHASE(header)                                                                                                  \
-   for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
-#endif
-
 #endif /* BUILD_HELPERS_H */
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@ -16,49 +16,6 @@
 #define VOID_REF  uint64_t
 #endif

-struct leaf_args {
-   VOID_REF ir;
-   VOID_REF bvh;
-   REF(radv_ir_header) header;
-   REF(key_id_pair) ids;
-
-   radv_bvh_geometry_data geom_data;
-};
-
-struct morton_args {
-   VOID_REF bvh;
-   REF(radv_ir_header) header;
-   REF(key_id_pair) ids;
-};
-
-#define LBVH_RIGHT_CHILD_BIT_SHIFT 29
-#define LBVH_RIGHT_CHILD_BIT       (1 << LBVH_RIGHT_CHILD_BIT_SHIFT)
-
-struct lbvh_node_info {
-   /* Number of children that have been processed (or are invalid/leaves) in
-    * the lbvh_generate_ir pass.
-    */
-   uint32_t path_count;
-
-   uint32_t children[2];
-   uint32_t parent;
-};
-
-struct lbvh_main_args {
-   VOID_REF bvh;
-   REF(key_id_pair) src_ids;
-   VOID_REF node_info;
-   uint32_t id_count;
-   uint32_t internal_node_base;
-};
-
-struct lbvh_generate_ir_args {
-   VOID_REF bvh;
-   VOID_REF node_info;
-   VOID_REF header;
-   uint32_t internal_node_base;
-};
-
 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
 #define RADV_COPY_MODE_DESERIALIZE 2
@ -72,30 +29,14 @@ struct copy_args {
 struct encode_args {
   VOID_REF intermediate_bvh;
   VOID_REF output_bvh;
-   REF(radv_ir_header) header;
+   REF(vk_ir_header) header;
   uint32_t output_bvh_offset;
   uint32_t leaf_node_count;
   uint32_t geometry_type;
 };

-struct ploc_prefix_scan_partition {
-   uint32_t aggregate;
-   uint32_t inclusive_sum;
-};
-
-#define PLOC_WORKGROUP_SIZE 1024
-
-struct ploc_args {
-   VOID_REF bvh;
-   VOID_REF prefix_scan_partitions;
-   REF(radv_ir_header) header;
-   VOID_REF ids_0;
-   VOID_REF ids_1;
-   uint32_t internal_node_offset;
-};
-
 struct header_args {
-   REF(radv_ir_header) src;
+   REF(vk_ir_header) src;
   REF(radv_accel_struct_header) dst;
   uint32_t bvh_offset;
   uint32_t instance_count;
@ -104,11 +45,11 @@ struct header_args {
 struct update_args {
   REF(radv_accel_struct_header) src;
   REF(radv_accel_struct_header) dst;
-   REF(radv_aabb) leaf_bounds;
+   REF(vk_aabb) leaf_bounds;
   REF(uint32_t) internal_ready_count;
   uint32_t leaf_node_count;

-   radv_bvh_geometry_data geom_data;
+   vk_bvh_geometry_data geom_data;
 };

 #endif /* BUILD_INTERFACE_H */
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@ -7,17 +7,14 @@
 #ifndef BVH_BVH_H
 #define BVH_BVH_H

+#include "vk_bvh.h"
+
 #define radv_bvh_node_triangle 0
 #define radv_bvh_node_box16    4
 #define radv_bvh_node_box32    5
 #define radv_bvh_node_instance 6
 #define radv_bvh_node_aabb     7

-#define radv_ir_node_triangle 0
-#define radv_ir_node_internal 1
-#define radv_ir_node_instance 2
-#define radv_ir_node_aabb     3
-
 #define RADV_GEOMETRY_OPAQUE (1u << 31)

 #define RADV_INSTANCE_FORCE_OPAQUE                 (1u << 31)
@ -29,31 +26,9 @@
 #define VK_UUID_SIZE 16
 #else
 #include <vulkan/vulkan.h>
-typedef struct radv_ir_node radv_ir_node;
-typedef struct radv_global_sync_data radv_global_sync_data;
-typedef struct radv_bvh_geometry_data radv_bvh_geometry_data;
-
 typedef uint16_t float16_t;
-
-typedef struct {
-   float values[3][4];
-} mat3x4;
-
-typedef struct {
-   float x;
-   float y;
-   float z;
-} vec3;
-
-typedef struct radv_aabb radv_aabb;
-
 #endif

-struct radv_aabb {
-   vec3 min;
-   vec3 max;
-};
-
 struct radv_accel_struct_serialization_header {
   uint8_t driver_uuid[VK_UUID_SIZE];
   uint8_t accel_struct_compat[VK_UUID_SIZE];
@ -74,7 +49,7 @@ struct radv_accel_struct_geometry_info {
 struct radv_accel_struct_header {
   uint32_t bvh_offset;
   uint32_t reserved;
-   radv_aabb aabb;
+   vk_aabb aabb;

   /* Everything after this gets either updated/copied from the CPU or written by header.comp. */
   uint64_t compacted_size;
@ -89,45 +64,6 @@ struct radv_accel_struct_header {
   uint32_t build_flags;
 };

-struct radv_ir_node {
-   radv_aabb aabb;
-};
-
-#define RADV_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
-#define RADV_NULL_BVH_OFFSET    0xFFFFFFFE
-
-struct radv_ir_box_node {
-   radv_ir_node base;
-   uint32_t children[2];
-   uint32_t bvh_offset;
-};
-
-struct radv_global_sync_data {
-   uint32_t task_counts[2];
-   uint32_t task_started_counter;
-   uint32_t task_done_counter;
-   uint32_t current_phase_start_counter;
-   uint32_t current_phase_end_counter;
-   uint32_t phase_index;
-   /* If this flag is set, the shader should exit
-    * instead of executing another phase */
-   uint32_t next_phase_exit_flag;
-};
-
-struct radv_ir_header {
-   int32_t min_bounds[3];
-   int32_t max_bounds[3];
-   uint32_t active_leaf_count;
-   /* Indirect dispatch dimensions for the encoder.
-    * ir_internal_node_count is the thread count in the X dimension,
-    * while Y and Z are always set to 1. */
-   uint32_t ir_internal_node_count;
-   uint32_t dispatch_size_y;
-   uint32_t dispatch_size_z;
-   radv_global_sync_data sync_data;
-   uint32_t dst_node_offset;
-};
-
 struct radv_bvh_triangle_node {
   float coords[3][3];
   uint32_t reserved[3];
@ -170,28 +106,11 @@ struct radv_bvh_box16_node {

 struct radv_bvh_box32_node {
   uint32_t children[4];
-   radv_aabb coords[4];
+   vk_aabb coords[4];
   uint32_t reserved[4];
 };

 #define RADV_BVH_ROOT_NODE    radv_bvh_node_box32
 #define RADV_BVH_INVALID_NODE 0xffffffffu

-/* If the task index is set to this value, there is no
- * more work to do. */
-#define TASK_INDEX_INVALID 0xFFFFFFFF
-
-struct radv_bvh_geometry_data {
-   uint64_t data;
-   uint64_t indices;
-   uint64_t transform;
-
-   uint32_t geometry_id;
-   uint32_t geometry_type;
-   uint32_t first_id;
-   uint32_t stride;
-   uint32_t vertex_format;
-   uint32_t index_format;
-};
-
 #endif /* BVH_H */
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@ -36,31 +36,85 @@ void set_parent(uint32_t child, uint32_t parent)
 void
 main()
 {
-   /* Revert the order so we start at the root */
-   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
-
-   uint32_t output_leaf_node_size;
-   switch (args.geometry_type) {
-   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
-      break;
-   case VK_GEOMETRY_TYPE_AABBS_KHR:
-      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
-      break;
-   default: /* instances */
-      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
-      break;
-   }
-
-   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * SIZEOF(radv_ir_node);
+   /* Encode leaf nodes. */
   uint32_t dst_leaf_offset =
      id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_bvh_box32_node);
+
+   uint32_t ir_leaf_node_size;
+   uint32_t output_leaf_node_size;
+   switch (args.geometry_type) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
+      ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
+
+      vk_ir_triangle_node src_node =
+         DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_triangle_node) dst_node =
+         REF(radv_bvh_triangle_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      DEREF(dst_node).coords = src_node.coords;
+      DEREF(dst_node).triangle_id = src_node.triangle_id;
+      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+      DEREF(dst_node).id = 9;
+
+      break;
+   }
+   case VK_GEOMETRY_TYPE_AABBS_KHR: {
+      ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
+
+      vk_ir_aabb_node src_node =
+         DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_aabb_node) dst_node =
+         REF(radv_bvh_aabb_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      DEREF(dst_node).primitive_id = src_node.primitive_id;
+      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+
+      break;
+   }
+   default: {
+      /* instances */
+      ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
+
+      vk_ir_instance_node src_node =
+         DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_instance_node) dst_node =
+         REF(radv_bvh_instance_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      radv_accel_struct_header blas_header =
+         DEREF(REF(radv_accel_struct_header)(src_node.base_ptr));
+
+      DEREF(dst_node).bvh_ptr = addr_to_node(src_node.base_ptr + blas_header.bvh_offset);
+      DEREF(dst_node).bvh_offset = blas_header.bvh_offset;
+
+      mat4 transform = mat4(src_node.otw_matrix);
+      mat4 inv_transform = transpose(inverse(transpose(transform)));
+      DEREF(dst_node).wto_matrix = mat3x4(inv_transform);
+      DEREF(dst_node).otw_matrix = mat3x4(transform);
+
+      DEREF(dst_node).custom_instance_and_mask = src_node.custom_instance_and_mask;
+      DEREF(dst_node).sbt_offset_and_flags = encode_sbt_offset_and_flags(src_node.sbt_offset_and_flags);
+      DEREF(dst_node).instance_id = src_node.instance_id;
+
+      break;
+   }
+   }
+
+   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
+      return;
+
+   /* Encode internal nodes. Revert the order so we start at the root */
+   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
+
+   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
   uint32_t dst_internal_offset = dst_leaf_offset + args.leaf_node_count * output_leaf_node_size;

-   REF(radv_ir_box_node) intermediate_internal_nodes =
-      REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
-   REF(radv_ir_box_node) src_node = INDEX(radv_ir_box_node, intermediate_internal_nodes, global_id);
-   radv_ir_box_node src = DEREF(src_node);
+   REF(vk_ir_box_node) intermediate_internal_nodes =
+      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
+   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
+   vk_ir_box_node src = DEREF(src_node);

   bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;

@ -70,10 +124,10 @@ main()
                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);

      uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
-      if (bvh_offset == RADV_UNKNOWN_BVH_OFFSET)
+      if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
         continue;

-      if (bvh_offset == RADV_NULL_BVH_OFFSET)
+      if (bvh_offset == VK_NULL_BVH_OFFSET)
         break;

      REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
@ -92,11 +146,11 @@ main()
         float largest_surface_area = -INFINITY;

         for (int32_t i = 0; i < found_child_count; ++i) {
-            if (ir_id_to_type(children[i]) != radv_ir_node_internal)
+            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
               continue;

-            radv_aabb bounds =
-               DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh,
+            vk_aabb bounds =
+               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
                                             ir_id_to_offset(children[i]))).aabb;

            float surface_area = aabb_surface_area(bounds);
@ -107,8 +161,8 @@ main()
         }

         if (collapsed_child_index != -1) {
-            REF(radv_ir_box_node) child_node =
-               REF(radv_ir_box_node)OFFSET(args.intermediate_bvh,
+            REF(vk_ir_box_node) child_node =
+               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
                                        ir_id_to_offset(children[collapsed_child_index]));
            uint32_t grandchildren[2] = DEREF(child_node).children;
            uint32_t valid_grandchild_count = 0;
@ -131,7 +185,7 @@ main()
               children[collapsed_child_index] = children[found_child_count];
            }

-            DEREF(child_node).bvh_offset = RADV_NULL_BVH_OFFSET;
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
         } else
            break;
      }
@ -141,24 +195,24 @@ main()
         uint32_t offset = ir_id_to_offset(children[i]);
         uint32_t dst_offset;

-         if (type == radv_ir_node_internal) {
+         if (type == vk_ir_node_internal) {
 #if COMPACT
            dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
 #else
            uint32_t offset_in_internal_nodes = offset - intermediate_leaf_nodes_size;
-            uint32_t child_index = offset_in_internal_nodes / SIZEOF(radv_ir_box_node);
+            uint32_t child_index = offset_in_internal_nodes / SIZEOF(vk_ir_box_node);
            dst_offset = dst_internal_offset + child_index * SIZEOF(radv_bvh_box32_node);
 #endif

-            REF(radv_ir_box_node) child_node = REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, offset);
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
            DEREF(child_node).bvh_offset = dst_offset;
         } else {
-            uint32_t child_index = offset / SIZEOF(radv_ir_node);
+            uint32_t child_index = offset / ir_leaf_node_size;
            dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
         }

-         radv_aabb child_aabb =
-            DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+         vk_aabb child_aabb =
+            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;

         DEREF(dst_node).coords[i] = child_aabb;

--- a/src/amd/vulkan/bvh/leaf.comp
+++ b/src/amd/vulkan/bvh/leaf.comp
@ -1,99 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-#include "build_interface.h"
-
-layout(push_constant) uniform CONSTS {
-   leaf_args args;
-};
-
-void
-main(void)
-{
-   uint32_t global_id = gl_GlobalInvocationID.x;
-   uint32_t primitive_id = args.geom_data.first_id + global_id;
-
-   REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id);
-   uint32_t src_offset = global_id * args.geom_data.stride;
-
-   uint32_t dst_stride;
-   uint32_t node_type;
-   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-      dst_stride = SIZEOF(radv_bvh_triangle_node);
-      node_type = radv_ir_node_triangle;
-   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-      dst_stride = SIZEOF(radv_bvh_aabb_node);
-      node_type = radv_ir_node_aabb;
-   } else {
-      dst_stride = SIZEOF(radv_bvh_instance_node);
-      node_type = radv_ir_node_instance;
-   }
-
-   uint32_t dst_offset = primitive_id * dst_stride;
-   VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset);
-
-   radv_aabb bounds;
-   bool is_active;
-   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-      is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id);
-   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-      is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id);
-   } else {
-      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-      /* arrayOfPointers */
-      if (args.geom_data.stride == 8) {
-         src_ptr = DEREF(REF(VOID_REF)(src_ptr));
-      }
-
-      is_active = build_instance(bounds, src_ptr, dst_ptr, global_id);
-   }
-
-#if ALWAYS_ACTIVE
-   if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
-      bounds.min = vec3(0.0);
-      bounds.max = vec3(0.0);
-      is_active = true;
-   }
-#endif
-
-   if (is_active) {
-      REF(radv_ir_node) ir_node = INDEX(radv_ir_node, args.ir, primitive_id);
-      DEREF(ir_node).aabb = bounds;
-   }
-
-   uint32_t ir_offset = primitive_id * SIZEOF(radv_ir_node);
-   DEREF(id_ptr).id = is_active ? pack_ir_node_id(ir_offset, node_type) : RADV_BVH_INVALID_NODE;
-
-   uvec4 ballot = subgroupBallot(is_active);
-   if (subgroupElect())
-      atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot));
-
-   atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x));
-   atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y));
-   atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z));
-   atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x));
-   atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y));
-   atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z));
-}
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@ -23,36 +23,6 @@ bvh_shaders = [
    'header',
    [],
  ],
-  [
-    'lbvh_generate_ir.comp',
-    'lbvh_generate_ir',
-    [],
-  ],
-  [
-    'lbvh_main.comp',
-    'lbvh_main',
-    [],
-  ],
-  [
-    'leaf.comp',
-    'leaf',
-    ['ALWAYS_ACTIVE=0'],
-  ],
-  [
-    'leaf.comp',
-    'leaf_always_active',
-    ['ALWAYS_ACTIVE=1'],
-  ],
-  [
-    'morton.comp',
-    'morton',
-    [],
-  ],
-  [
-    'ploc_internal.comp',
-    'ploc_internal',
-    [],
-  ],
  [
    'update.comp',
    'update',
@ -61,17 +31,20 @@ bvh_shaders = [
 ]

 bvh_include_dir = dir_source_root + '/src/amd/vulkan/bvh'
+vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'

 bvh_includes = files(
  'build_helpers.h',
  'build_interface.h',
  'bvh.h',
+  vk_bvh_include_dir + '/vk_build_helpers.h',
+  vk_bvh_include_dir + '/vk_bvh.h',
 )

 bvh_spv = []
 foreach s : bvh_shaders
  command = [
-    prog_glslang, '-V', '-I' + bvh_include_dir, '--target-env', 'spirv1.5',
+    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
    '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
  ]

--- a/src/amd/vulkan/bvh/update.comp
+++ b/src/amd/vulkan/bvh/update.comp
@ -53,7 +53,7 @@ void main() {
    VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset);
    uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride;

-    radv_aabb bounds;
+    vk_aabb bounds;
    bool is_active;
    if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
        is_active = build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x);
@ -65,7 +65,7 @@ void main() {
    if (!is_active)
        return;

-    DEREF(INDEX(radv_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
+    DEREF(INDEX(vk_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
    memoryBarrier(gl_ScopeDevice,
        gl_StorageSemanticsBuffer,
        gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
@ -112,11 +112,11 @@ void main() {

        for (uint32_t i = 0; i < valid_child_count; ++i) {
            uint32_t child_offset = id_to_offset(children[i]);
-            radv_aabb child_bounds;
+            vk_aabb child_bounds;
            if (child_offset == dst_offset)
                child_bounds = bounds;
            else if (child_offset >= internal_nodes_offset) {
-                child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+                child_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
                REF(radv_bvh_box32_node) child_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset);
                for (uint32_t j = 0; j < 4; ++j) {
                    if (DEREF(child_node).children[j] == RADV_BVH_INVALID_NODE)
@ -126,16 +126,16 @@ void main() {
                }
            } else {
                uint32_t child_index = (child_offset - first_leaf_offset) / leaf_node_size;
-                child_bounds = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index));
+                child_bounds = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
            }

            DEREF(dst_node).coords[i] = child_bounds;
        }

        if (parent_id == RADV_BVH_ROOT_NODE) {
-            radv_aabb root_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+            vk_aabb root_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
            for (uint32_t i = 0; i < valid_child_count; ++i) {
-                radv_aabb bounds = DEREF(dst_node).coords[i];
+                vk_aabb bounds = DEREF(dst_node).coords[i];
                root_bounds.min = min(root_bounds.min, bounds.min);
                root_bounds.max = max(root_bounds.max, bounds.max);
            }
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@ -191,9 +191,6 @@ if amd_with_llvm
  )
 endif

-subdir('radix_sort')
-libradv_files += radix_sort_files
-
 subdir('bvh')

 subdir('layers')
--- a/src/amd/vulkan/radix_sort/meson.build
+++ b/src/amd/vulkan/radix_sort/meson.build
@ -1,21 +0,0 @@
-# Copyright © 2022 Konstantin Seurer
-# SPDX-License-Identifier: MIT
-
-subdir('shaders')
-
-radix_sort_files = files(
-  'common/vk/barrier.c',
-  'common/vk/barrier.h',
-  'common/macros.h',
-  'common/util.c',
-  'common/util.h',
-  'shaders/push.h',
-  'targets/u64/config.h',
-  'radix_sort_vk_devaddr.h',
-  'radix_sort_vk_ext.h',
-  'radix_sort_vk.c',
-  'radix_sort_vk.h',
-  'radv_radix_sort.c',
-  'radv_radix_sort.h',
-  'target.h'
-)
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.c
+++ b/src/amd/vulkan/radix_sort/radv_radix_sort.c
@ -1,196 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#include "radv_radix_sort.h"
-#include "targets/u64/config.h"
-#include "radv_cmd_buffer.h"
-#include "target.h"
-
-static const uint32_t init_spv[] = {
-#include "radix_sort/shaders/init.comp.spv.h"
-};
-
-static const uint32_t fill_spv[] = {
-#include "radix_sort/shaders/fill.comp.spv.h"
-};
-
-static const uint32_t histogram_spv[] = {
-#include "radix_sort/shaders/histogram.comp.spv.h"
-};
-
-static const uint32_t prefix_spv[] = {
-#include "radix_sort/shaders/prefix.comp.spv.h"
-};
-
-static const uint32_t scatter_0_even_spv[] = {
-#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
-};
-
-static const uint32_t scatter_0_odd_spv[] = {
-#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
-};
-
-static const uint32_t scatter_1_even_spv[] = {
-#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
-};
-
-static const uint32_t scatter_1_odd_spv[] = {
-#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
-};
-
-static const struct radix_sort_vk_target_config target_config = {
-   .keyval_dwords = RS_KEYVAL_DWORDS,
-
-   .histogram =
-      {
-         .workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2,
-         .block_rows = RS_HISTOGRAM_BLOCK_ROWS,
-      },
-
-   .prefix =
-      {
-         .workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2,
-      },
-
-   .scatter =
-      {
-         .workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2,
-         .block_rows = RS_SCATTER_BLOCK_ROWS,
-      },
-};
-
-radix_sort_vk_t *
-radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc)
-{
-   const uint32_t *spv[8] = {
-      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
-      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
-   };
-   const uint32_t spv_sizes[8] = {
-      sizeof(init_spv),           sizeof(fill_spv),          sizeof(histogram_spv),      sizeof(prefix_spv),
-      sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
-   };
-   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreateShaderModule(VkDevice _device, const VkShaderModuleCreateInfo *pCreateInfo,
-                     const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreateShaderModule(_device, pCreateInfo, pAllocator, pShaderModule);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyShaderModule(VkDevice _device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyShaderModule(_device, shaderModule, pAllocator);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreatePipelineLayout(VkDevice _device, const VkPipelineLayoutCreateInfo *pCreateInfo,
-                       const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreatePipelineLayout(_device, pCreateInfo, pAllocator, pPipelineLayout);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyPipelineLayout(VkDevice _device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyPipelineLayout(_device, pipelineLayout, pAllocator);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount,
-                         const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator,
-                         VkPipeline *pPipelines)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreateComputePipelines(_device, pipelineCache, createInfoCount, pCreateInfos,
-                                                           pAllocator, pPipelines);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyPipeline(VkDevice _device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyPipeline(_device, pipeline, pAllocator);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
-                     VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount,
-                     const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
-                     const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
-                     const VkImageMemoryBarrier *pImageMemoryBarriers)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, dependencyFlags,
-                                                memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
-                                                pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags,
-                   uint32_t offset, uint32_t size, const void *pValues)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset, size, pValues);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
-}
-
-VKAPI_ATTR VkDeviceAddress VKAPI_CALL
-vkGetBufferDeviceAddress(VkDevice _device, const VkBufferDeviceAddressInfo *pInfo)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.GetBufferDeviceAddress(_device, pInfo);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size,
-                uint32_t data)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdFillBuffer(commandBuffer, dstBuffer, dstOffset, size, data);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdDispatchIndirect(commandBuffer, buffer, offset);
-}
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.h
+++ b/src/amd/vulkan/radix_sort/radv_radix_sort.h
@ -1,14 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef RADV_RADIX_SORT_H
-#define RADV_RADIX_SORT_H
-
-#include "radix_sort_vk_devaddr.h"
-
-radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc);
-
-#endif
--- a/src/amd/vulkan/radix_sort/shaders/meson.build
+++ b/src/amd/vulkan/radix_sort/shaders/meson.build
@ -1,40 +0,0 @@
-# Copyright © 2022 Konstantin Seurer
-# SPDX-License-Identifier: MIT
-
-radix_sort_shaders = [
-  'init.comp',
-  'fill.comp',
-  'histogram.comp',
-  'prefix.comp',
-  'scatter_0_even.comp',
-  'scatter_0_odd.comp',
-  'scatter_1_even.comp',
-  'scatter_1_odd.comp'
-]
-
-shader_include_dir = dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64'
-
-shader_include_files = files(
-  'bufref.h',
-  'prefix_limits.h',
-  'prefix.h',
-  'push.h',
-  'scatter.glsl',
-  dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64/config.h'
-)
-
-radix_sort_spv = []
-foreach s : radix_sort_shaders
-  _name = f'@s@.spv.h'
-  radix_sort_spv += custom_target(
-    _name,
-    input : s,
-    output : _name,
-    command : [
-      prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3',
-      '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_quiet, glslang_depfile,
-    ],
-    depfile : f'@_name@.d',
-    depend_files : shader_include_files,
-  )
-endforeach
--- a/src/amd/vulkan/radix_sort/shaders/prefix.h
+++ b/src/amd/vulkan/radix_sort/shaders/prefix.h
@ -1,353 +0,0 @@
-// Copyright 2021 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
-#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
-
-//
-// Requires several defines
-//
-#ifndef RS_PREFIX_LIMITS
-#error "Error: \"prefix_limits.h\" not loaded"
-#endif
-
-#ifndef RS_PREFIX_ARGS
-#error "Error: RS_PREFIX_ARGS undefined"
-#endif
-
-#ifndef RS_PREFIX_LOAD
-#error "Error: RS_PREFIX_LOAD undefined"
-#endif
-
-#ifndef RS_PREFIX_STORE
-#error "Error: RS_PREFIX_STORE undefined"
-#endif
-
-#ifndef RS_SUBGROUP_SIZE
-#error "Error: RS_SUBGROUP_SIZE undefined"
-#endif
-
-#ifndef RS_WORKGROUP_SIZE
-#error "Error: RS_WORKGROUP_SIZE undefined"
-#endif
-
-#ifndef RS_WORKGROUP_SUBGROUPS
-#error "Error: RS_WORKGROUP_SUBGROUPS undefined"
-#endif
-
-//
-// Optional switches:
-//
-//   * Disable holding original inclusively scanned histogram values in registers.
-//
-//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-//
-
-//
-// Compute exclusive prefix of uint32_t[256]
-//
-void
-rs_prefix(RS_PREFIX_ARGS)
-{
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  // Workgroup is a single subgroup so no shared memory is required.
-  //
-
-  //
-  // Exclusive scan-add the histogram
-  //
-  const uint32_t               h0     = RS_PREFIX_LOAD(0);
-  const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
-  RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-
-  RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
-
-  //
-  // Each iteration is dependent on the previous so no unrolling.  The
-  // compiler is free to hoist the loads upward though.
-  //
-  for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
-       ii < RS_RADIX_SIZE;
-       ii += RS_SUBGROUP_SIZE)
-    {
-      const uint32_t h     = RS_PREFIX_LOAD(ii);
-      const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
-      h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
-
-      RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
-    }
-
-#else
-  //
-  // Workgroup is multiple subgroups and uses shared memory to store
-  // the scan's intermediate results.
-  //
-  // Assumes a power-of-two subgroup, workgroup and radix size.
-  //
-  // Downsweep: Repeatedly scan reductions until they fit in a single
-  //            subgroup.
-  //
-  // Upsweep:   Then uniformly apply reductions to each subgroup.
-  //
-  //
-  //   Subgroup Size |  4 |  8 | 16 | 32 | 64 |
-  //   --------------+----+----+----+----+----+
-  //   Sweep 0       | 64 | 32 | 16 |  8 |  4 | sweep_0[]
-  //   Sweep 1       | 16 |  4 |  - |  - |  - | sweep_1[]
-  //   Sweep 2       |  4 |  - |  - |  - |  - | sweep_2[]
-  //   --------------+----+----+----+----+----+
-  //   Total dwords  | 84 | 36 | 16 |  8 |  4 |
-  //   --------------+----+----+----+----+----+
-  //
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-  uint32_t h_exc[RS_H_COMPONENTS];
-#endif
-
-  //
-  // Downsweep 0
-  //
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    const uint32_t h_inc = subgroupInclusiveAdd(h);
-
-    const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
-
-    //
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    h_exc[ii] = h_inc - h;
-#else
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
-#endif
-  }
-
-  barrier();
-
-  //
-  // Skip generalizing these sweeps for all possible subgroups -- just
-  // write them directly.
-  //
-#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0
-  //
-#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE)
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // subgroup has inactive invocations
-#endif
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-    }
-
-#elif (RS_SUBGROUP_SIZE == 8)
-
-#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-      RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
-    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
-    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-  }
-
-#endif
-
-  barrier();
-
-  //
-  // Scan 1
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
-    {
-      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
-      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
-    }
-
-#elif (RS_SUBGROUP_SIZE == 4)
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
-
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-      RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
-    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
-    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-  }
-#endif
-
-  barrier();
-
-  //
-  // Scan 1 and Downsweep 2
-  //
-#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
-  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
-    {
-      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
-      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
-      RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
-  {
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
-    const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-    RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
-    RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
-  }
-
-#endif
-
-  barrier();
-
-  //
-  // Scan 2
-  //
-  // 4 invocations
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
-    {
-      const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
-      const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
-
-      RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
-    }
-
-#else
-#error "Error: Unsupported subgroup size"
-#endif
-
-  barrier();
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Final upsweep 0
-  //
-#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    // clang format issue
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
-#endif
-  }
-
-#elif (RS_SUBGROUP_SIZE == 8)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
-
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
-#endif
-  }
-
-#elif (RS_SUBGROUP_SIZE == 4)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
-    const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
-
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
-#endif
-  }
-
-#else
-#error "Error: Unsupported subgroup size"
-#endif
-
-#endif
-}
-
-//
-//
-//
-
-#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
--- a/src/amd/vulkan/radix_sort/targets/u64/config.h
+++ b/src/amd/vulkan/radix_sort/targets/u64/config.h
@ -1,34 +0,0 @@
-// Copyright 2021 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
-#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
-
-//
-//
-//
-
-// clang-format off
-#define RS_KEYVAL_DWORDS                   2
-
-#define RS_FILL_WORKGROUP_SIZE_LOG2        7
-#define RS_FILL_BLOCK_ROWS                 8
-
-#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2   8
-#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2    6
-#define RS_HISTOGRAM_BLOCK_ROWS            14
-
-#define RS_PREFIX_WORKGROUP_SIZE_LOG2      8
-#define RS_PREFIX_SUBGROUP_SIZE_LOG2       6
-
-#define RS_SCATTER_WORKGROUP_SIZE_LOG2     8
-#define RS_SCATTER_SUBGROUP_SIZE_LOG2      6
-#define RS_SCATTER_BLOCK_ROWS              14
-// clang-format on
-
-//
-//
-//
-
-#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
--- a/src/amd/vulkan/radv_device.h
+++ b/src/amd/vulkan/radv_device.h
@ -24,7 +24,9 @@
 #include "radv_rra.h"
 #include "radv_shader.h"

+#include "vk_acceleration_structure.h"
 #include "vk_device.h"
+#include "vk_meta.h"
 #include "vk_texcompress_astc.h"
 #include "vk_texcompress_etc2.h"

@ -302,17 +304,6 @@ struct radv_meta_state {
   } dcc_retile;

   struct {
-      VkPipelineLayout leaf_p_layout;
-      VkPipeline leaf_pipeline;
-      VkPipeline leaf_updateable_pipeline;
-      VkPipelineLayout morton_p_layout;
-      VkPipeline morton_pipeline;
-      VkPipelineLayout lbvh_main_p_layout;
-      VkPipeline lbvh_main_pipeline;
-      VkPipelineLayout lbvh_generate_ir_p_layout;
-      VkPipeline lbvh_generate_ir_pipeline;
-      VkPipelineLayout ploc_p_layout;
-      VkPipeline ploc_pipeline;
      VkPipelineLayout encode_p_layout;
      VkPipeline encode_pipeline;
      VkPipeline encode_compact_pipeline;
@ -324,6 +315,7 @@ struct radv_meta_state {
      VkPipeline copy_pipeline;

      struct radix_sort_vk *radix_sort;
+      struct vk_acceleration_structure_build_args build_args;

      struct {
         VkBuffer buffer;
@ -340,6 +332,8 @@ struct radv_meta_state {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;
   } dgc_prepare;
+
+   struct vk_meta_device device;
 };

 struct radv_memory_trace_data {
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@ -542,7 +542,7 @@ rra_transcode_triangle_node(struct rra_transcoding_context *ctx, const struct ra
 }

 static void
-rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, radv_aabb bounds)
+rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, vk_aabb bounds)
 {
   struct rra_aabb_node *dst = (struct rra_aabb_node *)(ctx->dst + ctx->dst_leaf_offset);
   ctx->dst_leaf_offset += sizeof(struct rra_aabb_node);
@ -580,7 +580,7 @@ rra_transcode_instance_node(struct rra_transcoding_context *ctx, const struct ra
 }

 static uint32_t rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
-                                   radv_aabb bounds);
+                                   vk_aabb bounds);

 static void
 rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_bvh_box16_node *src)
@ -597,7 +597,7 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_
         continue;
      }

-      radv_aabb bounds = {
+      vk_aabb bounds = {
         .min =
            {
               _mesa_half_to_float(src->coords[i][0][0]),
@ -653,7 +653,7 @@ get_geometry_id(const void *node, uint32_t node_type)
 }

 static uint32_t
-rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, radv_aabb bounds)
+rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, vk_aabb bounds)
 {
   uint32_t node_type = src_id & 7;
   uint32_t src_offset = (src_id & (~7u)) << 3;
--- a/src/vulkan/runtime/bvh/lbvh_generate_ir.comp
+++ b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp
@ -1,7 +1,24 @@
 /*
 * Copyright © 2022 Bas Nieuwenhuizen
 *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
 */

 #version 460
@ -18,9 +35,9 @@
 #extension GL_EXT_buffer_reference2 : require
 #extension GL_KHR_memory_scope_semantics : require

-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"

-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;

 TYPE(lbvh_node_info, 4);

@ -36,8 +53,8 @@ main(void)

   uint32_t idx = global_id;

-   uint32_t previous_id = RADV_BVH_INVALID_NODE;
-   radv_aabb previous_bounds;
+   uint32_t previous_id = VK_BVH_INVALID_NODE;
+   vk_aabb previous_bounds;
   previous_bounds.min = vec3(INFINITY);
   previous_bounds.max = vec3(-INFINITY);

@ -58,13 +75,13 @@ main(void)
       * parents, which is a requirement of the encoder.
       */
      uint32_t dst_idx =
-         atomicAdd(DEREF(REF(radv_ir_header)(args.header)).ir_internal_node_count, 1);
+         atomicAdd(DEREF(REF(vk_ir_header)(args.header)).ir_internal_node_count, 1);

-      uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(radv_ir_box_node);
-      uint32_t current_id = pack_ir_node_id(current_offset, radv_ir_node_internal);
+      uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(vk_ir_box_node);
+      uint32_t current_id = pack_ir_node_id(current_offset, vk_ir_node_internal);

-      REF(radv_ir_box_node) node = REF(radv_ir_box_node)(OFFSET(args.bvh, current_offset));
-      radv_aabb bounds = previous_bounds;
+      REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, current_offset));
+      vk_aabb bounds = previous_bounds;

      lbvh_node_info info = DEREF(INDEX(lbvh_node_info, args.node_info, idx));

@ -78,10 +95,10 @@ main(void)
         previous_child_index = 1;

      if (previous_child_index == -1) {
-         if (children[0] != RADV_BVH_INVALID_NODE) {
+         if (children[0] != VK_BVH_INVALID_NODE) {
            uint32_t child_offset = ir_id_to_offset(children[0]);
-            REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset));
-            radv_aabb child_bounds = DEREF(child).aabb;
+            REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset));
+            vk_aabb child_bounds = DEREF(child).aabb;
            bounds.min = min(bounds.min, child_bounds.min);
            bounds.max = max(bounds.max, child_bounds.max);
         }
@ -89,23 +106,23 @@ main(void)
      }

      /* Fetch the non-cached child */
-      if (children[1 - previous_child_index] != RADV_BVH_INVALID_NODE) {
+      if (children[1 - previous_child_index] != VK_BVH_INVALID_NODE) {
         uint32_t child_offset = ir_id_to_offset(children[1 - previous_child_index]);
-         REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset));
-         radv_aabb child_bounds = DEREF(child).aabb;
+         REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset));
+         vk_aabb child_bounds = DEREF(child).aabb;
         bounds.min = min(bounds.min, child_bounds.min);
         bounds.max = max(bounds.max, child_bounds.max);
      }

-      radv_ir_box_node node_value;
+      vk_ir_box_node node_value;

      node_value.base.aabb = bounds;
-      node_value.bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+      node_value.bvh_offset = VK_UNKNOWN_BVH_OFFSET;
      node_value.children = children;

      DEREF(node) = node_value;

-      if (info.parent == RADV_BVH_INVALID_NODE)
+      if (info.parent == VK_BVH_INVALID_NODE)
         break;

      idx = info.parent & ~LBVH_RIGHT_CHILD_BIT;
--- a/src/vulkan/runtime/bvh/lbvh_main.comp
+++ b/src/vulkan/runtime/bvh/lbvh_main.comp
@ -1,7 +1,24 @@
 /*
 * Copyright © 2022 Bas Nieuwenhuizen
 *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
 */

 #version 460
@ -17,9 +34,9 @@
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require

-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"

-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;

 TYPE(lbvh_node_info, 4);

@ -74,11 +91,11 @@ main()
 {
   if (args.id_count <= 1) {
      REF(lbvh_node_info) dst = REF(lbvh_node_info)(args.node_info);
-      DEREF(dst).parent = RADV_BVH_INVALID_NODE;
+      DEREF(dst).parent = VK_BVH_INVALID_NODE;
      DEREF(dst).path_count = 2;
      DEREF(dst).children[0] =
-         args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : RADV_BVH_INVALID_NODE;
-      DEREF(dst).children[1] = RADV_BVH_INVALID_NODE;
+         args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : VK_BVH_INVALID_NODE;
+      DEREF(dst).children[1] = VK_BVH_INVALID_NODE;
      return;
   }

@ -136,5 +153,5 @@ main()
   DEREF(dst).children[0] = DEREF(INDEX(key_id_pair, args.src_ids, left)).id;
   DEREF(dst).children[1] = DEREF(INDEX(key_id_pair, args.src_ids, right)).id;
   if (id == 0)
-      DEREF(dst).parent = RADV_BVH_INVALID_NODE;
+      DEREF(dst).parent = VK_BVH_INVALID_NODE;
 }
--- a/src/vulkan/runtime/bvh/leaf.comp
+++ b/src/vulkan/runtime/bvh/leaf.comp
@ -0,0 +1,250 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+#include "vk_build_interface.h"
+
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
+
+layout(push_constant) uniform CONSTS {
+   leaf_args args;
+};
+
+/* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
+struct AccelerationStructureInstance {
+   mat3x4 transform;
+   uint32_t custom_instance_and_mask;
+   uint32_t sbt_offset_and_flags;
+   uint64_t accelerationStructureReference;
+};
+TYPE(AccelerationStructureInstance, 8);
+
+bool
+build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
+{
+   bool is_valid = true;
+   triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
+
+   triangle_vertices vertices = load_vertices(geom_data.data, indices, geom_data.vertex_format, geom_data.stride);
+
+   /* An inactive triangle is one for which the first (X) component of any vertex is NaN. If any
+    * other vertex component is NaN, and the first is not, the behavior is undefined. If the vertex
+    * format does not have a NaN representation, then all triangles are considered active.
+    */
+   if (isnan(vertices.vertex[0].x) || isnan(vertices.vertex[1].x) || isnan(vertices.vertex[2].x))
+#if ALWAYS_ACTIVE
+      is_valid = false;
+#else
+      return false;
+#endif
+
+   if (geom_data.transform != NULL) {
+      mat4 transform = mat4(1.0);
+
+      for (uint32_t col = 0; col < 4; col++)
+      for (uint32_t row = 0; row < 3; row++)
+      transform[col][row] = DEREF(INDEX(float, geom_data.transform, col + row * 4));
+
+      for (uint32_t i = 0; i < 3; i++)
+      vertices.vertex[i] = transform * vertices.vertex[i];
+   }
+
+   REF(vk_ir_triangle_node) node = REF(vk_ir_triangle_node)(dst_ptr);
+
+   bounds.min = vec3(INFINITY);
+   bounds.max = vec3(-INFINITY);
+
+   for (uint32_t coord = 0; coord < 3; coord++)
+   for (uint32_t comp = 0; comp < 3; comp++) {
+      DEREF(node).coords[coord][comp] = vertices.vertex[coord][comp];
+      bounds.min[comp] = min(bounds.min[comp], vertices.vertex[coord][comp]);
+      bounds.max[comp] = max(bounds.max[comp], vertices.vertex[coord][comp]);
+   }
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).triangle_id = global_id;
+   DEREF(node).geometry_id_and_flags = geom_data.geometry_id;
+   DEREF(node).id = 9;
+
+   return is_valid;
+}
+
+bool
+build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
+{
+   bool is_valid = true;
+   REF(vk_ir_aabb_node) node = REF(vk_ir_aabb_node)(dst_ptr);
+
+   for (uint32_t vec = 0; vec < 2; vec++)
+   for (uint32_t comp = 0; comp < 3; comp++) {
+      float coord = DEREF(INDEX(float, src_ptr, comp + vec * 3));
+
+      if (vec == 0)
+      bounds.min[comp] = coord;
+      else
+      bounds.max[comp] = coord;
+   }
+
+   /* An inactive AABB is one for which the minimum X coordinate is NaN. If any other component is
+    * NaN, and the first is not, the behavior is undefined.
+    */
+   if (isnan(bounds.min.x))
+#if ALWAYS_ACTIVE
+      is_valid = false;
+#else
+      return false;
+#endif
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).primitive_id = global_id;
+   DEREF(node).geometry_id_and_flags = geometry_id;
+
+   return is_valid;
+}
+
+vk_aabb
+calculate_instance_node_bounds(uint64_t base_ptr, mat3x4 otw_matrix)
+{
+   vk_aabb aabb;
+
+   vk_aabb blas_aabb = DEREF(REF(vk_aabb)(base_ptr + BVH_BOUNDS_OFFSET));
+
+   for (uint32_t comp = 0; comp < 3; ++comp) {
+      aabb.min[comp] = otw_matrix[comp][3];
+      aabb.max[comp] = otw_matrix[comp][3];
+      for (uint32_t col = 0; col < 3; ++col) {
+         aabb.min[comp] +=
+            min(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]);
+         aabb.max[comp] +=
+            max(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]);
+      }
+   }
+   return aabb;
+}
+
+bool
+build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
+{
+   REF(vk_ir_instance_node) node = REF(vk_ir_instance_node)(dst_ptr);
+
+   AccelerationStructureInstance instance = DEREF(REF(AccelerationStructureInstance)(src_ptr));
+
+   /* An inactive instance is one whose acceleration structure handle is VK_NULL_HANDLE. Since the active terminology is
+    * only relevant for BVH updates, which we do not implement, we can also skip instances with mask == 0.
+    */
+   if (instance.accelerationStructureReference == 0 || instance.custom_instance_and_mask < (1u << 24u))
+      return false;
+
+   DEREF(node).base_ptr = instance.accelerationStructureReference;
+
+   mat4 transform = mat4(instance.transform);
+   DEREF(node).otw_matrix = mat3x4(transform);
+
+   bounds = calculate_instance_node_bounds(instance.accelerationStructureReference, mat3x4(transform));
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).custom_instance_and_mask = instance.custom_instance_and_mask;
+   DEREF(node).sbt_offset_and_flags = instance.sbt_offset_and_flags;
+   DEREF(node).instance_id = global_id;
+
+   return true;
+}
+
+void
+main(void)
+{
+   uint32_t global_id = gl_GlobalInvocationID.x;
+   uint32_t primitive_id = args.geom_data.first_id + global_id;
+
+   REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id);
+   uint32_t src_offset = global_id * args.geom_data.stride;
+
+   uint32_t dst_stride;
+   uint32_t node_type;
+   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
+      dst_stride = SIZEOF(vk_ir_triangle_node);
+      node_type = vk_ir_node_triangle;
+   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
+      dst_stride = SIZEOF(vk_ir_aabb_node);
+      node_type = vk_ir_node_aabb;
+   } else {
+      dst_stride = SIZEOF(vk_ir_instance_node);
+      node_type = vk_ir_node_instance;
+   }
+
+   uint32_t dst_offset = primitive_id * dst_stride;
+   VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset);
+
+   vk_aabb bounds;
+   bool is_active;
+   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
+      is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id);
+   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
+      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
+      is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id);
+   } else {
+      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
+      /* arrayOfPointers */
+      if (args.geom_data.stride == 8) {
+         src_ptr = DEREF(REF(VOID_REF)(src_ptr));
+      }
+
+      is_active = build_instance(bounds, src_ptr, dst_ptr, global_id);
+   }
+
+#if ALWAYS_ACTIVE
+   if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
+      bounds.min = vec3(0.0);
+      bounds.max = vec3(0.0);
+      is_active = true;
+   }
+#endif
+
+   DEREF(id_ptr).id = is_active ? pack_ir_node_id(dst_offset, node_type) : VK_BVH_INVALID_NODE;
+
+   uvec4 ballot = subgroupBallot(is_active);
+   if (subgroupElect())
+      atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot));
+
+   atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x));
+   atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y));
+   atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z));
+   atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x));
+   atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y));
+   atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z));
+}
--- a/src/vulkan/runtime/bvh/meson.build
+++ b/src/vulkan/runtime/bvh/meson.build
@ -0,0 +1,81 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# source file, output name, defines
+bvh_shaders = [
+  [
+    'lbvh_generate_ir.comp',
+    'lbvh_generate_ir',
+    [],
+  ],
+  [
+    'lbvh_main.comp',
+    'lbvh_main',
+    [],
+  ],
+  [
+    'leaf.comp',
+    'leaf',
+    ['ALWAYS_ACTIVE=0'],
+  ],
+  [
+    'leaf.comp',
+    'leaf_always_active',
+    ['ALWAYS_ACTIVE=1'],
+  ],
+  [
+    'morton.comp',
+    'morton',
+    [],
+  ],
+  [
+    'ploc_internal.comp',
+    'ploc_internal',
+    [],
+  ],
+]
+
+vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
+
+vk_bvh_includes = files(
+  'vk_build_helpers.h',
+  'vk_build_interface.h',
+  'vk_bvh.h',
+)
+
+bvh_spv = []
+foreach s : bvh_shaders
+  command = [
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+  ] + (with_mesa_debug ? ['-g'] : [])
+  command += glslang_quiet
+
+  foreach define : s[2]
+    command += '-D' + define
+  endforeach
+
+  bvh_spv += custom_target(
+    s[1] + '.spv.h',
+    input : s[0],
+    output : s[1] + '.spv.h',
+    command : command,
+    depend_files: vk_bvh_includes
+  )
+endforeach
--- a/src/vulkan/runtime/bvh/morton.comp
+++ b/src/vulkan/runtime/bvh/morton.comp
@ -1,7 +1,24 @@
 /*
 * Copyright © 2022 Konstantin Seurer
 *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
 */

 #version 460
@ -17,9 +34,9 @@
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require

-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"

-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;

 layout(push_constant) uniform CONSTS {
   morton_args args;
@ -56,11 +73,11 @@ main(void)
   uint32_t id = DEREF(key_id).id;

   uint32_t key;
-   if (id != RADV_BVH_INVALID_NODE) {
-      radv_aabb bounds = DEREF(REF(radv_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb;
+   if (id != VK_BVH_INVALID_NODE) {
+      vk_aabb bounds = DEREF(REF(vk_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb;
      vec3 center = (bounds.min + bounds.max) * 0.5;

-      radv_aabb bvh_bounds;
+      vk_aabb bvh_bounds;
      bvh_bounds.min.x = from_emulated_float(DEREF(args.header).min_bounds[0]);
      bvh_bounds.min.y = from_emulated_float(DEREF(args.header).min_bounds[1]);
      bvh_bounds.min.z = from_emulated_float(DEREF(args.header).min_bounds[2]);
--- a/src/vulkan/runtime/bvh/ploc_internal.comp
+++ b/src/vulkan/runtime/bvh/ploc_internal.comp
@ -1,7 +1,24 @@
 /*
 * Copyright © 2022 Bas Nieuwenhuizen
 *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
 */

 #version 460
@ -24,7 +41,7 @@
 layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;

 #define USE_GLOBAL_SYNC
-#include "build_interface.h"
+#include "vk_build_interface.h"

 TYPE(ploc_prefix_scan_partition, 4);

@ -34,7 +51,8 @@ layout(push_constant) uniform CONSTS
 };

 shared uint32_t exclusive_prefix_sum;
-shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64];
+shared uint32_t aggregate_sums[PLOC_SUBGROUPS_PER_WORKGROUP];
+shared uint32_t aggregate_sums2[PLOC_SUBGROUPS_PER_WORKGROUP];

 /*
 * Global prefix scan over all workgroups to find out the index of the collapsed node to write.
@ -45,8 +63,7 @@ uint32_t
 prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
 {
   if (gl_LocalInvocationIndex == 0) {
-      /* Temporary copy of exclusive_prefix_sum to avoid reading+writing LDS each addition */
-      uint32_t local_exclusive_prefix_sum = 0;
+      exclusive_prefix_sum = 0;
      if (task_index >= gl_WorkGroupSize.x) {
         REF(ploc_prefix_scan_partition) current_partition =
            REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
@ -58,28 +75,55 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
            if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
                           gl_StorageSemanticsBuffer,
                           gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
-               local_exclusive_prefix_sum += DEREF(previous_partition).inclusive_sum;
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
               break;
            } else {
-               local_exclusive_prefix_sum += DEREF(previous_partition).aggregate;
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
               previous_partition -= 1;
            }
         }
         /* Set the inclusive sum for the next workgroups */
         atomicStore(DEREF(current_partition).inclusive_sum,
-                     DEREF(current_partition).aggregate + local_exclusive_prefix_sum, gl_ScopeDevice,
+                     DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
                     gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable);
      }
-      exclusive_prefix_sum = local_exclusive_prefix_sum;
   }

   if (subgroupElect())
      aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
   barrier();

-   if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) {
-      aggregate_sums[gl_LocalInvocationID.x] =
-         exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+   if (PLOC_SUBGROUPS_PER_WORKGROUP <= SUBGROUP_SIZE) {
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
+         aggregate_sums[gl_LocalInvocationID.x] =
+            exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+      }
+   } else {
+      /* If the length of aggregate_sums[] is larger than SUBGROUP_SIZE,
+       * the prefix scan can't be done simply by subgroupExclusiveAdd.
+       */
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+         aggregate_sums2[gl_LocalInvocationID.x] = aggregate_sums[gl_LocalInvocationID.x];
+      barrier();
+
+      /* Hillis Steele inclusive scan on aggregate_sums2 */
+      for (uint32_t stride = 1; stride < PLOC_SUBGROUPS_PER_WORKGROUP; stride *= 2) {
+         uint32_t value = 0;
+         if (gl_LocalInvocationID.x >= stride && gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+            value = aggregate_sums2[gl_LocalInvocationID.x - stride];
+         barrier();
+         if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+            aggregate_sums2[gl_LocalInvocationID.x] += value;
+         barrier();
+      }
+
+      /* Adapt to exclusive and add the prefix_sum from previous workgroups */
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
+         if (gl_LocalInvocationID.x == 0)
+            aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum;
+         else
+            aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + aggregate_sums2[gl_LocalInvocationID.x - 1];
+      }
   }
   barrier();

@ -90,20 +134,20 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
 #define BVH_LEVEL_COST 0.2

 uint32_t
-push_node(uint32_t children[2], radv_aabb bounds[2])
+push_node(uint32_t children[2], vk_aabb bounds[2])
 {
   uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
-   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
-   uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal);
-   REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
+   uint32_t dst_id = pack_ir_node_id(dst_offset, vk_ir_node_internal);
+   REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));

-   radv_aabb total_bounds;
+   vk_aabb total_bounds;
   total_bounds.min = vec3(INFINITY);
   total_bounds.max = vec3(-INFINITY);

   for (uint i = 0; i < 2; ++i) {
      VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
-      REF(radv_ir_node) child = REF(radv_ir_node)(node);
+      REF(vk_ir_node) child = REF(vk_ir_node)(node);

      total_bounds.min = min(total_bounds.min, bounds[i].min);
      total_bounds.max = max(total_bounds.max, bounds[i].max);
@ -112,7 +156,7 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
   }

   DEREF(dst_node).base.aabb = total_bounds;
-   DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+   DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
   return dst_id;
 }

@ -136,7 +180,7 @@ decode_neighbour_offset(uint32_t encoded_offset)

 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD

-shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
+shared vk_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];

 uint32_t
@ -155,11 +199,11 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
   for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound;
        i += gl_WorkGroupSize.x) {
      uint32_t id = load_id(ids, iter, i);
-      if (id == RADV_BVH_INVALID_NODE)
+      if (id == VK_BVH_INVALID_NODE)
         continue;

      VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
-      REF(radv_ir_node) node = REF(radv_ir_node)(addr);
+      REF(vk_ir_node) node = REF(vk_ir_node)(addr);

      shared_bounds[i - lds_base] = DEREF(node).aabb;
   }
@ -168,7 +212,7 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
 float
 combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
 {
-   radv_aabb combined_bounds;
+   vk_aabb combined_bounds;
   combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
   combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
   return aabb_surface_area(combined_bounds);
@ -187,10 +231,10 @@ main(void)
   if (DEREF(args.header).active_leaf_count <= 2) {
      if (gl_GlobalInvocationID.x == 0) {
         uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
-         uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
-         REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+         uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
+         REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));

-         radv_aabb total_bounds;
+         vk_aabb total_bounds;
         total_bounds.min = vec3(INFINITY);
         total_bounds.max = vec3(-INFINITY);

@ -198,10 +242,10 @@ main(void)
         for (; i < DEREF(args.header).active_leaf_count; i++) {
            uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id;

-            if (child_id != RADV_BVH_INVALID_NODE) {
+            if (child_id != VK_BVH_INVALID_NODE) {
               VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id));
-               REF(radv_ir_node) child = REF(radv_ir_node)(node);
-               radv_aabb bounds = DEREF(child).aabb;
+               REF(vk_ir_node) child = REF(vk_ir_node)(node);
+               vk_aabb bounds = DEREF(child).aabb;

               total_bounds.min = min(total_bounds.min, bounds.min);
               total_bounds.max = max(total_bounds.max, bounds.max);
@ -210,10 +254,10 @@ main(void)
            DEREF(dst_node).children[i] = child_id;
         }
         for (; i < 2; i++)
-            DEREF(dst_node).children[i] = RADV_BVH_INVALID_NODE;
+            DEREF(dst_node).children[i] = VK_BVH_INVALID_NODE;

         DEREF(dst_node).base.aabb = total_bounds;
-         DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+         DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
      }
      return;
   }
@ -329,11 +373,11 @@ main(void)
               if (task_index < neighbour_index) {
                  uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
                  uint32_t children[2] = {id, neighbour_id};
-                  radv_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};
+                  vk_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};

                  DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds);
                  DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
-                     RADV_BVH_INVALID_NODE;
+                     VK_BVH_INVALID_NODE;
               } else {
                  /* We still store in the other case so we don't destroy the node id needed to
                   * create the internal node */
@ -381,14 +425,14 @@ main(void)

         uint32_t id = task_index < current_task_count
                          ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
-                          : RADV_BVH_INVALID_NODE;
-         uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE);
+                          : VK_BVH_INVALID_NODE;
+         uvec4 ballot = subgroupBallot(id != VK_BVH_INVALID_NODE);

         uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
         if (task_index >= current_task_count)
            continue;

-         if (id != RADV_BVH_INVALID_NODE) {
+         if (id != VK_BVH_INVALID_NODE) {
            DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
            ++new_offset;
         }
--- a/src/vulkan/runtime/bvh/vk_build_helpers.h
+++ b/src/vulkan/runtime/bvh/vk_build_helpers.h
@ -0,0 +1,522 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VK_BVH_BUILD_HELPERS_H
+#define VK_BVH_BUILD_HELPERS_H
+
+#include "vk_bvh.h"
+
+#define VK_FORMAT_UNDEFINED                  0
+#define VK_FORMAT_R4G4_UNORM_PACK8           1
+#define VK_FORMAT_R4G4B4A4_UNORM_PACK16      2
+#define VK_FORMAT_B4G4R4A4_UNORM_PACK16      3
+#define VK_FORMAT_R5G6B5_UNORM_PACK16        4
+#define VK_FORMAT_B5G6R5_UNORM_PACK16        5
+#define VK_FORMAT_R5G5B5A1_UNORM_PACK16      6
+#define VK_FORMAT_B5G5R5A1_UNORM_PACK16      7
+#define VK_FORMAT_A1R5G5B5_UNORM_PACK16      8
+#define VK_FORMAT_R8_UNORM                   9
+#define VK_FORMAT_R8_SNORM                   10
+#define VK_FORMAT_R8_USCALED                 11
+#define VK_FORMAT_R8_SSCALED                 12
+#define VK_FORMAT_R8_UINT                    13
+#define VK_FORMAT_R8_SINT                    14
+#define VK_FORMAT_R8_SRGB                    15
+#define VK_FORMAT_R8G8_UNORM                 16
+#define VK_FORMAT_R8G8_SNORM                 17
+#define VK_FORMAT_R8G8_USCALED               18
+#define VK_FORMAT_R8G8_SSCALED               19
+#define VK_FORMAT_R8G8_UINT                  20
+#define VK_FORMAT_R8G8_SINT                  21
+#define VK_FORMAT_R8G8_SRGB                  22
+#define VK_FORMAT_R8G8B8_UNORM               23
+#define VK_FORMAT_R8G8B8_SNORM               24
+#define VK_FORMAT_R8G8B8_USCALED             25
+#define VK_FORMAT_R8G8B8_SSCALED             26
+#define VK_FORMAT_R8G8B8_UINT                27
+#define VK_FORMAT_R8G8B8_SINT                28
+#define VK_FORMAT_R8G8B8_SRGB                29
+#define VK_FORMAT_B8G8R8_UNORM               30
+#define VK_FORMAT_B8G8R8_SNORM               31
+#define VK_FORMAT_B8G8R8_USCALED             32
+#define VK_FORMAT_B8G8R8_SSCALED             33
+#define VK_FORMAT_B8G8R8_UINT                34
+#define VK_FORMAT_B8G8R8_SINT                35
+#define VK_FORMAT_B8G8R8_SRGB                36
+#define VK_FORMAT_R8G8B8A8_UNORM             37
+#define VK_FORMAT_R8G8B8A8_SNORM             38
+#define VK_FORMAT_R8G8B8A8_USCALED           39
+#define VK_FORMAT_R8G8B8A8_SSCALED           40
+#define VK_FORMAT_R8G8B8A8_UINT              41
+#define VK_FORMAT_R8G8B8A8_SINT              42
+#define VK_FORMAT_R8G8B8A8_SRGB              43
+#define VK_FORMAT_B8G8R8A8_UNORM             44
+#define VK_FORMAT_B8G8R8A8_SNORM             45
+#define VK_FORMAT_B8G8R8A8_USCALED           46
+#define VK_FORMAT_B8G8R8A8_SSCALED           47
+#define VK_FORMAT_B8G8R8A8_UINT              48
+#define VK_FORMAT_B8G8R8A8_SINT              49
+#define VK_FORMAT_B8G8R8A8_SRGB              50
+#define VK_FORMAT_A8B8G8R8_UNORM_PACK32      51
+#define VK_FORMAT_A8B8G8R8_SNORM_PACK32      52
+#define VK_FORMAT_A8B8G8R8_USCALED_PACK32    53
+#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32    54
+#define VK_FORMAT_A8B8G8R8_UINT_PACK32       55
+#define VK_FORMAT_A8B8G8R8_SINT_PACK32       56
+#define VK_FORMAT_A8B8G8R8_SRGB_PACK32       57
+#define VK_FORMAT_A2R10G10B10_UNORM_PACK32   58
+#define VK_FORMAT_A2R10G10B10_SNORM_PACK32   59
+#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
+#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
+#define VK_FORMAT_A2R10G10B10_UINT_PACK32    62
+#define VK_FORMAT_A2R10G10B10_SINT_PACK32    63
+#define VK_FORMAT_A2B10G10R10_UNORM_PACK32   64
+#define VK_FORMAT_A2B10G10R10_SNORM_PACK32   65
+#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
+#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
+#define VK_FORMAT_A2B10G10R10_UINT_PACK32    68
+#define VK_FORMAT_A2B10G10R10_SINT_PACK32    69
+#define VK_FORMAT_R16_UNORM                  70
+#define VK_FORMAT_R16_SNORM                  71
+#define VK_FORMAT_R16_USCALED                72
+#define VK_FORMAT_R16_SSCALED                73
+#define VK_FORMAT_R16_UINT                   74
+#define VK_FORMAT_R16_SINT                   75
+#define VK_FORMAT_R16_SFLOAT                 76
+#define VK_FORMAT_R16G16_UNORM               77
+#define VK_FORMAT_R16G16_SNORM               78
+#define VK_FORMAT_R16G16_USCALED             79
+#define VK_FORMAT_R16G16_SSCALED             80
+#define VK_FORMAT_R16G16_UINT                81
+#define VK_FORMAT_R16G16_SINT                82
+#define VK_FORMAT_R16G16_SFLOAT              83
+#define VK_FORMAT_R16G16B16_UNORM            84
+#define VK_FORMAT_R16G16B16_SNORM            85
+#define VK_FORMAT_R16G16B16_USCALED          86
+#define VK_FORMAT_R16G16B16_SSCALED          87
+#define VK_FORMAT_R16G16B16_UINT             88
+#define VK_FORMAT_R16G16B16_SINT             89
+#define VK_FORMAT_R16G16B16_SFLOAT           90
+#define VK_FORMAT_R16G16B16A16_UNORM         91
+#define VK_FORMAT_R16G16B16A16_SNORM         92
+#define VK_FORMAT_R16G16B16A16_USCALED       93
+#define VK_FORMAT_R16G16B16A16_SSCALED       94
+#define VK_FORMAT_R16G16B16A16_UINT          95
+#define VK_FORMAT_R16G16B16A16_SINT          96
+#define VK_FORMAT_R16G16B16A16_SFLOAT        97
+#define VK_FORMAT_R32_UINT                   98
+#define VK_FORMAT_R32_SINT                   99
+#define VK_FORMAT_R32_SFLOAT                 100
+#define VK_FORMAT_R32G32_UINT                101
+#define VK_FORMAT_R32G32_SINT                102
+#define VK_FORMAT_R32G32_SFLOAT              103
+#define VK_FORMAT_R32G32B32_UINT             104
+#define VK_FORMAT_R32G32B32_SINT             105
+#define VK_FORMAT_R32G32B32_SFLOAT           106
+#define VK_FORMAT_R32G32B32A32_UINT          107
+#define VK_FORMAT_R32G32B32A32_SINT          108
+#define VK_FORMAT_R32G32B32A32_SFLOAT        109
+#define VK_FORMAT_R64_UINT                   110
+#define VK_FORMAT_R64_SINT                   111
+#define VK_FORMAT_R64_SFLOAT                 112
+#define VK_FORMAT_R64G64_UINT                113
+#define VK_FORMAT_R64G64_SINT                114
+#define VK_FORMAT_R64G64_SFLOAT              115
+#define VK_FORMAT_R64G64B64_UINT             116
+#define VK_FORMAT_R64G64B64_SINT             117
+#define VK_FORMAT_R64G64B64_SFLOAT           118
+#define VK_FORMAT_R64G64B64A64_UINT          119
+#define VK_FORMAT_R64G64B64A64_SINT          120
+#define VK_FORMAT_R64G64B64A64_SFLOAT        121
+
+#define VK_INDEX_TYPE_UINT16    0
+#define VK_INDEX_TYPE_UINT32    1
+#define VK_INDEX_TYPE_NONE_KHR  1000165000
+#define VK_INDEX_TYPE_UINT8_EXT 1000265000
+
+#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
+#define VK_GEOMETRY_TYPE_AABBS_KHR     1
+#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
+
+#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
+#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR         2
+#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR                 4
+#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR              8
+
+#define TYPE(type, align)                                                                                              \
+   layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref                                  \
+   {                                                                                                                   \
+      type value;                                                                                                      \
+   };
+
+#define REF(type)  type##_ref
+#define VOID_REF   uint64_t
+#define NULL       0
+#define DEREF(var) var.value
+
+#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
+
+#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
+
+#define INFINITY (1.0 / 0.0)
+#define NAN      (0.0 / 0.0)
+
+#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
+
+TYPE(int8_t, 1);
+TYPE(uint8_t, 1);
+TYPE(int16_t, 2);
+TYPE(uint16_t, 2);
+TYPE(int32_t, 4);
+TYPE(uint32_t, 4);
+TYPE(int64_t, 8);
+TYPE(uint64_t, 8);
+
+TYPE(float, 4);
+
+TYPE(vec2, 4);
+TYPE(vec3, 4);
+TYPE(vec4, 4);
+
+TYPE(uvec4, 16);
+
+TYPE(VOID_REF, 8);
+
+/* copied from u_math.h */
+uint32_t
+align(uint32_t value, uint32_t alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+int32_t
+to_emulated_float(float f)
+{
+   int32_t bits = floatBitsToInt(f);
+   return f < 0 ? -2147483648 - bits : bits;
+}
+
+float
+from_emulated_float(int32_t bits)
+{
+   return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
+}
+
+TYPE(vk_aabb, 4);
+
+struct key_id_pair {
+   uint32_t id;
+   uint32_t key;
+};
+TYPE(key_id_pair, 4);
+
+TYPE(vk_accel_struct_serialization_header, 8);
+
+TYPE(vk_ir_header, 4);
+TYPE(vk_ir_node, 4);
+TYPE(vk_ir_box_node, 4);
+TYPE(vk_ir_triangle_node, 4);
+TYPE(vk_ir_aabb_node, 4);
+TYPE(vk_ir_instance_node, 8);
+
+TYPE(vk_global_sync_data, 4);
+
+uint32_t
+ir_id_to_offset(uint32_t id)
+{
+   return id & (~3u);
+}
+
+uint32_t
+ir_id_to_type(uint32_t id)
+{
+   return id & 3u;
+}
+
+uint32_t
+pack_ir_node_id(uint32_t offset, uint32_t type)
+{
+   return offset | type;
+}
+
+float
+aabb_surface_area(vk_aabb aabb)
+{
+   vec3 diagonal = aabb.max - aabb.min;
+   return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
+}
+
+/* Just a wrapper for 3 uints. */
+struct triangle_indices {
+   uint32_t index[3];
+};
+
+triangle_indices
+load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
+{
+   triangle_indices result;
+
+   uint32_t index_base = global_id * 3;
+
+   switch (index_format) {
+   case VK_INDEX_TYPE_UINT16: {
+      result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
+      break;
+   }
+   case VK_INDEX_TYPE_UINT32: {
+      result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
+      break;
+   }
+   case VK_INDEX_TYPE_NONE_KHR: {
+      result.index[0] = index_base + 0;
+      result.index[1] = index_base + 1;
+      result.index[2] = index_base + 2;
+      break;
+   }
+   case VK_INDEX_TYPE_UINT8_EXT: {
+      result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
+      break;
+   }
+   }
+
+   return result;
+}
+
+/* Just a wrapper for 3 vec4s. */
+struct triangle_vertices {
+   vec4 vertex[3];
+};
+
+TYPE(float16_t, 2);
+
+triangle_vertices
+load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
+{
+   triangle_vertices result;
+
+   for (uint32_t i = 0; i < 3; i++) {
+      VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
+      vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
+
+      switch (vertex_format) {
+      case VK_FORMAT_R32G32_SFLOAT:
+         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
+         break;
+      case VK_FORMAT_R32G32B32_SFLOAT:
+      case VK_FORMAT_R32G32B32A32_SFLOAT:
+         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
+         vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
+         break;
+      case VK_FORMAT_R16G16_SFLOAT:
+         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
+         break;
+      case VK_FORMAT_R16G16B16_SFLOAT:
+      case VK_FORMAT_R16G16B16A16_SFLOAT:
+         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
+         vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
+         break;
+      case VK_FORMAT_R16G16_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
+         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
+         break;
+      case VK_FORMAT_R16G16B16A16_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
+         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
+         vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
+         break;
+      case VK_FORMAT_R8G8_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
+         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
+         break;
+      case VK_FORMAT_R8G8B8A8_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
+         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
+         vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
+         break;
+      case VK_FORMAT_R16G16_UNORM:
+         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
+         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
+         break;
+      case VK_FORMAT_R16G16B16A16_UNORM:
+         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
+         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
+         vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
+         break;
+      case VK_FORMAT_R8G8_UNORM:
+         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
+         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
+         break;
+      case VK_FORMAT_R8G8B8A8_UNORM:
+         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
+         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
+         vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
+         break;
+      case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
+         uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
+         vertex.x = float(data & 0x3FF) / 0x3FF;
+         vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
+         vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
+         break;
+      }
+      }
+
+      result.vertex[i] = vertex;
+   }
+
+   return result;
+}
+
+/** Compute ceiling of integer quotient of A divided by B.
+    From macros.h */
+#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
+
+#ifdef USE_GLOBAL_SYNC
+
+/* There might be more invocations available than tasks to do.
+ * In that case, the fetched task index is greater than the
+ * counter offset for the next phase. To avoid out-of-bounds
+ * accessing, phases will be skipped until the task index is
+ * is in-bounds again. */
+uint32_t num_tasks_to_skip = 0;
+uint32_t phase_index = 0;
+bool should_skip = false;
+shared uint32_t global_task_index;
+
+shared uint32_t shared_phase_index;
+
+uint32_t
+task_count(REF(vk_ir_header) header)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   return DEREF(header).sync_data.task_counts[phase_index & 1];
+}
+
+/* Sets the task count for the next phase. */
+void
+set_next_task_count(REF(vk_ir_header) header, uint32_t new_count)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
+}
+
+/*
+ * This function has two main objectives:
+ * Firstly, it partitions pending work among free invocations.
+ * Secondly, it guarantees global synchronization between different phases.
+ *
+ * After every call to fetch_task, a new task index is returned.
+ * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
+ * to determine if the current phase should be executed or skipped.
+ *
+ * Since tasks are assigned per-workgroup, there is a possibility of the task index being
+ * greater than the total task count.
+ */
+uint32_t
+fetch_task(REF(vk_ir_header) header, bool did_work)
+{
+   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
+    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
+    * and their results are written to memory. */
+   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+   if (gl_LocalInvocationIndex == 0) {
+      if (did_work)
+         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
+      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
+
+      do {
+         /* Perform a memory barrier to refresh the current phase's end counter, in case
+          * another workgroup changed it. */
+         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
+          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
+          * previous phase have been completed. Switching to a new phase and incrementing the phase
+          * end counter in turn notifies all invocations for that phase that it is safe to execute.
+          */
+         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
+             DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
+            if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
+               DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
+               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+            } else {
+               atomicAdd(DEREF(header).sync_data.phase_index, 1);
+               DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
+               /* Ensure the changes to the phase index and start/end counter are visible for other
+                * workgroup waiting in the loop. */
+               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+               atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
+                         DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
+            }
+            break;
+         }
+
+         /* If other invocations have finished all nodes, break out; there is no work to do */
+         if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
+            break;
+         }
+      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
+
+      shared_phase_index = DEREF(header).sync_data.phase_index;
+   }
+
+   barrier();
+   if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
+      return TASK_INDEX_INVALID;
+
+   num_tasks_to_skip = shared_phase_index - phase_index;
+
+   uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
+   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
+}
+
+bool
+should_execute_phase()
+{
+   if (num_tasks_to_skip > 0) {
+      /* Skip to next phase. */
+      ++phase_index;
+      --num_tasks_to_skip;
+      return false;
+   }
+   return true;
+}
+
+#define PHASE(header)                                                                                                  \
+   for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
+#endif
+
+#endif
--- a/src/vulkan/runtime/bvh/vk_build_interface.h
+++ b/src/vulkan/runtime/bvh/vk_build_interface.h
@ -0,0 +1,103 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VK_BVH_BUILD_INTERFACE_H
+#define VK_BVH_BUILD_INTERFACE_H
+
+#ifdef VULKAN
+#include "vk_build_helpers.h"
+#else
+#include <stdint.h>
+#include "vk_bvh.h"
+#define REF(type) uint64_t
+#define VOID_REF  uint64_t
+#endif
+
+#define SUBGROUP_SIZE_ID 0
+#define BVH_BOUNDS_OFFSET_ID 1
+#ifdef VULKAN
+layout (constant_id = SUBGROUP_SIZE_ID) const int SUBGROUP_SIZE = 64;
+layout (constant_id = BVH_BOUNDS_OFFSET_ID) const int BVH_BOUNDS_OFFSET = 0;
+#endif
+
+struct leaf_args {
+   VOID_REF bvh;
+   REF(vk_ir_header) header;
+   REF(key_id_pair) ids;
+
+   vk_bvh_geometry_data geom_data;
+};
+
+struct morton_args {
+   VOID_REF bvh;
+   REF(vk_ir_header) header;
+   REF(key_id_pair) ids;
+};
+
+#define LBVH_RIGHT_CHILD_BIT_SHIFT 29
+#define LBVH_RIGHT_CHILD_BIT       (1 << LBVH_RIGHT_CHILD_BIT_SHIFT)
+
+struct lbvh_node_info {
+   /* Number of children that have been processed (or are invalid/leaves) in
+    * the lbvh_generate_ir pass.
+    */
+   uint32_t path_count;
+
+   uint32_t children[2];
+   uint32_t parent;
+};
+
+struct lbvh_main_args {
+   VOID_REF bvh;
+   REF(key_id_pair) src_ids;
+   VOID_REF node_info;
+   uint32_t id_count;
+   uint32_t internal_node_base;
+};
+
+struct lbvh_generate_ir_args {
+   VOID_REF bvh;
+   VOID_REF node_info;
+   VOID_REF header;
+   uint32_t internal_node_base;
+};
+
+struct ploc_prefix_scan_partition {
+   uint32_t aggregate;
+   uint32_t inclusive_sum;
+};
+
+#define PLOC_WORKGROUP_SIZE 1024
+#define PLOC_SUBGROUPS_PER_WORKGROUP                                           \
+   (DIV_ROUND_UP(PLOC_WORKGROUP_SIZE, SUBGROUP_SIZE))
+
+struct ploc_args {
+   VOID_REF bvh;
+   VOID_REF prefix_scan_partitions;
+   REF(vk_ir_header) header;
+   VOID_REF ids_0;
+   VOID_REF ids_1;
+   uint32_t internal_node_offset;
+};
+
+#endif
--- a/src/vulkan/runtime/bvh/vk_bvh.h
+++ b/src/vulkan/runtime/bvh/vk_bvh.h
@ -0,0 +1,156 @@
+/*
+ * Copyright © 2021 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BVH_VK_BVH_H
+#define BVH_VK_BVH_H
+
+#define vk_ir_node_triangle 0
+#define vk_ir_node_internal 1
+#define vk_ir_node_instance 2
+#define vk_ir_node_aabb     3
+
+#define VK_GEOMETRY_OPAQUE (1u << 31)
+
+#ifdef VULKAN
+#define VK_UUID_SIZE 16
+#else
+#include <vulkan/vulkan.h>
+typedef struct vk_ir_node vk_ir_node;
+typedef struct vk_global_sync_data vk_global_sync_data;
+typedef struct vk_bvh_geometry_data vk_bvh_geometry_data;
+
+typedef struct {
+   float values[3][4];
+} mat3x4;
+
+typedef struct {
+   float x;
+   float y;
+   float z;
+} vec3;
+
+typedef struct vk_aabb vk_aabb;
+#endif
+
+struct vk_aabb {
+   vec3 min;
+   vec3 max;
+};
+
+/* This is the header structure for serialized acceleration structures, as
+ * defined by the Vulkan spec.
+ */
+struct vk_accel_struct_serialization_header {
+   uint8_t driver_uuid[VK_UUID_SIZE];
+   uint8_t accel_struct_compat[VK_UUID_SIZE];
+   uint64_t serialization_size;
+   uint64_t deserialization_size;
+   uint64_t instance_count;
+#ifndef VULKAN
+   uint64_t instances[];
+#endif
+};
+
+struct vk_global_sync_data {
+   uint32_t task_counts[2];
+   uint32_t task_started_counter;
+   uint32_t task_done_counter;
+   uint32_t current_phase_start_counter;
+   uint32_t current_phase_end_counter;
+   uint32_t phase_index;
+   /* If this flag is set, the shader should exit
+    * instead of executing another phase */
+   uint32_t next_phase_exit_flag;
+};
+
+struct vk_ir_header {
+   int32_t min_bounds[3];
+   int32_t max_bounds[3];
+   uint32_t active_leaf_count;
+   /* Indirect dispatch dimensions for the encoder.
+    * ir_internal_node_count is the thread count in the X dimension,
+    * while Y and Z are always set to 1. */
+   uint32_t ir_internal_node_count;
+   uint32_t dispatch_size_y;
+   uint32_t dispatch_size_z;
+   vk_global_sync_data sync_data;
+   uint32_t dst_node_offset;
+};
+
+struct vk_ir_node {
+   vk_aabb aabb;
+};
+
+#define VK_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
+#define VK_NULL_BVH_OFFSET    0xFFFFFFFE
+
+struct vk_ir_box_node {
+   vk_ir_node base;
+   uint32_t children[2];
+   uint32_t bvh_offset;
+};
+
+struct vk_ir_aabb_node {
+   vk_ir_node base;
+   uint32_t primitive_id;
+   uint32_t geometry_id_and_flags;
+};
+
+struct vk_ir_triangle_node {
+   vk_ir_node base;
+   float coords[3][3];
+   uint32_t triangle_id;
+   uint32_t id;
+   uint32_t geometry_id_and_flags;
+};
+
+struct vk_ir_instance_node {
+   vk_ir_node base;
+   /* See radv_bvh_instance_node */
+   uint64_t base_ptr;
+   uint32_t custom_instance_and_mask;
+   uint32_t sbt_offset_and_flags;
+   mat3x4 otw_matrix;
+   uint32_t instance_id;
+};
+
+#define VK_BVH_INVALID_NODE 0xFFFFFFFF
+
+/* If the task index is set to this value, there is no
+ * more work to do. */
+#define TASK_INDEX_INVALID 0xFFFFFFFF
+
+struct vk_bvh_geometry_data {
+   uint64_t data;
+   uint64_t indices;
+   uint64_t transform;
+
+   uint32_t geometry_id;
+   uint32_t geometry_type;
+   uint32_t first_id;
+   uint32_t stride;
+   uint32_t vertex_format;
+   uint32_t index_format;
+};
+
+#endif
--- a/src/vulkan/runtime/meson.build
+++ b/src/vulkan/runtime/meson.build
@ -7,7 +7,6 @@
 vulkan_lite_runtime_files = files(
  'rmv/vk_rmv_common.c',
  'rmv/vk_rmv_exporter.c',
-  'vk_acceleration_structure.c',
  'vk_blend.c',
  'vk_buffer.c',
  'vk_buffer_view.c',
@ -277,6 +276,8 @@ vulkan_runtime_deps = [
 ]

 if prog_glslang.found()
+  subdir('radix_sort')
+  subdir('bvh')
  vulkan_runtime_files += files('vk_texcompress_astc.c')
  vulkan_runtime_files += custom_target(
    'astc_spv.h',
@ -288,6 +289,10 @@ if prog_glslang.found()
    ],
    depfile : 'astc_spv.h.d',
  )
+  vulkan_runtime_files += files('vk_acceleration_structure.c')
+  vulkan_runtime_files += radix_sort_files
+  vulkan_runtime_files += bvh_spv
+  vulkan_runtime_files += radix_sort_spv
 endif

 libvulkan_runtime = static_library(
@ -320,7 +325,10 @@ else
  )
 endif

-idep_vulkan_runtime_headers = idep_vulkan_lite_runtime_headers
+idep_vulkan_runtime_headers = [idep_vulkan_lite_runtime_headers]
+idep_vulkan_runtime_headers += declare_dependency(
+  include_directories : include_directories('bvh'),
+)

 idep_vulkan_runtime = declare_dependency(
  dependencies : [
--- a/src/vulkan/runtime/radix_sort/LICENSE
+++ b/src/vulkan/runtime/radix_sort/LICENSE
--- a/src/vulkan/runtime/radix_sort/common/macros.h
+++ b/src/vulkan/runtime/radix_sort/common/macros.h
--- a/src/vulkan/runtime/radix_sort/common/util.c
+++ b/src/vulkan/runtime/radix_sort/common/util.c
--- a/src/vulkan/runtime/radix_sort/common/util.h
+++ b/src/vulkan/runtime/radix_sort/common/util.h
--- a/src/vulkan/runtime/radix_sort/common/vk/barrier.c
+++ b/src/vulkan/runtime/radix_sort/common/vk/barrier.c
@ -7,6 +7,8 @@
 //

 #include "barrier.h"
+#include "vulkan/runtime/vk_device.h"
+#include "vulkan/runtime/vk_command_buffer.h"

 //
 //
@ -15,6 +17,10 @@
 void
 vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -23,7 +29,7 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       0,
@ -42,6 +48,10 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -50,7 +60,7 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       VK_PIPELINE_STAGE_TRANSFER_BIT,
                       0,
@ -69,6 +79,10 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -77,7 +91,7 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_TRANSFER_BIT,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       0,
@ -96,6 +110,10 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -104,7 +122,7 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_TRANSFER_BIT,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       0,
@ -123,6 +141,10 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -132,7 +154,7 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
                     VK_ACCESS_SHADER_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       0,
@ -151,6 +173,10 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -160,7 +186,7 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       VK_PIPELINE_STAGE_TRANSFER_BIT,
                       0,
@ -179,6 +205,10 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -187,7 +217,7 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_HOST_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       VK_PIPELINE_STAGE_HOST_BIT,
                       0,
@ -206,6 +236,10 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -214,7 +248,7 @@ vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
    .dstAccessMask = VK_ACCESS_HOST_READ_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_TRANSFER_BIT,
                       VK_PIPELINE_STAGE_HOST_BIT,
                       0,
@ -237,12 +271,16 @@ vk_memory_barrier(VkCommandBuffer      cb,
                  VkPipelineStageFlags dst_stage,
                  VkAccessFlags        dst_mask)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  VkMemoryBarrier const mb = { .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
                               .pNext         = NULL,
                               .srcAccessMask = src_mask,
                               .dstAccessMask = dst_mask };

-  vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
+  disp->CmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
 }

 //
@ -252,6 +290,10 @@ vk_memory_barrier(VkCommandBuffer      cb,
 void
 vk_barrier_debug(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  static VkMemoryBarrier const mb = {

    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@ -288,7 +330,7 @@ vk_barrier_debug(VkCommandBuffer cb)
                     VK_ACCESS_HOST_WRITE_BIT
  };

-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                       VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                       VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                       0,
--- a/src/vulkan/runtime/radix_sort/common/vk/barrier.h
+++ b/src/vulkan/runtime/radix_sort/common/vk/barrier.h
--- a/src/vulkan/runtime/radix_sort/meson.build
+++ b/src/vulkan/runtime/radix_sort/meson.build
@ -0,0 +1,37 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+subdir('shaders')
+
+radix_sort_files = files(
+  'common/vk/barrier.c',
+  'common/vk/barrier.h',
+  'common/macros.h',
+  'common/util.c',
+  'common/util.h',
+  'shaders/push.h',
+  'radix_sort_u64.c',
+  'radix_sort_u64.h',
+  'radix_sort_vk_devaddr.h',
+  'radix_sort_vk_ext.h',
+  'radix_sort_vk.c',
+  'radix_sort_vk.h',
+  'target.h'
+)
--- a/src/vulkan/runtime/radix_sort/radix_sort_u64.c
+++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.c
@ -0,0 +1,59 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "radix_sort_u64.h"
+#include <assert.h>
+
+static const uint32_t init_spv[] = {
+#include "radix_sort/shaders/init.comp.spv.h"
+};
+
+static const uint32_t fill_spv[] = {
+#include "radix_sort/shaders/fill.comp.spv.h"
+};
+
+static const uint32_t histogram_spv[] = {
+#include "radix_sort/shaders/histogram.comp.spv.h"
+};
+
+static const uint32_t prefix_spv[] = {
+#include "radix_sort/shaders/prefix.comp.spv.h"
+};
+
+static const uint32_t scatter_0_even_spv[] = {
+#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
+};
+
+static const uint32_t scatter_0_odd_spv[] = {
+#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
+};
+
+static const uint32_t scatter_1_even_spv[] = {
+#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
+};
+
+static const uint32_t scatter_1_odd_spv[] = {
+#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
+};
+
+
+radix_sort_vk_t *
+vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                         VkPipelineCache pc,
+                         struct radix_sort_vk_target_config config)
+{
+   assert(config.keyval_dwords == 2);
+
+   const uint32_t *spv[8] = {
+      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
+      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
+   };
+   const uint32_t spv_sizes[8] = {
+      sizeof(init_spv),           sizeof(fill_spv),          sizeof(histogram_spv),      sizeof(prefix_spv),
+      sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
+   };
+   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, config);
+}
+
--- a/src/vulkan/runtime/radix_sort/radix_sort_u64.h
+++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.h
@ -0,0 +1,24 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef VK_RADIX_SORT_U64
+#define VK_RADIX_SORT_U64
+
+#include "radix_sort_vk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+radix_sort_vk_t *
+vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                         VkPipelineCache pc,
+                         struct radix_sort_vk_target_config config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/vulkan/runtime/radix_sort/radix_sort_vk.c
+++ b/src/vulkan/runtime/radix_sort/radix_sort_vk.c
@ -11,6 +11,10 @@
 #include "common/vk/barrier.h"
 #include "radix_sort_vk_devaddr.h"
 #include "shaders/push.h"
+#include "shaders/config.h"
+
+#include "vk_command_buffer.h"
+#include "vk_device.h"

 //
 //
@ -100,14 +104,41 @@ radix_sort_vk_get_memory_requirements(radix_sort_vk_t const *               rs,
      // NOTE: Assumes .histograms are before .partitions.
      //
      // Last scatter workgroup skips writing to a partition.
+      // Each RS_RADIX_LOG2 (8) bit pass has a zero-initialized histogram. This
+      // is one RS_RADIX_SIZE histogram per keyval byte.
      //
-      // One histogram per (keyval byte + partitions)
+      // The last scatter workgroup skips writing to a partition so it doesn't
+      // need to be allocated.
      //
-      uint32_t const partitions = scatter_blocks - 1;
+      // If the device doesn't support "sequential dispatch" of workgroups, then
+      // we need a zero-initialized dword counter per radix pass in the keyval
+      // to atomically acquire a virtual workgroup id.  On sequentially
+      // dispatched devices, this is simply `gl_WorkGroupID.x`.
+      //
+      // The "internal" memory map looks like this:
+      //
+      //   +---------------------------------+ <-- 0
+      //   | histograms[keyval_size]         |
+      //   +---------------------------------+ <-- keyval_size                           * histo_size
+      //   | partitions[scatter_blocks_ru-1] |
+      //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size
+      //   | workgroup_ids[keyval_size]      |
+      //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size
+      //
+      // The `.workgroup_ids[]` are located after the last partition.
+      //
+      VkDeviceSize const histo_size = RS_RADIX_SIZE * sizeof(uint32_t);

-      mr->internal_size      = (mr->keyval_size + partitions) * (RS_RADIX_SIZE * sizeof(uint32_t));
+      mr->internal_size      = (mr->keyval_size + scatter_blocks - 1) * histo_size;
      mr->internal_alignment = internal_sg_size * sizeof(uint32_t);

+      //
+      // Support for nonsequential dispatch can be disabled.
+      //
+      VkDeviceSize const workgroup_ids_size = mr->keyval_size * sizeof(uint32_t);
+
+      mr->internal_size += workgroup_ids_size;
+
      //
      // Indirect
      //
@ -185,13 +216,17 @@ rs_pipeline_count(struct radix_sort_vk const * rs)
 }

 radix_sort_vk_t *
-radix_sort_vk_create(VkDevice                           device,
+radix_sort_vk_create(VkDevice                           _device,
                    VkAllocationCallbacks const *      ac,
                    VkPipelineCache                    pc,
                    const uint32_t* const*             spv,
                    const uint32_t*                    spv_sizes,
                    struct radix_sort_vk_target_config config)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
  //
  // Allocate radix_sort_vk
  //
@ -244,6 +279,38 @@ radix_sort_vk_create(VkDevice                           device,
      .size       = sizeof(struct rs_push_scatter) },  // scatter_1_odd
  };

+  uint32_t spec_constants[] = {
+    [RS_FILL_WORKGROUP_SIZE] = 1u << config.fill.workgroup_size_log2,
+    [RS_FILL_BLOCK_ROWS] = config.fill.block_rows,
+    [RS_HISTOGRAM_WORKGROUP_SIZE] = 1u << config.histogram.workgroup_size_log2,
+    [RS_HISTOGRAM_SUBGROUP_SIZE_LOG2] = config.histogram.subgroup_size_log2,
+    [RS_HISTOGRAM_BLOCK_ROWS] = config.histogram.block_rows,
+    [RS_PREFIX_WORKGROUP_SIZE] = 1u << config.prefix.workgroup_size_log2,
+    [RS_PREFIX_SUBGROUP_SIZE_LOG2] = config.prefix.subgroup_size_log2,
+    [RS_SCATTER_WORKGROUP_SIZE] = 1u << config.scatter.workgroup_size_log2,
+    [RS_SCATTER_SUBGROUP_SIZE_LOG2] = config.scatter.subgroup_size_log2,
+    [RS_SCATTER_BLOCK_ROWS] = config.scatter.block_rows,
+    [RS_SCATTER_NONSEQUENTIAL_DISPATCH] = config.nonsequential_dispatch,
+  };
+
+  VkSpecializationMapEntry spec_map[ARRAY_LENGTH_MACRO(spec_constants)];
+
+  for (uint32_t ii = 0; ii < ARRAY_LENGTH_MACRO(spec_constants); ii++)
+    {
+      spec_map[ii] = (VkSpecializationMapEntry) {
+        .constantID = ii,
+        .offset = sizeof(uint32_t) * ii,
+        .size = sizeof(uint32_t),
+      };
+    }
+
+  VkSpecializationInfo spec_info = {
+    .mapEntryCount = ARRAY_LENGTH_MACRO(spec_map),
+    .pMapEntries = spec_map,
+    .dataSize = sizeof(spec_constants),
+    .pData = spec_constants,
+  };
+
  VkPipelineLayoutCreateInfo plci = {

    .sType                  = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
@ -259,7 +326,7 @@ radix_sort_vk_create(VkDevice                           device,
    {
      plci.pPushConstantRanges = pcr + ii;

-      if (vkCreatePipelineLayout(device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS)
+      if (disp->CreatePipelineLayout(_device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS)
        goto fail_layout;
    }

@ -282,7 +349,7 @@ radix_sort_vk_create(VkDevice                           device,
      smci.codeSize = spv_sizes[ii];
      smci.pCode    = spv[ii];

-      if (vkCreateShaderModule(device, &smci, ac, sms + ii) != VK_SUCCESS)
+      if (disp->CreateShaderModule(_device, &smci, ac, sms + ii) != VK_SUCCESS)
        goto fail_shader;
    }

@ -323,11 +390,11 @@ radix_sort_vk_create(VkDevice                           device,
    .flags = 0,                                                                                    \
    .stage = { .sType               = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,         \
               .pNext               = NULL,                                                        \
-               .flags               = 0,                                                           \
+               .flags               = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT, \
               .stage               = VK_SHADER_STAGE_COMPUTE_BIT,                                 \
               .module              = sms[idx_],                                                   \
               .pName               = "main",                                                      \
-               .pSpecializationInfo = NULL },                                                      \
+               .pSpecializationInfo = &spec_info },                                                \
                                                                                                   \
    .layout             = rs->pipeline_layouts.handles[idx_],                                      \
    .basePipelineHandle = VK_NULL_HANDLE,                                                          \
@ -358,7 +425,7 @@ radix_sort_vk_create(VkDevice                           device,
  //
  // Create the compute pipelines
  //
-  if (vkCreateComputePipelines(device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS)
+  if (disp->CreateComputePipelines(_device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS)
    goto fail_pipeline;

  //
@ -366,7 +433,7 @@ radix_sort_vk_create(VkDevice                           device,
  //
  for (uint32_t ii = 0; ii < pipeline_count; ii++)
    {
-      vkDestroyShaderModule(device, sms[ii], ac);
+      disp->DestroyShaderModule(_device, sms[ii], ac);
    }

 #ifdef RS_VK_ENABLE_DEBUG_UTILS
@ -397,17 +464,17 @@ radix_sort_vk_create(VkDevice                           device,
 fail_pipeline:
  for (uint32_t ii = 0; ii < pipeline_count; ii++)
    {
-      vkDestroyPipeline(device, rs->pipelines.handles[ii], ac);
+      disp->DestroyPipeline(_device, rs->pipelines.handles[ii], ac);
    }
 fail_shader:
  for (uint32_t ii = 0; ii < pipeline_count; ii++)
    {
-      vkDestroyShaderModule(device, sms[ii], ac);
+      disp->DestroyShaderModule(_device, sms[ii], ac);
    }
 fail_layout:
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
    {
-      vkDestroyPipelineLayout(device, rs->pipeline_layouts.handles[ii], ac);
+      disp->DestroyPipelineLayout(_device, rs->pipeline_layouts.handles[ii], ac);
    }

  free(rs);
@ -420,18 +487,22 @@ fail_layout:
 void
 radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbacks const * const ac)
 {
+  VK_FROM_HANDLE(vk_device, device, d);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
  uint32_t const pipeline_count = rs_pipeline_count(rs);

  // destroy pipelines
  for (uint32_t ii = 0; ii < pipeline_count; ii++)
    {
-      vkDestroyPipeline(d, rs->pipelines.handles[ii], ac);
+      disp->DestroyPipeline(d, rs->pipelines.handles[ii], ac);
    }

  // destroy pipeline layouts
  for (uint32_t ii = 0; ii < pipeline_count; ii++)
    {
-      vkDestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac);
+      disp->DestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac);
    }

  free(rs);
@ -441,8 +512,12 @@ radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbac
 //
 //
 static VkDeviceAddress
-rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi)
+rs_get_devaddr(VkDevice _device, VkDescriptorBufferInfo const * dbi)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
  VkBufferDeviceAddressInfo const bdai = {

    .sType  = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
@ -450,7 +525,7 @@ rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi)
    .buffer = dbi->buffer
  };

-  VkDeviceAddress const devaddr = vkGetBufferDeviceAddress(device, &bdai) + dbi->offset;
+  VkDeviceAddress const devaddr = disp->GetBufferDeviceAddress(_device, &bdai) + dbi->offset;

  return devaddr;
 }
@ -465,13 +540,17 @@ rs_ext_cmd_write_timestamp(struct radix_sort_vk_ext_timestamps * ext_timestamps,
                           VkCommandBuffer                       cb,
                           VkPipelineStageFlagBits               pipeline_stage)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
  if ((ext_timestamps != NULL) &&
      (ext_timestamps->timestamps_set < ext_timestamps->timestamp_count))
    {
-      vkCmdWriteTimestamp(cb,
-                          pipeline_stage,
-                          ext_timestamps->timestamps,
-                          ext_timestamps->timestamps_set++);
+      disp->CmdWriteTimestamp(cb,
+                              pipeline_stage,
+                              ext_timestamps->timestamps,
+                              ext_timestamps->timestamps_set++);
    }
 }

@ -497,10 +576,14 @@ struct radix_sort_vk_ext_base
 void
 radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
                           radix_sort_vk_sort_devaddr_info_t const * info,
-                           VkDevice                                  device,
+                           VkDevice                                  _device,
                           VkCommandBuffer                           cb,
                           VkDeviceAddress *                         keyvals_sorted)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
  //
  // Anything to do?
  //
@ -557,16 +640,13 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
    // Label the command buffer
    //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL)
-    {
-      VkDebugUtilsLabelEXT const label = {
-        .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
-        .pNext      = NULL,
-        .pLabelName = "radix_sort_vk_sort",
-      };
+   VkDebugUtilsLabelEXT const label = {
+     .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+     .pNext      = NULL,
+     .pLabelName = "radix_sort_vk_sort",
+   };

-      pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label);
-    }
+   disp->CmdBeginDebugUtilsLabelEXT(cb, &label);
 #endif

  //
@ -679,16 +759,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
    .passes             = passes
  };

-  vkCmdPushConstants(cb,
+  disp->CmdPushConstants(cb,
                     rs->pipeline_layouts.named.histogram,
                     VK_SHADER_STAGE_COMPUTE_BIT,
                     0,
                     sizeof(push_histogram),
                     &push_histogram);

-  vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
+  disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);

-  vkCmdDispatch(cb, histo_blocks, 1, 1);
+  disp->CmdDispatch(cb, histo_blocks, 1, 1);

  ////////////////////////////////////////////////////////////////////////
  //
@ -707,16 +787,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
    .devaddr_histograms = devaddr_histograms,
  };

-  vkCmdPushConstants(cb,
+  disp->CmdPushConstants(cb,
                     rs->pipeline_layouts.named.prefix,
                     VK_SHADER_STAGE_COMPUTE_BIT,
                     0,
                     sizeof(push_prefix),
                     &push_prefix);

-  vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
+  disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);

-  vkCmdDispatch(cb, passes, 1, 1);
+  disp->CmdDispatch(cb, passes, 1, 1);

  ////////////////////////////////////////////////////////////////////////
  //
@ -746,14 +826,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
  {
    uint32_t const pass_dword = pass_idx / 4;

-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                       rs->pipeline_layouts.named.scatter[pass_dword].even,
                       VK_SHADER_STAGE_COMPUTE_BIT,
                       0,
                       sizeof(push_scatter),
                       &push_scatter);

-    vkCmdBindPipeline(cb,
+    disp->CmdBindPipeline(cb,
                      VK_PIPELINE_BIND_POINT_COMPUTE,
                      rs->pipelines.named.scatter[pass_dword].even);
  }
@ -762,7 +842,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,

  while (true)
    {
-      vkCmdDispatch(cb, scatter_blocks, 1, 1);
+      disp->CmdDispatch(cb, scatter_blocks, 1, 1);

      //
      // Continue?
@ -788,7 +868,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
      //
      VkPipelineLayout const pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even  //
                                          : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-      vkCmdPushConstants(cb,
+      disp->CmdPushConstants(cb,
                         pl,
                         VK_SHADER_STAGE_COMPUTE_BIT,
                         OFFSETOF_MACRO(struct rs_push_scatter, devaddr_histograms),
@ -801,7 +881,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
      VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even  //
                                   : rs->pipelines.named.scatter[pass_dword].odd;

-      vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+      disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
    }

 #ifdef RS_VK_ENABLE_EXTENSIONS
@ -812,10 +892,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
  // End the label
  //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL)
-    {
-      pfn_vkCmdEndDebugUtilsLabelEXT(cb);
-    }
+  disp->CmdEndDebugUtilsLabelEXT(cb);
 #endif
 }

@ -825,10 +902,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
 void
 radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *                            rs,
                                    radix_sort_vk_sort_indirect_devaddr_info_t const * info,
-                                    VkDevice                                           device,
+                                    VkDevice                                           _device,
                                    VkCommandBuffer                                    cb,
                                    VkDeviceAddress * keyvals_sorted)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
  //
  // Anything to do?
  //
@ -886,16 +967,13 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
    // Label the command buffer
    //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL)
-    {
-      VkDebugUtilsLabelEXT const label = {
-        .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
-        .pNext      = NULL,
-        .pLabelName = "radix_sort_vk_sort_indirect",
-      };
+  VkDebugUtilsLabelEXT const label = {
+    .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+    .pNext      = NULL,
+    .pLabelName = "radix_sort_vk_sort_indirect",
+  };

-      pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label);
-    }
+  disp->CmdBeginDebugUtilsLabelEXT(cb, &label);
 #endif

  //
@ -938,16 +1016,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
      .passes        = passes
    };

-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                       rs->pipeline_layouts.named.init,
                       VK_SHADER_STAGE_COMPUTE_BIT,
                       0,
                       sizeof(push_init),
                       &push_init);

-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init);

-    vkCmdDispatch(cb, 1, 1, 1);
+    disp->CmdDispatch(cb, 1, 1, 1);
  }

 #ifdef RS_VK_ENABLE_EXTENSIONS
@ -967,14 +1045,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
      .dword          = 0xFFFFFFFF
    };

-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                       rs->pipeline_layouts.named.fill,
                       VK_SHADER_STAGE_COMPUTE_BIT,
                       0,
                       sizeof(push_pad),
                       &push_pad);

-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);

    info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.pad));
  }
@ -992,14 +1070,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
      .dword          = 0
    };

-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                       rs->pipeline_layouts.named.fill,
                       VK_SHADER_STAGE_COMPUTE_BIT,
                       0,
                       sizeof(push_zero),
                       &push_zero);

-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);

    info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.zero));
  }
@ -1021,14 +1099,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
      .passes             = passes
    };

-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                       rs->pipeline_layouts.named.histogram,
                       VK_SHADER_STAGE_COMPUTE_BIT,
                       0,
                       sizeof(push_histogram),
                       &push_histogram);

-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);

    info->dispatch_indirect(cb,
                            &info->indirect,
@ -1049,16 +1127,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
      .devaddr_histograms = devaddr_histograms,
    };

-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                       rs->pipeline_layouts.named.prefix,
                       VK_SHADER_STAGE_COMPUTE_BIT,
                       0,
                       sizeof(push_prefix),
                       &push_prefix);

-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);

-    vkCmdDispatch(cb, passes, 1, 1);
+    disp->CmdDispatch(cb, passes, 1, 1);
  }

 #ifdef RS_VK_ENABLE_EXTENSIONS
@ -1088,14 +1166,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
    {
      uint32_t const pass_dword = pass_idx / 4;

-      vkCmdPushConstants(cb,
+      disp->CmdPushConstants(cb,
                         rs->pipeline_layouts.named.scatter[pass_dword].even,
                         VK_SHADER_STAGE_COMPUTE_BIT,
                         0,
                         sizeof(push_scatter),
                         &push_scatter);

-      vkCmdBindPipeline(cb,
+      disp->CmdBindPipeline(cb,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        rs->pipelines.named.scatter[pass_dword].even);
    }
@ -1134,7 +1212,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
        VkPipelineLayout const pl = is_even
                                      ? rs->pipeline_layouts.named.scatter[pass_dword].even  //
                                      : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-        vkCmdPushConstants(
+        disp->CmdPushConstants(
          cb,
          pl,
          VK_SHADER_STAGE_COMPUTE_BIT,
@ -1148,7 +1226,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
        VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even  //
                                     : rs->pipelines.named.scatter[pass_dword].odd;

-        vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+        disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
      }
  }

@ -1160,10 +1238,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
  // End the label
  //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL)
-    {
-      pfn_vkCmdEndDebugUtilsLabelEXT(cb);
-    }
+  disp->CmdEndDebugUtilsLabelEXT(cb);
 #endif
 }

@ -1177,7 +1252,11 @@ radix_sort_vk_fill_buffer(VkCommandBuffer                     cb,
                          VkDeviceSize                        size,
                          uint32_t                            data)
 {
-  vkCmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data);
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
+  disp->CmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data);
 }

 //
@ -1221,7 +1300,11 @@ radix_sort_vk_dispatch_indirect(VkCommandBuffer                     cb,
                                radix_sort_vk_buffer_info_t const * buffer_info,
                                VkDeviceSize                        offset)
 {
-  vkCmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset);
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
+  disp->CmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset);
 }

 //
--- a/src/vulkan/runtime/radix_sort/radix_sort_vk.h
+++ b/src/vulkan/runtime/radix_sort/radix_sort_vk.h
--- a/src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h
+++ b/src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h
--- a/src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h
+++ b/src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h
--- a/src/vulkan/runtime/radix_sort/shaders/bufref.h
+++ b/src/vulkan/runtime/radix_sort/shaders/bufref.h
--- a/src/vulkan/runtime/radix_sort/shaders/config.h
+++ b/src/vulkan/runtime/radix_sort/shaders/config.h
@ -0,0 +1,33 @@
+// Copyright 2024 Valve Corporation
+// SPDX-License-Identifier: MIT
+
+#ifdef VULKAN
+#define CONFIG(_name, _id, default_val) layout (constant_id = _id) const int _name = default_val;
+#else
+enum rs_config {
+#define CONFIG(_name, _id, default_val) _name = _id,
+#endif
+
+#define RS_FILL_WORKGROUP_SIZE_ID 0
+CONFIG(RS_FILL_WORKGROUP_SIZE, RS_FILL_WORKGROUP_SIZE_ID, 7)
+CONFIG(RS_FILL_BLOCK_ROWS, 1, 8)
+
+#define RS_HISTOGRAM_WORKGROUP_SIZE_ID 2
+CONFIG(RS_HISTOGRAM_WORKGROUP_SIZE, RS_HISTOGRAM_WORKGROUP_SIZE_ID, 7)
+CONFIG(RS_HISTOGRAM_SUBGROUP_SIZE_LOG2, 3, 7)
+CONFIG(RS_HISTOGRAM_BLOCK_ROWS, 4, 8)
+
+#define RS_PREFIX_WORKGROUP_SIZE_ID 5
+CONFIG(RS_PREFIX_WORKGROUP_SIZE, RS_PREFIX_WORKGROUP_SIZE_ID, 8)
+CONFIG(RS_PREFIX_SUBGROUP_SIZE_LOG2, 6, 6)
+
+#define RS_SCATTER_WORKGROUP_SIZE_ID 7
+CONFIG(RS_SCATTER_WORKGROUP_SIZE, RS_SCATTER_WORKGROUP_SIZE_ID, 8)
+CONFIG(RS_SCATTER_SUBGROUP_SIZE_LOG2, 8, 6)
+CONFIG(RS_SCATTER_BLOCK_ROWS, 9, 14)
+
+CONFIG(RS_SCATTER_NONSEQUENTIAL_DISPATCH, 10, 0)
+
+#ifndef VULKAN
+};
+#endif
--- a/src/vulkan/runtime/radix_sort/shaders/fill.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/fill.comp
@ -49,23 +49,11 @@ layout(push_constant) uniform block_push
 //
 RS_STRUCT_INDIRECT_INFO_FILL();

-//
-// Check all switches are defined
-//
-#ifndef RS_FILL_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_FILL_BLOCK_ROWS
-#error "Undefined: RS_FILL_BLOCK_ROWS"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
-#define RS_WORKGROUP_SIZE   (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE   (RS_FILL_WORKGROUP_SIZE)
 #define RS_BLOCK_DWORDS     (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE)
 #define RS_RADIX_MASK       ((1 << RS_RADIX_LOG2) - 1)
 // clang-format on
@ -73,7 +61,7 @@ RS_STRUCT_INDIRECT_INFO_FILL();
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_FILL_WORKGROUP_SIZE_ID) in;

 //
 //
--- a/src/vulkan/runtime/radix_sort/shaders/histogram.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/histogram.comp
@ -61,26 +61,11 @@ layout(push_constant) uniform block_push
 #error "Undefined: RS_KEYVAL_DWORDS"
 #endif

-//
-#ifndef RS_HISTOGRAM_BLOCK_ROWS
-#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS"
-#endif
-
-//
-#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
-#define RS_WORKGROUP_SIZE       (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE       (RS_HISTOGRAM_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE        (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 #define RS_BLOCK_KEYVALS        (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE)
@ -104,11 +89,8 @@ layout(push_constant) uniform block_push
 //
 #define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_)

-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4)
-#else
-#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4)
-#endif
+#define RS_HISTOGRAM_OFFSET(pass_) \
+  RS_HISTOGRAM_BASE(pass_) + (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x) * 4

 //
 // Assumes (RS_RADIX_LOG2 == 8)
@ -167,7 +149,7 @@ shared rs_histogram_smem smem;
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_HISTOGRAM_WORKGROUP_SIZE_ID) in;

 //
 //
@ -196,41 +178,38 @@ rs_histogram_zero()
  //
  // Zero SMEM histogram
  //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = gl_SubgroupInvocationID;
-
-  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
  {
-    smem.histogram[smem_offset + ii] = 0;
-  }
+    const uint32_t smem_offset = gl_SubgroupInvocationID;

-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    smem.histogram[smem_offset + ii] = 0;
-  }
-
-  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
-
-  if (smem_idx < RS_RADIX_SIZE)
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
    {
-      smem.histogram[smem_idx] = 0;
+      smem.histogram[smem_offset + ii] = 0;
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      smem.histogram[smem_offset + ii] = 0;
    }

-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);

-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      smem.histogram[gl_LocalInvocationID.x] = 0;
-    }
-
-#endif
+    if (smem_idx < RS_RADIX_SIZE)
+      {
+        smem.histogram[smem_idx] = 0;
+      }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        smem.histogram[gl_LocalInvocationID.x] = 0;
+      }
+  }
 }

 //
@ -242,50 +221,47 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
  //
  // Store to GMEM
  //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = gl_SubgroupInvocationID;
-
-  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
  {
-    const uint32_t count = smem.histogram[smem_offset + ii];
+    const uint32_t smem_offset = gl_SubgroupInvocationID;

-    atomicAdd(rs_histograms.extent[ii], count);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t count = smem.histogram[smem_offset + ii];
-
-    atomicAdd(rs_histograms.extent[ii], count);
-  }
-
-  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
-
-  if (smem_idx < RS_RADIX_SIZE)
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
    {
-      const uint32_t count = smem.histogram[smem_idx];
+      const uint32_t count = smem.histogram[smem_offset + ii];

-      atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
-                count);
+      atomicAdd(rs_histograms.extent[ii], count);
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      const uint32_t count = smem.histogram[smem_offset + ii];
+
+      atomicAdd(rs_histograms.extent[ii], count);
    }

-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);

-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
+    if (smem_idx < RS_RADIX_SIZE)
+      {
+        const uint32_t count = smem.histogram[smem_idx];

-      atomicAdd(rs_histograms.extent[0], count);
-    }
+        atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
+                  count);
+      }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t count = smem.histogram[gl_LocalInvocationID.x];

-#endif
+        atomicAdd(rs_histograms.extent[0], count);
+      }
+  }
 }

 #endif
@ -298,21 +274,19 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
 void
 rs_histogram_atomic_after_write()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  subgroupMemoryBarrierShared();
-#else
-  barrier();
-#endif
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupMemoryBarrierShared();
+  else
+    barrier();
 }

 void
 rs_histogram_read_after_atomic()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  subgroupMemoryBarrierShared();
-#else
-  barrier();
-#endif
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupMemoryBarrierShared();
+  else
+    barrier();
 }

 #endif
--- a/src/vulkan/runtime/radix_sort/shaders/init.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/init.comp
@ -53,9 +53,9 @@ RS_STRUCT_INDIRECT_INFO();
 // Local macros
 //
 // clang-format off
-#define RS_FILL_WORKGROUP_SIZE        (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
-#define RS_SCATTER_WORKGROUP_SIZE     (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
-#define RS_HISTOGRAM_WORKGROUP_SIZE   (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_FILL_WORKGROUP_SIZE        (RS_FILL_WORKGROUP_SIZE)
+#define RS_SCATTER_WORKGROUP_SIZE     (RS_SCATTER_WORKGROUP_SIZE)
+#define RS_HISTOGRAM_WORKGROUP_SIZE   (RS_HISTOGRAM_WORKGROUP_SIZE)

 #define RS_FILL_BLOCK_DWORDS          (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
 #define RS_SCATTER_BLOCK_KEYVALS      (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
@ -150,12 +150,34 @@ main()
  // 256-dword partitions directly follow the 256-dword histograms, we
  // can dispatch just one FILL.
  //
+  // The "internal" memory map looks like this:
+  //
+  //   +---------------------------------+ <-- 0
+  //   | histograms[keyval_size]         |
+  //   +---------------------------------+ <-- keyval_size                           * histo_dwords
+  //   | partitions[scatter_blocks_ru-1] |
+  //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords
+  //   | workgroup_ids[keyval_size]      |
+  //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size
+  //
+  // NOTE(allanmac): The `.block_offset` and `.dword_offset_min`
+  // parameters are zeroes because the host can offset the buffer
+  // device address since the number of passes is known by the host.
+  // If we ever wanted to supported an indirect number of "key" bits
+  // in the sort, then this would need to change.
+  //
+  // NOTE(allanmac): The `.workgroup_ids[]` are only used if
+  // nonsequential dispatch isn't supported by the device.
+  //
  rs_indirect_info_fill zero;

  zero.block_offset               = 0;
  zero.dword_offset_min           = 0;
  zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE;

+  if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0)
+    zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4);  // one pass per byte
+
  const uint32_t zero_ru_blocks =
    RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS);

--- a/src/vulkan/runtime/radix_sort/shaders/meson.build
+++ b/src/vulkan/runtime/radix_sort/shaders/meson.build
@ -0,0 +1,53 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+radix_sort_shaders = [
+  'init.comp',
+  'fill.comp',
+  'histogram.comp',
+  'prefix.comp',
+  'scatter_0_even.comp',
+  'scatter_0_odd.comp',
+  'scatter_1_even.comp',
+  'scatter_1_odd.comp'
+]
+
+shader_include_files = files(
+  'bufref.h',
+  'prefix_limits.h',
+  'prefix.h',
+  'push.h',
+  'scatter.glsl',
+  'config.h',
+)
+
+defines = ['-DRS_KEYVAL_DWORDS=2']
+
+radix_sort_spv = []
+foreach s : radix_sort_shaders
+  radix_sort_spv += custom_target(
+    s + '.spv.h',
+    input : s,
+    output : s + '.spv.h',
+    command : [
+      prog_glslang, '-V', '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+    ] + defines + glslang_quiet + (with_mesa_debug ? ['-g'] : []),
+    depend_files: shader_include_files)
+endforeach
--- a/src/vulkan/runtime/radix_sort/shaders/prefix.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix.comp
@ -46,41 +46,20 @@ layout(push_constant) uniform block_push
 #define RS_SUBGROUP_UNIFORM
 #endif

-//
-// Check all switches are defined
-//
-//
-#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
 #define RS_KEYVAL_SIZE          (RS_KEYVAL_DWORDS * 4)
-#define RS_WORKGROUP_SIZE       (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE       (RS_PREFIX_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE        (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 // clang-format on

 //
-// There is no purpose in having a workgroup size larger than the
-// radix size.
-//
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)"
-#endif
-
 //
 //
-//
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_PREFIX_WORKGROUP_SIZE_ID) in;

 //
 // Histogram buffer reference
@ -95,34 +74,23 @@ layout(buffer_reference, std430) buffer buffer_rs_histograms
 //
 #include "prefix_limits.h"

-//
-// If multi-subgroup then define shared memory
-//
-#if (RS_WORKGROUP_SUBGROUPS > 1)
-
 //----------------------------------------
 shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE];

 #define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_]
 //----------------------------------------

-#if (RS_SWEEP_1_SIZE > 0)
 //----------------------------------------
 shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE];

 #define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_]
 //----------------------------------------
-#endif

-#if (RS_SWEEP_2_SIZE > 0)
 //----------------------------------------
 shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE];

 #define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_]
 //----------------------------------------
-#endif
-
-#endif

 //
 // Define function arguments
@ -151,37 +119,21 @@ main()
  //
  // Define buffer reference to read histograms
  //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  // Define histograms bufref for single subgroup
-  //
  // NOTE(allanmac): The histogram buffer reference could be adjusted
  // on the host to save a couple instructions at the cost of added
  // complexity.
  //
+  const uint32_t invocation_id = RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x;
+
  RS_SUBGROUP_UNIFORM
  const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
-  const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4;
+  const uint32_t histograms_offset = (histograms_base + invocation_id) * 4;

  RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
                                    rs_histograms,
                                    push.devaddr_histograms,
                                    histograms_offset);

-#else
-  //
-  // Define histograms bufref for workgroup
-  //
-  RS_SUBGROUP_UNIFORM
-  const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
-  const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4;
-
-  RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
-                                    rs_histograms,
-                                    push.devaddr_histograms,
-                                    histograms_offset);
-
-#endif

  //
  // Compute exclusive prefix of uint32_t[256]
--- a/src/vulkan/runtime/radix_sort/shaders/prefix.h
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix.h
@ -0,0 +1,356 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+
+//
+// Requires several defines
+//
+#ifndef RS_PREFIX_LIMITS
+#error "Error: \"prefix_limits.h\" not loaded"
+#endif
+
+#ifndef RS_PREFIX_ARGS
+#error "Error: RS_PREFIX_ARGS undefined"
+#endif
+
+#ifndef RS_PREFIX_LOAD
+#error "Error: RS_PREFIX_LOAD undefined"
+#endif
+
+#ifndef RS_PREFIX_STORE
+#error "Error: RS_PREFIX_STORE undefined"
+#endif
+
+//
+// Optional switches:
+//
+//   * Disable holding original inclusively scanned histogram values in registers.
+//
+//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+//
+
+//
+// Compute exclusive prefix of uint32_t[256]
+//
+void
+rs_prefix(RS_PREFIX_ARGS)
+{
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+  {
+    //
+    // Workgroup is a single subgroup so no shared memory is required.
+    //
+
+    //
+    // Exclusive scan-add the histogram
+    //
+    const uint32_t               h0     = RS_PREFIX_LOAD(0);
+    const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
+    RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+
+    RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
+
+    //
+    // Each iteration is dependent on the previous so no unrolling.  The
+    // compiler is free to hoist the loads upward though.
+    //
+    for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
+         ii < RS_RADIX_SIZE;
+         ii += RS_SUBGROUP_SIZE)
+      {
+        const uint32_t h     = RS_PREFIX_LOAD(ii);
+        const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
+        h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+        RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
+      }
+  }
+  else
+  {
+    //
+    // Workgroup is multiple subgroups and uses shared memory to store
+    // the scan's intermediate results.
+    //
+    // Assumes a power-of-two subgroup, workgroup and radix size.
+    //
+    // Downsweep: Repeatedly scan reductions until they fit in a single
+    //            subgroup.
+    //
+    // Upsweep:   Then uniformly apply reductions to each subgroup.
+    //
+    //
+    //   Subgroup Size |  4 |  8 | 16 | 32 | 64 | 128 |
+    //   --------------+----+----+----+----+----+-----+
+    //   Sweep 0       | 64 | 32 | 16 |  8 |  4 |   2 | sweep_0[]
+    //   Sweep 1       | 16 |  4 |  - |  - |  - |   - | sweep_1[]
+    //   Sweep 2       |  4 |  - |  - |  - |  - |   - | sweep_2[]
+    //   --------------+----+----+----+----+----+-----+
+    //   Total dwords  | 84 | 36 | 16 |  8 |  4 |   2 |
+    //   --------------+----+----+----+----+----+-----+
+    //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    uint32_t h_exc[RS_H_COMPONENTS];
+#endif
+
+    //
+    // Downsweep 0
+    //
+    [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+    {
+      const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+      const uint32_t h_inc = subgroupInclusiveAdd(h);
+
+      const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+      RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+      //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+      h_exc[ii] = h_inc - h;
+#else
+      RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
+#endif
+    }
+
+    barrier();
+
+    //
+    // Skip generalizing these sweeps for all possible subgroups -- just
+    // write them directly.
+    //
+    if (RS_SUBGROUP_SIZE == 128)
+    {
+      // There are only two elements in SWEEP0 per subgroup. The scan is
+      // trivial so we fold it into the upsweep.
+    }
+    else if (RS_SUBGROUP_SIZE >= 16)
+    {
+      //////////////////////////////////////////////////////////////////////
+      //
+      // Scan 0
+      //
+      if (RS_SWEEP_0_SIZE != RS_WORKGROUP_SIZE && // workgroup has inactive components
+          gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)
+        {
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+        }
+
+      barrier();
+    }
+    else if (RS_SUBGROUP_SIZE == 8)
+    {
+      if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+      {
+        //////////////////////////////////////////////////////////////////////
+        //
+        // Scan 0 and Downsweep 1
+        //
+        if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
+          {
+            const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+            const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+            RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+            RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else
+      {
+        //////////////////////////////////////////////////////////////////////
+        //
+        // Scan 0 and Downsweep 1
+        //
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
+        {
+          const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+          RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 1
+      //
+      if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
+        {
+          const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+          const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+          RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+        }
+
+      barrier();
+    }
+    else if (RS_SUBGROUP_SIZE == 4)
+    {
+      //////////////////////////////////////////////////////////////////////
+      //
+      // Scan 0 and Downsweep 1
+      //
+      if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+      {
+        if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
+          {
+            const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+            const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+            RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+            RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else
+      {
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
+        {
+          const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+          RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 1 and Downsweep 2
+      //
+      if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
+      {
+        if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
+          {
+            const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+            const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+            RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+            RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else 
+      {
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
+        {
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
+          const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+          RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
+          RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 2
+      //
+      // 4 invocations
+      //
+      if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
+        {
+          const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
+          const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
+
+          RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
+        }
+
+      barrier();
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    //
+    // Final upsweep 0
+    //
+    if (RS_SUBGROUP_SIZE == 128)
+    {
+      // There must be more than one subgroup per workgroup, but the maximum
+      // workgroup size is 256 so there must be exactly two subgroups per
+      // workgroup and RS_H_COMPONENTS must be 1.
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+      RS_PREFIX_STORE(0) = h_exc[0] + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0);
+#else
+      const uint32_t h_exc = RS_PREFIX_LOAD(0);
+
+      RS_PREFIX_STORE(0) = h_exc + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0);
+#endif
+    }
+    else if (RS_SUBGROUP_SIZE >= 16)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+        // clang format issue
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
+#endif
+      }
+    }
+    else if (RS_SUBGROUP_SIZE == 8)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+        const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#endif
+      }
+    }
+    else if (RS_SUBGROUP_SIZE == 4)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+        const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+        const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#endif
+      }
+    }
+  }
+}
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
--- a/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
@ -10,17 +10,12 @@
 //
 #define RS_PREFIX_LIMITS

-//
-// Multi-subgroup prefix requires shared memory.
-//
-#if (RS_WORKGROUP_SUBGROUPS > 1)
-
 // clang-format off
 #define RS_H_COMPONENTS    (RS_RADIX_SIZE / RS_WORKGROUP_SIZE)

-#define RS_SWEEP_0_SIZE    (RS_RADIX_SIZE   / RS_SUBGROUP_SIZE)
-#define RS_SWEEP_1_SIZE    (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)
-#define RS_SWEEP_2_SIZE    (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)
+#define RS_SWEEP_0_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_RADIX_SIZE / RS_SUBGROUP_SIZE))
+#define RS_SWEEP_1_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE))
+#define RS_SWEEP_2_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE))

 #define RS_SWEEP_SIZE      (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE)

@ -32,15 +27,6 @@
 #define RS_SWEEP_2_OFFSET  (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE)
 // clang-format on

-//
-// Single subgroup prefix doesn't use shared memory.
-//
-#else
-
-#define RS_SWEEP_SIZE 0
-
-#endif
-
 //
 //
 //
--- a/src/vulkan/runtime/radix_sort/shaders/push.h
+++ b/src/vulkan/runtime/radix_sort/shaders/push.h
--- a/src/vulkan/runtime/radix_sort/shaders/scatter.glsl
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl
--- a/src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp
--- a/src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp
--- a/src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp
--- a/src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp
--- a/src/vulkan/runtime/radix_sort/target.h
+++ b/src/vulkan/runtime/radix_sort/target.h
@ -27,6 +27,7 @@ struct radix_sort_vk_target_config
  struct
  {
    uint32_t workgroup_size_log2;
+    uint32_t block_rows;
  } fill;

  struct
@ -48,6 +49,8 @@ struct radix_sort_vk_target_config
    uint32_t subgroup_size_log2;
    uint32_t block_rows;
  } scatter;
+
+  bool nonsequential_dispatch;
 };

 //
--- a/src/vulkan/runtime/vk_acceleration_structure.c
+++ b/src/vulkan/runtime/vk_acceleration_structure.c
--- a/src/vulkan/runtime/vk_acceleration_structure.h
+++ b/src/vulkan/runtime/vk_acceleration_structure.h
@ -26,6 +26,11 @@
 #define VK_ACCELERATION_STRUCTURE_H

 #include "vk_object.h"
+#include "radix_sort/radix_sort_vk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif

 struct vk_acceleration_structure {
   struct vk_object_base base;
@ -40,4 +45,88 @@ VkDeviceAddress vk_acceleration_structure_get_va(struct vk_acceleration_structur
 VK_DEFINE_NONDISP_HANDLE_CASTS(vk_acceleration_structure, base, VkAccelerationStructureKHR,
                               VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)

+#define MAX_ENCODE_PASSES 2
+#define MAX_UPDATE_PASSES 2
+
+struct vk_acceleration_structure_build_ops {
+   VkDeviceSize (*get_as_size)(VkDevice device,
+                               const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                               uint32_t leaf_count);
+   VkDeviceSize (*get_update_scratch_size)(struct vk_device *device, uint32_t leaf_count);
+   uint32_t (*get_encode_key[MAX_ENCODE_PASSES])(VkAccelerationStructureTypeKHR type,
+                                                 VkBuildAccelerationStructureFlagBitsKHR flags);
+   VkResult (*encode_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                                       uint32_t key);
+   void (*encode_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                        const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                        const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                                        VkDeviceAddress intermediate_as_addr,
+                                        VkDeviceAddress intermediate_header_addr,
+                                        uint32_t leaf_count,
+                                        uint32_t key,
+                                        struct vk_acceleration_structure *dst);
+   void (*init_update_scratch)(VkCommandBuffer cmd_buffer,
+                               VkDeviceAddress scratch,
+                               uint32_t leaf_count,
+                               struct vk_acceleration_structure *src_as,
+                               struct vk_acceleration_structure *dst_as);
+   void (*update_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer);
+   void (*update_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                        const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                        const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                                        uint32_t leaf_count,
+                                        struct vk_acceleration_structure *dst,
+                                        struct vk_acceleration_structure *src);
+};
+
+struct vk_acceleration_structure_build_args {
+   uint32_t subgroup_size;
+   uint32_t bvh_bounds_offset;
+   bool emit_markers;
+   const radix_sort_vk_t *radix_sort;
+};
+
+struct vk_meta_device;
+
+void vk_cmd_build_acceleration_structures(VkCommandBuffer cmdbuf,
+                                          struct vk_device *device,
+                                          struct vk_meta_device *meta,
+                                          uint32_t info_count,
+                                          const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                                          const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+                                          const struct vk_acceleration_structure_build_args *args);
+
+void vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
+                           const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                           const uint32_t *pMaxPrimitiveCounts,
+                           VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo,
+                           const struct vk_acceleration_structure_build_args *args);
+
+bool vk_acceleration_struct_vtx_format_supported(VkFormat format);
+
+static inline VkGeometryTypeKHR
+vk_get_as_geometry_type(const VkAccelerationStructureBuildGeometryInfoKHR *build_info)
+{
+   if (build_info->geometryCount) {
+      if (build_info->pGeometries)
+         return build_info->pGeometries[0].geometryType;
+      else
+         return build_info->ppGeometries[0]->geometryType;
+   }
+
+   /* If there are no geometries, the geometry type shouldn't matter, but
+    * return something.
+    */
+   return VK_GEOMETRY_TYPE_TRIANGLES_KHR;
+}
+
+struct vk_bvh_geometry_data
+vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index,
+                      const VkAccelerationStructureGeometryKHR *geometry,
+                      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/vulkan/runtime/vk_device.h
+++ b/src/vulkan/runtime/vk_device.h
@ -37,6 +37,7 @@
 extern "C" {
 #endif

+struct vk_acceleration_structure_build_ops;
 struct vk_command_buffer_ops;
 struct vk_device_shader_ops;
 struct vk_sync;
@ -134,6 +135,9 @@ struct vk_device {
   /** Shader vtable for VK_EXT_shader_object and common pipelines */
   const struct vk_device_shader_ops *shader_ops;

+   /** Acceleration structure build vtable for common BVH building. */
+   const struct vk_acceleration_structure_build_ops *as_build_ops;
+
   /**
    * Write data to a buffer from the command processor. This is simpler than
    * setting up a staging buffer and faster for small writes, but is not