diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h
index 30b3224b2d6..3014a827e61 100644
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@@ -8,210 +8,7 @@
 #define BVH_BUILD_HELPERS_H
 
 #include "bvh.h"
-
-#define VK_FORMAT_UNDEFINED                  0
-#define VK_FORMAT_R4G4_UNORM_PACK8           1
-#define VK_FORMAT_R4G4B4A4_UNORM_PACK16      2
-#define VK_FORMAT_B4G4R4A4_UNORM_PACK16      3
-#define VK_FORMAT_R5G6B5_UNORM_PACK16        4
-#define VK_FORMAT_B5G6R5_UNORM_PACK16        5
-#define VK_FORMAT_R5G5B5A1_UNORM_PACK16      6
-#define VK_FORMAT_B5G5R5A1_UNORM_PACK16      7
-#define VK_FORMAT_A1R5G5B5_UNORM_PACK16      8
-#define VK_FORMAT_R8_UNORM                   9
-#define VK_FORMAT_R8_SNORM                   10
-#define VK_FORMAT_R8_USCALED                 11
-#define VK_FORMAT_R8_SSCALED                 12
-#define VK_FORMAT_R8_UINT                    13
-#define VK_FORMAT_R8_SINT                    14
-#define VK_FORMAT_R8_SRGB                    15
-#define VK_FORMAT_R8G8_UNORM                 16
-#define VK_FORMAT_R8G8_SNORM                 17
-#define VK_FORMAT_R8G8_USCALED               18
-#define VK_FORMAT_R8G8_SSCALED               19
-#define VK_FORMAT_R8G8_UINT                  20
-#define VK_FORMAT_R8G8_SINT                  21
-#define VK_FORMAT_R8G8_SRGB                  22
-#define VK_FORMAT_R8G8B8_UNORM               23
-#define VK_FORMAT_R8G8B8_SNORM               24
-#define VK_FORMAT_R8G8B8_USCALED             25
-#define VK_FORMAT_R8G8B8_SSCALED             26
-#define VK_FORMAT_R8G8B8_UINT                27
-#define VK_FORMAT_R8G8B8_SINT                28
-#define VK_FORMAT_R8G8B8_SRGB                29
-#define VK_FORMAT_B8G8R8_UNORM               30
-#define VK_FORMAT_B8G8R8_SNORM               31
-#define VK_FORMAT_B8G8R8_USCALED             32
-#define VK_FORMAT_B8G8R8_SSCALED             33
-#define VK_FORMAT_B8G8R8_UINT                34
-#define VK_FORMAT_B8G8R8_SINT                35
-#define VK_FORMAT_B8G8R8_SRGB                36
-#define VK_FORMAT_R8G8B8A8_UNORM             37
-#define VK_FORMAT_R8G8B8A8_SNORM             38
-#define VK_FORMAT_R8G8B8A8_USCALED           39
-#define VK_FORMAT_R8G8B8A8_SSCALED           40
-#define VK_FORMAT_R8G8B8A8_UINT              41
-#define VK_FORMAT_R8G8B8A8_SINT              42
-#define VK_FORMAT_R8G8B8A8_SRGB              43
-#define VK_FORMAT_B8G8R8A8_UNORM             44
-#define VK_FORMAT_B8G8R8A8_SNORM             45
-#define VK_FORMAT_B8G8R8A8_USCALED           46
-#define VK_FORMAT_B8G8R8A8_SSCALED           47
-#define VK_FORMAT_B8G8R8A8_UINT              48
-#define VK_FORMAT_B8G8R8A8_SINT              49
-#define VK_FORMAT_B8G8R8A8_SRGB              50
-#define VK_FORMAT_A8B8G8R8_UNORM_PACK32      51
-#define VK_FORMAT_A8B8G8R8_SNORM_PACK32      52
-#define VK_FORMAT_A8B8G8R8_USCALED_PACK32    53
-#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32    54
-#define VK_FORMAT_A8B8G8R8_UINT_PACK32       55
-#define VK_FORMAT_A8B8G8R8_SINT_PACK32       56
-#define VK_FORMAT_A8B8G8R8_SRGB_PACK32       57
-#define VK_FORMAT_A2R10G10B10_UNORM_PACK32   58
-#define VK_FORMAT_A2R10G10B10_SNORM_PACK32   59
-#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
-#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
-#define VK_FORMAT_A2R10G10B10_UINT_PACK32    62
-#define VK_FORMAT_A2R10G10B10_SINT_PACK32    63
-#define VK_FORMAT_A2B10G10R10_UNORM_PACK32   64
-#define VK_FORMAT_A2B10G10R10_SNORM_PACK32   65
-#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
-#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
-#define VK_FORMAT_A2B10G10R10_UINT_PACK32    68
-#define VK_FORMAT_A2B10G10R10_SINT_PACK32    69
-#define VK_FORMAT_R16_UNORM                  70
-#define VK_FORMAT_R16_SNORM                  71
-#define VK_FORMAT_R16_USCALED                72
-#define VK_FORMAT_R16_SSCALED                73
-#define VK_FORMAT_R16_UINT                   74
-#define VK_FORMAT_R16_SINT                   75
-#define VK_FORMAT_R16_SFLOAT                 76
-#define VK_FORMAT_R16G16_UNORM               77
-#define VK_FORMAT_R16G16_SNORM               78
-#define VK_FORMAT_R16G16_USCALED             79
-#define VK_FORMAT_R16G16_SSCALED             80
-#define VK_FORMAT_R16G16_UINT                81
-#define VK_FORMAT_R16G16_SINT                82
-#define VK_FORMAT_R16G16_SFLOAT              83
-#define VK_FORMAT_R16G16B16_UNORM            84
-#define VK_FORMAT_R16G16B16_SNORM            85
-#define VK_FORMAT_R16G16B16_USCALED          86
-#define VK_FORMAT_R16G16B16_SSCALED          87
-#define VK_FORMAT_R16G16B16_UINT             88
-#define VK_FORMAT_R16G16B16_SINT             89
-#define VK_FORMAT_R16G16B16_SFLOAT           90
-#define VK_FORMAT_R16G16B16A16_UNORM         91
-#define VK_FORMAT_R16G16B16A16_SNORM         92
-#define VK_FORMAT_R16G16B16A16_USCALED       93
-#define VK_FORMAT_R16G16B16A16_SSCALED       94
-#define VK_FORMAT_R16G16B16A16_UINT          95
-#define VK_FORMAT_R16G16B16A16_SINT          96
-#define VK_FORMAT_R16G16B16A16_SFLOAT        97
-#define VK_FORMAT_R32_UINT                   98
-#define VK_FORMAT_R32_SINT                   99
-#define VK_FORMAT_R32_SFLOAT                 100
-#define VK_FORMAT_R32G32_UINT                101
-#define VK_FORMAT_R32G32_SINT                102
-#define VK_FORMAT_R32G32_SFLOAT              103
-#define VK_FORMAT_R32G32B32_UINT             104
-#define VK_FORMAT_R32G32B32_SINT             105
-#define VK_FORMAT_R32G32B32_SFLOAT           106
-#define VK_FORMAT_R32G32B32A32_UINT          107
-#define VK_FORMAT_R32G32B32A32_SINT          108
-#define VK_FORMAT_R32G32B32A32_SFLOAT        109
-#define VK_FORMAT_R64_UINT                   110
-#define VK_FORMAT_R64_SINT                   111
-#define VK_FORMAT_R64_SFLOAT                 112
-#define VK_FORMAT_R64G64_UINT                113
-#define VK_FORMAT_R64G64_SINT                114
-#define VK_FORMAT_R64G64_SFLOAT              115
-#define VK_FORMAT_R64G64B64_UINT             116
-#define VK_FORMAT_R64G64B64_SINT             117
-#define VK_FORMAT_R64G64B64_SFLOAT           118
-#define VK_FORMAT_R64G64B64A64_UINT          119
-#define VK_FORMAT_R64G64B64A64_SINT          120
-#define VK_FORMAT_R64G64B64A64_SFLOAT        121
-
-#define VK_INDEX_TYPE_UINT16    0
-#define VK_INDEX_TYPE_UINT32    1
-#define VK_INDEX_TYPE_NONE_KHR  1000165000
-#define VK_INDEX_TYPE_UINT8_EXT 1000265000
-
-#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
-#define VK_GEOMETRY_TYPE_AABBS_KHR     1
-#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
-
-#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
-#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR         2
-#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR                 4
-#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR              8
-
-#define TYPE(type, align)                                                                                              \
-   layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref                                  \
-   {                                                                                                                   \
-      type value;                                                                                                      \
-   };
-
-#define REF(type)  type##_ref
-#define VOID_REF   uint64_t
-#define NULL       0
-#define DEREF(var) var.value
-
-#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
-
-#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
-
-#define INFINITY (1.0 / 0.0)
-#define NAN      (0.0 / 0.0)
-
-#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
-
-TYPE(int8_t, 1);
-TYPE(uint8_t, 1);
-TYPE(int16_t, 2);
-TYPE(uint16_t, 2);
-TYPE(int32_t, 4);
-TYPE(uint32_t, 4);
-TYPE(int64_t, 8);
-TYPE(uint64_t, 8);
-
-TYPE(float, 4);
-
-TYPE(vec2, 4);
-TYPE(vec3, 4);
-TYPE(vec4, 4);
-
-TYPE(uvec4, 16);
-
-TYPE(VOID_REF, 8);
-
-/* copied from u_math.h */
-uint32_t
-align(uint32_t value, uint32_t alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
-
-int32_t
-to_emulated_float(float f)
-{
-   int32_t bits = floatBitsToInt(f);
-   return f < 0 ? -2147483648 - bits : bits;
-}
-
-float
-from_emulated_float(int32_t bits)
-{
-   return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
-}
-
-TYPE(radv_aabb, 4);
-
-struct key_id_pair {
-   uint32_t id;
-   uint32_t key;
-};
-TYPE(key_id_pair, 4);
+#include "vk_build_helpers.h"
 
 TYPE(radv_accel_struct_serialization_header, 8);
 TYPE(radv_accel_struct_header, 8);
@@ -221,12 +18,6 @@ TYPE(radv_bvh_instance_node, 8);
 TYPE(radv_bvh_box16_node, 4);
 TYPE(radv_bvh_box32_node, 4);
 
-TYPE(radv_ir_header, 4);
-TYPE(radv_ir_node, 4);
-TYPE(radv_ir_box_node, 4);
-
-TYPE(radv_global_sync_data, 4);
-
 uint32_t
 id_to_offset(uint32_t id)
 {
@@ -259,178 +50,23 @@ addr_to_node(uint64_t addr)
    return (addr >> 3) & ((1ul << 45) - 1);
 }
 
-uint32_t
-ir_id_to_offset(uint32_t id)
-{
-   return id & (~3u);
-}
-
-uint32_t
-ir_id_to_type(uint32_t id)
-{
-   return id & 3u;
-}
-
-uint32_t
-pack_ir_node_id(uint32_t offset, uint32_t type)
-{
-   return offset | type;
-}
-
 uint32_t
 ir_type_to_bvh_type(uint32_t type)
 {
    switch (type) {
-   case radv_ir_node_triangle:
+   case vk_ir_node_triangle:
       return radv_bvh_node_triangle;
-   case radv_ir_node_internal:
+   case vk_ir_node_internal:
       return radv_bvh_node_box32;
-   case radv_ir_node_instance:
+   case vk_ir_node_instance:
       return radv_bvh_node_instance;
-   case radv_ir_node_aabb:
+   case vk_ir_node_aabb:
       return radv_bvh_node_aabb;
    }
    /* unreachable in valid nodes */
    return RADV_BVH_INVALID_NODE;
 }
 
-float
-aabb_surface_area(radv_aabb aabb)
-{
-   vec3 diagonal = aabb.max - aabb.min;
-   return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
-}
-
-/* Just a wrapper for 3 uints. */
-struct triangle_indices {
-   uint32_t index[3];
-};
-
-triangle_indices
-load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
-{
-   triangle_indices result;
-
-   uint32_t index_base = global_id * 3;
-
-   switch (index_format) {
-   case VK_INDEX_TYPE_UINT16: {
-      result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
-      break;
-   }
-   case VK_INDEX_TYPE_UINT32: {
-      result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
-      break;
-   }
-   case VK_INDEX_TYPE_NONE_KHR: {
-      result.index[0] = index_base + 0;
-      result.index[1] = index_base + 1;
-      result.index[2] = index_base + 2;
-      break;
-   }
-   case VK_INDEX_TYPE_UINT8_EXT: {
-      result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
-      break;
-   }
-   }
-
-   return result;
-}
-
-/* Just a wrapper for 3 vec4s. */
-struct triangle_vertices {
-   vec4 vertex[3];
-};
-
-TYPE(float16_t, 2);
-
-triangle_vertices
-load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
-{
-   triangle_vertices result;
-
-   for (uint32_t i = 0; i < 3; i++) {
-      VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
-      vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
-
-      switch (vertex_format) {
-      case VK_FORMAT_R32G32_SFLOAT:
-         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
-         break;
-      case VK_FORMAT_R32G32B32_SFLOAT:
-      case VK_FORMAT_R32G32B32A32_SFLOAT:
-         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
-         vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
-         break;
-      case VK_FORMAT_R16G16_SFLOAT:
-         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
-         break;
-      case VK_FORMAT_R16G16B16_SFLOAT:
-      case VK_FORMAT_R16G16B16A16_SFLOAT:
-         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
-         vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
-         break;
-      case VK_FORMAT_R16G16_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
-         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
-         break;
-      case VK_FORMAT_R16G16B16A16_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
-         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
-         vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
-         break;
-      case VK_FORMAT_R8G8_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
-         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
-         break;
-      case VK_FORMAT_R8G8B8A8_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
-         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
-         vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
-         break;
-      case VK_FORMAT_R16G16_UNORM:
-         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
-         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
-         break;
-      case VK_FORMAT_R16G16B16A16_UNORM:
-         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
-         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
-         vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
-         break;
-      case VK_FORMAT_R8G8_UNORM:
-         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
-         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
-         break;
-      case VK_FORMAT_R8G8B8A8_UNORM:
-         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
-         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
-         vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
-         break;
-      case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
-         uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
-         vertex.x = float(data & 0x3FF) / 0x3FF;
-         vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
-         vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
-         break;
-      }
-      }
-
-      result.vertex[i] = vertex;
-   }
-
-   return result;
-}
-
 /* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
 struct AccelerationStructureInstance {
    mat3x4 transform;
@@ -441,7 +77,7 @@ struct AccelerationStructureInstance {
 TYPE(AccelerationStructureInstance, 8);
 
 bool
-build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data geom_data, uint32_t global_id)
+build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
 {
    bool is_valid = true;
    triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
@@ -490,7 +126,7 @@ build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data
 }
 
 bool
-build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
+build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
 {
    bool is_valid = true;
    REF(radv_bvh_aabb_node) node = REF(radv_bvh_aabb_node)(dst_ptr);
@@ -521,10 +157,10 @@ build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t
    return is_valid;
 }
 
-radv_aabb
+vk_aabb
 calculate_instance_node_bounds(radv_accel_struct_header header, mat3x4 otw_matrix)
 {
-   radv_aabb aabb;
+   vk_aabb aabb;
    for (uint32_t comp = 0; comp < 3; ++comp) {
       aabb.min[comp] = otw_matrix[comp][3];
       aabb.max[comp] = otw_matrix[comp][3];
@@ -555,7 +191,7 @@ encode_sbt_offset_and_flags(uint32_t src)
 }
 
 bool
-build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
+build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
 {
    REF(radv_bvh_instance_node) node = REF(radv_bvh_instance_node)(dst_ptr);
 
@@ -591,123 +227,4 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3
     From macros.h */
 #define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
 
-#ifdef USE_GLOBAL_SYNC
-
-/* There might be more invocations available than tasks to do.
- * In that case, the fetched task index is greater than the
- * counter offset for the next phase. To avoid out-of-bounds
- * accessing, phases will be skipped until the task index is
- * is in-bounds again. */
-uint32_t num_tasks_to_skip = 0;
-uint32_t phase_index = 0;
-bool should_skip = false;
-shared uint32_t global_task_index;
-
-shared uint32_t shared_phase_index;
-
-uint32_t
-task_count(REF(radv_ir_header) header)
-{
-   uint32_t phase_index = DEREF(header).sync_data.phase_index;
-   return DEREF(header).sync_data.task_counts[phase_index & 1];
-}
-
-/* Sets the task count for the next phase. */
-void
-set_next_task_count(REF(radv_ir_header) header, uint32_t new_count)
-{
-   uint32_t phase_index = DEREF(header).sync_data.phase_index;
-   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
-}
-
-/*
- * This function has two main objectives:
- * Firstly, it partitions pending work among free invocations.
- * Secondly, it guarantees global synchronization between different phases.
- *
- * After every call to fetch_task, a new task index is returned.
- * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
- * to determine if the current phase should be executed or skipped.
- *
- * Since tasks are assigned per-workgroup, there is a possibility of the task index being
- * greater than the total task count.
- */
-uint32_t
-fetch_task(REF(radv_ir_header) header, bool did_work)
-{
-   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
-    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
-    * and their results are written to memory. */
-   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-   if (gl_LocalInvocationIndex == 0) {
-      if (did_work)
-         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
-      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
-
-      do {
-         /* Perform a memory barrier to refresh the current phase's end counter, in case
-          * another workgroup changed it. */
-         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-
-         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
-          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
-          * previous phase have been completed. Switching to a new phase and incrementing the phase
-          * end counter in turn notifies all invocations for that phase that it is safe to execute.
-          */
-         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
-             DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
-            if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
-               DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
-               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-            } else {
-               atomicAdd(DEREF(header).sync_data.phase_index, 1);
-               DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
-               /* Ensure the changes to the phase index and start/end counter are visible for other
-                * workgroup waiting in the loop. */
-               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-               atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
-                         DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
-            }
-            break;
-         }
-
-         /* If other invocations have finished all nodes, break out; there is no work to do */
-         if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
-            break;
-         }
-      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
-
-      shared_phase_index = DEREF(header).sync_data.phase_index;
-   }
-
-   barrier();
-   if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
-      return TASK_INDEX_INVALID;
-
-   num_tasks_to_skip = shared_phase_index - phase_index;
-
-   uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
-   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
-}
-
-bool
-should_execute_phase()
-{
-   if (num_tasks_to_skip > 0) {
-      /* Skip to next phase. */
-      ++phase_index;
-      --num_tasks_to_skip;
-      return false;
-   }
-   return true;
-}
-
-#define PHASE(header)                                                                                                  \
-   for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
-#endif
-
 #endif /* BUILD_HELPERS_H */
diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h
index 6422319c506..c0c06c98fed 100644
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@@ -16,49 +16,6 @@
 #define VOID_REF  uint64_t
 #endif
 
-struct leaf_args {
-   VOID_REF ir;
-   VOID_REF bvh;
-   REF(radv_ir_header) header;
-   REF(key_id_pair) ids;
-
-   radv_bvh_geometry_data geom_data;
-};
-
-struct morton_args {
-   VOID_REF bvh;
-   REF(radv_ir_header) header;
-   REF(key_id_pair) ids;
-};
-
-#define LBVH_RIGHT_CHILD_BIT_SHIFT 29
-#define LBVH_RIGHT_CHILD_BIT       (1 << LBVH_RIGHT_CHILD_BIT_SHIFT)
-
-struct lbvh_node_info {
-   /* Number of children that have been processed (or are invalid/leaves) in
-    * the lbvh_generate_ir pass.
-    */
-   uint32_t path_count;
-
-   uint32_t children[2];
-   uint32_t parent;
-};
-
-struct lbvh_main_args {
-   VOID_REF bvh;
-   REF(key_id_pair) src_ids;
-   VOID_REF node_info;
-   uint32_t id_count;
-   uint32_t internal_node_base;
-};
-
-struct lbvh_generate_ir_args {
-   VOID_REF bvh;
-   VOID_REF node_info;
-   VOID_REF header;
-   uint32_t internal_node_base;
-};
-
 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
 #define RADV_COPY_MODE_DESERIALIZE 2
@@ -72,30 +29,14 @@ struct copy_args {
 struct encode_args {
    VOID_REF intermediate_bvh;
    VOID_REF output_bvh;
-   REF(radv_ir_header) header;
+   REF(vk_ir_header) header;
    uint32_t output_bvh_offset;
    uint32_t leaf_node_count;
    uint32_t geometry_type;
 };
 
-struct ploc_prefix_scan_partition {
-   uint32_t aggregate;
-   uint32_t inclusive_sum;
-};
-
-#define PLOC_WORKGROUP_SIZE 1024
-
-struct ploc_args {
-   VOID_REF bvh;
-   VOID_REF prefix_scan_partitions;
-   REF(radv_ir_header) header;
-   VOID_REF ids_0;
-   VOID_REF ids_1;
-   uint32_t internal_node_offset;
-};
-
 struct header_args {
-   REF(radv_ir_header) src;
+   REF(vk_ir_header) src;
    REF(radv_accel_struct_header) dst;
    uint32_t bvh_offset;
    uint32_t instance_count;
@@ -104,11 +45,11 @@ struct header_args {
 struct update_args {
    REF(radv_accel_struct_header) src;
    REF(radv_accel_struct_header) dst;
-   REF(radv_aabb) leaf_bounds;
+   REF(vk_aabb) leaf_bounds;
    REF(uint32_t) internal_ready_count;
    uint32_t leaf_node_count;
 
-   radv_bvh_geometry_data geom_data;
+   vk_bvh_geometry_data geom_data;
 };
 
 #endif /* BUILD_INTERFACE_H */
diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index 27399fff200..2b87ec47664 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -7,17 +7,14 @@
 #ifndef BVH_BVH_H
 #define BVH_BVH_H
 
+#include "vk_bvh.h"
+
 #define radv_bvh_node_triangle 0
 #define radv_bvh_node_box16    4
 #define radv_bvh_node_box32    5
 #define radv_bvh_node_instance 6
 #define radv_bvh_node_aabb     7
 
-#define radv_ir_node_triangle 0
-#define radv_ir_node_internal 1
-#define radv_ir_node_instance 2
-#define radv_ir_node_aabb     3
-
 #define RADV_GEOMETRY_OPAQUE (1u << 31)
 
 #define RADV_INSTANCE_FORCE_OPAQUE                 (1u << 31)
@@ -29,31 +26,9 @@
 #define VK_UUID_SIZE 16
 #else
 #include <vulkan/vulkan.h>
-typedef struct radv_ir_node radv_ir_node;
-typedef struct radv_global_sync_data radv_global_sync_data;
-typedef struct radv_bvh_geometry_data radv_bvh_geometry_data;
-
 typedef uint16_t float16_t;
-
-typedef struct {
-   float values[3][4];
-} mat3x4;
-
-typedef struct {
-   float x;
-   float y;
-   float z;
-} vec3;
-
-typedef struct radv_aabb radv_aabb;
-
 #endif
 
-struct radv_aabb {
-   vec3 min;
-   vec3 max;
-};
-
 struct radv_accel_struct_serialization_header {
    uint8_t driver_uuid[VK_UUID_SIZE];
    uint8_t accel_struct_compat[VK_UUID_SIZE];
@@ -74,7 +49,7 @@ struct radv_accel_struct_geometry_info {
 struct radv_accel_struct_header {
    uint32_t bvh_offset;
    uint32_t reserved;
-   radv_aabb aabb;
+   vk_aabb aabb;
 
    /* Everything after this gets either updated/copied from the CPU or written by header.comp. */
    uint64_t compacted_size;
@@ -89,45 +64,6 @@ struct radv_accel_struct_header {
    uint32_t build_flags;
 };
 
-struct radv_ir_node {
-   radv_aabb aabb;
-};
-
-#define RADV_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
-#define RADV_NULL_BVH_OFFSET    0xFFFFFFFE
-
-struct radv_ir_box_node {
-   radv_ir_node base;
-   uint32_t children[2];
-   uint32_t bvh_offset;
-};
-
-struct radv_global_sync_data {
-   uint32_t task_counts[2];
-   uint32_t task_started_counter;
-   uint32_t task_done_counter;
-   uint32_t current_phase_start_counter;
-   uint32_t current_phase_end_counter;
-   uint32_t phase_index;
-   /* If this flag is set, the shader should exit
-    * instead of executing another phase */
-   uint32_t next_phase_exit_flag;
-};
-
-struct radv_ir_header {
-   int32_t min_bounds[3];
-   int32_t max_bounds[3];
-   uint32_t active_leaf_count;
-   /* Indirect dispatch dimensions for the encoder.
-    * ir_internal_node_count is the thread count in the X dimension,
-    * while Y and Z are always set to 1. */
-   uint32_t ir_internal_node_count;
-   uint32_t dispatch_size_y;
-   uint32_t dispatch_size_z;
-   radv_global_sync_data sync_data;
-   uint32_t dst_node_offset;
-};
-
 struct radv_bvh_triangle_node {
    float coords[3][3];
    uint32_t reserved[3];
@@ -170,28 +106,11 @@ struct radv_bvh_box16_node {
 
 struct radv_bvh_box32_node {
    uint32_t children[4];
-   radv_aabb coords[4];
+   vk_aabb coords[4];
    uint32_t reserved[4];
 };
 
 #define RADV_BVH_ROOT_NODE    radv_bvh_node_box32
 #define RADV_BVH_INVALID_NODE 0xffffffffu
 
-/* If the task index is set to this value, there is no
- * more work to do. */
-#define TASK_INDEX_INVALID 0xFFFFFFFF
-
-struct radv_bvh_geometry_data {
-   uint64_t data;
-   uint64_t indices;
-   uint64_t transform;
-
-   uint32_t geometry_id;
-   uint32_t geometry_type;
-   uint32_t first_id;
-   uint32_t stride;
-   uint32_t vertex_format;
-   uint32_t index_format;
-};
-
 #endif /* BVH_H */
diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp
index 5c84f631860..50623aa3736 100644
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@@ -36,31 +36,85 @@ void set_parent(uint32_t child, uint32_t parent)
 void
 main()
 {
-   /* Revert the order so we start at the root */
-   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
-
-   uint32_t output_leaf_node_size;
-   switch (args.geometry_type) {
-   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
-      break;
-   case VK_GEOMETRY_TYPE_AABBS_KHR:
-      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
-      break;
-   default: /* instances */
-      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
-      break;
-   }
-
-   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * SIZEOF(radv_ir_node);
+   /* Encode leaf nodes. */
    uint32_t dst_leaf_offset =
       id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_bvh_box32_node);
+
+   uint32_t ir_leaf_node_size;
+   uint32_t output_leaf_node_size;
+   switch (args.geometry_type) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
+      ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
+
+      vk_ir_triangle_node src_node =
+         DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_triangle_node) dst_node =
+         REF(radv_bvh_triangle_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      DEREF(dst_node).coords = src_node.coords;
+      DEREF(dst_node).triangle_id = src_node.triangle_id;
+      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+      DEREF(dst_node).id = 9;
+
+      break;
+   }
+   case VK_GEOMETRY_TYPE_AABBS_KHR: {
+      ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
+
+      vk_ir_aabb_node src_node =
+         DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_aabb_node) dst_node =
+         REF(radv_bvh_aabb_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      DEREF(dst_node).primitive_id = src_node.primitive_id;
+      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+
+      break;
+   }
+   default: {
+      /* instances */
+      ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
+
+      vk_ir_instance_node src_node =
+         DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_instance_node) dst_node =
+         REF(radv_bvh_instance_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      radv_accel_struct_header blas_header =
+         DEREF(REF(radv_accel_struct_header)(src_node.base_ptr));
+
+      DEREF(dst_node).bvh_ptr = addr_to_node(src_node.base_ptr + blas_header.bvh_offset);
+      DEREF(dst_node).bvh_offset = blas_header.bvh_offset;
+
+      mat4 transform = mat4(src_node.otw_matrix);
+      mat4 inv_transform = transpose(inverse(transpose(transform)));
+      DEREF(dst_node).wto_matrix = mat3x4(inv_transform);
+      DEREF(dst_node).otw_matrix = mat3x4(transform);
+
+      DEREF(dst_node).custom_instance_and_mask = src_node.custom_instance_and_mask;
+      DEREF(dst_node).sbt_offset_and_flags = encode_sbt_offset_and_flags(src_node.sbt_offset_and_flags);
+      DEREF(dst_node).instance_id = src_node.instance_id;
+
+      break;
+   }
+   }
+
+   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
+      return;
+
+   /* Encode internal nodes. Revert the order so we start at the root */
+   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
+
+   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
    uint32_t dst_internal_offset = dst_leaf_offset + args.leaf_node_count * output_leaf_node_size;
 
-   REF(radv_ir_box_node) intermediate_internal_nodes =
-      REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
-   REF(radv_ir_box_node) src_node = INDEX(radv_ir_box_node, intermediate_internal_nodes, global_id);
-   radv_ir_box_node src = DEREF(src_node);
+   REF(vk_ir_box_node) intermediate_internal_nodes =
+      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
+   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
+   vk_ir_box_node src = DEREF(src_node);
 
    bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;
 
@@ -70,10 +124,10 @@ main()
                     gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
       uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
-      if (bvh_offset == RADV_UNKNOWN_BVH_OFFSET)
+      if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
          continue;
 
-      if (bvh_offset == RADV_NULL_BVH_OFFSET)
+      if (bvh_offset == VK_NULL_BVH_OFFSET)
          break;
 
       REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
@@ -92,11 +146,11 @@ main()
          float largest_surface_area = -INFINITY;
 
          for (int32_t i = 0; i < found_child_count; ++i) {
-            if (ir_id_to_type(children[i]) != radv_ir_node_internal)
+            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
                continue;
 
-            radv_aabb bounds =
-               DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh,
+            vk_aabb bounds =
+               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
                                              ir_id_to_offset(children[i]))).aabb;
 
             float surface_area = aabb_surface_area(bounds);
@@ -107,8 +161,8 @@ main()
          }
 
          if (collapsed_child_index != -1) {
-            REF(radv_ir_box_node) child_node =
-               REF(radv_ir_box_node)OFFSET(args.intermediate_bvh,
+            REF(vk_ir_box_node) child_node =
+               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
                                         ir_id_to_offset(children[collapsed_child_index]));
             uint32_t grandchildren[2] = DEREF(child_node).children;
             uint32_t valid_grandchild_count = 0;
@@ -131,7 +185,7 @@ main()
                children[collapsed_child_index] = children[found_child_count];
             }
 
-            DEREF(child_node).bvh_offset = RADV_NULL_BVH_OFFSET;
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
          } else
             break;
       }
@@ -141,24 +195,24 @@ main()
          uint32_t offset = ir_id_to_offset(children[i]);
          uint32_t dst_offset;
 
-         if (type == radv_ir_node_internal) {
+         if (type == vk_ir_node_internal) {
 #if COMPACT
             dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
 #else
             uint32_t offset_in_internal_nodes = offset - intermediate_leaf_nodes_size;
-            uint32_t child_index = offset_in_internal_nodes / SIZEOF(radv_ir_box_node);
+            uint32_t child_index = offset_in_internal_nodes / SIZEOF(vk_ir_box_node);
             dst_offset = dst_internal_offset + child_index * SIZEOF(radv_bvh_box32_node);
 #endif
 
-            REF(radv_ir_box_node) child_node = REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, offset);
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
             DEREF(child_node).bvh_offset = dst_offset;
          } else {
-            uint32_t child_index = offset / SIZEOF(radv_ir_node);
+            uint32_t child_index = offset / ir_leaf_node_size;
             dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
          }
 
-         radv_aabb child_aabb =
-            DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+         vk_aabb child_aabb =
+            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
 
          DEREF(dst_node).coords[i] = child_aabb;
 
diff --git a/src/amd/vulkan/bvh/leaf.comp b/src/amd/vulkan/bvh/leaf.comp
deleted file mode 100644
index 26568527c6f..00000000000
--- a/src/amd/vulkan/bvh/leaf.comp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-#include "build_interface.h"
-
-layout(push_constant) uniform CONSTS {
-   leaf_args args;
-};
-
-void
-main(void)
-{
-   uint32_t global_id = gl_GlobalInvocationID.x;
-   uint32_t primitive_id = args.geom_data.first_id + global_id;
-
-   REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id);
-   uint32_t src_offset = global_id * args.geom_data.stride;
-
-   uint32_t dst_stride;
-   uint32_t node_type;
-   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-      dst_stride = SIZEOF(radv_bvh_triangle_node);
-      node_type = radv_ir_node_triangle;
-   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-      dst_stride = SIZEOF(radv_bvh_aabb_node);
-      node_type = radv_ir_node_aabb;
-   } else {
-      dst_stride = SIZEOF(radv_bvh_instance_node);
-      node_type = radv_ir_node_instance;
-   }
-
-   uint32_t dst_offset = primitive_id * dst_stride;
-   VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset);
-
-   radv_aabb bounds;
-   bool is_active;
-   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-      is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id);
-   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-      is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id);
-   } else {
-      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-      /* arrayOfPointers */
-      if (args.geom_data.stride == 8) {
-         src_ptr = DEREF(REF(VOID_REF)(src_ptr));
-      }
-
-      is_active = build_instance(bounds, src_ptr, dst_ptr, global_id);
-   }
-
-#if ALWAYS_ACTIVE
-   if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
-      bounds.min = vec3(0.0);
-      bounds.max = vec3(0.0);
-      is_active = true;
-   }
-#endif
-
-   if (is_active) {
-      REF(radv_ir_node) ir_node = INDEX(radv_ir_node, args.ir, primitive_id);
-      DEREF(ir_node).aabb = bounds;
-   }
-
-   uint32_t ir_offset = primitive_id * SIZEOF(radv_ir_node);
-   DEREF(id_ptr).id = is_active ? pack_ir_node_id(ir_offset, node_type) : RADV_BVH_INVALID_NODE;
-
-   uvec4 ballot = subgroupBallot(is_active);
-   if (subgroupElect())
-      atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot));
-
-   atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x));
-   atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y));
-   atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z));
-   atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x));
-   atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y));
-   atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z));
-}
diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build
index 594194169a9..9173892d4a1 100644
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@@ -23,36 +23,6 @@ bvh_shaders = [
     'header',
     [],
   ],
-  [
-    'lbvh_generate_ir.comp',
-    'lbvh_generate_ir',
-    [],
-  ],
-  [
-    'lbvh_main.comp',
-    'lbvh_main',
-    [],
-  ],
-  [
-    'leaf.comp',
-    'leaf',
-    ['ALWAYS_ACTIVE=0'],
-  ],
-  [
-    'leaf.comp',
-    'leaf_always_active',
-    ['ALWAYS_ACTIVE=1'],
-  ],
-  [
-    'morton.comp',
-    'morton',
-    [],
-  ],
-  [
-    'ploc_internal.comp',
-    'ploc_internal',
-    [],
-  ],
   [
     'update.comp',
     'update',
@@ -61,17 +31,20 @@ bvh_shaders = [
 ]
 
 bvh_include_dir = dir_source_root + '/src/amd/vulkan/bvh'
+vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
 
 bvh_includes = files(
   'build_helpers.h',
   'build_interface.h',
   'bvh.h',
+  vk_bvh_include_dir + '/vk_build_helpers.h',
+  vk_bvh_include_dir + '/vk_bvh.h',
 )
 
 bvh_spv = []
 foreach s : bvh_shaders
   command = [
-    prog_glslang, '-V', '-I' + bvh_include_dir, '--target-env', 'spirv1.5',
+    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
     '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
   ]
 
diff --git a/src/amd/vulkan/bvh/update.comp b/src/amd/vulkan/bvh/update.comp
index 54577355e9e..ca06dfdf375 100644
--- a/src/amd/vulkan/bvh/update.comp
+++ b/src/amd/vulkan/bvh/update.comp
@@ -53,7 +53,7 @@ void main() {
     VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset);
     uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride;
 
-    radv_aabb bounds;
+    vk_aabb bounds;
     bool is_active;
     if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
         is_active = build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x);
@@ -65,7 +65,7 @@ void main() {
     if (!is_active)
         return;
 
-    DEREF(INDEX(radv_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
+    DEREF(INDEX(vk_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
     memoryBarrier(gl_ScopeDevice,
         gl_StorageSemanticsBuffer,
         gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
@@ -112,11 +112,11 @@ void main() {
 
         for (uint32_t i = 0; i < valid_child_count; ++i) {
             uint32_t child_offset = id_to_offset(children[i]);
-            radv_aabb child_bounds;
+            vk_aabb child_bounds;
             if (child_offset == dst_offset)
                 child_bounds = bounds;
             else if (child_offset >= internal_nodes_offset) {
-                child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+                child_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
                 REF(radv_bvh_box32_node) child_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset);
                 for (uint32_t j = 0; j < 4; ++j) {
                     if (DEREF(child_node).children[j] == RADV_BVH_INVALID_NODE)
@@ -126,16 +126,16 @@ void main() {
                 }
             } else {
                 uint32_t child_index = (child_offset - first_leaf_offset) / leaf_node_size;
-                child_bounds = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index));
+                child_bounds = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
             }
 
             DEREF(dst_node).coords[i] = child_bounds;
         }
 
         if (parent_id == RADV_BVH_ROOT_NODE) {
-            radv_aabb root_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+            vk_aabb root_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
             for (uint32_t i = 0; i < valid_child_count; ++i) {
-                radv_aabb bounds = DEREF(dst_node).coords[i];
+                vk_aabb bounds = DEREF(dst_node).coords[i];
                 root_bounds.min = min(root_bounds.min, bounds.min);
                 root_bounds.max = max(root_bounds.max, bounds.max);
             }
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 5976bef8b85..539be57c9dc 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -191,9 +191,6 @@ if amd_with_llvm
   )
 endif
 
-subdir('radix_sort')
-libradv_files += radix_sort_files
-
 subdir('bvh')
 
 subdir('layers')
diff --git a/src/amd/vulkan/radix_sort/meson.build b/src/amd/vulkan/radix_sort/meson.build
deleted file mode 100644
index c1478755822..00000000000
--- a/src/amd/vulkan/radix_sort/meson.build
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright © 2022 Konstantin Seurer
-# SPDX-License-Identifier: MIT
-
-subdir('shaders')
-
-radix_sort_files = files(
-  'common/vk/barrier.c',
-  'common/vk/barrier.h',
-  'common/macros.h',
-  'common/util.c',
-  'common/util.h',
-  'shaders/push.h',
-  'targets/u64/config.h',
-  'radix_sort_vk_devaddr.h',
-  'radix_sort_vk_ext.h',
-  'radix_sort_vk.c',
-  'radix_sort_vk.h',
-  'radv_radix_sort.c',
-  'radv_radix_sort.h',
-  'target.h'
-)
diff --git a/src/amd/vulkan/radix_sort/radv_radix_sort.c b/src/amd/vulkan/radix_sort/radv_radix_sort.c
deleted file mode 100644
index 4305baaba75..00000000000
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#include "radv_radix_sort.h"
-#include "targets/u64/config.h"
-#include "radv_cmd_buffer.h"
-#include "target.h"
-
-static const uint32_t init_spv[] = {
-#include "radix_sort/shaders/init.comp.spv.h"
-};
-
-static const uint32_t fill_spv[] = {
-#include "radix_sort/shaders/fill.comp.spv.h"
-};
-
-static const uint32_t histogram_spv[] = {
-#include "radix_sort/shaders/histogram.comp.spv.h"
-};
-
-static const uint32_t prefix_spv[] = {
-#include "radix_sort/shaders/prefix.comp.spv.h"
-};
-
-static const uint32_t scatter_0_even_spv[] = {
-#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
-};
-
-static const uint32_t scatter_0_odd_spv[] = {
-#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
-};
-
-static const uint32_t scatter_1_even_spv[] = {
-#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
-};
-
-static const uint32_t scatter_1_odd_spv[] = {
-#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
-};
-
-static const struct radix_sort_vk_target_config target_config = {
-   .keyval_dwords = RS_KEYVAL_DWORDS,
-
-   .histogram =
-      {
-         .workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2,
-         .block_rows = RS_HISTOGRAM_BLOCK_ROWS,
-      },
-
-   .prefix =
-      {
-         .workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2,
-      },
-
-   .scatter =
-      {
-         .workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2,
-         .block_rows = RS_SCATTER_BLOCK_ROWS,
-      },
-};
-
-radix_sort_vk_t *
-radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc)
-{
-   const uint32_t *spv[8] = {
-      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
-      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
-   };
-   const uint32_t spv_sizes[8] = {
-      sizeof(init_spv),           sizeof(fill_spv),          sizeof(histogram_spv),      sizeof(prefix_spv),
-      sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
-   };
-   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreateShaderModule(VkDevice _device, const VkShaderModuleCreateInfo *pCreateInfo,
-                     const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreateShaderModule(_device, pCreateInfo, pAllocator, pShaderModule);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyShaderModule(VkDevice _device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyShaderModule(_device, shaderModule, pAllocator);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreatePipelineLayout(VkDevice _device, const VkPipelineLayoutCreateInfo *pCreateInfo,
-                       const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreatePipelineLayout(_device, pCreateInfo, pAllocator, pPipelineLayout);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyPipelineLayout(VkDevice _device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyPipelineLayout(_device, pipelineLayout, pAllocator);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount,
-                         const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator,
-                         VkPipeline *pPipelines)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreateComputePipelines(_device, pipelineCache, createInfoCount, pCreateInfos,
-                                                           pAllocator, pPipelines);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyPipeline(VkDevice _device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyPipeline(_device, pipeline, pAllocator);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
-                     VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount,
-                     const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
-                     const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
-                     const VkImageMemoryBarrier *pImageMemoryBarriers)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, dependencyFlags,
-                                                memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
-                                                pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags,
-                   uint32_t offset, uint32_t size, const void *pValues)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset, size, pValues);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
-}
-
-VKAPI_ATTR VkDeviceAddress VKAPI_CALL
-vkGetBufferDeviceAddress(VkDevice _device, const VkBufferDeviceAddressInfo *pInfo)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.GetBufferDeviceAddress(_device, pInfo);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size,
-                uint32_t data)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdFillBuffer(commandBuffer, dstBuffer, dstOffset, size, data);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdDispatchIndirect(commandBuffer, buffer, offset);
-}
diff --git a/src/amd/vulkan/radix_sort/radv_radix_sort.h b/src/amd/vulkan/radix_sort/radv_radix_sort.h
deleted file mode 100644
index a0990610b9f..00000000000
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef RADV_RADIX_SORT_H
-#define RADV_RADIX_SORT_H
-
-#include "radix_sort_vk_devaddr.h"
-
-radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc);
-
-#endif
diff --git a/src/amd/vulkan/radix_sort/shaders/meson.build b/src/amd/vulkan/radix_sort/shaders/meson.build
deleted file mode 100644
index 7b5545696b2..00000000000
--- a/src/amd/vulkan/radix_sort/shaders/meson.build
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright © 2022 Konstantin Seurer
-# SPDX-License-Identifier: MIT
-
-radix_sort_shaders = [
-  'init.comp',
-  'fill.comp',
-  'histogram.comp',
-  'prefix.comp',
-  'scatter_0_even.comp',
-  'scatter_0_odd.comp',
-  'scatter_1_even.comp',
-  'scatter_1_odd.comp'
-]
-
-shader_include_dir = dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64'
-
-shader_include_files = files(
-  'bufref.h',
-  'prefix_limits.h',
-  'prefix.h',
-  'push.h',
-  'scatter.glsl',
-  dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64/config.h'
-)
-
-radix_sort_spv = []
-foreach s : radix_sort_shaders
-  _name = f'@s@.spv.h'
-  radix_sort_spv += custom_target(
-    _name,
-    input : s,
-    output : _name,
-    command : [
-      prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3',
-      '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_quiet, glslang_depfile,
-    ],
-    depfile : f'@_name@.d',
-    depend_files : shader_include_files,
-  )
-endforeach
diff --git a/src/amd/vulkan/radix_sort/shaders/prefix.h b/src/amd/vulkan/radix_sort/shaders/prefix.h
deleted file mode 100644
index f9d470bb3f5..00000000000
--- a/src/amd/vulkan/radix_sort/shaders/prefix.h
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright 2021 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
-#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
-
-//
-// Requires several defines
-//
-#ifndef RS_PREFIX_LIMITS
-#error "Error: \"prefix_limits.h\" not loaded"
-#endif
-
-#ifndef RS_PREFIX_ARGS
-#error "Error: RS_PREFIX_ARGS undefined"
-#endif
-
-#ifndef RS_PREFIX_LOAD
-#error "Error: RS_PREFIX_LOAD undefined"
-#endif
-
-#ifndef RS_PREFIX_STORE
-#error "Error: RS_PREFIX_STORE undefined"
-#endif
-
-#ifndef RS_SUBGROUP_SIZE
-#error "Error: RS_SUBGROUP_SIZE undefined"
-#endif
-
-#ifndef RS_WORKGROUP_SIZE
-#error "Error: RS_WORKGROUP_SIZE undefined"
-#endif
-
-#ifndef RS_WORKGROUP_SUBGROUPS
-#error "Error: RS_WORKGROUP_SUBGROUPS undefined"
-#endif
-
-//
-// Optional switches:
-//
-//   * Disable holding original inclusively scanned histogram values in registers.
-//
-//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-//
-
-//
-// Compute exclusive prefix of uint32_t[256]
-//
-void
-rs_prefix(RS_PREFIX_ARGS)
-{
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  // Workgroup is a single subgroup so no shared memory is required.
-  //
-
-  //
-  // Exclusive scan-add the histogram
-  //
-  const uint32_t               h0     = RS_PREFIX_LOAD(0);
-  const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
-  RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-
-  RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
-
-  //
-  // Each iteration is dependent on the previous so no unrolling.  The
-  // compiler is free to hoist the loads upward though.
-  //
-  for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
-       ii < RS_RADIX_SIZE;
-       ii += RS_SUBGROUP_SIZE)
-    {
-      const uint32_t h     = RS_PREFIX_LOAD(ii);
-      const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
-      h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
-
-      RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
-    }
-
-#else
-  //
-  // Workgroup is multiple subgroups and uses shared memory to store
-  // the scan's intermediate results.
-  //
-  // Assumes a power-of-two subgroup, workgroup and radix size.
-  //
-  // Downsweep: Repeatedly scan reductions until they fit in a single
-  //            subgroup.
-  //
-  // Upsweep:   Then uniformly apply reductions to each subgroup.
-  //
-  //
-  //   Subgroup Size |  4 |  8 | 16 | 32 | 64 |
-  //   --------------+----+----+----+----+----+
-  //   Sweep 0       | 64 | 32 | 16 |  8 |  4 | sweep_0[]
-  //   Sweep 1       | 16 |  4 |  - |  - |  - | sweep_1[]
-  //   Sweep 2       |  4 |  - |  - |  - |  - | sweep_2[]
-  //   --------------+----+----+----+----+----+
-  //   Total dwords  | 84 | 36 | 16 |  8 |  4 |
-  //   --------------+----+----+----+----+----+
-  //
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-  uint32_t h_exc[RS_H_COMPONENTS];
-#endif
-
-  //
-  // Downsweep 0
-  //
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    const uint32_t h_inc = subgroupInclusiveAdd(h);
-
-    const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
-
-    //
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    h_exc[ii] = h_inc - h;
-#else
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
-#endif
-  }
-
-  barrier();
-
-  //
-  // Skip generalizing these sweeps for all possible subgroups -- just
-  // write them directly.
-  //
-#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0
-  //
-#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE)
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // subgroup has inactive invocations
-#endif
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-    }
-
-#elif (RS_SUBGROUP_SIZE == 8)
-
-#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-      RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
-    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
-    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-  }
-
-#endif
-
-  barrier();
-
-  //
-  // Scan 1
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
-    {
-      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
-      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
-    }
-
-#elif (RS_SUBGROUP_SIZE == 4)
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
-
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-      RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
-    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
-    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-  }
-#endif
-
-  barrier();
-
-  //
-  // Scan 1 and Downsweep 2
-  //
-#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
-  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
-    {
-      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
-      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
-      RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
-  {
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
-    const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-    RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
-    RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
-  }
-
-#endif
-
-  barrier();
-
-  //
-  // Scan 2
-  //
-  // 4 invocations
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
-    {
-      const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
-      const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
-
-      RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
-    }
-
-#else
-#error "Error: Unsupported subgroup size"
-#endif
-
-  barrier();
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Final upsweep 0
-  //
-#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    // clang format issue
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
-#endif
-  }
-
-#elif (RS_SUBGROUP_SIZE == 8)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
-
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
-#endif
-  }
-
-#elif (RS_SUBGROUP_SIZE == 4)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
-    const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
-
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
-#endif
-  }
-
-#else
-#error "Error: Unsupported subgroup size"
-#endif
-
-#endif
-}
-
-//
-//
-//
-
-#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
diff --git a/src/amd/vulkan/radix_sort/targets/u64/config.h b/src/amd/vulkan/radix_sort/targets/u64/config.h
deleted file mode 100644
index fa1a51eb017..00000000000
--- a/src/amd/vulkan/radix_sort/targets/u64/config.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2021 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
-#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
-
-//
-//
-//
-
-// clang-format off
-#define RS_KEYVAL_DWORDS                   2
-
-#define RS_FILL_WORKGROUP_SIZE_LOG2        7
-#define RS_FILL_BLOCK_ROWS                 8
-
-#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2   8
-#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2    6
-#define RS_HISTOGRAM_BLOCK_ROWS            14
-
-#define RS_PREFIX_WORKGROUP_SIZE_LOG2      8
-#define RS_PREFIX_SUBGROUP_SIZE_LOG2       6
-
-#define RS_SCATTER_WORKGROUP_SIZE_LOG2     8
-#define RS_SCATTER_SUBGROUP_SIZE_LOG2      6
-#define RS_SCATTER_BLOCK_ROWS              14
-// clang-format on
-
-//
-//
-//
-
-#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c
index dbb5595494f..c6fe70528b6 100644
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@@ -4,16 +4,12 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "radv_sqtt.h"
-
 #include "meta/radv_meta.h"
 #include "nir_builder.h"
 #include "radv_cs.h"
 #include "radv_entrypoints.h"
 
-#include "radix_sort/common/vk/barrier.h"
-#include "radix_sort/radv_radix_sort.h"
-#include "radix_sort/shaders/push.h"
+#include "radix_sort/radix_sort_u64.h"
 
 #include "bvh/build_interface.h"
 #include "bvh/bvh.h"
@@ -21,30 +17,6 @@
 #include "vk_acceleration_structure.h"
 #include "vk_common_entrypoints.h"
 
-static const uint32_t leaf_spv[] = {
-#include "bvh/leaf.spv.h"
-};
-
-static const uint32_t leaf_always_active_spv[] = {
-#include "bvh/leaf_always_active.spv.h"
-};
-
-static const uint32_t morton_spv[] = {
-#include "bvh/morton.spv.h"
-};
-
-static const uint32_t lbvh_main_spv[] = {
-#include "bvh/lbvh_main.spv.h"
-};
-
-static const uint32_t lbvh_generate_ir_spv[] = {
-#include "bvh/lbvh_generate_ir.spv.h"
-};
-
-static const uint32_t ploc_spv[] = {
-#include "bvh/ploc_internal.spv.h"
-};
-
 static const uint32_t copy_spv[] = {
 #include "bvh/copy.spv.h"
 };
@@ -65,21 +37,6 @@ static const uint32_t update_spv[] = {
 #include "bvh/update.spv.h"
 };
 
-#define KEY_ID_PAIR_SIZE 8
-#define MORTON_BIT_SIZE  24
-
-enum internal_build_type {
-   INTERNAL_BUILD_TYPE_LBVH,
-   INTERNAL_BUILD_TYPE_PLOC,
-   INTERNAL_BUILD_TYPE_UPDATE,
-};
-
-struct build_config {
-   enum internal_build_type internal_type;
-   bool compact;
-   bool updateable;
-};
-
 struct acceleration_structure_layout {
    uint32_t geometry_info_offset;
    uint32_t bvh_offset;
@@ -89,71 +46,23 @@ struct acceleration_structure_layout {
 };
 
 struct scratch_layout {
-   uint32_t size;
    uint32_t update_size;
-
    uint32_t header_offset;
-
-   /* Used for UPDATE only. */
-
    uint32_t internal_ready_count_offset;
-
-   /* Used for BUILD only. */
-
-   uint32_t sort_buffer_offset[2];
-   uint32_t sort_internal_offset;
-
-   uint32_t ploc_prefix_sum_partition_offset;
-   uint32_t lbvh_node_offset;
-
-   uint32_t ir_offset;
-   uint32_t internal_node_offset;
 };
 
-static struct build_config
-build_config(uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfoKHR *build_info)
-{
-   struct build_config config = {0};
-
-   if (leaf_count <= 4)
-      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
-   else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR)
-      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
-   else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) &&
-            !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
-      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
-   else
-      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
-
-   if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR &&
-       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
-      config.internal_type = INTERNAL_BUILD_TYPE_UPDATE;
-
-   if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) &&
-       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
-      config.updateable = true;
-
-   if (build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
-      config.compact = true;
-
-   return config;
-}
+enum radv_encode_key_bits {
+   RADV_ENCODE_KEY_COMPACT = 1,
+};
 
 static void
-get_build_layout(struct radv_device *device, uint32_t leaf_count,
-                 const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
-                 struct acceleration_structure_layout *accel_struct, struct scratch_layout *scratch)
+radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf_count,
+                                       const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                       struct acceleration_structure_layout *accel_struct)
 {
    uint32_t internal_count = MAX2(leaf_count, 2) - 1;
 
-   VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
-
-   if (build_info->geometryCount) {
-      if (build_info->pGeometries)
-         geometry_type = build_info->pGeometries[0].geometryType;
-      else
-         geometry_type = build_info->ppGeometries[0]->geometryType;
-   }
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
 
    uint32_t bvh_leaf_size;
    switch (geometry_type) {
@@ -170,92 +79,52 @@ get_build_layout(struct radv_device *device, uint32_t leaf_count,
       unreachable("Unknown VkGeometryTypeKHR");
    }
 
-   if (accel_struct) {
-      uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count;
-      uint32_t offset = 0;
-      offset += sizeof(struct radv_accel_struct_header);
+   uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count;
+   uint32_t offset = 0;
+   offset += sizeof(struct radv_accel_struct_header);
 
-      if (device->rra_trace.accel_structs) {
-         accel_struct->geometry_info_offset = offset;
-         offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount;
-      }
-      /* Parent links, which have to go directly before bvh_offset as we index them using negative
-       * offsets from there. */
-      offset += bvh_size / 64 * 4;
-
-      /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
-      offset = ALIGN(offset, 64);
-      accel_struct->bvh_offset = offset;
-
-      /* root node */
-      offset += sizeof(struct radv_bvh_box32_node);
-
-      accel_struct->leaf_nodes_offset = offset;
-      offset += bvh_leaf_size * leaf_count;
-
-      accel_struct->internal_nodes_offset = offset;
-      /* Factor out the root node. */
-      offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1);
-
-      accel_struct->size = offset;
+   if (device->rra_trace.accel_structs) {
+      accel_struct->geometry_info_offset = offset;
+      offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount;
    }
+   /* Parent links, which have to go directly before bvh_offset as we index them using negative
+    * offsets from there. */
+   offset += bvh_size / 64 * 4;
 
-   if (scratch) {
-      radix_sort_vk_memory_requirements_t requirements = {
-         0,
-      };
-      if (radv_device_init_accel_struct_build_state(device) == VK_SUCCESS)
-         radix_sort_vk_get_memory_requirements(device->meta_state.accel_struct_build.radix_sort, leaf_count,
-                                               &requirements);
+   /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
+   offset = ALIGN(offset, 64);
+   accel_struct->bvh_offset = offset;
 
-      uint32_t offset = 0;
+   /* root node */
+   offset += sizeof(struct radv_bvh_box32_node);
 
-      uint32_t ploc_scratch_space = 0;
-      uint32_t lbvh_node_space = 0;
+   accel_struct->leaf_nodes_offset = offset;
+   offset += bvh_leaf_size * leaf_count;
 
-      struct build_config config = build_config(leaf_count, build_info);
+   accel_struct->internal_nodes_offset = offset;
+   /* Factor out the root node. */
+   offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1);
 
-      if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC)
-         ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition);
-      else
-         lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count;
+   accel_struct->size = offset;
+}
 
-      scratch->header_offset = offset;
-      offset += sizeof(struct radv_ir_header);
+static void
+radv_get_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct scratch_layout *scratch)
+{
+   uint32_t internal_count = MAX2(leaf_count, 2) - 1;
 
-      scratch->sort_buffer_offset[0] = offset;
-      offset += requirements.keyvals_size;
+   uint32_t offset = 0;
 
-      scratch->sort_buffer_offset[1] = offset;
-      offset += requirements.keyvals_size;
+   scratch->header_offset = offset;
+   offset += sizeof(struct vk_ir_header);
 
-      scratch->sort_internal_offset = offset;
-      /* Internal sorting data is not needed when PLOC/LBVH are invoked,
-       * save space by aliasing them */
-      scratch->ploc_prefix_sum_partition_offset = offset;
-      scratch->lbvh_node_offset = offset;
-      offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space);
+   uint32_t update_offset = 0;
 
-      scratch->ir_offset = offset;
-      offset += sizeof(struct radv_ir_node) * leaf_count;
+   update_offset += sizeof(vk_aabb) * leaf_count;
+   scratch->internal_ready_count_offset = update_offset;
 
-      scratch->internal_node_offset = offset;
-      offset += sizeof(struct radv_ir_box_node) * internal_count;
-
-      scratch->size = offset;
-
-      if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) {
-         uint32_t update_offset = 0;
-
-         update_offset += sizeof(radv_aabb) * leaf_count;
-         scratch->internal_ready_count_offset = update_offset;
-
-         update_offset += sizeof(uint32_t) * internal_count;
-         scratch->update_size = update_offset;
-      } else {
-         scratch->update_size = offset;
-      }
-   }
+   update_offset += sizeof(uint32_t) * internal_count;
+   scratch->update_size = update_offset;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -272,17 +141,11 @@ radv_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStruc
    STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64);
    STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128);
 
-   uint32_t leaf_count = 0;
-   for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
-      leaf_count += pMaxPrimitiveCounts[i];
+   if (radv_device_init_accel_struct_build_state(device) != VK_SUCCESS)
+      return;
 
-   struct acceleration_structure_layout accel_struct;
-   struct scratch_layout scratch;
-   get_build_layout(device, leaf_count, pBuildInfo, &accel_struct, &scratch);
-
-   pSizeInfo->accelerationStructureSize = accel_struct.size;
-   pSizeInfo->updateScratchSize = scratch.update_size;
-   pSizeInfo->buildScratchSize = scratch.size;
+   vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts, pSizeInfo,
+                         &device->meta_state.accel_struct_build.build_args);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -319,24 +182,13 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
    struct vk_device_dispatch_table *dispatch = &device->vk.dispatch_table;
 
    dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.ploc_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_generate_ir_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_main_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_updateable_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_compact_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.header_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.morton_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.update_pipeline, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.copy_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.ploc_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_generate_ir_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_main_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.leaf_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.encode_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.header_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.morton_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.update_p_layout, &state->alloc);
 
    if (state->accel_struct_build.radix_sort)
@@ -492,7 +344,7 @@ radv_device_init_null_accel_struct(struct radv_device *device)
    };
 
    for (uint32_t child = 0; child < 4; child++) {
-      root.coords[child] = (radv_aabb){
+      root.coords[child] = (vk_aabb){
          .min.x = NAN,
          .min.y = NAN,
          .min.z = NAN,
@@ -524,6 +376,328 @@ radv_device_init_null_accel_struct(struct radv_device *device)
    return VK_SUCCESS;
 }
 
+static VkDeviceSize
+radv_get_as_size(VkDevice _device, const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, uint32_t leaf_count)
+{
+   VK_FROM_HANDLE(radv_device, device, _device);
+
+   struct acceleration_structure_layout accel_struct;
+   radv_get_acceleration_structure_layout(device, leaf_count, pBuildInfo, &accel_struct);
+   return accel_struct.size;
+}
+
+static VkDeviceSize
+radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count)
+{
+   struct radv_device *device = container_of(vk_device, struct radv_device, vk);
+
+   struct scratch_layout scratch;
+   radv_get_scratch_layout(device, leaf_count, &scratch);
+   return scratch.update_size;
+}
+
+static uint32_t
+radv_get_encode_key(VkAccelerationStructureTypeKHR type, VkBuildAccelerationStructureFlagBitsKHR flags)
+{
+   if (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
+      return RADV_ENCODE_KEY_COMPACT;
+
+   return 0;
+}
+
+static VkResult
+radv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   bool compact = key & RADV_ENCODE_KEY_COMPACT;
+   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             compact ? device->meta_state.accel_struct_build.encode_compact_pipeline
+                                                     : device->meta_state.accel_struct_build.encode_pipeline);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_encode_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+               const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, VkDeviceAddress intermediate_as_addr,
+               VkDeviceAddress intermediate_header_addr, uint32_t leaf_count, uint32_t key,
+               struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
+
+   if (key & RADV_ENCODE_KEY_COMPACT) {
+      uint32_t dst_offset = layout.internal_nodes_offset - layout.bvh_offset;
+      radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset),
+                            &dst_offset, sizeof(uint32_t));
+   }
+
+   const struct encode_args args = {
+      .intermediate_bvh = intermediate_as_addr,
+      .output_bvh = vk_acceleration_structure_get_va(dst) + layout.bvh_offset,
+      .header = intermediate_header_addr,
+      .output_bvh_offset = layout.bvh_offset,
+      .leaf_node_count = leaf_count,
+      .geometry_type = vk_get_as_geometry_type(build_info),
+   };
+   vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
+                              VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
+
+   struct radv_dispatch_info dispatch = {
+      .unaligned = true,
+      .ordered = true,
+      .blocks = {leaf_count, 1, 1},
+   };
+
+   radv_compute_dispatch(cmd_buffer, &dispatch);
+}
+
+static VkResult
+radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   if (!(key & RADV_ENCODE_KEY_COMPACT))
+      return VK_SUCCESS;
+
+   /* Wait for encoding to finish. */
+   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                                   radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) |
+                                   radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL);
+
+   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             device->meta_state.accel_struct_build.header_pipeline);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                 const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                 VkDeviceAddress intermediate_as_addr, VkDeviceAddress intermediate_header_addr, uint32_t leaf_count,
+                 uint32_t key, struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   size_t base = offsetof(struct radv_accel_struct_header, compacted_size);
+
+   uint64_t instance_count = build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? leaf_count : 0;
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
+
+   if (key & RADV_ENCODE_KEY_COMPACT) {
+      base = offsetof(struct radv_accel_struct_header, geometry_count);
+
+      struct header_args args = {
+         .src = intermediate_header_addr,
+         .dst = vk_acceleration_structure_get_va(dst),
+         .bvh_offset = layout.bvh_offset,
+         .instance_count = instance_count,
+      };
+
+      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout,
+                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
+
+      radv_unaligned_dispatch(cmd_buffer, 1, 1, 1);
+   }
+
+   struct radv_accel_struct_header header;
+
+   header.instance_offset = layout.bvh_offset + sizeof(struct radv_bvh_box32_node);
+   header.instance_count = instance_count;
+   header.compacted_size = layout.size;
+
+   header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64);
+   header.copy_dispatch_size[1] = 1;
+   header.copy_dispatch_size[2] = 1;
+
+   header.serialization_size =
+      header.compacted_size +
+      align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128);
+
+   header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) -
+                 sizeof(uint64_t) * header.instance_count;
+
+   header.build_flags = build_info->flags;
+   header.geometry_count = build_info->geometryCount;
+
+   radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(dst) + base, (const char *)&header + base,
+                         sizeof(header) - base);
+
+   if (device->rra_trace.accel_structs) {
+      uint64_t geometry_infos_size = build_info->geometryCount * sizeof(struct radv_accel_struct_geometry_info);
+
+      struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size);
+      if (!geometry_infos)
+         return;
+
+      for (uint32_t i = 0; i < build_info->geometryCount; i++) {
+         const VkAccelerationStructureGeometryKHR *geometry =
+            build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i];
+         geometry_infos[i].type = geometry->geometryType;
+         geometry_infos[i].flags = geometry->flags;
+         geometry_infos[i].primitive_count = build_range_infos[i].primitiveCount;
+      }
+
+      radv_CmdUpdateBuffer(commandBuffer, dst->buffer, dst->offset + layout.geometry_info_offset, geometry_infos_size,
+                           geometry_infos);
+   }
+}
+
+static void
+radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch, uint32_t leaf_count,
+                         struct vk_acceleration_structure *src_as, struct vk_acceleration_structure *dst_as)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   struct scratch_layout layout;
+   radv_get_scratch_layout(device, leaf_count, &layout);
+
+   /* Prepare ready counts for internal nodes */
+   radv_fill_buffer(cmd_buffer, NULL, NULL, scratch + layout.internal_ready_count_offset,
+                    layout.update_size - layout.internal_ready_count_offset, 0x0);
+}
+
+static void
+radv_update_bind_pipeline(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   /* Wait for update scratch initialization to finish.. */
+   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                                   radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) |
+                                   radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL);
+
+   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             device->meta_state.accel_struct_build.update_pipeline);
+}
+
+static uint32_t
+pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
+{
+   uint32_t geometry_id_and_flags = geometry_id;
+   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
+      geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE;
+
+   return geometry_id_and_flags;
+}
+
+static void
+radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+               const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count,
+               struct vk_acceleration_structure *dst, struct vk_acceleration_structure *src)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   if (src != dst) {
+      VK_FROM_HANDLE(radv_buffer, src_as_buffer, src->buffer);
+      VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst->buffer);
+
+      struct acceleration_structure_layout layout;
+      radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
+
+      /* Copy header/metadata */
+      radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src->offset,
+                       dst_as_buffer->offset + dst->offset, layout.bvh_offset);
+   }
+
+   struct scratch_layout layout;
+   radv_get_scratch_layout(device, leaf_count, &layout);
+
+   struct update_args update_consts = {
+      .src = vk_acceleration_structure_get_va(src),
+      .dst = vk_acceleration_structure_get_va(dst),
+      .leaf_bounds = build_info->scratchData.deviceAddress,
+      .internal_ready_count = build_info->scratchData.deviceAddress + layout.internal_ready_count_offset,
+      .leaf_node_count = leaf_count,
+   };
+
+   uint32_t first_id = 0;
+   for (uint32_t i = 0; i < build_info->geometryCount; i++) {
+      const VkAccelerationStructureGeometryKHR *geom =
+         build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i];
+
+      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &build_range_infos[i];
+
+      update_consts.geom_data = vk_fill_geometry_data(build_info->type, first_id, i, geom, build_range_info);
+
+      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout,
+                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts);
+      radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1);
+
+      first_id += build_range_info->primitiveCount;
+   }
+}
+
+static const struct radix_sort_vk_target_config radix_sort_config = {
+   .keyval_dwords = 2,
+   .fill.workgroup_size_log2 = 7,
+   .fill.block_rows = 8,
+   .histogram.workgroup_size_log2 = 8,
+   .histogram.subgroup_size_log2 = 6,
+   .histogram.block_rows = 14,
+   .prefix.workgroup_size_log2 = 8,
+   .prefix.subgroup_size_log2 = 6,
+   .scatter.workgroup_size_log2 = 8,
+   .scatter.subgroup_size_log2 = 6,
+   .scatter.block_rows = 14,
+};
+
+static const struct vk_acceleration_structure_build_ops build_ops = {
+   .get_as_size = radv_get_as_size,
+   .get_update_scratch_size = radv_get_update_scratch_size,
+   .get_encode_key[0] = radv_get_encode_key,
+   .get_encode_key[1] = radv_get_encode_key,
+   .encode_bind_pipeline[0] = radv_encode_bind_pipeline,
+   .encode_bind_pipeline[1] = radv_init_header_bind_pipeline,
+   .encode_as[0] = radv_encode_as,
+   .encode_as[1] = radv_init_header,
+   .init_update_scratch = radv_init_update_scratch,
+   .update_bind_pipeline[0] = radv_update_bind_pipeline,
+   .update_as[0] = radv_update_as,
+};
+
+static void
+radv_write_buffer_cp(VkCommandBuffer commandBuffer, VkDeviceAddress addr, void *data, uint32_t size)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   radv_update_buffer_cp(cmd_buffer, addr, data, size);
+}
+
+static void
+radv_flush_buffer_write_cp(VkCommandBuffer commandBuffer)
+{
+}
+
+static void
+radv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   radv_unaligned_dispatch(cmd_buffer, x, y, z);
+}
+
+static void
+radv_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer, VkDeviceAddress addr, VkDeviceSize size, uint32_t data)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   radv_fill_buffer(cmd_buffer, NULL, NULL, addr, size, data);
+}
+
 VkResult
 radv_device_init_accel_struct_build_state(struct radv_device *device)
 {
@@ -533,38 +707,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (device->meta_state.accel_struct_build.radix_sort)
       goto exit;
 
-   result = create_build_pipeline_spv(device, leaf_always_active_spv, sizeof(leaf_always_active_spv),
-                                      sizeof(struct leaf_args),
-                                      &device->meta_state.accel_struct_build.leaf_updateable_pipeline,
-                                      &device->meta_state.accel_struct_build.leaf_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, leaf_spv, sizeof(leaf_spv), sizeof(struct leaf_args),
-                                      &device->meta_state.accel_struct_build.leaf_pipeline,
-                                      &device->meta_state.accel_struct_build.leaf_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, lbvh_main_spv, sizeof(lbvh_main_spv), sizeof(struct lbvh_main_args),
-                                      &device->meta_state.accel_struct_build.lbvh_main_pipeline,
-                                      &device->meta_state.accel_struct_build.lbvh_main_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, lbvh_generate_ir_spv, sizeof(lbvh_generate_ir_spv),
-                                      sizeof(struct lbvh_generate_ir_args),
-                                      &device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline,
-                                      &device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, ploc_spv, sizeof(ploc_spv), sizeof(struct ploc_args),
-                                      &device->meta_state.accel_struct_build.ploc_pipeline,
-                                      &device->meta_state.accel_struct_build.ploc_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
    result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args),
                                       &device->meta_state.accel_struct_build.encode_pipeline,
                                       &device->meta_state.accel_struct_build.encode_p_layout);
@@ -584,20 +726,33 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (result != VK_SUCCESS)
       goto exit;
 
-   result = create_build_pipeline_spv(device, morton_spv, sizeof(morton_spv), sizeof(struct morton_args),
-                                      &device->meta_state.accel_struct_build.morton_pipeline,
-                                      &device->meta_state.accel_struct_build.morton_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
    result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args),
                                       &device->meta_state.accel_struct_build.update_pipeline,
                                       &device->meta_state.accel_struct_build.update_p_layout);
    if (result != VK_SUCCESS)
       goto exit;
 
-   device->meta_state.accel_struct_build.radix_sort =
-      radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache);
+   device->meta_state.accel_struct_build.radix_sort = vk_create_radix_sort_u64(
+      radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache, radix_sort_config);
+
+   result = vk_meta_device_init(&device->vk, &device->meta_state.device);
+   if (result != VK_SUCCESS)
+      goto exit;
+
+   device->meta_state.device.pipeline_cache = device->meta_state.cache;
+
+   device->vk.as_build_ops = &build_ops;
+   device->vk.write_buffer_cp = radv_write_buffer_cp;
+   device->vk.flush_buffer_write_cp = radv_flush_buffer_write_cp;
+   device->vk.cmd_dispatch_unaligned = radv_cmd_dispatch_unaligned;
+   device->vk.cmd_fill_buffer_addr = radv_cmd_fill_buffer_addr;
+
+   struct vk_acceleration_structure_build_args *build_args = &device->meta_state.accel_struct_build.build_args;
+   build_args->subgroup_size = 64;
+   build_args->bvh_bounds_offset = offsetof(struct radv_accel_struct_header, aabb);
+   build_args->emit_markers = device->sqtt.bo;
+   build_args->radix_sort = device->meta_state.accel_struct_build.radix_sort;
+
 exit:
    mtx_unlock(&device->meta_state.mtx);
    return result;
@@ -616,727 +771,6 @@ radv_device_init_accel_struct_copy_state(struct radv_device *device)
    return result;
 }
 
-struct bvh_state {
-   uint32_t node_count;
-   uint32_t scratch_offset;
-
-   uint32_t leaf_node_count;
-   uint32_t internal_node_count;
-   uint32_t leaf_node_size;
-
-   struct acceleration_structure_layout accel_struct;
-   struct scratch_layout scratch;
-   struct build_config config;
-
-   /* Radix sort state */
-   uint32_t scatter_blocks;
-   uint32_t count_ru_scatter;
-   uint32_t histo_blocks;
-   uint32_t count_ru_histo;
-   struct rs_push_scatter push_scatter;
-};
-
-struct radv_bvh_batch_state {
-   bool any_compact;
-   bool any_non_compact;
-   bool any_ploc;
-   bool any_lbvh;
-   bool any_updateable;
-   bool any_non_updateable;
-   bool any_update;
-};
-
-static uint32_t
-pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
-{
-   uint32_t geometry_id_and_flags = geometry_id;
-   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
-      geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE;
-
-   return geometry_id_and_flags;
-}
-
-static struct radv_bvh_geometry_data
-fill_geometry_data(VkAccelerationStructureTypeKHR type, struct bvh_state *bvh_state, uint32_t geom_index,
-                   const VkAccelerationStructureGeometryKHR *geometry,
-                   const VkAccelerationStructureBuildRangeInfoKHR *build_range_info)
-{
-   struct radv_bvh_geometry_data data = {
-      .first_id = bvh_state->node_count,
-      .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags),
-      .geometry_type = geometry->geometryType,
-   };
-
-   switch (geometry->geometryType) {
-   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
-
-      data.data = geometry->geometry.triangles.vertexData.deviceAddress +
-                  build_range_info->firstVertex * geometry->geometry.triangles.vertexStride;
-      data.indices = geometry->geometry.triangles.indexData.deviceAddress;
-
-      if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR)
-         data.data += build_range_info->primitiveOffset;
-      else
-         data.indices += build_range_info->primitiveOffset;
-
-      data.transform = geometry->geometry.triangles.transformData.deviceAddress;
-      if (data.transform)
-         data.transform += build_range_info->transformOffset;
-
-      data.stride = geometry->geometry.triangles.vertexStride;
-      data.vertex_format = geometry->geometry.triangles.vertexFormat;
-      data.index_format = geometry->geometry.triangles.indexType;
-      break;
-   case VK_GEOMETRY_TYPE_AABBS_KHR:
-      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
-
-      data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset;
-      data.stride = geometry->geometry.aabbs.stride;
-      break;
-   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
-      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR);
-
-      data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset;
-
-      if (geometry->geometry.instances.arrayOfPointers)
-         data.stride = 8;
-      else
-         data.stride = sizeof(VkAccelerationStructureInstanceKHR);
-      break;
-   default:
-      unreachable("Unknown geometryType");
-   }
-
-   return data;
-}
-
-static void
-build_leaves(VkCommandBuffer commandBuffer, uint32_t infoCount,
-             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
-             const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states,
-             bool updateable)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "leaves");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             updateable ? device->meta_state.accel_struct_build.leaf_updateable_pipeline
-                                                        : device->meta_state.accel_struct_build.leaf_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      if (bvh_states[i].config.updateable != updateable)
-         continue;
-
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-
-      struct leaf_args leaf_consts = {
-         .ir = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.leaf_nodes_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
-      };
-
-      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geom =
-            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
-
-         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];
-
-         leaf_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info);
-
-         vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.leaf_p_layout,
-                                    VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts);
-         radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1);
-
-         bvh_states[i].leaf_node_count += build_range_info->primitiveCount;
-         bvh_states[i].node_count += build_range_info->primitiveCount;
-      }
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-morton_generate(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                enum radv_cmd_flush_bits flush_bits)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "morton");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.morton_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      const struct morton_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.morton_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      radv_unaligned_dispatch(cmd_buffer, bvh_states[i].node_count, 1, 1);
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-
-   cmd_buffer->state.flush_bits |= flush_bits;
-}
-
-static void
-morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount,
-            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-            enum radv_cmd_flush_bits flush_bits)
-{
-   /* Copyright 2019 The Fuchsia Authors. */
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "sort");
-
-   radix_sort_vk_t *rs = device->meta_state.accel_struct_build.radix_sort;
-
-   /*
-    * OVERVIEW
-    *
-    *   1. Pad the keyvals in `scatter_even`.
-    *   2. Zero the `histograms` and `partitions`.
-    *      --- BARRIER ---
-    *   3. HISTOGRAM is dispatched before PREFIX.
-    *      --- BARRIER ---
-    *   4. PREFIX is dispatched before the first SCATTER.
-    *      --- BARRIER ---
-    *   5. One or more SCATTER dispatches.
-    *
-    * Note that the `partitions` buffer can be zeroed anytime before the first
-    * scatter.
-    */
-
-   /* How many passes? */
-   uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t);
-   uint32_t keyval_bits = keyval_bytes * 8;
-   uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits);
-   uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2;
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].node_count)
-         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1];
-      else
-         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0];
-   }
-
-   /*
-    * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS
-    *
-    * Pad fractional blocks with max-valued keyvals.
-    *
-    * Zero the histograms and partitions buffer.
-    *
-    * This assumes the partitions follow the histograms.
-    */
-
-   /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */
-
-   /* How many scatter blocks? */
-   uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2;
-   uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows;
-
-   /*
-    * How many histogram blocks?
-    *
-    * Note that it's OK to have more max-valued digits counted by the histogram
-    * than sorted by the scatters because the sort is stable.
-    */
-   uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2;
-   uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows;
-
-   uint32_t pass_idx = (keyval_bytes - passes);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (!bvh_states[i].node_count)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      bvh_states[i].scatter_blocks = (bvh_states[i].node_count + scatter_block_kvs - 1) / scatter_block_kvs;
-      bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs;
-
-      bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs;
-      bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs;
-
-      /* Fill with max values */
-      if (bvh_states[i].count_ru_histo > bvh_states[i].node_count) {
-         radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + bvh_states[i].node_count * keyval_bytes,
-                          (bvh_states[i].count_ru_histo - bvh_states[i].node_count) * keyval_bytes, 0xFFFFFFFF);
-      }
-
-      /*
-       * Zero histograms and invalidate partitions.
-       *
-       * Note that the partition invalidation only needs to be performed once
-       * because the even/odd scatter dispatches rely on the the previous pass to
-       * leave the partitions in an invalid state.
-       *
-       * Note that the last workgroup doesn't read/write a partition so it doesn't
-       * need to be initialized.
-       */
-      uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1;
-
-      uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
-
-      radv_fill_buffer(cmd_buffer, NULL, NULL, internal_addr + rs->internal.histograms.offset + fill_base,
-                       histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)), 0);
-   }
-
-   /*
-    * Pipeline: HISTOGRAM
-    *
-    * TODO(allanmac): All subgroups should try to process approximately the same
-    * number of blocks in order to minimize tail effects.  This was implemented
-    * and reverted but should be reimplemented and benchmarked later.
-    */
-   vk_barrier_transfer_w_to_compute_r(commandBuffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             rs->pipelines.named.histogram);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (!bvh_states[i].node_count)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      /* Dispatch histogram */
-      struct rs_push_histogram push_histogram = {
-         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
-         .devaddr_keyvals = keyvals_even_addr,
-         .passes = passes,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0,
-                                 sizeof(push_histogram), &push_histogram);
-
-      vk_common_CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1);
-   }
-
-   /*
-    * Pipeline: PREFIX
-    *
-    * Launch one workgroup per pass.
-    */
-   vk_barrier_compute_w_to_compute_r(commandBuffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (!bvh_states[i].node_count)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      struct rs_push_prefix push_prefix = {
-         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0,
-                                 sizeof(push_prefix), &push_prefix);
-
-      vk_common_CmdDispatch(commandBuffer, passes, 1, 1);
-   }
-
-   /* Pipeline: SCATTER */
-   vk_barrier_compute_w_to_compute_r(commandBuffer);
-
-   uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
-
-   for (uint32_t i = 0; i < infoCount; i++) {
-      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
-      uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1];
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      bvh_states[i].push_scatter = (struct rs_push_scatter){
-         .devaddr_keyvals_even = keyvals_even_addr,
-         .devaddr_keyvals_odd = keyvals_odd_addr,
-         .devaddr_partitions = internal_addr + rs->internal.partitions.offset,
-         .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset,
-      };
-   }
-
-   bool is_even = true;
-
-   while (true) {
-      uint32_t pass_dword = pass_idx / 4;
-
-      /* Bind new pipeline */
-      VkPipeline p =
-         is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd;
-      device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p);
-
-      /* Update push constants that changed */
-      VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even
-                                    : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-
-      for (uint32_t i = 0; i < infoCount; i++) {
-         if (!bvh_states[i].node_count)
-            continue;
-         if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-            continue;
-
-         bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2;
-
-         vk_common_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter),
-                                    &bvh_states[i].push_scatter);
-
-         vk_common_CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1);
-
-         bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t));
-      }
-
-      /* Continue? */
-      if (++pass_idx >= keyval_bytes)
-         break;
-
-      vk_barrier_compute_w_to_compute_r(commandBuffer);
-
-      is_even ^= true;
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-
-   cmd_buffer->state.flush_bits |= flush_bits;
-}
-
-static void
-lbvh_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                    enum radv_cmd_flush_bits flush_bits)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "lbvh");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.lbvh_main_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
-         continue;
-
-      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
-      uint32_t internal_node_count = MAX2(bvh_states[i].node_count, 2) - 1;
-
-      const struct lbvh_main_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
-         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
-         .id_count = bvh_states[i].node_count,
-         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_main_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      radv_unaligned_dispatch(cmd_buffer, internal_node_count, 1, 1);
-      bvh_states[i].node_count = internal_node_count;
-      bvh_states[i].internal_node_count = internal_node_count;
-   }
-
-   cmd_buffer->state.flush_bits |= flush_bits;
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
-         continue;
-
-      const struct lbvh_generate_ir_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      radv_unaligned_dispatch(cmd_buffer, bvh_states[i].internal_node_count, 1, 1);
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-ploc_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "ploc");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.ploc_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC)
-         continue;
-
-      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
-      uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0])
-                                       ? bvh_states[i].scratch.sort_buffer_offset[1]
-                                       : bvh_states[i].scratch.sort_buffer_offset[0];
-
-      const struct ploc_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
-         .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset,
-         .prefix_scan_partitions =
-            pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset,
-         .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.ploc_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      vk_common_CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1);
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-encode_nodes(VkCommandBuffer commandBuffer, uint32_t infoCount,
-             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, bool compact)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "encode");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             compact ? device->meta_state.accel_struct_build.encode_compact_pipeline
-                                                     : device->meta_state.accel_struct_build.encode_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (compact != bvh_states[i].config.compact)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-
-      VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
-
-      /* If the geometry count is 0, then the size does not matter
-       * because it will be multiplied with 0.
-       */
-      if (pInfos[i].geometryCount)
-         geometry_type =
-            pInfos[i].pGeometries ? pInfos[i].pGeometries[0].geometryType : pInfos[i].ppGeometries[0]->geometryType;
-
-      if (bvh_states[i].config.compact) {
-         uint32_t dst_offset = bvh_states[i].accel_struct.internal_nodes_offset - bvh_states[i].accel_struct.bvh_offset;
-         radv_update_buffer_cp(cmd_buffer,
-                               pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset +
-                                  offsetof(struct radv_ir_header, dst_node_offset),
-                               &dst_offset, sizeof(uint32_t));
-      }
-
-      const struct encode_args args = {
-         .intermediate_bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .output_bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.bvh_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .output_bvh_offset = bvh_states[i].accel_struct.bvh_offset,
-         .leaf_node_count = bvh_states[i].leaf_node_count,
-         .geometry_type = geometry_type,
-      };
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
-
-      struct radv_dispatch_info dispatch = {
-         .unaligned = true,
-         .ordered = true,
-         .va = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset +
-               offsetof(struct radv_ir_header, ir_internal_node_count),
-      };
-
-      radv_compute_dispatch(cmd_buffer, &dispatch);
-   }
-   /* This is the final access to the leaf nodes, no need to flush */
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-init_header(VkCommandBuffer commandBuffer, uint32_t infoCount,
-            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-            struct radv_bvh_batch_state *batch_state)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   if (batch_state->any_compact) {
-      radv_write_user_event_marker(cmd_buffer, UserEventPush, "header");
-
-      device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                                device->meta_state.accel_struct_build.header_pipeline);
-   }
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-      size_t base = offsetof(struct radv_accel_struct_header, compacted_size);
-
-      uint64_t instance_count =
-         pInfos[i].type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? bvh_states[i].leaf_node_count : 0;
-
-      if (bvh_states[i].config.compact) {
-         base = offsetof(struct radv_accel_struct_header, geometry_count);
-
-         struct header_args args = {
-            .src = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-            .dst = vk_acceleration_structure_get_va(accel_struct),
-            .bvh_offset = bvh_states[i].accel_struct.bvh_offset,
-            .instance_count = instance_count,
-         };
-
-         vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout,
-                                    VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
-
-         radv_unaligned_dispatch(cmd_buffer, 1, 1, 1);
-      }
-
-      struct radv_accel_struct_header header;
-
-      header.instance_offset = bvh_states[i].accel_struct.bvh_offset + sizeof(struct radv_bvh_box32_node);
-      header.instance_count = instance_count;
-      header.compacted_size = bvh_states[i].accel_struct.size;
-
-      header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64);
-      header.copy_dispatch_size[1] = 1;
-      header.copy_dispatch_size[2] = 1;
-
-      header.serialization_size =
-         header.compacted_size +
-         align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128);
-
-      header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) -
-                    sizeof(uint64_t) * header.instance_count;
-
-      header.build_flags = pInfos[i].flags;
-      header.geometry_count = pInfos[i].geometryCount;
-
-      radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(accel_struct) + base,
-                            (const char *)&header + base, sizeof(header) - base);
-   }
-
-   if (batch_state->any_compact)
-      radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-init_geometry_infos(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                    const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
-{
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-
-      uint64_t geometry_infos_size = pInfos[i].geometryCount * sizeof(struct radv_accel_struct_geometry_info);
-
-      struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size);
-      if (!geometry_infos)
-         continue;
-
-      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geometry =
-            pInfos[i].pGeometries ? pInfos[i].pGeometries + j : pInfos[i].ppGeometries[j];
-         geometry_infos[j].type = geometry->geometryType;
-         geometry_infos[j].flags = geometry->flags;
-         geometry_infos[j].primitive_count = ppBuildRangeInfos[i][j].primitiveCount;
-      }
-
-      radv_CmdUpdateBuffer(commandBuffer, accel_struct->buffer,
-                           accel_struct->offset + bvh_states[i].accel_struct.geometry_info_offset, geometry_infos_size,
-                           geometry_infos);
-
-      free(geometry_infos);
-   }
-}
-
-static void
-update(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
-       const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "update");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.update_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint32_t leaf_node_count = 0;
-      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
-         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
-      }
-
-      VK_FROM_HANDLE(vk_acceleration_structure, src_bvh, pInfos[i].srcAccelerationStructure);
-      VK_FROM_HANDLE(vk_acceleration_structure, dst_bvh, pInfos[i].dstAccelerationStructure);
-      struct update_args update_consts = {
-         .src = vk_acceleration_structure_get_va(src_bvh),
-         .dst = vk_acceleration_structure_get_va(dst_bvh),
-         .leaf_bounds = pInfos[i].scratchData.deviceAddress,
-         .internal_ready_count =
-            pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset,
-         .leaf_node_count = leaf_node_count,
-      };
-
-      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geom =
-            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
-
-         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];
-
-         update_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info);
-
-         vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout,
-                                    VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts);
-         radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1);
-
-         bvh_states[i].leaf_node_count += build_range_info->primitiveCount;
-         bvh_states[i].node_count += build_range_info->primitiveCount;
-      }
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
 VKAPI_ATTR void VKAPI_CALL
 radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t infoCount,
                                        const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
@@ -1352,132 +786,14 @@ radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t i
       return;
    }
 
-   enum radv_cmd_flush_bits flush_bits = RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
-                                         radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                                                               VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) |
-                                         radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                                                               VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL);
-
    radv_meta_save(&saved_state, cmd_buffer,
                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
-   struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
-
-   radv_describe_begin_accel_struct_build(cmd_buffer, infoCount);
-
-   struct radv_bvh_batch_state batch_state = {0};
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      uint32_t leaf_node_count = 0;
-      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
-         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
-      }
-
-      get_build_layout(device, leaf_node_count, pInfos + i, &bvh_states[i].accel_struct, &bvh_states[i].scratch);
-
-      struct build_config config = build_config(leaf_node_count, pInfos + i);
-      bvh_states[i].config = config;
-
-      if (config.compact)
-         batch_state.any_compact = true;
-      else
-         batch_state.any_non_compact = true;
-
-      if (config.updateable)
-         batch_state.any_updateable = true;
-      else
-         batch_state.any_non_updateable = true;
-
-      if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) {
-         batch_state.any_ploc = true;
-      } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) {
-         batch_state.any_lbvh = true;
-      } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) {
-         batch_state.any_update = true;
-      } else {
-         unreachable("Unknown internal_build_type");
-      }
-
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) {
-         /* The internal node count is updated in lbvh_build_internal for LBVH
-          * and from the PLOC shader for PLOC. */
-         struct radv_ir_header header = {
-            .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff},
-            .max_bounds = {0x80000000, 0x80000000, 0x80000000},
-            .dispatch_size_y = 1,
-            .dispatch_size_z = 1,
-            .sync_data =
-               {
-                  .current_phase_end_counter = TASK_INDEX_INVALID,
-                  /* Will be updated by the first PLOC shader invocation */
-                  .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
-               },
-         };
-
-         radv_update_buffer_cp(cmd_buffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-                               &header, sizeof(header));
-      } else {
-         /* Prepare ready counts for internal nodes */
-         radv_fill_buffer(cmd_buffer, NULL, NULL,
-                          pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset,
-                          bvh_states[i].scratch.update_size - bvh_states[i].scratch.internal_ready_count_offset, 0x0);
-         if (pInfos[i].srcAccelerationStructure != pInfos[i].dstAccelerationStructure) {
-            VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure);
-            VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure);
-
-            VK_FROM_HANDLE(radv_buffer, src_as_buffer, src_as->buffer);
-            VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst_as->buffer);
-
-            /* Copy header/metadata */
-            radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src_as->offset,
-                             dst_as_buffer->offset + dst_as->offset, bvh_states[i].accel_struct.bvh_offset);
-         }
-      }
-   }
 
    cmd_buffer->state.current_event_type = EventInternalUnknown;
 
-   if (batch_state.any_lbvh || batch_state.any_ploc) {
-      if (batch_state.any_non_updateable)
-         build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, false);
-      if (batch_state.any_updateable)
-         build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, true);
+   vk_cmd_build_acceleration_structures(commandBuffer, &device->vk, &device->meta_state.device, infoCount, pInfos,
+                                        ppBuildRangeInfos, &device->meta_state.accel_struct_build.build_args);
 
-      cmd_buffer->state.flush_bits |= flush_bits;
-
-      morton_generate(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
-
-      morton_sort(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
-
-      cmd_buffer->state.flush_bits |= flush_bits;
-
-      if (batch_state.any_lbvh)
-         lbvh_build_internal(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
-
-      if (batch_state.any_ploc)
-         ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states);
-
-      cmd_buffer->state.flush_bits |= flush_bits;
-
-      if (batch_state.any_non_compact)
-         encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, false);
-
-      if (batch_state.any_compact)
-         encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, true);
-
-      cmd_buffer->state.flush_bits |= flush_bits;
-   }
-
-   init_header(commandBuffer, infoCount, pInfos, bvh_states, &batch_state);
-
-   if (device->rra_trace.accel_structs)
-      init_geometry_infos(commandBuffer, infoCount, pInfos, bvh_states, ppBuildRangeInfos);
-
-   if (batch_state.any_update)
-      update(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states);
-
-   radv_describe_end_accel_struct_build(cmd_buffer);
-
-   free(bvh_states);
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h
index e6e432e4771..dbd7f961c38 100644
--- a/src/amd/vulkan/radv_device.h
+++ b/src/amd/vulkan/radv_device.h
@@ -24,7 +24,9 @@
 #include "radv_rra.h"
 #include "radv_shader.h"
 
+#include "vk_acceleration_structure.h"
 #include "vk_device.h"
+#include "vk_meta.h"
 #include "vk_texcompress_astc.h"
 #include "vk_texcompress_etc2.h"
 
@@ -302,17 +304,6 @@ struct radv_meta_state {
    } dcc_retile;
 
    struct {
-      VkPipelineLayout leaf_p_layout;
-      VkPipeline leaf_pipeline;
-      VkPipeline leaf_updateable_pipeline;
-      VkPipelineLayout morton_p_layout;
-      VkPipeline morton_pipeline;
-      VkPipelineLayout lbvh_main_p_layout;
-      VkPipeline lbvh_main_pipeline;
-      VkPipelineLayout lbvh_generate_ir_p_layout;
-      VkPipeline lbvh_generate_ir_pipeline;
-      VkPipelineLayout ploc_p_layout;
-      VkPipeline ploc_pipeline;
       VkPipelineLayout encode_p_layout;
       VkPipeline encode_pipeline;
       VkPipeline encode_compact_pipeline;
@@ -324,6 +315,7 @@ struct radv_meta_state {
       VkPipeline copy_pipeline;
 
       struct radix_sort_vk *radix_sort;
+      struct vk_acceleration_structure_build_args build_args;
 
       struct {
          VkBuffer buffer;
@@ -340,6 +332,8 @@ struct radv_meta_state {
       VkDescriptorSetLayout ds_layout;
       VkPipelineLayout p_layout;
    } dgc_prepare;
+
+   struct vk_meta_device device;
 };
 
 struct radv_memory_trace_data {
diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c
index 79e7802915d..c7adbf20e81 100644
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@@ -542,7 +542,7 @@ rra_transcode_triangle_node(struct rra_transcoding_context *ctx, const struct ra
 }
 
 static void
-rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, radv_aabb bounds)
+rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, vk_aabb bounds)
 {
    struct rra_aabb_node *dst = (struct rra_aabb_node *)(ctx->dst + ctx->dst_leaf_offset);
    ctx->dst_leaf_offset += sizeof(struct rra_aabb_node);
@@ -580,7 +580,7 @@ rra_transcode_instance_node(struct rra_transcoding_context *ctx, const struct ra
 }
 
 static uint32_t rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
-                                   radv_aabb bounds);
+                                   vk_aabb bounds);
 
 static void
 rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_bvh_box16_node *src)
@@ -597,7 +597,7 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_
          continue;
       }
 
-      radv_aabb bounds = {
+      vk_aabb bounds = {
          .min =
             {
                _mesa_half_to_float(src->coords[i][0][0]),
@@ -653,7 +653,7 @@ get_geometry_id(const void *node, uint32_t node_type)
 }
 
 static uint32_t
-rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, radv_aabb bounds)
+rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, vk_aabb bounds)
 {
    uint32_t node_type = src_id & 7;
    uint32_t src_offset = (src_id & (~7u)) << 3;
diff --git a/src/amd/vulkan/bvh/lbvh_generate_ir.comp b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp
similarity index 58%
rename from src/amd/vulkan/bvh/lbvh_generate_ir.comp
rename to src/vulkan/runtime/bvh/lbvh_generate_ir.comp
index 18821d13a79..818e568b4c1 100644
--- a/src/amd/vulkan/bvh/lbvh_generate_ir.comp
+++ b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -18,9 +35,9 @@
 #extension GL_EXT_buffer_reference2 : require
 #extension GL_KHR_memory_scope_semantics : require
 
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"
 
-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
 
 TYPE(lbvh_node_info, 4);
 
@@ -36,8 +53,8 @@ main(void)
 
    uint32_t idx = global_id;
 
-   uint32_t previous_id = RADV_BVH_INVALID_NODE;
-   radv_aabb previous_bounds;
+   uint32_t previous_id = VK_BVH_INVALID_NODE;
+   vk_aabb previous_bounds;
    previous_bounds.min = vec3(INFINITY);
    previous_bounds.max = vec3(-INFINITY);
 
@@ -58,13 +75,13 @@ main(void)
        * parents, which is a requirement of the encoder.
        */
       uint32_t dst_idx =
-         atomicAdd(DEREF(REF(radv_ir_header)(args.header)).ir_internal_node_count, 1);
+         atomicAdd(DEREF(REF(vk_ir_header)(args.header)).ir_internal_node_count, 1);
 
-      uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(radv_ir_box_node);
-      uint32_t current_id = pack_ir_node_id(current_offset, radv_ir_node_internal);
+      uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(vk_ir_box_node);
+      uint32_t current_id = pack_ir_node_id(current_offset, vk_ir_node_internal);
 
-      REF(radv_ir_box_node) node = REF(radv_ir_box_node)(OFFSET(args.bvh, current_offset));
-      radv_aabb bounds = previous_bounds;
+      REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, current_offset));
+      vk_aabb bounds = previous_bounds;
 
       lbvh_node_info info = DEREF(INDEX(lbvh_node_info, args.node_info, idx));
 
@@ -78,10 +95,10 @@ main(void)
          previous_child_index = 1;
 
       if (previous_child_index == -1) {
-         if (children[0] != RADV_BVH_INVALID_NODE) {
+         if (children[0] != VK_BVH_INVALID_NODE) {
             uint32_t child_offset = ir_id_to_offset(children[0]);
-            REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset));
-            radv_aabb child_bounds = DEREF(child).aabb;
+            REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset));
+            vk_aabb child_bounds = DEREF(child).aabb;
             bounds.min = min(bounds.min, child_bounds.min);
             bounds.max = max(bounds.max, child_bounds.max);
          }
@@ -89,23 +106,23 @@ main(void)
       }
 
       /* Fetch the non-cached child */
-      if (children[1 - previous_child_index] != RADV_BVH_INVALID_NODE) {
+      if (children[1 - previous_child_index] != VK_BVH_INVALID_NODE) {
          uint32_t child_offset = ir_id_to_offset(children[1 - previous_child_index]);
-         REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset));
-         radv_aabb child_bounds = DEREF(child).aabb;
+         REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset));
+         vk_aabb child_bounds = DEREF(child).aabb;
          bounds.min = min(bounds.min, child_bounds.min);
          bounds.max = max(bounds.max, child_bounds.max);
       }
 
-      radv_ir_box_node node_value;
+      vk_ir_box_node node_value;
 
       node_value.base.aabb = bounds;
-      node_value.bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+      node_value.bvh_offset = VK_UNKNOWN_BVH_OFFSET;
       node_value.children = children;
 
       DEREF(node) = node_value;
 
-      if (info.parent == RADV_BVH_INVALID_NODE)
+      if (info.parent == VK_BVH_INVALID_NODE)
          break;
 
       idx = info.parent & ~LBVH_RIGHT_CHILD_BIT;
diff --git a/src/amd/vulkan/bvh/lbvh_main.comp b/src/vulkan/runtime/bvh/lbvh_main.comp
similarity index 76%
rename from src/amd/vulkan/bvh/lbvh_main.comp
rename to src/vulkan/runtime/bvh/lbvh_main.comp
index c6c51280985..c79a3164eb9 100644
--- a/src/amd/vulkan/bvh/lbvh_main.comp
+++ b/src/vulkan/runtime/bvh/lbvh_main.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -17,9 +34,9 @@
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require
 
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"
 
-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
 
 TYPE(lbvh_node_info, 4);
 
@@ -74,11 +91,11 @@ main()
 {
    if (args.id_count <= 1) {
       REF(lbvh_node_info) dst = REF(lbvh_node_info)(args.node_info);
-      DEREF(dst).parent = RADV_BVH_INVALID_NODE;
+      DEREF(dst).parent = VK_BVH_INVALID_NODE;
       DEREF(dst).path_count = 2;
       DEREF(dst).children[0] =
-         args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : RADV_BVH_INVALID_NODE;
-      DEREF(dst).children[1] = RADV_BVH_INVALID_NODE;
+         args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : VK_BVH_INVALID_NODE;
+      DEREF(dst).children[1] = VK_BVH_INVALID_NODE;
       return;
    }
 
@@ -136,5 +153,5 @@ main()
    DEREF(dst).children[0] = DEREF(INDEX(key_id_pair, args.src_ids, left)).id;
    DEREF(dst).children[1] = DEREF(INDEX(key_id_pair, args.src_ids, right)).id;
    if (id == 0)
-      DEREF(dst).parent = RADV_BVH_INVALID_NODE;
+      DEREF(dst).parent = VK_BVH_INVALID_NODE;
 }
diff --git a/src/vulkan/runtime/bvh/leaf.comp b/src/vulkan/runtime/bvh/leaf.comp
new file mode 100644
index 00000000000..85f0756204a
--- /dev/null
+++ b/src/vulkan/runtime/bvh/leaf.comp
@@ -0,0 +1,250 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+#include "vk_build_interface.h"
+
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
+
+layout(push_constant) uniform CONSTS {
+   leaf_args args;
+};
+
+/* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
+struct AccelerationStructureInstance {
+   mat3x4 transform;
+   uint32_t custom_instance_and_mask;
+   uint32_t sbt_offset_and_flags;
+   uint64_t accelerationStructureReference;
+};
+TYPE(AccelerationStructureInstance, 8);
+
+bool
+build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
+{
+   bool is_valid = true;
+   triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
+
+   triangle_vertices vertices = load_vertices(geom_data.data, indices, geom_data.vertex_format, geom_data.stride);
+
+   /* An inactive triangle is one for which the first (X) component of any vertex is NaN. If any
+    * other vertex component is NaN, and the first is not, the behavior is undefined. If the vertex
+    * format does not have a NaN representation, then all triangles are considered active.
+    */
+   if (isnan(vertices.vertex[0].x) || isnan(vertices.vertex[1].x) || isnan(vertices.vertex[2].x))
+#if ALWAYS_ACTIVE
+      is_valid = false;
+#else
+      return false;
+#endif
+
+   if (geom_data.transform != NULL) {
+      mat4 transform = mat4(1.0);
+
+      for (uint32_t col = 0; col < 4; col++)
+      for (uint32_t row = 0; row < 3; row++)
+      transform[col][row] = DEREF(INDEX(float, geom_data.transform, col + row * 4));
+
+      for (uint32_t i = 0; i < 3; i++)
+      vertices.vertex[i] = transform * vertices.vertex[i];
+   }
+
+   REF(vk_ir_triangle_node) node = REF(vk_ir_triangle_node)(dst_ptr);
+
+   bounds.min = vec3(INFINITY);
+   bounds.max = vec3(-INFINITY);
+
+   for (uint32_t coord = 0; coord < 3; coord++)
+   for (uint32_t comp = 0; comp < 3; comp++) {
+      DEREF(node).coords[coord][comp] = vertices.vertex[coord][comp];
+      bounds.min[comp] = min(bounds.min[comp], vertices.vertex[coord][comp]);
+      bounds.max[comp] = max(bounds.max[comp], vertices.vertex[coord][comp]);
+   }
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).triangle_id = global_id;
+   DEREF(node).geometry_id_and_flags = geom_data.geometry_id;
+   DEREF(node).id = 9;
+
+   return is_valid;
+}
+
+bool
+build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
+{
+   bool is_valid = true;
+   REF(vk_ir_aabb_node) node = REF(vk_ir_aabb_node)(dst_ptr);
+
+   for (uint32_t vec = 0; vec < 2; vec++)
+   for (uint32_t comp = 0; comp < 3; comp++) {
+      float coord = DEREF(INDEX(float, src_ptr, comp + vec * 3));
+
+      if (vec == 0)
+      bounds.min[comp] = coord;
+      else
+      bounds.max[comp] = coord;
+   }
+
+   /* An inactive AABB is one for which the minimum X coordinate is NaN. If any other component is
+    * NaN, and the first is not, the behavior is undefined.
+    */
+   if (isnan(bounds.min.x))
+#if ALWAYS_ACTIVE
+      is_valid = false;
+#else
+      return false;
+#endif
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).primitive_id = global_id;
+   DEREF(node).geometry_id_and_flags = geometry_id;
+
+   return is_valid;
+}
+
+vk_aabb
+calculate_instance_node_bounds(uint64_t base_ptr, mat3x4 otw_matrix)
+{
+   vk_aabb aabb;
+
+   vk_aabb blas_aabb = DEREF(REF(vk_aabb)(base_ptr + BVH_BOUNDS_OFFSET));
+
+   for (uint32_t comp = 0; comp < 3; ++comp) {
+      aabb.min[comp] = otw_matrix[comp][3];
+      aabb.max[comp] = otw_matrix[comp][3];
+      for (uint32_t col = 0; col < 3; ++col) {
+         aabb.min[comp] +=
+            min(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]);
+         aabb.max[comp] +=
+            max(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]);
+      }
+   }
+   return aabb;
+}
+
+bool
+build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
+{
+   REF(vk_ir_instance_node) node = REF(vk_ir_instance_node)(dst_ptr);
+
+   AccelerationStructureInstance instance = DEREF(REF(AccelerationStructureInstance)(src_ptr));
+
+   /* An inactive instance is one whose acceleration structure handle is VK_NULL_HANDLE. Since the active terminology is
+    * only relevant for BVH updates, which we do not implement, we can also skip instances with mask == 0.
+    */
+   if (instance.accelerationStructureReference == 0 || instance.custom_instance_and_mask < (1u << 24u))
+      return false;
+
+   DEREF(node).base_ptr = instance.accelerationStructureReference;
+
+   mat4 transform = mat4(instance.transform);
+   DEREF(node).otw_matrix = mat3x4(transform);
+
+   bounds = calculate_instance_node_bounds(instance.accelerationStructureReference, mat3x4(transform));
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).custom_instance_and_mask = instance.custom_instance_and_mask;
+   DEREF(node).sbt_offset_and_flags = instance.sbt_offset_and_flags;
+   DEREF(node).instance_id = global_id;
+
+   return true;
+}
+
+void
+main(void)
+{
+   uint32_t global_id = gl_GlobalInvocationID.x;
+   uint32_t primitive_id = args.geom_data.first_id + global_id;
+
+   REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id);
+   uint32_t src_offset = global_id * args.geom_data.stride;
+
+   uint32_t dst_stride;
+   uint32_t node_type;
+   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
+      dst_stride = SIZEOF(vk_ir_triangle_node);
+      node_type = vk_ir_node_triangle;
+   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
+      dst_stride = SIZEOF(vk_ir_aabb_node);
+      node_type = vk_ir_node_aabb;
+   } else {
+      dst_stride = SIZEOF(vk_ir_instance_node);
+      node_type = vk_ir_node_instance;
+   }
+
+   uint32_t dst_offset = primitive_id * dst_stride;
+   VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset);
+
+   vk_aabb bounds;
+   bool is_active;
+   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
+      is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id);
+   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
+      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
+      is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id);
+   } else {
+      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
+      /* arrayOfPointers */
+      if (args.geom_data.stride == 8) {
+         src_ptr = DEREF(REF(VOID_REF)(src_ptr));
+      }
+
+      is_active = build_instance(bounds, src_ptr, dst_ptr, global_id);
+   }
+
+#if ALWAYS_ACTIVE
+   if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
+      bounds.min = vec3(0.0);
+      bounds.max = vec3(0.0);
+      is_active = true;
+   }
+#endif
+
+   DEREF(id_ptr).id = is_active ? pack_ir_node_id(dst_offset, node_type) : VK_BVH_INVALID_NODE;
+
+   uvec4 ballot = subgroupBallot(is_active);
+   if (subgroupElect())
+      atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot));
+
+   atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x));
+   atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y));
+   atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z));
+   atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x));
+   atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y));
+   atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z));
+}
diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build
new file mode 100644
index 00000000000..a2d751c295c
--- /dev/null
+++ b/src/vulkan/runtime/bvh/meson.build
@@ -0,0 +1,81 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# source file, output name, defines
+bvh_shaders = [
+  [
+    'lbvh_generate_ir.comp',
+    'lbvh_generate_ir',
+    [],
+  ],
+  [
+    'lbvh_main.comp',
+    'lbvh_main',
+    [],
+  ],
+  [
+    'leaf.comp',
+    'leaf',
+    ['ALWAYS_ACTIVE=0'],
+  ],
+  [
+    'leaf.comp',
+    'leaf_always_active',
+    ['ALWAYS_ACTIVE=1'],
+  ],
+  [
+    'morton.comp',
+    'morton',
+    [],
+  ],
+  [
+    'ploc_internal.comp',
+    'ploc_internal',
+    [],
+  ],
+]
+
+vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
+
+vk_bvh_includes = files(
+  'vk_build_helpers.h',
+  'vk_build_interface.h',
+  'vk_bvh.h',
+)
+
+bvh_spv = []
+foreach s : bvh_shaders
+  command = [
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+  ] + (with_mesa_debug ? ['-g'] : [])
+  command += glslang_quiet
+
+  foreach define : s[2]
+    command += '-D' + define
+  endforeach
+
+  bvh_spv += custom_target(
+    s[1] + '.spv.h',
+    input : s[0],
+    output : s[1] + '.spv.h',
+    command : command,
+    depend_files: vk_bvh_includes
+  )
+endforeach
diff --git a/src/amd/vulkan/bvh/morton.comp b/src/vulkan/runtime/bvh/morton.comp
similarity index 62%
rename from src/amd/vulkan/bvh/morton.comp
rename to src/vulkan/runtime/bvh/morton.comp
index f795297a11c..75a6f15baf3 100644
--- a/src/amd/vulkan/bvh/morton.comp
+++ b/src/vulkan/runtime/bvh/morton.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Konstantin Seurer
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -17,9 +34,9 @@
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require
 
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"
 
-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
 
 layout(push_constant) uniform CONSTS {
    morton_args args;
@@ -56,11 +73,11 @@ main(void)
    uint32_t id = DEREF(key_id).id;
 
    uint32_t key;
-   if (id != RADV_BVH_INVALID_NODE) {
-      radv_aabb bounds = DEREF(REF(radv_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb;
+   if (id != VK_BVH_INVALID_NODE) {
+      vk_aabb bounds = DEREF(REF(vk_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb;
       vec3 center = (bounds.min + bounds.max) * 0.5;
 
-      radv_aabb bvh_bounds;
+      vk_aabb bvh_bounds;
       bvh_bounds.min.x = from_emulated_float(DEREF(args.header).min_bounds[0]);
       bvh_bounds.min.y = from_emulated_float(DEREF(args.header).min_bounds[1]);
       bvh_bounds.min.z = from_emulated_float(DEREF(args.header).min_bounds[2]);
diff --git a/src/amd/vulkan/bvh/ploc_internal.comp b/src/vulkan/runtime/bvh/ploc_internal.comp
similarity index 76%
rename from src/amd/vulkan/bvh/ploc_internal.comp
rename to src/vulkan/runtime/bvh/ploc_internal.comp
index 50fc40edc93..0ecf7d38d82 100644
--- a/src/amd/vulkan/bvh/ploc_internal.comp
+++ b/src/vulkan/runtime/bvh/ploc_internal.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -24,7 +41,7 @@
 layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;
 
 #define USE_GLOBAL_SYNC
-#include "build_interface.h"
+#include "vk_build_interface.h"
 
 TYPE(ploc_prefix_scan_partition, 4);
 
@@ -34,7 +51,8 @@ layout(push_constant) uniform CONSTS
 };
 
 shared uint32_t exclusive_prefix_sum;
-shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64];
+shared uint32_t aggregate_sums[PLOC_SUBGROUPS_PER_WORKGROUP];
+shared uint32_t aggregate_sums2[PLOC_SUBGROUPS_PER_WORKGROUP];
 
 /*
  * Global prefix scan over all workgroups to find out the index of the collapsed node to write.
@@ -45,8 +63,7 @@ uint32_t
 prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
 {
    if (gl_LocalInvocationIndex == 0) {
-      /* Temporary copy of exclusive_prefix_sum to avoid reading+writing LDS each addition */
-      uint32_t local_exclusive_prefix_sum = 0;
+      exclusive_prefix_sum = 0;
       if (task_index >= gl_WorkGroupSize.x) {
          REF(ploc_prefix_scan_partition) current_partition =
             REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
@@ -58,28 +75,55 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
             if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
                            gl_StorageSemanticsBuffer,
                            gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
-               local_exclusive_prefix_sum += DEREF(previous_partition).inclusive_sum;
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
                break;
             } else {
-               local_exclusive_prefix_sum += DEREF(previous_partition).aggregate;
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
                previous_partition -= 1;
             }
          }
          /* Set the inclusive sum for the next workgroups */
          atomicStore(DEREF(current_partition).inclusive_sum,
-                     DEREF(current_partition).aggregate + local_exclusive_prefix_sum, gl_ScopeDevice,
+                     DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
                      gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable);
       }
-      exclusive_prefix_sum = local_exclusive_prefix_sum;
    }
 
    if (subgroupElect())
       aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
    barrier();
 
-   if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) {
-      aggregate_sums[gl_LocalInvocationID.x] =
-         exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+   if (PLOC_SUBGROUPS_PER_WORKGROUP <= SUBGROUP_SIZE) {
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
+         aggregate_sums[gl_LocalInvocationID.x] =
+            exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+      }
+   } else {
+      /* If the length of aggregate_sums[] is larger than SUBGROUP_SIZE,
+       * the prefix scan can't be done simply by subgroupExclusiveAdd.
+       */
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+         aggregate_sums2[gl_LocalInvocationID.x] = aggregate_sums[gl_LocalInvocationID.x];
+      barrier();
+
+      /* Hillis Steele inclusive scan on aggregate_sums2 */
+      for (uint32_t stride = 1; stride < PLOC_SUBGROUPS_PER_WORKGROUP; stride *= 2) {
+         uint32_t value = 0;
+         if (gl_LocalInvocationID.x >= stride && gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+            value = aggregate_sums2[gl_LocalInvocationID.x - stride];
+         barrier();
+         if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+            aggregate_sums2[gl_LocalInvocationID.x] += value;
+         barrier();
+      }
+
+      /* Adapt to exclusive and add the prefix_sum from previous workgroups */
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
+         if (gl_LocalInvocationID.x == 0)
+            aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum;
+         else
+            aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + aggregate_sums2[gl_LocalInvocationID.x - 1];
+      }
    }
    barrier();
 
@@ -90,20 +134,20 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
 #define BVH_LEVEL_COST 0.2
 
 uint32_t
-push_node(uint32_t children[2], radv_aabb bounds[2])
+push_node(uint32_t children[2], vk_aabb bounds[2])
 {
    uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
-   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
-   uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal);
-   REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
+   uint32_t dst_id = pack_ir_node_id(dst_offset, vk_ir_node_internal);
+   REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));
 
-   radv_aabb total_bounds;
+   vk_aabb total_bounds;
    total_bounds.min = vec3(INFINITY);
    total_bounds.max = vec3(-INFINITY);
 
    for (uint i = 0; i < 2; ++i) {
       VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
-      REF(radv_ir_node) child = REF(radv_ir_node)(node);
+      REF(vk_ir_node) child = REF(vk_ir_node)(node);
 
       total_bounds.min = min(total_bounds.min, bounds[i].min);
       total_bounds.max = max(total_bounds.max, bounds[i].max);
@@ -112,7 +156,7 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
    }
 
    DEREF(dst_node).base.aabb = total_bounds;
-   DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+   DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
    return dst_id;
 }
 
@@ -136,7 +180,7 @@ decode_neighbour_offset(uint32_t encoded_offset)
 
 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
 
-shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
+shared vk_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
 
 uint32_t
@@ -155,11 +199,11 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
    for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound;
         i += gl_WorkGroupSize.x) {
       uint32_t id = load_id(ids, iter, i);
-      if (id == RADV_BVH_INVALID_NODE)
+      if (id == VK_BVH_INVALID_NODE)
          continue;
 
       VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
-      REF(radv_ir_node) node = REF(radv_ir_node)(addr);
+      REF(vk_ir_node) node = REF(vk_ir_node)(addr);
 
       shared_bounds[i - lds_base] = DEREF(node).aabb;
    }
@@ -168,7 +212,7 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
 float
 combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
 {
-   radv_aabb combined_bounds;
+   vk_aabb combined_bounds;
    combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
    combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
    return aabb_surface_area(combined_bounds);
@@ -187,10 +231,10 @@ main(void)
    if (DEREF(args.header).active_leaf_count <= 2) {
       if (gl_GlobalInvocationID.x == 0) {
          uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
-         uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
-         REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+         uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
+         REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));
 
-         radv_aabb total_bounds;
+         vk_aabb total_bounds;
          total_bounds.min = vec3(INFINITY);
          total_bounds.max = vec3(-INFINITY);
 
@@ -198,10 +242,10 @@ main(void)
          for (; i < DEREF(args.header).active_leaf_count; i++) {
             uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id;
 
-            if (child_id != RADV_BVH_INVALID_NODE) {
+            if (child_id != VK_BVH_INVALID_NODE) {
                VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id));
-               REF(radv_ir_node) child = REF(radv_ir_node)(node);
-               radv_aabb bounds = DEREF(child).aabb;
+               REF(vk_ir_node) child = REF(vk_ir_node)(node);
+               vk_aabb bounds = DEREF(child).aabb;
 
                total_bounds.min = min(total_bounds.min, bounds.min);
                total_bounds.max = max(total_bounds.max, bounds.max);
@@ -210,10 +254,10 @@ main(void)
             DEREF(dst_node).children[i] = child_id;
          }
          for (; i < 2; i++)
-            DEREF(dst_node).children[i] = RADV_BVH_INVALID_NODE;
+            DEREF(dst_node).children[i] = VK_BVH_INVALID_NODE;
 
          DEREF(dst_node).base.aabb = total_bounds;
-         DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+         DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
       }
       return;
    }
@@ -329,11 +373,11 @@ main(void)
                if (task_index < neighbour_index) {
                   uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
                   uint32_t children[2] = {id, neighbour_id};
-                  radv_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};
+                  vk_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};
 
                   DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds);
                   DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
-                     RADV_BVH_INVALID_NODE;
+                     VK_BVH_INVALID_NODE;
                } else {
                   /* We still store in the other case so we don't destroy the node id needed to
                    * create the internal node */
@@ -381,14 +425,14 @@ main(void)
 
          uint32_t id = task_index < current_task_count
                           ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
-                          : RADV_BVH_INVALID_NODE;
-         uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE);
+                          : VK_BVH_INVALID_NODE;
+         uvec4 ballot = subgroupBallot(id != VK_BVH_INVALID_NODE);
 
          uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
          if (task_index >= current_task_count)
             continue;
 
-         if (id != RADV_BVH_INVALID_NODE) {
+         if (id != VK_BVH_INVALID_NODE) {
             DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
             ++new_offset;
          }
diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h
new file mode 100644
index 00000000000..0a178adea14
--- /dev/null
+++ b/src/vulkan/runtime/bvh/vk_build_helpers.h
@@ -0,0 +1,522 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VK_BVH_BUILD_HELPERS_H
+#define VK_BVH_BUILD_HELPERS_H
+
+#include "vk_bvh.h"
+
+#define VK_FORMAT_UNDEFINED                  0
+#define VK_FORMAT_R4G4_UNORM_PACK8           1
+#define VK_FORMAT_R4G4B4A4_UNORM_PACK16      2
+#define VK_FORMAT_B4G4R4A4_UNORM_PACK16      3
+#define VK_FORMAT_R5G6B5_UNORM_PACK16        4
+#define VK_FORMAT_B5G6R5_UNORM_PACK16        5
+#define VK_FORMAT_R5G5B5A1_UNORM_PACK16      6
+#define VK_FORMAT_B5G5R5A1_UNORM_PACK16      7
+#define VK_FORMAT_A1R5G5B5_UNORM_PACK16      8
+#define VK_FORMAT_R8_UNORM                   9
+#define VK_FORMAT_R8_SNORM                   10
+#define VK_FORMAT_R8_USCALED                 11
+#define VK_FORMAT_R8_SSCALED                 12
+#define VK_FORMAT_R8_UINT                    13
+#define VK_FORMAT_R8_SINT                    14
+#define VK_FORMAT_R8_SRGB                    15
+#define VK_FORMAT_R8G8_UNORM                 16
+#define VK_FORMAT_R8G8_SNORM                 17
+#define VK_FORMAT_R8G8_USCALED               18
+#define VK_FORMAT_R8G8_SSCALED               19
+#define VK_FORMAT_R8G8_UINT                  20
+#define VK_FORMAT_R8G8_SINT                  21
+#define VK_FORMAT_R8G8_SRGB                  22
+#define VK_FORMAT_R8G8B8_UNORM               23
+#define VK_FORMAT_R8G8B8_SNORM               24
+#define VK_FORMAT_R8G8B8_USCALED             25
+#define VK_FORMAT_R8G8B8_SSCALED             26
+#define VK_FORMAT_R8G8B8_UINT                27
+#define VK_FORMAT_R8G8B8_SINT                28
+#define VK_FORMAT_R8G8B8_SRGB                29
+#define VK_FORMAT_B8G8R8_UNORM               30
+#define VK_FORMAT_B8G8R8_SNORM               31
+#define VK_FORMAT_B8G8R8_USCALED             32
+#define VK_FORMAT_B8G8R8_SSCALED             33
+#define VK_FORMAT_B8G8R8_UINT                34
+#define VK_FORMAT_B8G8R8_SINT                35
+#define VK_FORMAT_B8G8R8_SRGB                36
+#define VK_FORMAT_R8G8B8A8_UNORM             37
+#define VK_FORMAT_R8G8B8A8_SNORM             38
+#define VK_FORMAT_R8G8B8A8_USCALED           39
+#define VK_FORMAT_R8G8B8A8_SSCALED           40
+#define VK_FORMAT_R8G8B8A8_UINT              41
+#define VK_FORMAT_R8G8B8A8_SINT              42
+#define VK_FORMAT_R8G8B8A8_SRGB              43
+#define VK_FORMAT_B8G8R8A8_UNORM             44
+#define VK_FORMAT_B8G8R8A8_SNORM             45
+#define VK_FORMAT_B8G8R8A8_USCALED           46
+#define VK_FORMAT_B8G8R8A8_SSCALED           47
+#define VK_FORMAT_B8G8R8A8_UINT              48
+#define VK_FORMAT_B8G8R8A8_SINT              49
+#define VK_FORMAT_B8G8R8A8_SRGB              50
+#define VK_FORMAT_A8B8G8R8_UNORM_PACK32      51
+#define VK_FORMAT_A8B8G8R8_SNORM_PACK32      52
+#define VK_FORMAT_A8B8G8R8_USCALED_PACK32    53
+#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32    54
+#define VK_FORMAT_A8B8G8R8_UINT_PACK32       55
+#define VK_FORMAT_A8B8G8R8_SINT_PACK32       56
+#define VK_FORMAT_A8B8G8R8_SRGB_PACK32       57
+#define VK_FORMAT_A2R10G10B10_UNORM_PACK32   58
+#define VK_FORMAT_A2R10G10B10_SNORM_PACK32   59
+#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
+#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
+#define VK_FORMAT_A2R10G10B10_UINT_PACK32    62
+#define VK_FORMAT_A2R10G10B10_SINT_PACK32    63
+#define VK_FORMAT_A2B10G10R10_UNORM_PACK32   64
+#define VK_FORMAT_A2B10G10R10_SNORM_PACK32   65
+#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
+#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
+#define VK_FORMAT_A2B10G10R10_UINT_PACK32    68
+#define VK_FORMAT_A2B10G10R10_SINT_PACK32    69
+#define VK_FORMAT_R16_UNORM                  70
+#define VK_FORMAT_R16_SNORM                  71
+#define VK_FORMAT_R16_USCALED                72
+#define VK_FORMAT_R16_SSCALED                73
+#define VK_FORMAT_R16_UINT                   74
+#define VK_FORMAT_R16_SINT                   75
+#define VK_FORMAT_R16_SFLOAT                 76
+#define VK_FORMAT_R16G16_UNORM               77
+#define VK_FORMAT_R16G16_SNORM               78
+#define VK_FORMAT_R16G16_USCALED             79
+#define VK_FORMAT_R16G16_SSCALED             80
+#define VK_FORMAT_R16G16_UINT                81
+#define VK_FORMAT_R16G16_SINT                82
+#define VK_FORMAT_R16G16_SFLOAT              83
+#define VK_FORMAT_R16G16B16_UNORM            84
+#define VK_FORMAT_R16G16B16_SNORM            85
+#define VK_FORMAT_R16G16B16_USCALED          86
+#define VK_FORMAT_R16G16B16_SSCALED          87
+#define VK_FORMAT_R16G16B16_UINT             88
+#define VK_FORMAT_R16G16B16_SINT             89
+#define VK_FORMAT_R16G16B16_SFLOAT           90
+#define VK_FORMAT_R16G16B16A16_UNORM         91
+#define VK_FORMAT_R16G16B16A16_SNORM         92
+#define VK_FORMAT_R16G16B16A16_USCALED       93
+#define VK_FORMAT_R16G16B16A16_SSCALED       94
+#define VK_FORMAT_R16G16B16A16_UINT          95
+#define VK_FORMAT_R16G16B16A16_SINT          96
+#define VK_FORMAT_R16G16B16A16_SFLOAT        97
+#define VK_FORMAT_R32_UINT                   98
+#define VK_FORMAT_R32_SINT                   99
+#define VK_FORMAT_R32_SFLOAT                 100
+#define VK_FORMAT_R32G32_UINT                101
+#define VK_FORMAT_R32G32_SINT                102
+#define VK_FORMAT_R32G32_SFLOAT              103
+#define VK_FORMAT_R32G32B32_UINT             104
+#define VK_FORMAT_R32G32B32_SINT             105
+#define VK_FORMAT_R32G32B32_SFLOAT           106
+#define VK_FORMAT_R32G32B32A32_UINT          107
+#define VK_FORMAT_R32G32B32A32_SINT          108
+#define VK_FORMAT_R32G32B32A32_SFLOAT        109
+#define VK_FORMAT_R64_UINT                   110
+#define VK_FORMAT_R64_SINT                   111
+#define VK_FORMAT_R64_SFLOAT                 112
+#define VK_FORMAT_R64G64_UINT                113
+#define VK_FORMAT_R64G64_SINT                114
+#define VK_FORMAT_R64G64_SFLOAT              115
+#define VK_FORMAT_R64G64B64_UINT             116
+#define VK_FORMAT_R64G64B64_SINT             117
+#define VK_FORMAT_R64G64B64_SFLOAT           118
+#define VK_FORMAT_R64G64B64A64_UINT          119
+#define VK_FORMAT_R64G64B64A64_SINT          120
+#define VK_FORMAT_R64G64B64A64_SFLOAT        121
+
+#define VK_INDEX_TYPE_UINT16    0
+#define VK_INDEX_TYPE_UINT32    1
+#define VK_INDEX_TYPE_NONE_KHR  1000165000
+#define VK_INDEX_TYPE_UINT8_EXT 1000265000
+
+#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
+#define VK_GEOMETRY_TYPE_AABBS_KHR     1
+#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
+
+#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
+#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR         2
+#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR                 4
+#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR              8
+
+#define TYPE(type, align)                                                                                              \
+   layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref                                  \
+   {                                                                                                                   \
+      type value;                                                                                                      \
+   };
+
+#define REF(type)  type##_ref
+#define VOID_REF   uint64_t
+#define NULL       0
+#define DEREF(var) var.value
+
+#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
+
+#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
+
+#define INFINITY (1.0 / 0.0)
+#define NAN      (0.0 / 0.0)
+
+#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
+
+TYPE(int8_t, 1);
+TYPE(uint8_t, 1);
+TYPE(int16_t, 2);
+TYPE(uint16_t, 2);
+TYPE(int32_t, 4);
+TYPE(uint32_t, 4);
+TYPE(int64_t, 8);
+TYPE(uint64_t, 8);
+
+TYPE(float, 4);
+
+TYPE(vec2, 4);
+TYPE(vec3, 4);
+TYPE(vec4, 4);
+
+TYPE(uvec4, 16);
+
+TYPE(VOID_REF, 8);
+
+/* copied from u_math.h */
+uint32_t
+align(uint32_t value, uint32_t alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+int32_t
+to_emulated_float(float f)
+{
+   int32_t bits = floatBitsToInt(f);
+   return f < 0 ? -2147483648 - bits : bits;
+}
+
+float
+from_emulated_float(int32_t bits)
+{
+   return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
+}
+
+TYPE(vk_aabb, 4);
+
+struct key_id_pair {
+   uint32_t id;
+   uint32_t key;
+};
+TYPE(key_id_pair, 4);
+
+TYPE(vk_accel_struct_serialization_header, 8);
+
+TYPE(vk_ir_header, 4);
+TYPE(vk_ir_node, 4);
+TYPE(vk_ir_box_node, 4);
+TYPE(vk_ir_triangle_node, 4);
+TYPE(vk_ir_aabb_node, 4);
+TYPE(vk_ir_instance_node, 8);
+
+TYPE(vk_global_sync_data, 4);
+
+uint32_t
+ir_id_to_offset(uint32_t id)
+{
+   return id & (~3u);
+}
+
+uint32_t
+ir_id_to_type(uint32_t id)
+{
+   return id & 3u;
+}
+
+uint32_t
+pack_ir_node_id(uint32_t offset, uint32_t type)
+{
+   return offset | type;
+}
+
+float
+aabb_surface_area(vk_aabb aabb)
+{
+   vec3 diagonal = aabb.max - aabb.min;
+   return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
+}
+
+/* Just a wrapper for 3 uints. */
+struct triangle_indices {
+   uint32_t index[3];
+};
+
+triangle_indices
+load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
+{
+   triangle_indices result;
+
+   uint32_t index_base = global_id * 3;
+
+   switch (index_format) {
+   case VK_INDEX_TYPE_UINT16: {
+      result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
+      break;
+   }
+   case VK_INDEX_TYPE_UINT32: {
+      result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
+      break;
+   }
+   case VK_INDEX_TYPE_NONE_KHR: {
+      result.index[0] = index_base + 0;
+      result.index[1] = index_base + 1;
+      result.index[2] = index_base + 2;
+      break;
+   }
+   case VK_INDEX_TYPE_UINT8_EXT: {
+      result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
+      break;
+   }
+   }
+
+   return result;
+}
+
+/* Just a wrapper for 3 vec4s. */
+struct triangle_vertices {
+   vec4 vertex[3];
+};
+
+TYPE(float16_t, 2);
+
+triangle_vertices
+load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
+{
+   triangle_vertices result;
+
+   for (uint32_t i = 0; i < 3; i++) {
+      VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
+      vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
+
+      switch (vertex_format) {
+      case VK_FORMAT_R32G32_SFLOAT:
+         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
+         break;
+      case VK_FORMAT_R32G32B32_SFLOAT:
+      case VK_FORMAT_R32G32B32A32_SFLOAT:
+         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
+         vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
+         break;
+      case VK_FORMAT_R16G16_SFLOAT:
+         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
+         break;
+      case VK_FORMAT_R16G16B16_SFLOAT:
+      case VK_FORMAT_R16G16B16A16_SFLOAT:
+         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
+         vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
+         break;
+      case VK_FORMAT_R16G16_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
+         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
+         break;
+      case VK_FORMAT_R16G16B16A16_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
+         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
+         vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
+         break;
+      case VK_FORMAT_R8G8_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
+         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
+         break;
+      case VK_FORMAT_R8G8B8A8_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
+         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
+         vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
+         break;
+      case VK_FORMAT_R16G16_UNORM:
+         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
+         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
+         break;
+      case VK_FORMAT_R16G16B16A16_UNORM:
+         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
+         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
+         vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
+         break;
+      case VK_FORMAT_R8G8_UNORM:
+         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
+         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
+         break;
+      case VK_FORMAT_R8G8B8A8_UNORM:
+         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
+         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
+         vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
+         break;
+      case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
+         uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
+         vertex.x = float(data & 0x3FF) / 0x3FF;
+         vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
+         vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
+         break;
+      }
+      }
+
+      result.vertex[i] = vertex;
+   }
+
+   return result;
+}
+
+/** Compute ceiling of integer quotient of A divided by B.
+    From macros.h */
+#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
+
+#ifdef USE_GLOBAL_SYNC
+
+/* There might be more invocations available than tasks to do.
+ * In that case, the fetched task index is greater than the
+ * counter offset for the next phase. To avoid out-of-bounds
+ * accessing, phases will be skipped until the task index is
+ * is in-bounds again. */
+uint32_t num_tasks_to_skip = 0;
+uint32_t phase_index = 0;
+bool should_skip = false;
+shared uint32_t global_task_index;
+
+shared uint32_t shared_phase_index;
+
+uint32_t
+task_count(REF(vk_ir_header) header)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   return DEREF(header).sync_data.task_counts[phase_index & 1];
+}
+
+/* Sets the task count for the next phase. */
+void
+set_next_task_count(REF(vk_ir_header) header, uint32_t new_count)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
+}
+
+/*
+ * This function has two main objectives:
+ * Firstly, it partitions pending work among free invocations.
+ * Secondly, it guarantees global synchronization between different phases.
+ *
+ * After every call to fetch_task, a new task index is returned.
+ * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
+ * to determine if the current phase should be executed or skipped.
+ *
+ * Since tasks are assigned per-workgroup, there is a possibility of the task index being
+ * greater than the total task count.
+ */
+uint32_t
+fetch_task(REF(vk_ir_header) header, bool did_work)
+{
+   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
+    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
+    * and their results are written to memory. */
+   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+   if (gl_LocalInvocationIndex == 0) {
+      if (did_work)
+         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
+      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
+
+      do {
+         /* Perform a memory barrier to refresh the current phase's end counter, in case
+          * another workgroup changed it. */
+         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
+          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
+          * previous phase have been completed. Switching to a new phase and incrementing the phase
+          * end counter in turn notifies all invocations for that phase that it is safe to execute.
+          */
+         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
+             DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
+            if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
+               DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
+               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+            } else {
+               atomicAdd(DEREF(header).sync_data.phase_index, 1);
+               DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
+               /* Ensure the changes to the phase index and start/end counter are visible for other
+                * workgroup waiting in the loop. */
+               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+               atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
+                         DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
+            }
+            break;
+         }
+
+         /* If other invocations have finished all nodes, break out; there is no work to do */
+         if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
+            break;
+         }
+      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
+
+      shared_phase_index = DEREF(header).sync_data.phase_index;
+   }
+
+   barrier();
+   if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
+      return TASK_INDEX_INVALID;
+
+   num_tasks_to_skip = shared_phase_index - phase_index;
+
+   uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
+   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
+}
+
+bool
+should_execute_phase()
+{
+   if (num_tasks_to_skip > 0) {
+      /* Skip to next phase. */
+      ++phase_index;
+      --num_tasks_to_skip;
+      return false;
+   }
+   return true;
+}
+
+#define PHASE(header)                                                                                                  \
+   for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
+#endif
+
+#endif
diff --git a/src/vulkan/runtime/bvh/vk_build_interface.h b/src/vulkan/runtime/bvh/vk_build_interface.h
new file mode 100644
index 00000000000..0d2f1fed21c
--- /dev/null
+++ b/src/vulkan/runtime/bvh/vk_build_interface.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VK_BVH_BUILD_INTERFACE_H
+#define VK_BVH_BUILD_INTERFACE_H
+
+#ifdef VULKAN
+#include "vk_build_helpers.h"
+#else
+#include <stdint.h>
+#include "vk_bvh.h"
+#define REF(type) uint64_t
+#define VOID_REF  uint64_t
+#endif
+
+#define SUBGROUP_SIZE_ID 0
+#define BVH_BOUNDS_OFFSET_ID 1
+#ifdef VULKAN
+layout (constant_id = SUBGROUP_SIZE_ID) const int SUBGROUP_SIZE = 64;
+layout (constant_id = BVH_BOUNDS_OFFSET_ID) const int BVH_BOUNDS_OFFSET = 0;
+#endif
+
+struct leaf_args {
+   VOID_REF bvh;
+   REF(vk_ir_header) header;
+   REF(key_id_pair) ids;
+
+   vk_bvh_geometry_data geom_data;
+};
+
+struct morton_args {
+   VOID_REF bvh;
+   REF(vk_ir_header) header;
+   REF(key_id_pair) ids;
+};
+
+#define LBVH_RIGHT_CHILD_BIT_SHIFT 29
+#define LBVH_RIGHT_CHILD_BIT       (1 << LBVH_RIGHT_CHILD_BIT_SHIFT)
+
+struct lbvh_node_info {
+   /* Number of children that have been processed (or are invalid/leaves) in
+    * the lbvh_generate_ir pass.
+    */
+   uint32_t path_count;
+
+   uint32_t children[2];
+   uint32_t parent;
+};
+
+struct lbvh_main_args {
+   VOID_REF bvh;
+   REF(key_id_pair) src_ids;
+   VOID_REF node_info;
+   uint32_t id_count;
+   uint32_t internal_node_base;
+};
+
+struct lbvh_generate_ir_args {
+   VOID_REF bvh;
+   VOID_REF node_info;
+   VOID_REF header;
+   uint32_t internal_node_base;
+};
+
+struct ploc_prefix_scan_partition {
+   uint32_t aggregate;
+   uint32_t inclusive_sum;
+};
+
+#define PLOC_WORKGROUP_SIZE 1024
+#define PLOC_SUBGROUPS_PER_WORKGROUP                                           \
+   (DIV_ROUND_UP(PLOC_WORKGROUP_SIZE, SUBGROUP_SIZE))
+
+struct ploc_args {
+   VOID_REF bvh;
+   VOID_REF prefix_scan_partitions;
+   REF(vk_ir_header) header;
+   VOID_REF ids_0;
+   VOID_REF ids_1;
+   uint32_t internal_node_offset;
+};
+
+#endif
diff --git a/src/vulkan/runtime/bvh/vk_bvh.h b/src/vulkan/runtime/bvh/vk_bvh.h
new file mode 100644
index 00000000000..f393fa443d4
--- /dev/null
+++ b/src/vulkan/runtime/bvh/vk_bvh.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2021 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BVH_VK_BVH_H
+#define BVH_VK_BVH_H
+
+#define vk_ir_node_triangle 0
+#define vk_ir_node_internal 1
+#define vk_ir_node_instance 2
+#define vk_ir_node_aabb     3
+
+#define VK_GEOMETRY_OPAQUE (1u << 31)
+
+#ifdef VULKAN
+#define VK_UUID_SIZE 16
+#else
+#include <vulkan/vulkan.h>
+typedef struct vk_ir_node vk_ir_node;
+typedef struct vk_global_sync_data vk_global_sync_data;
+typedef struct vk_bvh_geometry_data vk_bvh_geometry_data;
+
+typedef struct {
+   float values[3][4];
+} mat3x4;
+
+typedef struct {
+   float x;
+   float y;
+   float z;
+} vec3;
+
+typedef struct vk_aabb vk_aabb;
+#endif
+
+struct vk_aabb {
+   vec3 min;
+   vec3 max;
+};
+
+/* This is the header structure for serialized acceleration structures, as
+ * defined by the Vulkan spec.
+ */
+struct vk_accel_struct_serialization_header {
+   uint8_t driver_uuid[VK_UUID_SIZE];
+   uint8_t accel_struct_compat[VK_UUID_SIZE];
+   uint64_t serialization_size;
+   uint64_t deserialization_size;
+   uint64_t instance_count;
+#ifndef VULKAN
+   uint64_t instances[];
+#endif
+};
+
+struct vk_global_sync_data {
+   uint32_t task_counts[2];
+   uint32_t task_started_counter;
+   uint32_t task_done_counter;
+   uint32_t current_phase_start_counter;
+   uint32_t current_phase_end_counter;
+   uint32_t phase_index;
+   /* If this flag is set, the shader should exit
+    * instead of executing another phase */
+   uint32_t next_phase_exit_flag;
+};
+
+struct vk_ir_header {
+   int32_t min_bounds[3];
+   int32_t max_bounds[3];
+   uint32_t active_leaf_count;
+   /* Indirect dispatch dimensions for the encoder.
+    * ir_internal_node_count is the thread count in the X dimension,
+    * while Y and Z are always set to 1. */
+   uint32_t ir_internal_node_count;
+   uint32_t dispatch_size_y;
+   uint32_t dispatch_size_z;
+   vk_global_sync_data sync_data;
+   uint32_t dst_node_offset;
+};
+
+struct vk_ir_node {
+   vk_aabb aabb;
+};
+
+#define VK_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
+#define VK_NULL_BVH_OFFSET    0xFFFFFFFE
+
+struct vk_ir_box_node {
+   vk_ir_node base;
+   uint32_t children[2];
+   uint32_t bvh_offset;
+};
+
+struct vk_ir_aabb_node {
+   vk_ir_node base;
+   uint32_t primitive_id;
+   uint32_t geometry_id_and_flags;
+};
+
+struct vk_ir_triangle_node {
+   vk_ir_node base;
+   float coords[3][3];
+   uint32_t triangle_id;
+   uint32_t id;
+   uint32_t geometry_id_and_flags;
+};
+
+struct vk_ir_instance_node {
+   vk_ir_node base;
+   /* See radv_bvh_instance_node */
+   uint64_t base_ptr;
+   uint32_t custom_instance_and_mask;
+   uint32_t sbt_offset_and_flags;
+   mat3x4 otw_matrix;
+   uint32_t instance_id;
+};
+
+#define VK_BVH_INVALID_NODE 0xFFFFFFFF
+
+/* If the task index is set to this value, there is no
+ * more work to do. */
+#define TASK_INDEX_INVALID 0xFFFFFFFF
+
+struct vk_bvh_geometry_data {
+   uint64_t data;
+   uint64_t indices;
+   uint64_t transform;
+
+   uint32_t geometry_id;
+   uint32_t geometry_type;
+   uint32_t first_id;
+   uint32_t stride;
+   uint32_t vertex_format;
+   uint32_t index_format;
+};
+
+#endif
diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build
index b325ebe6f3d..9d34ae432f0 100644
--- a/src/vulkan/runtime/meson.build
+++ b/src/vulkan/runtime/meson.build
@@ -7,7 +7,6 @@
 vulkan_lite_runtime_files = files(
   'rmv/vk_rmv_common.c',
   'rmv/vk_rmv_exporter.c',
-  'vk_acceleration_structure.c',
   'vk_blend.c',
   'vk_buffer.c',
   'vk_buffer_view.c',
@@ -277,6 +276,8 @@ vulkan_runtime_deps = [
 ]
 
 if prog_glslang.found()
+  subdir('radix_sort')
+  subdir('bvh')
   vulkan_runtime_files += files('vk_texcompress_astc.c')
   vulkan_runtime_files += custom_target(
     'astc_spv.h',
@@ -288,6 +289,10 @@ if prog_glslang.found()
     ],
     depfile : 'astc_spv.h.d',
   )
+  vulkan_runtime_files += files('vk_acceleration_structure.c')
+  vulkan_runtime_files += radix_sort_files
+  vulkan_runtime_files += bvh_spv
+  vulkan_runtime_files += radix_sort_spv
 endif
 
 libvulkan_runtime = static_library(
@@ -320,7 +325,10 @@ else
   )
 endif
 
-idep_vulkan_runtime_headers = idep_vulkan_lite_runtime_headers
+idep_vulkan_runtime_headers = [idep_vulkan_lite_runtime_headers]
+idep_vulkan_runtime_headers += declare_dependency(
+  include_directories : include_directories('bvh'),
+)
 
 idep_vulkan_runtime = declare_dependency(
   dependencies : [
diff --git a/src/amd/vulkan/radix_sort/LICENSE b/src/vulkan/runtime/radix_sort/LICENSE
similarity index 100%
rename from src/amd/vulkan/radix_sort/LICENSE
rename to src/vulkan/runtime/radix_sort/LICENSE
diff --git a/src/amd/vulkan/radix_sort/common/macros.h b/src/vulkan/runtime/radix_sort/common/macros.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/macros.h
rename to src/vulkan/runtime/radix_sort/common/macros.h
diff --git a/src/amd/vulkan/radix_sort/common/util.c b/src/vulkan/runtime/radix_sort/common/util.c
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/util.c
rename to src/vulkan/runtime/radix_sort/common/util.c
diff --git a/src/amd/vulkan/radix_sort/common/util.h b/src/vulkan/runtime/radix_sort/common/util.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/util.h
rename to src/vulkan/runtime/radix_sort/common/util.h
diff --git a/src/amd/vulkan/radix_sort/common/vk/barrier.c b/src/vulkan/runtime/radix_sort/common/vk/barrier.c
similarity index 81%
rename from src/amd/vulkan/radix_sort/common/vk/barrier.c
rename to src/vulkan/runtime/radix_sort/common/vk/barrier.c
index 58134dbd11a..e0865f6b770 100644
--- a/src/amd/vulkan/radix_sort/common/vk/barrier.c
+++ b/src/vulkan/runtime/radix_sort/common/vk/barrier.c
@@ -7,6 +7,8 @@
 //
 
 #include "barrier.h"
+#include "vulkan/runtime/vk_device.h"
+#include "vulkan/runtime/vk_command_buffer.h"
 
 //
 //
@@ -15,6 +17,10 @@
 void
 vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -23,7 +29,7 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -42,6 +48,10 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -50,7 +60,7 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        0,
@@ -69,6 +79,10 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -77,7 +91,7 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -96,6 +110,10 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -104,7 +122,7 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -123,6 +141,10 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -132,7 +154,7 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
                      VK_ACCESS_SHADER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -151,6 +173,10 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -160,7 +186,7 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        0,
@@ -179,6 +205,10 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -187,7 +217,7 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_HOST_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_HOST_BIT,
                        0,
@@ -206,6 +236,10 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -214,7 +248,7 @@ vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_HOST_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        VK_PIPELINE_STAGE_HOST_BIT,
                        0,
@@ -237,12 +271,16 @@ vk_memory_barrier(VkCommandBuffer      cb,
                   VkPipelineStageFlags dst_stage,
                   VkAccessFlags        dst_mask)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   VkMemoryBarrier const mb = { .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
                                .pNext         = NULL,
                                .srcAccessMask = src_mask,
                                .dstAccessMask = dst_mask };
 
-  vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
+  disp->CmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
 }
 
 //
@@ -252,6 +290,10 @@ vk_memory_barrier(VkCommandBuffer      cb,
 void
 vk_barrier_debug(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -288,7 +330,7 @@ vk_barrier_debug(VkCommandBuffer cb)
                      VK_ACCESS_HOST_WRITE_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                        0,
diff --git a/src/amd/vulkan/radix_sort/common/vk/barrier.h b/src/vulkan/runtime/radix_sort/common/vk/barrier.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/vk/barrier.h
rename to src/vulkan/runtime/radix_sort/common/vk/barrier.h
diff --git a/src/vulkan/runtime/radix_sort/meson.build b/src/vulkan/runtime/radix_sort/meson.build
new file mode 100644
index 00000000000..138c0c9369a
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/meson.build
@@ -0,0 +1,37 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+subdir('shaders')
+
+radix_sort_files = files(
+  'common/vk/barrier.c',
+  'common/vk/barrier.h',
+  'common/macros.h',
+  'common/util.c',
+  'common/util.h',
+  'shaders/push.h',
+  'radix_sort_u64.c',
+  'radix_sort_u64.h',
+  'radix_sort_vk_devaddr.h',
+  'radix_sort_vk_ext.h',
+  'radix_sort_vk.c',
+  'radix_sort_vk.h',
+  'target.h'
+)
diff --git a/src/vulkan/runtime/radix_sort/radix_sort_u64.c b/src/vulkan/runtime/radix_sort/radix_sort_u64.c
new file mode 100644
index 00000000000..0d5f9217656
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "radix_sort_u64.h"
+#include <assert.h>
+
+static const uint32_t init_spv[] = {
+#include "radix_sort/shaders/init.comp.spv.h"
+};
+
+static const uint32_t fill_spv[] = {
+#include "radix_sort/shaders/fill.comp.spv.h"
+};
+
+static const uint32_t histogram_spv[] = {
+#include "radix_sort/shaders/histogram.comp.spv.h"
+};
+
+static const uint32_t prefix_spv[] = {
+#include "radix_sort/shaders/prefix.comp.spv.h"
+};
+
+static const uint32_t scatter_0_even_spv[] = {
+#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
+};
+
+static const uint32_t scatter_0_odd_spv[] = {
+#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
+};
+
+static const uint32_t scatter_1_even_spv[] = {
+#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
+};
+
+static const uint32_t scatter_1_odd_spv[] = {
+#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
+};
+
+
+radix_sort_vk_t *
+vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                         VkPipelineCache pc,
+                         struct radix_sort_vk_target_config config)
+{
+   assert(config.keyval_dwords == 2);
+
+   const uint32_t *spv[8] = {
+      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
+      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
+   };
+   const uint32_t spv_sizes[8] = {
+      sizeof(init_spv),           sizeof(fill_spv),          sizeof(histogram_spv),      sizeof(prefix_spv),
+      sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
+   };
+   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, config);
+}
+
diff --git a/src/vulkan/runtime/radix_sort/radix_sort_u64.h b/src/vulkan/runtime/radix_sort/radix_sort_u64.h
new file mode 100644
index 00000000000..8bb37fe2082
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef VK_RADIX_SORT_U64
+#define VK_RADIX_SORT_U64
+
+#include "radix_sort_vk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+radix_sort_vk_t *
+vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                         VkPipelineCache pc,
+                         struct radix_sort_vk_target_config config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk.c b/src/vulkan/runtime/radix_sort/radix_sort_vk.c
similarity index 83%
rename from src/amd/vulkan/radix_sort/radix_sort_vk.c
rename to src/vulkan/runtime/radix_sort/radix_sort_vk.c
index 70253884fc4..31efd3d4a75 100644
--- a/src/amd/vulkan/radix_sort/radix_sort_vk.c
+++ b/src/vulkan/runtime/radix_sort/radix_sort_vk.c
@@ -11,6 +11,10 @@
 #include "common/vk/barrier.h"
 #include "radix_sort_vk_devaddr.h"
 #include "shaders/push.h"
+#include "shaders/config.h"
+
+#include "vk_command_buffer.h"
+#include "vk_device.h"
 
 //
 //
@@ -100,14 +104,41 @@ radix_sort_vk_get_memory_requirements(radix_sort_vk_t const *               rs,
       // NOTE: Assumes .histograms are before .partitions.
       //
       // Last scatter workgroup skips writing to a partition.
+      // Each RS_RADIX_LOG2 (8) bit pass has a zero-initialized histogram. This
+      // is one RS_RADIX_SIZE histogram per keyval byte.
       //
-      // One histogram per (keyval byte + partitions)
+      // The last scatter workgroup skips writing to a partition so it doesn't
+      // need to be allocated.
       //
-      uint32_t const partitions = scatter_blocks - 1;
+      // If the device doesn't support "sequential dispatch" of workgroups, then
+      // we need a zero-initialized dword counter per radix pass in the keyval
+      // to atomically acquire a virtual workgroup id.  On sequentially
+      // dispatched devices, this is simply `gl_WorkGroupID.x`.
+      //
+      // The "internal" memory map looks like this:
+      //
+      //   +---------------------------------+ <-- 0
+      //   | histograms[keyval_size]         |
+      //   +---------------------------------+ <-- keyval_size                           * histo_size
+      //   | partitions[scatter_blocks_ru-1] |
+      //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size
+      //   | workgroup_ids[keyval_size]      |
+      //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size
+      //
+      // The `.workgroup_ids[]` are located after the last partition.
+      //
+      VkDeviceSize const histo_size = RS_RADIX_SIZE * sizeof(uint32_t);
 
-      mr->internal_size      = (mr->keyval_size + partitions) * (RS_RADIX_SIZE * sizeof(uint32_t));
+      mr->internal_size      = (mr->keyval_size + scatter_blocks - 1) * histo_size;
       mr->internal_alignment = internal_sg_size * sizeof(uint32_t);
 
+      //
+      // Support for nonsequential dispatch can be disabled.
+      //
+      VkDeviceSize const workgroup_ids_size = mr->keyval_size * sizeof(uint32_t);
+
+      mr->internal_size += workgroup_ids_size;
+
       //
       // Indirect
       //
@@ -185,13 +216,17 @@ rs_pipeline_count(struct radix_sort_vk const * rs)
 }
 
 radix_sort_vk_t *
-radix_sort_vk_create(VkDevice                           device,
+radix_sort_vk_create(VkDevice                           _device,
                     VkAllocationCallbacks const *      ac,
                     VkPipelineCache                    pc,
                     const uint32_t* const*             spv,
                     const uint32_t*                    spv_sizes,
                     struct radix_sort_vk_target_config config)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   //
   // Allocate radix_sort_vk
   //
@@ -244,6 +279,38 @@ radix_sort_vk_create(VkDevice                           device,
       .size       = sizeof(struct rs_push_scatter) },  // scatter_1_odd
   };
 
+  uint32_t spec_constants[] = {
+    [RS_FILL_WORKGROUP_SIZE] = 1u << config.fill.workgroup_size_log2,
+    [RS_FILL_BLOCK_ROWS] = config.fill.block_rows,
+    [RS_HISTOGRAM_WORKGROUP_SIZE] = 1u << config.histogram.workgroup_size_log2,
+    [RS_HISTOGRAM_SUBGROUP_SIZE_LOG2] = config.histogram.subgroup_size_log2,
+    [RS_HISTOGRAM_BLOCK_ROWS] = config.histogram.block_rows,
+    [RS_PREFIX_WORKGROUP_SIZE] = 1u << config.prefix.workgroup_size_log2,
+    [RS_PREFIX_SUBGROUP_SIZE_LOG2] = config.prefix.subgroup_size_log2,
+    [RS_SCATTER_WORKGROUP_SIZE] = 1u << config.scatter.workgroup_size_log2,
+    [RS_SCATTER_SUBGROUP_SIZE_LOG2] = config.scatter.subgroup_size_log2,
+    [RS_SCATTER_BLOCK_ROWS] = config.scatter.block_rows,
+    [RS_SCATTER_NONSEQUENTIAL_DISPATCH] = config.nonsequential_dispatch,
+  };
+
+  VkSpecializationMapEntry spec_map[ARRAY_LENGTH_MACRO(spec_constants)];
+
+  for (uint32_t ii = 0; ii < ARRAY_LENGTH_MACRO(spec_constants); ii++)
+    {
+      spec_map[ii] = (VkSpecializationMapEntry) {
+        .constantID = ii,
+        .offset = sizeof(uint32_t) * ii,
+        .size = sizeof(uint32_t),
+      };
+    }
+
+  VkSpecializationInfo spec_info = {
+    .mapEntryCount = ARRAY_LENGTH_MACRO(spec_map),
+    .pMapEntries = spec_map,
+    .dataSize = sizeof(spec_constants),
+    .pData = spec_constants,
+  };
+
   VkPipelineLayoutCreateInfo plci = {
 
     .sType                  = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
@@ -259,7 +326,7 @@ radix_sort_vk_create(VkDevice                           device,
     {
       plci.pPushConstantRanges = pcr + ii;
 
-      if (vkCreatePipelineLayout(device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS)
+      if (disp->CreatePipelineLayout(_device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS)
         goto fail_layout;
     }
 
@@ -282,7 +349,7 @@ radix_sort_vk_create(VkDevice                           device,
       smci.codeSize = spv_sizes[ii];
       smci.pCode    = spv[ii];
 
-      if (vkCreateShaderModule(device, &smci, ac, sms + ii) != VK_SUCCESS)
+      if (disp->CreateShaderModule(_device, &smci, ac, sms + ii) != VK_SUCCESS)
         goto fail_shader;
     }
 
@@ -323,11 +390,11 @@ radix_sort_vk_create(VkDevice                           device,
     .flags = 0,                                                                                    \
     .stage = { .sType               = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,         \
                .pNext               = NULL,                                                        \
-               .flags               = 0,                                                           \
+               .flags               = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT, \
                .stage               = VK_SHADER_STAGE_COMPUTE_BIT,                                 \
                .module              = sms[idx_],                                                   \
                .pName               = "main",                                                      \
-               .pSpecializationInfo = NULL },                                                      \
+               .pSpecializationInfo = &spec_info },                                                \
                                                                                                    \
     .layout             = rs->pipeline_layouts.handles[idx_],                                      \
     .basePipelineHandle = VK_NULL_HANDLE,                                                          \
@@ -358,7 +425,7 @@ radix_sort_vk_create(VkDevice                           device,
   //
   // Create the compute pipelines
   //
-  if (vkCreateComputePipelines(device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS)
+  if (disp->CreateComputePipelines(_device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS)
     goto fail_pipeline;
 
   //
@@ -366,7 +433,7 @@ radix_sort_vk_create(VkDevice                           device,
   //
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyShaderModule(device, sms[ii], ac);
+      disp->DestroyShaderModule(_device, sms[ii], ac);
     }
 
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
@@ -397,17 +464,17 @@ radix_sort_vk_create(VkDevice                           device,
 fail_pipeline:
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipeline(device, rs->pipelines.handles[ii], ac);
+      disp->DestroyPipeline(_device, rs->pipelines.handles[ii], ac);
     }
 fail_shader:
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyShaderModule(device, sms[ii], ac);
+      disp->DestroyShaderModule(_device, sms[ii], ac);
     }
 fail_layout:
    for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipelineLayout(device, rs->pipeline_layouts.handles[ii], ac);
+      disp->DestroyPipelineLayout(_device, rs->pipeline_layouts.handles[ii], ac);
     }
 
   free(rs);
@@ -420,18 +487,22 @@ fail_layout:
 void
 radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbacks const * const ac)
 {
+  VK_FROM_HANDLE(vk_device, device, d);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   uint32_t const pipeline_count = rs_pipeline_count(rs);
 
   // destroy pipelines
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipeline(d, rs->pipelines.handles[ii], ac);
+      disp->DestroyPipeline(d, rs->pipelines.handles[ii], ac);
     }
 
   // destroy pipeline layouts
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac);
+      disp->DestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac);
     }
 
   free(rs);
@@ -441,8 +512,12 @@ radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbac
 //
 //
 static VkDeviceAddress
-rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi)
+rs_get_devaddr(VkDevice _device, VkDescriptorBufferInfo const * dbi)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   VkBufferDeviceAddressInfo const bdai = {
 
     .sType  = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
@@ -450,7 +525,7 @@ rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi)
     .buffer = dbi->buffer
   };
 
-  VkDeviceAddress const devaddr = vkGetBufferDeviceAddress(device, &bdai) + dbi->offset;
+  VkDeviceAddress const devaddr = disp->GetBufferDeviceAddress(_device, &bdai) + dbi->offset;
 
   return devaddr;
 }
@@ -465,13 +540,17 @@ rs_ext_cmd_write_timestamp(struct radix_sort_vk_ext_timestamps * ext_timestamps,
                            VkCommandBuffer                       cb,
                            VkPipelineStageFlagBits               pipeline_stage)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   if ((ext_timestamps != NULL) &&
       (ext_timestamps->timestamps_set < ext_timestamps->timestamp_count))
     {
-      vkCmdWriteTimestamp(cb,
-                          pipeline_stage,
-                          ext_timestamps->timestamps,
-                          ext_timestamps->timestamps_set++);
+      disp->CmdWriteTimestamp(cb,
+                              pipeline_stage,
+                              ext_timestamps->timestamps,
+                              ext_timestamps->timestamps_set++);
     }
 }
 
@@ -497,10 +576,14 @@ struct radix_sort_vk_ext_base
 void
 radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
                            radix_sort_vk_sort_devaddr_info_t const * info,
-                           VkDevice                                  device,
+                           VkDevice                                  _device,
                            VkCommandBuffer                           cb,
                            VkDeviceAddress *                         keyvals_sorted)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   //
   // Anything to do?
   //
@@ -557,16 +640,13 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
     // Label the command buffer
     //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL)
-    {
-      VkDebugUtilsLabelEXT const label = {
-        .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
-        .pNext      = NULL,
-        .pLabelName = "radix_sort_vk_sort",
-      };
+   VkDebugUtilsLabelEXT const label = {
+     .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+     .pNext      = NULL,
+     .pLabelName = "radix_sort_vk_sort",
+   };
 
-      pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label);
-    }
+   disp->CmdBeginDebugUtilsLabelEXT(cb, &label);
 #endif
 
   //
@@ -679,16 +759,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
     .passes             = passes
   };
 
-  vkCmdPushConstants(cb,
+  disp->CmdPushConstants(cb,
                      rs->pipeline_layouts.named.histogram,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_histogram),
                      &push_histogram);
 
-  vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
+  disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
 
-  vkCmdDispatch(cb, histo_blocks, 1, 1);
+  disp->CmdDispatch(cb, histo_blocks, 1, 1);
 
   ////////////////////////////////////////////////////////////////////////
   //
@@ -707,16 +787,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
     .devaddr_histograms = devaddr_histograms,
   };
 
-  vkCmdPushConstants(cb,
+  disp->CmdPushConstants(cb,
                      rs->pipeline_layouts.named.prefix,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_prefix),
                      &push_prefix);
 
-  vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
+  disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
 
-  vkCmdDispatch(cb, passes, 1, 1);
+  disp->CmdDispatch(cb, passes, 1, 1);
 
   ////////////////////////////////////////////////////////////////////////
   //
@@ -746,14 +826,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
   {
     uint32_t const pass_dword = pass_idx / 4;
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.scatter[pass_dword].even,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_scatter),
                        &push_scatter);
 
-    vkCmdBindPipeline(cb,
+    disp->CmdBindPipeline(cb,
                       VK_PIPELINE_BIND_POINT_COMPUTE,
                       rs->pipelines.named.scatter[pass_dword].even);
   }
@@ -762,7 +842,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
 
   while (true)
     {
-      vkCmdDispatch(cb, scatter_blocks, 1, 1);
+      disp->CmdDispatch(cb, scatter_blocks, 1, 1);
 
       //
       // Continue?
@@ -788,7 +868,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
       //
       VkPipelineLayout const pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even  //
                                           : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-      vkCmdPushConstants(cb,
+      disp->CmdPushConstants(cb,
                          pl,
                          VK_SHADER_STAGE_COMPUTE_BIT,
                          OFFSETOF_MACRO(struct rs_push_scatter, devaddr_histograms),
@@ -801,7 +881,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
       VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even  //
                                    : rs->pipelines.named.scatter[pass_dword].odd;
 
-      vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+      disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
     }
 
 #ifdef RS_VK_ENABLE_EXTENSIONS
@@ -812,10 +892,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
   // End the label
   //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL)
-    {
-      pfn_vkCmdEndDebugUtilsLabelEXT(cb);
-    }
+  disp->CmdEndDebugUtilsLabelEXT(cb);
 #endif
 }
 
@@ -825,10 +902,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
 void
 radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *                            rs,
                                     radix_sort_vk_sort_indirect_devaddr_info_t const * info,
-                                    VkDevice                                           device,
+                                    VkDevice                                           _device,
                                     VkCommandBuffer                                    cb,
                                     VkDeviceAddress * keyvals_sorted)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   //
   // Anything to do?
   //
@@ -886,16 +967,13 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
     // Label the command buffer
     //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL)
-    {
-      VkDebugUtilsLabelEXT const label = {
-        .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
-        .pNext      = NULL,
-        .pLabelName = "radix_sort_vk_sort_indirect",
-      };
+  VkDebugUtilsLabelEXT const label = {
+    .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+    .pNext      = NULL,
+    .pLabelName = "radix_sort_vk_sort_indirect",
+  };
 
-      pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label);
-    }
+  disp->CmdBeginDebugUtilsLabelEXT(cb, &label);
 #endif
 
   //
@@ -938,16 +1016,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .passes        = passes
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.init,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_init),
                        &push_init);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init);
 
-    vkCmdDispatch(cb, 1, 1, 1);
+    disp->CmdDispatch(cb, 1, 1, 1);
   }
 
 #ifdef RS_VK_ENABLE_EXTENSIONS
@@ -967,14 +1045,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .dword          = 0xFFFFFFFF
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.fill,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_pad),
                        &push_pad);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
 
     info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.pad));
   }
@@ -992,14 +1070,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .dword          = 0
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.fill,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_zero),
                        &push_zero);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
 
     info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.zero));
   }
@@ -1021,14 +1099,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .passes             = passes
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.histogram,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_histogram),
                        &push_histogram);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
 
     info->dispatch_indirect(cb,
                             &info->indirect,
@@ -1049,16 +1127,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .devaddr_histograms = devaddr_histograms,
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.prefix,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_prefix),
                        &push_prefix);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
 
-    vkCmdDispatch(cb, passes, 1, 1);
+    disp->CmdDispatch(cb, passes, 1, 1);
   }
 
 #ifdef RS_VK_ENABLE_EXTENSIONS
@@ -1088,14 +1166,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
     {
       uint32_t const pass_dword = pass_idx / 4;
 
-      vkCmdPushConstants(cb,
+      disp->CmdPushConstants(cb,
                          rs->pipeline_layouts.named.scatter[pass_dword].even,
                          VK_SHADER_STAGE_COMPUTE_BIT,
                          0,
                          sizeof(push_scatter),
                          &push_scatter);
 
-      vkCmdBindPipeline(cb,
+      disp->CmdBindPipeline(cb,
                         VK_PIPELINE_BIND_POINT_COMPUTE,
                         rs->pipelines.named.scatter[pass_dword].even);
     }
@@ -1134,7 +1212,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
         VkPipelineLayout const pl = is_even
                                       ? rs->pipeline_layouts.named.scatter[pass_dword].even  //
                                       : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-        vkCmdPushConstants(
+        disp->CmdPushConstants(
           cb,
           pl,
           VK_SHADER_STAGE_COMPUTE_BIT,
@@ -1148,7 +1226,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
         VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even  //
                                      : rs->pipelines.named.scatter[pass_dword].odd;
 
-        vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+        disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
       }
   }
 
@@ -1160,10 +1238,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
   // End the label
   //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL)
-    {
-      pfn_vkCmdEndDebugUtilsLabelEXT(cb);
-    }
+  disp->CmdEndDebugUtilsLabelEXT(cb);
 #endif
 }
 
@@ -1177,7 +1252,11 @@ radix_sort_vk_fill_buffer(VkCommandBuffer                     cb,
                           VkDeviceSize                        size,
                           uint32_t                            data)
 {
-  vkCmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data);
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
+  disp->CmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data);
 }
 
 //
@@ -1221,7 +1300,11 @@ radix_sort_vk_dispatch_indirect(VkCommandBuffer                     cb,
                                 radix_sort_vk_buffer_info_t const * buffer_info,
                                 VkDeviceSize                        offset)
 {
-  vkCmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset);
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
+  disp->CmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset);
 }
 
 //
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk.h b/src/vulkan/runtime/radix_sort/radix_sort_vk.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/radix_sort_vk.h
rename to src/vulkan/runtime/radix_sort/radix_sort_vk.h
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h b/src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h
rename to src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk_ext.h b/src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/radix_sort_vk_ext.h
rename to src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h
diff --git a/src/amd/vulkan/radix_sort/shaders/bufref.h b/src/vulkan/runtime/radix_sort/shaders/bufref.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/bufref.h
rename to src/vulkan/runtime/radix_sort/shaders/bufref.h
diff --git a/src/vulkan/runtime/radix_sort/shaders/config.h b/src/vulkan/runtime/radix_sort/shaders/config.h
new file mode 100644
index 00000000000..702f1649605
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/shaders/config.h
@@ -0,0 +1,33 @@
+// Copyright 2024 Valve Corporation
+// SPDX-License-Identifier: MIT
+
+#ifdef VULKAN
+#define CONFIG(_name, _id, default_val) layout (constant_id = _id) const int _name = default_val;
+#else
+enum rs_config {
+#define CONFIG(_name, _id, default_val) _name = _id,
+#endif
+
+#define RS_FILL_WORKGROUP_SIZE_ID 0
+CONFIG(RS_FILL_WORKGROUP_SIZE, RS_FILL_WORKGROUP_SIZE_ID, 7)
+CONFIG(RS_FILL_BLOCK_ROWS, 1, 8)
+
+#define RS_HISTOGRAM_WORKGROUP_SIZE_ID 2
+CONFIG(RS_HISTOGRAM_WORKGROUP_SIZE, RS_HISTOGRAM_WORKGROUP_SIZE_ID, 7)
+CONFIG(RS_HISTOGRAM_SUBGROUP_SIZE_LOG2, 3, 7)
+CONFIG(RS_HISTOGRAM_BLOCK_ROWS, 4, 8)
+
+#define RS_PREFIX_WORKGROUP_SIZE_ID 5
+CONFIG(RS_PREFIX_WORKGROUP_SIZE, RS_PREFIX_WORKGROUP_SIZE_ID, 8)
+CONFIG(RS_PREFIX_SUBGROUP_SIZE_LOG2, 6, 6)
+
+#define RS_SCATTER_WORKGROUP_SIZE_ID 7
+CONFIG(RS_SCATTER_WORKGROUP_SIZE, RS_SCATTER_WORKGROUP_SIZE_ID, 8)
+CONFIG(RS_SCATTER_SUBGROUP_SIZE_LOG2, 8, 6)
+CONFIG(RS_SCATTER_BLOCK_ROWS, 9, 14)
+
+CONFIG(RS_SCATTER_NONSEQUENTIAL_DISPATCH, 10, 0)
+
+#ifndef VULKAN
+};
+#endif
diff --git a/src/amd/vulkan/radix_sort/shaders/fill.comp b/src/vulkan/runtime/radix_sort/shaders/fill.comp
similarity index 89%
rename from src/amd/vulkan/radix_sort/shaders/fill.comp
rename to src/vulkan/runtime/radix_sort/shaders/fill.comp
index 76b446d8c5d..c85d650d0ff 100644
--- a/src/amd/vulkan/radix_sort/shaders/fill.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/fill.comp
@@ -49,23 +49,11 @@ layout(push_constant) uniform block_push
 //
 RS_STRUCT_INDIRECT_INFO_FILL();
 
-//
-// Check all switches are defined
-//
-#ifndef RS_FILL_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_FILL_BLOCK_ROWS
-#error "Undefined: RS_FILL_BLOCK_ROWS"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
-#define RS_WORKGROUP_SIZE   (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE   (RS_FILL_WORKGROUP_SIZE)
 #define RS_BLOCK_DWORDS     (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE)
 #define RS_RADIX_MASK       ((1 << RS_RADIX_LOG2) - 1)
 // clang-format on
@@ -73,7 +61,7 @@ RS_STRUCT_INDIRECT_INFO_FILL();
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_FILL_WORKGROUP_SIZE_ID) in;
 
 //
 //
diff --git a/src/amd/vulkan/radix_sort/shaders/histogram.comp b/src/vulkan/runtime/radix_sort/shaders/histogram.comp
similarity index 78%
rename from src/amd/vulkan/radix_sort/shaders/histogram.comp
rename to src/vulkan/runtime/radix_sort/shaders/histogram.comp
index 7d554630fe5..0eb078807b7 100644
--- a/src/amd/vulkan/radix_sort/shaders/histogram.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/histogram.comp
@@ -61,26 +61,11 @@ layout(push_constant) uniform block_push
 #error "Undefined: RS_KEYVAL_DWORDS"
 #endif
 
-//
-#ifndef RS_HISTOGRAM_BLOCK_ROWS
-#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS"
-#endif
-
-//
-#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
-#define RS_WORKGROUP_SIZE       (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE       (RS_HISTOGRAM_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE        (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 #define RS_BLOCK_KEYVALS        (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE)
@@ -104,11 +89,8 @@ layout(push_constant) uniform block_push
 //
 #define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_)
 
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4)
-#else
-#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4)
-#endif
+#define RS_HISTOGRAM_OFFSET(pass_) \
+  RS_HISTOGRAM_BASE(pass_) + (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x) * 4
 
 //
 // Assumes (RS_RADIX_LOG2 == 8)
@@ -167,7 +149,7 @@ shared rs_histogram_smem smem;
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_HISTOGRAM_WORKGROUP_SIZE_ID) in;
 
 //
 //
@@ -196,41 +178,38 @@ rs_histogram_zero()
   //
   // Zero SMEM histogram
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = gl_SubgroupInvocationID;
-
-  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    smem.histogram[smem_offset + ii] = 0;
-  }
+    const uint32_t smem_offset = gl_SubgroupInvocationID;
 
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    smem.histogram[smem_offset + ii] = 0;
-  }
-
-  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
-
-  if (smem_idx < RS_RADIX_SIZE)
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      smem.histogram[smem_idx] = 0;
+      smem.histogram[smem_offset + ii] = 0;
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      smem.histogram[smem_offset + ii] = 0;
     }
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
 
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      smem.histogram[gl_LocalInvocationID.x] = 0;
-    }
-
-#endif
+    if (smem_idx < RS_RADIX_SIZE)
+      {
+        smem.histogram[smem_idx] = 0;
+      }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        smem.histogram[gl_LocalInvocationID.x] = 0;
+      }
+  }
 }
 
 //
@@ -242,50 +221,47 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
   //
   // Store to GMEM
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = gl_SubgroupInvocationID;
-
-  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    const uint32_t count = smem.histogram[smem_offset + ii];
+    const uint32_t smem_offset = gl_SubgroupInvocationID;
 
-    atomicAdd(rs_histograms.extent[ii], count);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t count = smem.histogram[smem_offset + ii];
-
-    atomicAdd(rs_histograms.extent[ii], count);
-  }
-
-  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
-
-  if (smem_idx < RS_RADIX_SIZE)
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      const uint32_t count = smem.histogram[smem_idx];
+      const uint32_t count = smem.histogram[smem_offset + ii];
 
-      atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
-                count);
+      atomicAdd(rs_histograms.extent[ii], count);
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      const uint32_t count = smem.histogram[smem_offset + ii];
+
+      atomicAdd(rs_histograms.extent[ii], count);
     }
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
 
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
+    if (smem_idx < RS_RADIX_SIZE)
+      {
+        const uint32_t count = smem.histogram[smem_idx];
 
-      atomicAdd(rs_histograms.extent[0], count);
-    }
+        atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
+                  count);
+      }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
 
-#endif
+        atomicAdd(rs_histograms.extent[0], count);
+      }
+  }
 }
 
 #endif
@@ -298,21 +274,19 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
 void
 rs_histogram_atomic_after_write()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  subgroupMemoryBarrierShared();
-#else
-  barrier();
-#endif
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupMemoryBarrierShared();
+  else
+    barrier();
 }
 
 void
 rs_histogram_read_after_atomic()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  subgroupMemoryBarrierShared();
-#else
-  barrier();
-#endif
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupMemoryBarrierShared();
+  else
+    barrier();
 }
 
 #endif
diff --git a/src/amd/vulkan/radix_sort/shaders/init.comp b/src/vulkan/runtime/radix_sort/shaders/init.comp
similarity index 76%
rename from src/amd/vulkan/radix_sort/shaders/init.comp
rename to src/vulkan/runtime/radix_sort/shaders/init.comp
index 1ffd48d79df..5865be65488 100644
--- a/src/amd/vulkan/radix_sort/shaders/init.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/init.comp
@@ -53,9 +53,9 @@ RS_STRUCT_INDIRECT_INFO();
 // Local macros
 //
 // clang-format off
-#define RS_FILL_WORKGROUP_SIZE        (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
-#define RS_SCATTER_WORKGROUP_SIZE     (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
-#define RS_HISTOGRAM_WORKGROUP_SIZE   (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_FILL_WORKGROUP_SIZE        (RS_FILL_WORKGROUP_SIZE)
+#define RS_SCATTER_WORKGROUP_SIZE     (RS_SCATTER_WORKGROUP_SIZE)
+#define RS_HISTOGRAM_WORKGROUP_SIZE   (RS_HISTOGRAM_WORKGROUP_SIZE)
 
 #define RS_FILL_BLOCK_DWORDS          (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
 #define RS_SCATTER_BLOCK_KEYVALS      (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
@@ -150,12 +150,34 @@ main()
   // 256-dword partitions directly follow the 256-dword histograms, we
   // can dispatch just one FILL.
   //
+  // The "internal" memory map looks like this:
+  //
+  //   +---------------------------------+ <-- 0
+  //   | histograms[keyval_size]         |
+  //   +---------------------------------+ <-- keyval_size                           * histo_dwords
+  //   | partitions[scatter_blocks_ru-1] |
+  //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords
+  //   | workgroup_ids[keyval_size]      |
+  //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size
+  //
+  // NOTE(allanmac): The `.block_offset` and `.dword_offset_min`
+  // parameters are zeroes because the host can offset the buffer
+  // device address since the number of passes is known by the host.
+  // If we ever wanted to supported an indirect number of "key" bits
+  // in the sort, then this would need to change.
+  //
+  // NOTE(allanmac): The `.workgroup_ids[]` are only used if
+  // nonsequential dispatch isn't supported by the device.
+  //
   rs_indirect_info_fill zero;
 
   zero.block_offset               = 0;
   zero.dword_offset_min           = 0;
   zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE;
 
+  if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0)
+    zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4);  // one pass per byte
+
   const uint32_t zero_ru_blocks =
     RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS);
 
diff --git a/src/vulkan/runtime/radix_sort/shaders/meson.build b/src/vulkan/runtime/radix_sort/shaders/meson.build
new file mode 100644
index 00000000000..4152735b730
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/shaders/meson.build
@@ -0,0 +1,53 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+radix_sort_shaders = [
+  'init.comp',
+  'fill.comp',
+  'histogram.comp',
+  'prefix.comp',
+  'scatter_0_even.comp',
+  'scatter_0_odd.comp',
+  'scatter_1_even.comp',
+  'scatter_1_odd.comp'
+]
+
+shader_include_files = files(
+  'bufref.h',
+  'prefix_limits.h',
+  'prefix.h',
+  'push.h',
+  'scatter.glsl',
+  'config.h',
+)
+
+defines = ['-DRS_KEYVAL_DWORDS=2']
+
+radix_sort_spv = []
+foreach s : radix_sort_shaders
+  radix_sort_spv += custom_target(
+    s + '.spv.h',
+    input : s,
+    output : s + '.spv.h',
+    command : [
+      prog_glslang, '-V', '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+    ] + defines + glslang_quiet + (with_mesa_debug ? ['-g'] : []),
+    depend_files: shader_include_files)
+endforeach
diff --git a/src/amd/vulkan/radix_sort/shaders/prefix.comp b/src/vulkan/runtime/radix_sort/shaders/prefix.comp
similarity index 69%
rename from src/amd/vulkan/radix_sort/shaders/prefix.comp
rename to src/vulkan/runtime/radix_sort/shaders/prefix.comp
index aae88869a6e..650d3305fd6 100644
--- a/src/amd/vulkan/radix_sort/shaders/prefix.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix.comp
@@ -46,41 +46,20 @@ layout(push_constant) uniform block_push
 #define RS_SUBGROUP_UNIFORM
 #endif
 
-//
-// Check all switches are defined
-//
-//
-#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
 #define RS_KEYVAL_SIZE          (RS_KEYVAL_DWORDS * 4)
-#define RS_WORKGROUP_SIZE       (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE       (RS_PREFIX_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE        (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 // clang-format on
 
 //
-// There is no purpose in having a workgroup size larger than the
-// radix size.
-//
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)"
-#endif
-
 //
 //
-//
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_PREFIX_WORKGROUP_SIZE_ID) in;
 
 //
 // Histogram buffer reference
@@ -95,34 +74,23 @@ layout(buffer_reference, std430) buffer buffer_rs_histograms
 //
 #include "prefix_limits.h"
 
-//
-// If multi-subgroup then define shared memory
-//
-#if (RS_WORKGROUP_SUBGROUPS > 1)
-
 //----------------------------------------
 shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE];
 
 #define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_]
 //----------------------------------------
 
-#if (RS_SWEEP_1_SIZE > 0)
 //----------------------------------------
 shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE];
 
 #define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_]
 //----------------------------------------
-#endif
 
-#if (RS_SWEEP_2_SIZE > 0)
 //----------------------------------------
 shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE];
 
 #define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_]
 //----------------------------------------
-#endif
-
-#endif
 
 //
 // Define function arguments
@@ -151,37 +119,21 @@ main()
   //
   // Define buffer reference to read histograms
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  // Define histograms bufref for single subgroup
-  //
   // NOTE(allanmac): The histogram buffer reference could be adjusted
   // on the host to save a couple instructions at the cost of added
   // complexity.
   //
+  const uint32_t invocation_id = RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x;
+
   RS_SUBGROUP_UNIFORM
   const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
-  const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4;
+  const uint32_t histograms_offset = (histograms_base + invocation_id) * 4;
 
   RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
                                     rs_histograms,
                                     push.devaddr_histograms,
                                     histograms_offset);
 
-#else
-  //
-  // Define histograms bufref for workgroup
-  //
-  RS_SUBGROUP_UNIFORM
-  const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
-  const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4;
-
-  RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
-                                    rs_histograms,
-                                    push.devaddr_histograms,
-                                    histograms_offset);
-
-#endif
 
   //
   // Compute exclusive prefix of uint32_t[256]
diff --git a/src/vulkan/runtime/radix_sort/shaders/prefix.h b/src/vulkan/runtime/radix_sort/shaders/prefix.h
new file mode 100644
index 00000000000..f9582da0067
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix.h
@@ -0,0 +1,356 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+
+//
+// Requires several defines
+//
+#ifndef RS_PREFIX_LIMITS
+#error "Error: \"prefix_limits.h\" not loaded"
+#endif
+
+#ifndef RS_PREFIX_ARGS
+#error "Error: RS_PREFIX_ARGS undefined"
+#endif
+
+#ifndef RS_PREFIX_LOAD
+#error "Error: RS_PREFIX_LOAD undefined"
+#endif
+
+#ifndef RS_PREFIX_STORE
+#error "Error: RS_PREFIX_STORE undefined"
+#endif
+
+//
+// Optional switches:
+//
+//   * Disable holding original inclusively scanned histogram values in registers.
+//
+//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+//
+
+//
+// Compute exclusive prefix of uint32_t[256]
+//
+void
+rs_prefix(RS_PREFIX_ARGS)
+{
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+  {
+    //
+    // Workgroup is a single subgroup so no shared memory is required.
+    //
+
+    //
+    // Exclusive scan-add the histogram
+    //
+    const uint32_t               h0     = RS_PREFIX_LOAD(0);
+    const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
+    RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+
+    RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
+
+    //
+    // Each iteration is dependent on the previous so no unrolling.  The
+    // compiler is free to hoist the loads upward though.
+    //
+    for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
+         ii < RS_RADIX_SIZE;
+         ii += RS_SUBGROUP_SIZE)
+      {
+        const uint32_t h     = RS_PREFIX_LOAD(ii);
+        const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
+        h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+        RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
+      }
+  }
+  else
+  {
+    //
+    // Workgroup is multiple subgroups and uses shared memory to store
+    // the scan's intermediate results.
+    //
+    // Assumes a power-of-two subgroup, workgroup and radix size.
+    //
+    // Downsweep: Repeatedly scan reductions until they fit in a single
+    //            subgroup.
+    //
+    // Upsweep:   Then uniformly apply reductions to each subgroup.
+    //
+    //
+    //   Subgroup Size |  4 |  8 | 16 | 32 | 64 | 128 |
+    //   --------------+----+----+----+----+----+-----+
+    //   Sweep 0       | 64 | 32 | 16 |  8 |  4 |   2 | sweep_0[]
+    //   Sweep 1       | 16 |  4 |  - |  - |  - |   - | sweep_1[]
+    //   Sweep 2       |  4 |  - |  - |  - |  - |   - | sweep_2[]
+    //   --------------+----+----+----+----+----+-----+
+    //   Total dwords  | 84 | 36 | 16 |  8 |  4 |   2 |
+    //   --------------+----+----+----+----+----+-----+
+    //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    uint32_t h_exc[RS_H_COMPONENTS];
+#endif
+
+    //
+    // Downsweep 0
+    //
+    [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+    {
+      const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+      const uint32_t h_inc = subgroupInclusiveAdd(h);
+
+      const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+      RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+      //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+      h_exc[ii] = h_inc - h;
+#else
+      RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
+#endif
+    }
+
+    barrier();
+
+    //
+    // Skip generalizing these sweeps for all possible subgroups -- just
+    // write them directly.
+    //
+    if (RS_SUBGROUP_SIZE == 128)
+    {
+      // There are only two elements in SWEEP0 per subgroup. The scan is
+      // trivial so we fold it into the upsweep.
+    }
+    else if (RS_SUBGROUP_SIZE >= 16)
+    {
+      //////////////////////////////////////////////////////////////////////
+      //
+      // Scan 0
+      //
+      if (RS_SWEEP_0_SIZE != RS_WORKGROUP_SIZE && // workgroup has inactive components
+          gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)
+        {
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+        }
+
+      barrier();
+    }
+    else if (RS_SUBGROUP_SIZE == 8)
+    {
+      if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+      {
+        //////////////////////////////////////////////////////////////////////
+        //
+        // Scan 0 and Downsweep 1
+        //
+        if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
+          {
+            const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+            const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+            RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+            RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else
+      {
+        //////////////////////////////////////////////////////////////////////
+        //
+        // Scan 0 and Downsweep 1
+        //
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
+        {
+          const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+          RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 1
+      //
+      if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
+        {
+          const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+          const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+          RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+        }
+
+      barrier();
+    }
+    else if (RS_SUBGROUP_SIZE == 4)
+    {
+      //////////////////////////////////////////////////////////////////////
+      //
+      // Scan 0 and Downsweep 1
+      //
+      if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+      {
+        if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
+          {
+            const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+            const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+            RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+            RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else
+      {
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
+        {
+          const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+          RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 1 and Downsweep 2
+      //
+      if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
+      {
+        if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
+          {
+            const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+            const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+            RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+            RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else 
+      {
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
+        {
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
+          const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+          RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
+          RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 2
+      //
+      // 4 invocations
+      //
+      if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
+        {
+          const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
+          const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
+
+          RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
+        }
+
+      barrier();
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    //
+    // Final upsweep 0
+    //
+    if (RS_SUBGROUP_SIZE == 128)
+    {
+      // There must be more than one subgroup per workgroup, but the maximum
+      // workgroup size is 256 so there must be exactly two subgroups per
+      // workgroup and RS_H_COMPONENTS must be 1.
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+      RS_PREFIX_STORE(0) = h_exc[0] + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0);
+#else
+      const uint32_t h_exc = RS_PREFIX_LOAD(0);
+
+      RS_PREFIX_STORE(0) = h_exc + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0);
+#endif
+    }
+    else if (RS_SUBGROUP_SIZE >= 16)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+        // clang format issue
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
+#endif
+      }
+    }
+    else if (RS_SUBGROUP_SIZE == 8)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+        const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#endif
+      }
+    }
+    else if (RS_SUBGROUP_SIZE == 4)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+        const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+        const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#endif
+      }
+    }
+  }
+}
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
diff --git a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
similarity index 71%
rename from src/amd/vulkan/radix_sort/shaders/prefix_limits.h
rename to src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
index a98e554ad4a..4d0e89fb9c2 100644
--- a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
@@ -10,17 +10,12 @@
 //
 #define RS_PREFIX_LIMITS
 
-//
-// Multi-subgroup prefix requires shared memory.
-//
-#if (RS_WORKGROUP_SUBGROUPS > 1)
-
 // clang-format off
 #define RS_H_COMPONENTS    (RS_RADIX_SIZE / RS_WORKGROUP_SIZE)
 
-#define RS_SWEEP_0_SIZE    (RS_RADIX_SIZE   / RS_SUBGROUP_SIZE)
-#define RS_SWEEP_1_SIZE    (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)
-#define RS_SWEEP_2_SIZE    (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)
+#define RS_SWEEP_0_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_RADIX_SIZE / RS_SUBGROUP_SIZE))
+#define RS_SWEEP_1_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE))
+#define RS_SWEEP_2_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE))
 
 #define RS_SWEEP_SIZE      (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE)
 
@@ -32,15 +27,6 @@
 #define RS_SWEEP_2_OFFSET  (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE)
 // clang-format on
 
-//
-// Single subgroup prefix doesn't use shared memory.
-//
-#else
-
-#define RS_SWEEP_SIZE 0
-
-#endif
-
 //
 //
 //
diff --git a/src/amd/vulkan/radix_sort/shaders/push.h b/src/vulkan/runtime/radix_sort/shaders/push.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/push.h
rename to src/vulkan/runtime/radix_sort/shaders/push.h
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter.glsl b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl
similarity index 58%
rename from src/amd/vulkan/radix_sort/shaders/scatter.glsl
rename to src/vulkan/runtime/radix_sort/shaders/scatter.glsl
index b57d9e80850..bacd44682f5 100644
--- a/src/amd/vulkan/radix_sort/shaders/scatter.glsl
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl
@@ -84,21 +84,6 @@ layout(push_constant) uniform block_push
 #error "Undefined: RS_SCATTER_KEYVAL_DWORD_BASE"
 #endif
 
-//
-#ifndef RS_SCATTER_BLOCK_ROWS
-#error "Undefined: RS_SCATTER_BLOCK_ROWS"
-#endif
-
-//
-#ifndef RS_SCATTER_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_SCATTER_SUBGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_SCATTER_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_SCATTER_WORKGROUP_SIZE_LOG2"
-#endif
-
 //
 // Status masks are defined differently for the scatter_even and
 // scatter_odd shaders.
@@ -140,7 +125,7 @@ layout(push_constant) uniform block_push
 //
 // clang-format off
 #define RS_KEYVAL_SIZE               (RS_KEYVAL_DWORDS * 4)
-#define RS_WORKGROUP_SIZE            (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE            (RS_SCATTER_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE             (1 << RS_SCATTER_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS       (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 #define RS_SUBGROUP_KEYVALS          (RS_SCATTER_BLOCK_ROWS * RS_SUBGROUP_SIZE)
@@ -148,13 +133,6 @@ layout(push_constant) uniform block_push
 #define RS_RADIX_MASK                ((1 << RS_RADIX_LOG2) - 1)
 // clang-format on
 
-//
-// Validate number of keyvals fit in a uint16_t.
-//
-#if (RS_BLOCK_KEYVALS >= 65536)
-#error "Error: (RS_BLOCK_KEYVALS >= 65536)"
-#endif
-
 //
 // Keyval type
 //
@@ -181,9 +159,7 @@ layout(push_constant) uniform block_push
 // Determine at compile time the base of the final iteration for
 // workgroups smaller than RS_RADIX_SIZE.
 //
-#if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
 #define RS_WORKGROUP_BASE_FINAL ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)
-#endif
 
 //
 // Max macro
@@ -291,7 +267,7 @@ layout(push_constant) uniform block_push
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_SCATTER_WORKGROUP_SIZE_ID) in;
 
 //
 //
@@ -325,48 +301,55 @@ shared rs_scatter_smem smem;
 // The shared memory barrier is either subgroup-wide or
 // workgroup-wide.
 //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_BARRIER() subgroupBarrier()
-#else
-#define RS_BARRIER() barrier()
-#endif
+void rsBarrier()
+{
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupBarrier();
+  else
+    barrier();
+}
 
 //
 // If multi-subgroup then define shared memory
 //
-#if (RS_WORKGROUP_SUBGROUPS > 1)
 
 //----------------------------------------
 #define RS_PREFIX_SWEEP0(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_0_OFFSET + (idx_)]
 //----------------------------------------
 
-#if (RS_SWEEP_1_SIZE > 0)
 //----------------------------------------
 #define RS_PREFIX_SWEEP1(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_1_OFFSET + (idx_)]
 //----------------------------------------
-#endif
 
-#if (RS_SWEEP_2_SIZE > 0)
 //----------------------------------------
 #define RS_PREFIX_SWEEP2(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_2_OFFSET + (idx_)]
 //----------------------------------------
-#endif
 
-#endif
+uint32_t
+invocation_id()
+{
+  return RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupID : gl_LocalInvocationID.x;
+}
 
 //
 // Define prefix load/store functions
 //
 // clang-format off
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_PREFIX_LOAD(idx_)   smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID + (idx_)]
-#define RS_PREFIX_STORE(idx_)  smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID + (idx_)]
-#else
-#define RS_PREFIX_LOAD(idx_)   smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x + (idx_)]
-#define RS_PREFIX_STORE(idx_)  smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x + (idx_)]
-#endif
+#define RS_PREFIX_LOAD(idx_)   smem.extent[RS_SMEM_HISTOGRAM_OFFSET + invocation_id() + (idx_)]
+#define RS_PREFIX_STORE(idx_)  smem.extent[RS_SMEM_HISTOGRAM_OFFSET + invocation_id() + (idx_)]
 // clang-format on
 
+layout(buffer_reference, std430) buffer buffer_rs_workgroup_id
+{
+  uint32_t x[RS_KEYVAL_DWORDS * 4];
+};
+
+#define RS_IS_FIRST_LOCAL_INVOCATION() (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID == 0 : gl_LocalInvocationID.x == 0)
+
+RS_SUBGROUP_UNIFORM uint32_t rs_gl_workgroup_id_x;
+
+#define RS_GL_WORKGROUP_ID_X (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0 ? rs_gl_workgroup_id_x : gl_WorkGroupID.x)
+
 //
 // Load the prefix function
 //
@@ -383,45 +366,43 @@ shared rs_scatter_smem smem;
 void
 rs_histogram_zero()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    smem.extent[smem_offset + ii] = 0;
-  }
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
 
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    smem.extent[smem_offset + ii] = 0;
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      smem.histogram[smem_offset_final] = 0;
+      smem.extent[smem_offset + ii] = 0;
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
-      smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x] = 0;
+      smem.extent[smem_offset + ii] = 0;
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
 
-  RS_BARRIER();
+      if (smem_offset_final < RS_RADIX_SIZE)
+        {
+          smem.extent[smem_offset_final] = 0;
+        }
+    }
+  }
+  else
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x] = 0;
+      }
+  }
+
+  rsBarrier();
 }
 
 //
@@ -450,11 +431,6 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //----------------------------------------------------------------------
 #ifdef RS_SCATTER_ENABLE_NV_MATCH
 
-  //
-  // 32
-  //
-#if (RS_SUBGROUP_SIZE == 32)
-
   [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
   {
     //
@@ -470,13 +446,6 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
     kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
   }
 
-  //
-  // Undefined!
-  //
-#else
-#error "Error: rs_histogram_rank() undefined for subgroup size"
-#endif
-
   //----------------------------------------------------------------------
   //
   // Default is to emulate a `match` operation with ballots.
@@ -484,79 +453,32 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //----------------------------------------------------------------------
 #elif !defined(RS_SCATTER_ENABLE_BROADCAST_MATCH)
 
-  //
-  // 64
-  //
-#if (RS_SUBGROUP_SIZE == 64)
-
   [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
   {
     const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
 
-    u32vec2 match;
+    u32vec4 match;
 
     {
-      const bool     is_one = RS_BIT_IS_ONE(digit, 0);
-      const u32vec2  ballot = subgroupBallot(is_one).xy;
-      const uint32_t mask   = is_one ? 0 : 0xFFFFFFFF;
+      const bool    is_one = RS_BIT_IS_ONE(digit, 0);
+      const u32vec4 ballot = subgroupBallot(is_one);
+      const u32vec4 mask   = u32vec4(is_one ? 0 : 0xFFFFFFFF);
 
-      match.x = (ballot.x ^ mask);
-      match.y = (ballot.y ^ mask);
+      match = ballot ^ mask;
     }
 
     [[unroll]] for (int32_t bit = 1; bit < RS_RADIX_LOG2; bit++)
     {
-      const bool     is_one = RS_BIT_IS_ONE(digit, bit);
-      const u32vec2  ballot = subgroupBallot(is_one).xy;
-      const uint32_t mask   = is_one ? 0 : 0xFFFFFFFF;
+      const bool    is_one = RS_BIT_IS_ONE(digit, bit);
+      const u32vec4 ballot = subgroupBallot(is_one);
+      const u32vec4 mask   = u32vec4(is_one ? 0 : 0xFFFFFFFF);
 
-      match.x &= (ballot.x ^ mask);
-      match.y &= (ballot.y ^ mask);
+      match &= ballot ^ mask;
     }
 
-    kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) |
-             (bitCount(match.x & gl_SubgroupLeMask.x) +  //
-              bitCount(match.y & gl_SubgroupLeMask.y));
+    kr[ii] = (subgroupBallotBitCount(match) << 16) | subgroupBallotInclusiveBitCount(match);
   }
 
-  //
-  // <= 32
-  //
-#elif ((RS_SUBGROUP_SIZE <= 32) && !defined(RS_SCATTER_ENABLE_NV_MATCH))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
-  {
-    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
-
-    uint32_t match;
-
-    {
-      const bool     is_one = RS_BIT_IS_ONE(digit, 0);
-      const uint32_t ballot = subgroupBallot(is_one).x;
-      const uint32_t mask   = is_one ? 0 : RS_SUBGROUP_MASK;
-
-      match = (ballot ^ mask);
-    }
-
-    [[unroll]] for (int32_t bit = 1; bit < RS_RADIX_LOG2; bit++)
-    {
-      const bool     is_one = RS_BIT_IS_ONE(digit, bit);
-      const uint32_t ballot = subgroupBallot(is_one).x;
-      const uint32_t mask   = is_one ? 0 : RS_SUBGROUP_MASK;
-
-      match &= (ballot ^ mask);
-    }
-
-    kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
-  }
-
-  //
-  // Undefined!
-  //
-#else
-#error "Error: rs_histogram_rank() undefined for subgroup size"
-#endif
-
   //----------------------------------------------------------------------
   //
   // Emulate a `match` operation with broadcasts.
@@ -569,69 +491,58 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //
   // 64
   //
-#if (RS_SUBGROUP_SIZE == 64)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
+  if (RS_SUBGROUP_SIZE == 64)
   {
-    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
-
-    u32vec2 match;
-
-    // subgroup invocation 0
+    [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
     {
-      match[0] = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
-    }
+      const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
 
-    // subgroup invocations 1-31
-    [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
+      u32vec2 match;
+
+      // subgroup invocation 0
+      {
+        match[0] = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
+      }
+
+      // subgroup invocations 1-31
+      [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
+      {
+        match[0] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
+      }
+
+      // subgroup invocation 32
+      {
+        match[1] = (subgroupBroadcast(digit, 32) == digit) ? (1u << 0) : 0;
+      }
+
+      // subgroup invocations 33-63
+      [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
+      {
+        match[1] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
+      }
+
+      kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) |
+               (bitCount(match.x & gl_SubgroupLeMask.x) +  //
+                bitCount(match.y & gl_SubgroupLeMask.y));
+    }
+  } else if (RS_SUBGROUP_SIZE <= 32) {
+    [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
     {
-      match[0] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
-    }
+      const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
 
-    // subgroup invocation 32
-    {
-      match[1] = (subgroupBroadcast(digit, 32) == digit) ? (1u << 0) : 0;
-    }
+      // subgroup invocation 0
+      uint32_t match = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
 
-    // subgroup invocations 33-63
-    [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
-    {
-      match[1] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
-    }
+      // subgroup invocations 1-(RS_SUBGROUP_SIZE-1)
+      [[unroll]] for (int32_t jj = 1; jj < RS_SUBGROUP_SIZE; jj++)
+      {
+        match |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
+      }
 
-    kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) |
-             (bitCount(match.x & gl_SubgroupLeMask.x) +  //
-              bitCount(match.y & gl_SubgroupLeMask.y));
+      kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
+    }
   }
 
-  //
-  // <= 32
-  //
-#elif ((RS_SUBGROUP_SIZE <= 32) && !defined(RS_SCATTER_ENABLE_NV_MATCH))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
-  {
-    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
-
-    // subgroup invocation 0
-    uint32_t match = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
-
-    // subgroup invocations 1-(RS_SUBGROUP_SIZE-1)
-    [[unroll]] for (int32_t jj = 1; jj < RS_SUBGROUP_SIZE; jj++)
-    {
-      match |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
-    }
-
-    kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
-  }
-
-  //
-  // Undefined!
-  //
-#else
-#error "Error: rs_histogram_rank() undefined for subgroup size"
-#endif
-
 #endif
 
   //
@@ -660,7 +571,7 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
           }
         }
 
-      RS_BARRIER();
+      rsBarrier();
     }
 }
 
@@ -677,110 +588,103 @@ rs_first_prefix_store(restrict buffer_rs_partitions rs_partitions)
   //
   // Define the histogram reference
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  const uint32_t hist_offset = gl_SubgroupInvocationID * 4;
-#else
-  const uint32_t hist_offset = gl_LocalInvocationID.x * 4;
-#endif
+  const uint32_t hist_offset = invocation_id() * 4;
 
   readonly RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histogram,
                                              rs_histogram,
                                              push.devaddr_histograms,
                                              hist_offset);
 
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
-  const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    const uint32_t exc = rs_histogram.extent[ii];
-    const uint32_t red = smem.extent[smem_offset_h + ii];
+    const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
+    const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
 
-    smem.extent[smem_offset_l + ii] = exc;
-
-    const uint32_t inc = exc + red;
-
-    atomicStore(rs_partitions.extent[ii],
-                inc | RS_PARTITION_MASK_PREFIX,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
-  const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t exc = rs_histogram.extent[ii];
-    const uint32_t red = smem.extent[smem_offset_h + ii];
-
-    smem.extent[smem_offset_l + ii] = exc;
-
-    const uint32_t inc = exc + red;
-
-    atomicStore(rs_partitions.extent[ii],
-                inc | RS_PARTITION_MASK_PREFIX,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final_h = smem_offset_h + RS_WORKGROUP_BASE_FINAL;
-  const uint32_t smem_offset_final_l = smem_offset_l + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      const uint32_t exc = rs_histogram.extent[RS_WORKGROUP_BASE_FINAL];
-      const uint32_t red = smem.extent[smem_offset_final_h];
+      const uint32_t exc = rs_histogram.extent[ii];
+      const uint32_t red = smem.extent[smem_offset_h + ii];
 
-      smem.extent[smem_offset_final_l] = exc;
+      smem.extent[smem_offset_l + ii] = exc;
 
       const uint32_t inc = exc + red;
 
-      atomicStore(rs_partitions.extent[RS_WORKGROUP_BASE_FINAL],
+      atomicStore(rs_partitions.extent[ii],
                   inc | RS_PARTITION_MASK_PREFIX,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
+    const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
-      const uint32_t exc = rs_histogram.extent[0];
-      const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+      const uint32_t exc = rs_histogram.extent[ii];
+      const uint32_t red = smem.extent[smem_offset_h + ii];
 
-      smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+      smem.extent[smem_offset_l + ii] = exc;
 
       const uint32_t inc = exc + red;
 
-      atomicStore(rs_partitions.extent[0],
+      atomicStore(rs_partitions.extent[ii],
                   inc | RS_PARTITION_MASK_PREFIX,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final_h = smem_offset_h + RS_WORKGROUP_BASE_FINAL;
+      const uint32_t smem_offset_final_l = smem_offset_l + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final_h < RS_RADIX_SIZE)
+        {
+          const uint32_t exc = rs_histogram.extent[RS_WORKGROUP_BASE_FINAL];
+          const uint32_t red = smem.extent[smem_offset_final_h];
+
+          smem.extent[smem_offset_final_l] = exc;
+
+          const uint32_t inc = exc + red;
+
+          atomicStore(rs_partitions.extent[RS_WORKGROUP_BASE_FINAL],
+                      inc | RS_PARTITION_MASK_PREFIX,
+                      gl_ScopeQueueFamily,
+                      gl_StorageSemanticsBuffer,
+                      gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+        }
+    }
+  }
+  else
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t exc = rs_histogram.extent[0];
+        const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+
+        smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+
+        const uint32_t inc = exc + red;
+
+        atomicStore(rs_partitions.extent[0],
+                    inc | RS_PARTITION_MASK_PREFIX,
+                    gl_ScopeQueueFamily,
+                    gl_StorageSemanticsBuffer,
+                    gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+      }
+  }
 }
 
 //
@@ -790,76 +694,77 @@ void
 rs_reduction_store(restrict buffer_rs_partitions      rs_partitions,
                    RS_SUBGROUP_UNIFORM const uint32_t partition_base)
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    const uint32_t red = smem.extent[smem_offset + ii];
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SUBGROUPS == 1)
+    //
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
 
-    atomicStore(rs_partitions.extent[partition_base + ii],
-                red | RS_PARTITION_MASK_REDUCTION,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t red = smem.extent[smem_offset + ii];
-
-    atomicStore(rs_partitions.extent[partition_base + ii],
-                red | RS_PARTITION_MASK_REDUCTION,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      const uint32_t red = smem.extent[smem_offset_final];
+      const uint32_t red = smem.extent[smem_offset + ii];
 
-      atomicStore(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+      atomicStore(rs_partitions.extent[partition_base + ii],
                   red | RS_PARTITION_MASK_REDUCTION,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
-      const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+      const uint32_t red = smem.extent[smem_offset + ii];
 
-      atomicStore(rs_partitions.extent[partition_base],
+      atomicStore(rs_partitions.extent[partition_base + ii],
                   red | RS_PARTITION_MASK_REDUCTION,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final < RS_RADIX_SIZE)
+        {
+          const uint32_t red = smem.extent[smem_offset_final];
+
+          atomicStore(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+                      red | RS_PARTITION_MASK_REDUCTION,
+                      gl_ScopeQueueFamily,
+                      gl_StorageSemanticsBuffer,
+                      gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+        }
+    }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+
+        atomicStore(rs_partitions.extent[partition_base],
+                    red | RS_PARTITION_MASK_REDUCTION,
+                    gl_ScopeQueueFamily,
+                    gl_StorageSemanticsBuffer,
+                    gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+      }
+  }
 }
 
 //
@@ -875,120 +780,15 @@ void
 rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
                   RS_SUBGROUP_UNIFORM const uint32_t partition_base)
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
+    ////////////////////////////////////////////////////////////////////////////
     //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
+    // (RS_WORKGROUP_SUBGROUPS == 1)
     //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
 
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        //
-        // Otherwise, save the exclusive scan and atomically transform
-        // the reduction into an inclusive prefix status math:
-        //
-        //   reduction + 1 = prefix
-        //
-        smem.extent[smem_offset + ii] = exc;
-
-        atomicAdd(rs_partitions.extent[partition_base + ii],
-                  exc | (1 << 30),
-                  gl_ScopeQueueFamily,
-                  gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease);
-        break;
-      }
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
-    //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
-    //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
-
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        //
-        // Otherwise, save the exclusive scan and atomically transform
-        // the reduction into an inclusive prefix status math:
-        //
-        //   reduction + 1 = prefix
-        //
-        smem.extent[smem_offset + ii] = exc;
-
-        atomicAdd(rs_partitions.extent[partition_base + ii],
-                  exc | (1 << 30),
-                  gl_ScopeQueueFamily,
-                  gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease);
-        break;
-      }
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_SMEM_LOOKBACK_OFFSET + RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
       uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
       uint32_t exc                 = 0;
@@ -1002,7 +802,7 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
                                            gl_ScopeQueueFamily,
                                            gl_StorageSemanticsBuffer,
-                                           gl_SemanticsAcquire);
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
           // spin until valid
           if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
@@ -1027,7 +827,7 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           //
           smem.extent[smem_offset + ii] = exc;
 
-          atomicAdd(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+          atomicAdd(rs_partitions.extent[partition_base + ii],
                     exc | (1 << 30),
                     gl_ScopeQueueFamily,
                     gl_StorageSemanticsBuffer,
@@ -1035,16 +835,16 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           break;
         }
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
       uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
       uint32_t exc                 = 0;
@@ -1055,10 +855,10 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
       //
       while (true)
         {
-          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
                                            gl_ScopeQueueFamily,
                                            gl_StorageSemanticsBuffer,
-                                           gl_SemanticsAcquire);
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
           // spin until valid
           if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
@@ -1081,9 +881,9 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           //
           //   reduction + 1 = prefix
           //
-          smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+          smem.extent[smem_offset + ii] = exc;
 
-          atomicAdd(rs_partitions.extent[partition_base],
+          atomicAdd(rs_partitions.extent[partition_base + ii],
                     exc | (1 << 30),
                     gl_ScopeQueueFamily,
                     gl_StorageSemanticsBuffer,
@@ -1092,7 +892,113 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
         }
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final < RS_SMEM_LOOKBACK_OFFSET + RS_RADIX_SIZE)
+        {
+          uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+          uint32_t exc                 = 0;
+
+          //
+          // NOTE: Each workgroup invocation can proceed independently.
+          // Subgroups and workgroups do NOT have to coordinate.
+          //
+          while (true)
+            {
+              const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL],
+                                               gl_ScopeQueueFamily,
+                                               gl_StorageSemanticsBuffer,
+                                               gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+              // spin until valid
+              if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+                {
+                  continue;
+                }
+
+              exc += (prev & RS_PARTITION_MASK_COUNT);
+
+              if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+                {
+                  // continue accumulating reductions
+                  partition_base_prev -= RS_RADIX_SIZE;
+                  continue;
+                }
+
+              //
+              // Otherwise, save the exclusive scan and atomically transform
+              // the reduction into an inclusive prefix status math:
+              //
+              //   reduction + 1 = prefix
+              //
+              smem.extent[smem_offset + RS_WORKGROUP_BASE_FINAL] = exc;
+
+              atomicAdd(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+                        exc | (1 << 30),
+                        gl_ScopeQueueFamily,
+                        gl_StorageSemanticsBuffer,
+                        gl_SemanticsAcquireRelease);
+              break;
+            }
+        }
+    }
+  }
+  else
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+        uint32_t exc                 = 0;
+
+        //
+        // NOTE: Each workgroup invocation can proceed independently.
+        // Subgroups and workgroups do NOT have to coordinate.
+        //
+        while (true)
+          {
+            const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+                                             gl_ScopeQueueFamily,
+                                             gl_StorageSemanticsBuffer,
+                                             gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+            // spin until valid
+            if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+              {
+                continue;
+              }
+
+            exc += (prev & RS_PARTITION_MASK_COUNT);
+
+            if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+              {
+                // continue accumulating reductions
+                partition_base_prev -= RS_RADIX_SIZE;
+                continue;
+              }
+
+            //
+            // Otherwise, save the exclusive scan and atomically transform
+            // the reduction into an inclusive prefix status math:
+            //
+            //   reduction + 1 = prefix
+            //
+            smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+
+            atomicAdd(rs_partitions.extent[partition_base],
+                      exc | (1 << 30),
+                      gl_ScopeQueueFamily,
+                      gl_StorageSemanticsBuffer,
+                      gl_SemanticsAcquireRelease);
+            break;
+          }
+      }
+  }
 }
 
 //
@@ -1105,98 +1011,15 @@ void
 rs_lookback_skip_store(restrict buffer_rs_partitions      rs_partitions,
                        RS_SUBGROUP_UNIFORM const uint32_t partition_base)
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
+    ////////////////////////////////////////////////////////////////////////////
     //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
+    // (RS_WORKGROUP_SUBGROUPS == 1)
     //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
 
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        // Otherwise, save the exclusive scan.
-        smem.extent[smem_offset + ii] = exc;
-        break;
-      }
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
-    //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
-    //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
-
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        // Otherwise, save the exclusive scan.
-        smem.extent[smem_offset + ii] = exc;
-        break;
-      }
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
       uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
       uint32_t exc                 = 0;
@@ -1207,56 +1030,10 @@ rs_lookback_skip_store(restrict buffer_rs_partitions      rs_partitions,
       //
       while (true)
         {
-          const uint32_t prev =
-            atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL],
-                       gl_ScopeQueueFamily,
-                       gl_StorageSemanticsBuffer,
-                       gl_SemanticsAcquire);
-
-          // spin until valid
-          if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-            {
-              continue;
-            }
-
-          exc += (prev & RS_PARTITION_MASK_COUNT);
-
-          if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-            {
-              // continue accumulating reductions
-              partition_base_prev -= RS_RADIX_SIZE;
-              continue;
-            }
-
-          // Otherwise, save the exclusive scan.
-          smem.extent[smem_offset_final] = exc;
-          break;
-        }
-    }
-#endif
-
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-      uint32_t exc                 = 0;
-
-      //
-      // NOTE: Each workgroup invocation can proceed independently.
-      // Subgroups and workgroups do NOT have to coordinate.
-      //
-      while (true)
-        {
-          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
                                            gl_ScopeQueueFamily,
                                            gl_StorageSemanticsBuffer,
-                                           gl_SemanticsAcquire);
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
           // spin until valid
           if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
@@ -1274,12 +1051,142 @@ rs_lookback_skip_store(restrict buffer_rs_partitions      rs_partitions,
             }
 
           // Otherwise, save the exclusive scan.
-          smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+          smem.extent[smem_offset + ii] = exc;
+          break;
+        }
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+      uint32_t exc                 = 0;
+
+      //
+      // NOTE: Each workgroup invocation can proceed independently.
+      // Subgroups and workgroups do NOT have to coordinate.
+      //
+      while (true)
+        {
+          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
+                                           gl_ScopeQueueFamily,
+                                           gl_StorageSemanticsBuffer,
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+          // spin until valid
+          if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+            {
+              continue;
+            }
+
+          exc += (prev & RS_PARTITION_MASK_COUNT);
+
+          if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+            {
+              // continue accumulating reductions
+              partition_base_prev -= RS_RADIX_SIZE;
+              continue;
+            }
+
+          // Otherwise, save the exclusive scan.
+          smem.extent[smem_offset + ii] = exc;
           break;
         }
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final < RS_RADIX_SIZE)
+        {
+          uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+          uint32_t exc                 = 0;
+
+          //
+          // NOTE: Each workgroup invocation can proceed independently.
+          // Subgroups and workgroups do NOT have to coordinate.
+          //
+          while (true)
+            {
+              const uint32_t prev =
+                atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL],
+                           gl_ScopeQueueFamily,
+                           gl_StorageSemanticsBuffer,
+                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+              // spin until valid
+              if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+                {
+                  continue;
+                }
+
+              exc += (prev & RS_PARTITION_MASK_COUNT);
+
+              if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+                {
+                  // continue accumulating reductions
+                  partition_base_prev -= RS_RADIX_SIZE;
+                  continue;
+                }
+
+              // Otherwise, save the exclusive scan.
+              smem.extent[smem_offset_final] = exc;
+              break;
+            }
+        }
+    }
+  }
+  else
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+        uint32_t exc                 = 0;
+
+        //
+        // NOTE: Each workgroup invocation can proceed independently.
+        // Subgroups and workgroups do NOT have to coordinate.
+        //
+        while (true)
+          {
+            const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+                                             gl_ScopeQueueFamily,
+                                             gl_StorageSemanticsBuffer,
+                                             gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+            // spin until valid
+            if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+              {
+                continue;
+              }
+
+            exc += (prev & RS_PARTITION_MASK_COUNT);
+
+            if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+              {
+                // continue accumulating reductions
+                partition_base_prev -= RS_RADIX_SIZE;
+                continue;
+              }
+
+            // Otherwise, save the exclusive scan.
+            smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+            break;
+          }
+      }
+  }
 }
 
 //
@@ -1302,7 +1209,7 @@ rs_rank_to_local(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //
   // Reordering phase will overwrite histogram span.
   //
-  RS_BARRIER();
+  rsBarrier();
 }
 
 //
@@ -1333,13 +1240,7 @@ rs_rank_to_global(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
 void
 rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_SCATTER_BLOCK_ROWS])
 {
-  // clang-format off
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_SubgroupInvocationID;
-#else
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_LocalInvocationID.x;
-#endif
-  // clang-format on
+  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + invocation_id();
 
   [[unroll]] for (uint32_t ii = 0; ii < RS_KEYVAL_DWORDS; ii++)
   {
@@ -1353,7 +1254,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_
       smem.extent[smem_idx] = RS_KV_DWORD(kv[jj], ii);
     }
 
-    RS_BARRIER();
+    rsBarrier();
 
     //
     // Load keyval dword from sorted location
@@ -1363,7 +1264,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_
       RS_KV_DWORD(kv[jj], ii) = smem.extent[smem_base + jj * RS_WORKGROUP_SIZE];
     }
 
-    RS_BARRIER();
+    rsBarrier();
   }
 
   //
@@ -1376,7 +1277,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_
     smem.extent[smem_idx] = uint32_t(kr[ii]);
   }
 
-  RS_BARRIER();
+  rsBarrier();
 
   //
   // Load kr[] from sorted location -- we only need the rank.
@@ -1395,13 +1296,7 @@ void
 rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
              inout uint32_t       kr[RS_SCATTER_BLOCK_ROWS])
 {
-  // clang-format off
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_SubgroupInvocationID;
-#else
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_LocalInvocationID.x;
-#endif
-  // clang-format on
+  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + invocation_id();
 
   [[unroll]] for (uint32_t ii = 0; ii < RS_KEYVAL_DWORDS; ii++)
   {
@@ -1415,7 +1310,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
       smem.extent[smem_idx] = RS_KV_DWORD(kv[jj], ii);
     }
 
-    RS_BARRIER();
+    rsBarrier();
 
     //
     // Load keyval dword from sorted location
@@ -1425,7 +1320,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
       RS_KV_DWORD(kv[jj], ii) = smem.extent[smem_base + jj * RS_WORKGROUP_SIZE];
     }
 
-    RS_BARRIER();
+    rsBarrier();
   }
 
   //
@@ -1438,7 +1333,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
     smem.extent[smem_idx] = uint32_t(kr[ii]);
   }
 
-  RS_BARRIER();
+  rsBarrier();
 
   //
   // Load kr[] from sorted location -- we only need the rank.
@@ -1459,7 +1354,7 @@ rs_load(out RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS])
   //
   // Set up buffer reference
   //
-  const uint32_t kv_in_offset_keys = gl_WorkGroupID.x * RS_BLOCK_KEYVALS +
+  const uint32_t kv_in_offset_keys = RS_GL_WORKGROUP_ID_X * RS_BLOCK_KEYVALS +
                                      gl_SubgroupID * RS_SUBGROUP_KEYVALS + gl_SubgroupInvocationID;
 
   u32vec2 kv_in_offset;
@@ -1530,6 +1425,58 @@ rs_store(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], const uint32_t kr[RS_SC
 void
 main()
 {
+  //
+  // If this is a nonsequential dispatch device then acquire a virtual
+  // workgroup id.
+  //
+  // This is only run once and is a special compile-time-enabled case
+  // so we leverage the existing `push.devaddr_partitions` address
+  // instead of altering the push constant structure definition.
+  //
+  if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0)
+    {
+      if (RS_IS_FIRST_LOCAL_INVOCATION())
+        {
+          // The "internal" memory map looks like this:
+          //
+          //   +---------------------------------+ <-- 0
+          //   | histograms[keyval_size]         |
+          //   +---------------------------------+ <-- keyval_size                           * histo_size
+          //   | partitions[scatter_blocks_ru-1] |
+          //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size
+          //   | workgroup_ids[keyval_size]      |
+          //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size
+          //
+          // Extended multiply to avoid 4GB overflow
+          //
+          u32vec2 workgroup_id_offset;
+
+          umulExtended((gl_NumWorkGroups.x - 1),  // virtual workgroup ids follow partitions[]
+                       4 * RS_RADIX_SIZE,         // sizeof(uint32_t) * 256
+                       workgroup_id_offset.y,     // msb
+                       workgroup_id_offset.x);    // lsb
+
+          RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_workgroup_id,
+                                             rs_workgroup_id,
+                                             push.devaddr_partitions,
+                                             workgroup_id_offset);
+
+          const uint32_t x_idx = RS_SCATTER_KEYVAL_DWORD_BASE * 4 + (push.pass_offset / RS_RADIX_LOG2);
+
+          smem.extent[0] = atomicAdd(rs_workgroup_id.x[x_idx],
+                                     1,
+                                     gl_ScopeQueueFamily,
+                                     gl_StorageSemanticsBuffer,
+                                     gl_SemanticsAcquireRelease);
+        }
+
+      rsBarrier();
+
+      rs_gl_workgroup_id_x = smem.extent[0];
+
+      rsBarrier();
+    }
+
   //
   // Load keyvals
   //
@@ -1568,7 +1515,7 @@ main()
 
     [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
     {
-      rs_kv_out.extent[gl_WorkGroupID.x * RS_BLOCK_KEYVALS + ii * RS_WORKGROUP_SIZE] = kr[ii];
+      rs_kv_out.extent[RS_GL_WORKGROUP_ID_X * RS_BLOCK_KEYVALS + ii * RS_WORKGROUP_SIZE] = kr[ii];
     }
 
     return;
@@ -1594,11 +1541,7 @@ main()
       //
       // Define partitions bufref
       //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-      const uint32_t partition_offset = gl_SubgroupInvocationID * 4;
-#else
-      const uint32_t partition_offset = gl_LocalInvocationID.x * 4;
-#endif
+      const uint32_t partition_offset = invocation_id() * 4;
 
       RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_partitions,
                                         rs_partitions,
@@ -1608,7 +1551,7 @@ main()
       //
       // The first partition is a special case.
       //
-      if (gl_WorkGroupID.x == 0)
+      if (RS_GL_WORKGROUP_ID_X == 0)
         {
           //
           // Other workgroups may lookback on this partition.
@@ -1623,12 +1566,12 @@ main()
           //
           // Otherwise, this is not the first workgroup.
           //
-          RS_SUBGROUP_UNIFORM const uint32_t partition_base = gl_WorkGroupID.x * RS_RADIX_SIZE;
+          RS_SUBGROUP_UNIFORM const uint32_t partition_base = RS_GL_WORKGROUP_ID_X * RS_RADIX_SIZE;
 
           //
           // The last partition is a special case.
           //
-          if (gl_WorkGroupID.x + 1 < gl_NumWorkGroups.x)
+          if (RS_GL_WORKGROUP_ID_X + 1 < gl_NumWorkGroups.x)
             {
               //
               // Atomically store the reduction to the global partition.
@@ -1667,7 +1610,7 @@ main()
       //
       // Barrier before reading prefix scanned histogram.
       //
-      RS_BARRIER();
+      rsBarrier();
 
       //
       // Convert keyval's rank to a local index
@@ -1686,7 +1629,7 @@ main()
       //
       // Wait for lookback to complete.
       //
-      RS_BARRIER();
+      rsBarrier();
 #endif
 
       //
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp
diff --git a/src/amd/vulkan/radix_sort/target.h b/src/vulkan/runtime/radix_sort/target.h
similarity index 94%
rename from src/amd/vulkan/radix_sort/target.h
rename to src/vulkan/runtime/radix_sort/target.h
index 2164389757d..1ddac0ccc8e 100644
--- a/src/amd/vulkan/radix_sort/target.h
+++ b/src/vulkan/runtime/radix_sort/target.h
@@ -27,6 +27,7 @@ struct radix_sort_vk_target_config
   struct
   {
     uint32_t workgroup_size_log2;
+    uint32_t block_rows;
   } fill;
 
   struct
@@ -48,6 +49,8 @@ struct radix_sort_vk_target_config
     uint32_t subgroup_size_log2;
     uint32_t block_rows;
   } scatter;
+
+  bool nonsequential_dispatch;
 };
 
 //
diff --git a/src/vulkan/runtime/vk_acceleration_structure.c b/src/vulkan/runtime/vk_acceleration_structure.c
index 074b94ea85c..ccea927f559 100644
--- a/src/vulkan/runtime/vk_acceleration_structure.c
+++ b/src/vulkan/runtime/vk_acceleration_structure.c
@@ -27,7 +27,41 @@
 #include "vk_alloc.h"
 #include "vk_common_entrypoints.h"
 #include "vk_device.h"
+#include "vk_command_buffer.h"
 #include "vk_log.h"
+#include "vk_meta.h"
+
+#include "bvh/vk_build_interface.h"
+#include "bvh/vk_bvh.h"
+
+#include "radix_sort/common/vk/barrier.h"
+#include "radix_sort/shaders/push.h"
+
+#include "util/u_string.h"
+
+static const uint32_t leaf_spv[] = {
+#include "bvh/leaf.spv.h"
+};
+
+static const uint32_t leaf_always_active_spv[] = {
+#include "bvh/leaf_always_active.spv.h"
+};
+
+static const uint32_t morton_spv[] = {
+#include "bvh/morton.spv.h"
+};
+
+static const uint32_t lbvh_main_spv[] = {
+#include "bvh/lbvh_main.spv.h"
+};
+
+static const uint32_t lbvh_generate_ir_spv[] = {
+#include "bvh/lbvh_generate_ir.spv.h"
+};
+
+static const uint32_t ploc_spv[] = {
+#include "bvh/ploc_internal.spv.h"
+};
 
 VkDeviceAddress
 vk_acceleration_structure_get_va(struct vk_acceleration_structure *accel_struct)
@@ -92,3 +126,1122 @@ vk_common_GetAccelerationStructureDeviceAddressKHR(
    VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfo->accelerationStructure);
    return vk_acceleration_structure_get_va(accel_struct);
 }
+
+#define KEY_ID_PAIR_SIZE 8
+#define MORTON_BIT_SIZE  24
+
+enum internal_build_type {
+   INTERNAL_BUILD_TYPE_LBVH,
+   INTERNAL_BUILD_TYPE_PLOC,
+   INTERNAL_BUILD_TYPE_UPDATE,
+};
+
+struct build_config {
+   enum internal_build_type internal_type;
+   bool updateable;
+   uint32_t encode_key[MAX_ENCODE_PASSES];
+};
+
+struct scratch_layout {
+   uint32_t size;
+   uint32_t update_size;
+
+   uint32_t header_offset;
+
+   /* Used for BUILD only. */
+
+   uint32_t sort_buffer_offset[2];
+   uint32_t sort_internal_offset;
+
+   uint32_t ploc_prefix_sum_partition_offset;
+   uint32_t lbvh_node_offset;
+
+   uint32_t ir_offset;
+   uint32_t internal_node_offset;
+};
+
+static struct build_config
+build_config(uint32_t leaf_count,
+             const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+             const struct vk_acceleration_structure_build_ops *ops)
+{
+   struct build_config config = {0};
+
+   if (leaf_count <= 4)
+      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
+   else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR)
+      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
+   else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) &&
+            !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
+      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
+   else
+      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
+
+   if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR &&
+       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
+       ops->update_as[0])
+      config.internal_type = INTERNAL_BUILD_TYPE_UPDATE;
+
+   if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) &&
+       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
+       ops->update_as[0])
+      config.updateable = true;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(config.encode_key); i++) {
+      if (!ops->get_encode_key[i])
+         break;
+      config.encode_key[i] = ops->get_encode_key[i](leaf_count, build_info->flags);
+   }
+
+   return config;
+}
+
+static void
+get_scratch_layout(struct vk_device *device,
+                   uint32_t leaf_count,
+                   const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                   const struct vk_acceleration_structure_build_args *args,
+                   struct scratch_layout *scratch)
+{
+   uint32_t internal_count = MAX2(leaf_count, 2) - 1;
+
+   radix_sort_vk_memory_requirements_t requirements = {
+      0,
+   };
+   radix_sort_vk_get_memory_requirements(args->radix_sort, leaf_count,
+                                         &requirements);
+
+   uint32_t ir_leaf_size;
+   switch (vk_get_as_geometry_type(build_info)) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      ir_leaf_size = sizeof(struct vk_ir_triangle_node);
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      ir_leaf_size = sizeof(struct vk_ir_aabb_node);
+      break;
+   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+      ir_leaf_size = sizeof(struct vk_ir_instance_node);
+      break;
+   default:
+      unreachable("Unknown VkGeometryTypeKHR");
+   }
+
+
+   uint32_t offset = 0;
+
+   uint32_t ploc_scratch_space = 0;
+   uint32_t lbvh_node_space = 0;
+
+   struct build_config config = build_config(leaf_count, build_info,
+                                             device->as_build_ops);
+
+   if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC)
+      ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition);
+   else
+      lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count;
+
+   scratch->header_offset = offset;
+   offset += sizeof(struct vk_ir_header);
+
+   scratch->sort_buffer_offset[0] = offset;
+   offset += requirements.keyvals_size;
+
+   scratch->sort_buffer_offset[1] = offset;
+   offset += requirements.keyvals_size;
+
+   scratch->sort_internal_offset = offset;
+   /* Internal sorting data is not needed when PLOC/LBVH are invoked,
+    * save space by aliasing them */
+   scratch->ploc_prefix_sum_partition_offset = offset;
+   scratch->lbvh_node_offset = offset;
+   offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space);
+
+   scratch->ir_offset = offset;
+   offset += ir_leaf_size * leaf_count;
+
+   scratch->internal_node_offset = offset;
+   offset += sizeof(struct vk_ir_box_node) * internal_count;
+
+   scratch->size = offset;
+
+   if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
+       device->as_build_ops->update_as[0]) {
+      scratch->update_size =
+         device->as_build_ops->get_update_scratch_size(device, leaf_count);
+   } else {
+      scratch->update_size = offset;
+   }
+}
+
+struct bvh_state {
+   uint32_t scratch_offset;
+
+   uint32_t leaf_node_count;
+   uint32_t internal_node_count;
+   uint32_t leaf_node_size;
+
+   struct scratch_layout scratch;
+   struct build_config config;
+
+   /* Radix sort state */
+   uint32_t scatter_blocks;
+   uint32_t count_ru_scatter;
+   uint32_t histo_blocks;
+   uint32_t count_ru_histo;
+   struct rs_push_scatter push_scatter;
+
+   uint32_t last_encode_pass;
+};
+
+struct bvh_batch_state {
+   bool any_updateable;
+   bool any_non_updateable;
+   bool any_ploc;
+   bool any_lbvh;
+   bool any_update;
+};
+
+static VkResult
+get_pipeline_spv(struct vk_device *device, struct vk_meta_device *meta,
+                 const char *name, const uint32_t *spv, uint32_t spv_size,
+                 unsigned push_constant_size,
+                 const struct vk_acceleration_structure_build_args *args,
+                 VkPipeline *pipeline, VkPipelineLayout *layout)
+{
+   size_t key_size = strlen(name);
+
+   VkResult result = vk_meta_get_pipeline_layout(
+         device, meta, NULL,
+         &(VkPushConstantRange){
+            VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size
+         },
+         name, key_size, layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(meta, name, key_size);
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline = pipeline_from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkShaderModuleCreateInfo module_info = {
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = NULL,
+      .flags = 0,
+      .codeSize = spv_size,
+      .pCode = spv,
+   };
+
+   VkSpecializationMapEntry spec_map[2] = {
+      {
+         .constantID = SUBGROUP_SIZE_ID,
+         .offset = 0,
+         .size = sizeof(args->subgroup_size),
+      },
+      {
+         .constantID = BVH_BOUNDS_OFFSET_ID,
+         .offset = sizeof(args->subgroup_size),
+         .size = sizeof(args->bvh_bounds_offset),
+      },
+   };
+
+   uint32_t spec_constants[2] = {
+      args->subgroup_size,
+      args->bvh_bounds_offset
+   };
+
+   VkSpecializationInfo spec_info = {
+      .mapEntryCount = ARRAY_SIZE(spec_map),
+      .pMapEntries = spec_map,
+      .dataSize = sizeof(spec_constants),
+      .pData = spec_constants,
+   };
+
+   VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT rssci = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+      .pNext = &module_info,
+      .requiredSubgroupSize = args->subgroup_size,
+   };
+
+   VkPipelineShaderStageCreateInfo shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &rssci,
+      .flags = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+      .pSpecializationInfo = &spec_info,
+   };
+
+   VkComputePipelineCreateInfo pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = shader_stage,
+      .flags = 0,
+      .layout = *layout,
+   };
+
+   return vk_meta_create_compute_pipeline(device, meta, &pipeline_info,
+                                          name, key_size, pipeline);
+}
+
+static uint32_t
+pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
+{
+   uint32_t geometry_id_and_flags = geometry_id;
+   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
+      geometry_id_and_flags |= VK_GEOMETRY_OPAQUE;
+
+   return geometry_id_and_flags;
+}
+
+struct vk_bvh_geometry_data
+vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index,
+                      const VkAccelerationStructureGeometryKHR *geometry,
+                      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info)
+{
+   struct vk_bvh_geometry_data data = {
+      .first_id = first_id,
+      .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags),
+      .geometry_type = geometry->geometryType,
+   };
+
+   switch (geometry->geometryType) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
+
+      data.data = geometry->geometry.triangles.vertexData.deviceAddress +
+                  build_range_info->firstVertex * geometry->geometry.triangles.vertexStride;
+      data.indices = geometry->geometry.triangles.indexData.deviceAddress;
+
+      if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR)
+         data.data += build_range_info->primitiveOffset;
+      else
+         data.indices += build_range_info->primitiveOffset;
+
+      data.transform = geometry->geometry.triangles.transformData.deviceAddress;
+      if (data.transform)
+         data.transform += build_range_info->transformOffset;
+
+      data.stride = geometry->geometry.triangles.vertexStride;
+      data.vertex_format = geometry->geometry.triangles.vertexFormat;
+      data.index_format = geometry->geometry.triangles.indexType;
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
+
+      data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset;
+      data.stride = geometry->geometry.aabbs.stride;
+      break;
+   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR);
+
+      data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset;
+
+      if (geometry->geometry.instances.arrayOfPointers)
+         data.stride = 8;
+      else
+         data.stride = sizeof(VkAccelerationStructureInstanceKHR);
+      break;
+   default:
+      unreachable("Unknown geometryType");
+   }
+
+   return data;
+}
+
+static void
+vk_cmd_begin_debug_marker(VkCommandBuffer commandBuffer, const char *format, ...)
+{
+   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
+   struct vk_device *device = cmd_buffer->base.device;
+
+   va_list ap;
+   va_start(ap, format);
+
+   char *name;
+   if (vasprintf(&name, format, ap) == -1)
+      return;
+
+   va_end(ap);
+
+   VkDebugMarkerMarkerInfoEXT marker = {
+      .sType = VK_STRUCTURE_TYPE_DEBUG_MARKER_MARKER_INFO_EXT,
+      .pMarkerName = name,
+   };
+
+   device->dispatch_table.CmdDebugMarkerBeginEXT(commandBuffer, &marker);
+}
+
+static void
+vk_cmd_end_debug_marker(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
+   struct vk_device *device = cmd_buffer->base.device;
+
+   device->dispatch_table.CmdDebugMarkerEndEXT(commandBuffer);
+}
+
+static VkResult
+build_leaves(VkCommandBuffer commandBuffer,
+             struct vk_device *device, struct vk_meta_device *meta,
+             const struct vk_acceleration_structure_build_args *args,
+             uint32_t infoCount,
+             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+             const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+             struct bvh_state *bvh_states,
+             bool updateable)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   /* Many apps are broken and will make inactive primitives active when
+    * updating, even though this is disallowed by the spec.  To handle this,
+    * we use a different variant for updateable acceleration structures when
+    * the driver implements an update pass. This passes through inactive leaf
+    * nodes as if they were active, with an empty bounding box. It's then the
+    * driver or HW's responsibility to filter out inactive nodes.
+    */
+    VkResult result;
+   if (updateable) {
+      result = get_pipeline_spv(device, meta, "leaves_always_active",
+                                leaf_always_active_spv,
+                                sizeof(leaf_always_active_spv),
+                                sizeof(struct leaf_args), args, &pipeline, &layout);
+   } else {
+      result = get_pipeline_spv(device, meta, "leaves", leaf_spv, sizeof(leaf_spv),
+                                sizeof(struct leaf_args), args, &pipeline, &layout);
+   }
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "build_leaves");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+      if (bvh_states[i].config.updateable != updateable)
+         continue;
+
+      struct leaf_args leaf_consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
+      };
+
+      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
+         const VkAccelerationStructureGeometryKHR *geom =
+            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
+
+         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];
+
+         leaf_consts.geom_data = vk_fill_geometry_data(pInfos[i].type, bvh_states[i].leaf_node_count, j, geom, build_range_info);
+
+         disp->CmdPushConstants(commandBuffer, layout,
+                                VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts);
+         device->cmd_dispatch_unaligned(commandBuffer, build_range_info->primitiveCount, 1, 1);
+
+         bvh_states[i].leaf_node_count += build_range_info->primitiveCount;
+      }
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+morton_generate(VkCommandBuffer commandBuffer, struct vk_device *device,
+                struct vk_meta_device *meta, 
+                const struct vk_acceleration_structure_build_args *args,
+                uint32_t infoCount,
+                const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                struct bvh_state *bvh_states)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   VkResult result =
+      get_pipeline_spv(device, meta, "morton", morton_spv, sizeof(morton_spv),
+                       sizeof(struct morton_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "morton_generate");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+      const struct morton_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].leaf_node_count, 1, 1);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+static void
+morton_sort(VkCommandBuffer commandBuffer, struct vk_device *device,
+            const struct vk_acceleration_structure_build_args *args,
+            uint32_t infoCount,
+            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
+{
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "morton_sort");
+
+   /* Copyright 2019 The Fuchsia Authors. */
+   const radix_sort_vk_t *rs = args->radix_sort;
+
+   /*
+    * OVERVIEW
+    *
+    *   1. Pad the keyvals in `scatter_even`.
+    *   2. Zero the `histograms` and `partitions`.
+    *      --- BARRIER ---
+    *   3. HISTOGRAM is dispatched before PREFIX.
+    *      --- BARRIER ---
+    *   4. PREFIX is dispatched before the first SCATTER.
+    *      --- BARRIER ---
+    *   5. One or more SCATTER dispatches.
+    *
+    * Note that the `partitions` buffer can be zeroed anytime before the first
+    * scatter.
+    */
+
+   /* How many passes? */
+   uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t);
+   uint32_t keyval_bits = keyval_bytes * 8;
+   uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits);
+   uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2;
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].leaf_node_count)
+         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1];
+      else
+         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0];
+   }
+
+   /*
+    * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS
+    *
+    * Pad fractional blocks with max-valued keyvals.
+    *
+    * Zero the histograms and partitions buffer.
+    *
+    * This assumes the partitions follow the histograms.
+    */
+
+   /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */
+
+   /* How many scatter blocks? */
+   uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2;
+   uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows;
+
+   /*
+    * How many histogram blocks?
+    *
+    * Note that it's OK to have more max-valued digits counted by the histogram
+    * than sorted by the scatters because the sort is stable.
+    */
+   uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2;
+   uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows;
+
+   uint32_t pass_idx = (keyval_bytes - passes);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (!bvh_states[i].leaf_node_count)
+         continue;
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+
+      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      bvh_states[i].scatter_blocks = (bvh_states[i].leaf_node_count + scatter_block_kvs - 1) / scatter_block_kvs;
+      bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs;
+
+      bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs;
+      bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs;
+
+      /* Fill with max values */
+      if (bvh_states[i].count_ru_histo > bvh_states[i].leaf_node_count) {
+         device->cmd_fill_buffer_addr(commandBuffer, keyvals_even_addr +
+                                      bvh_states[i].leaf_node_count * keyval_bytes,
+                                      (bvh_states[i].count_ru_histo - bvh_states[i].leaf_node_count) * keyval_bytes,
+                                      0xFFFFFFFF);
+      }
+
+      /*
+       * Zero histograms and invalidate partitions.
+       *
+       * Note that the partition invalidation only needs to be performed once
+       * because the even/odd scatter dispatches rely on the the previous pass to
+       * leave the partitions in an invalid state.
+       *
+       * Note that the last workgroup doesn't read/write a partition so it doesn't
+       * need to be initialized.
+       */
+      uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1;
+
+      uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
+
+      device->cmd_fill_buffer_addr(commandBuffer, 
+                                   internal_addr + rs->internal.histograms.offset + fill_base,
+                                   histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)) + keyval_bytes * sizeof(uint32_t), 0);
+   }
+
+   /*
+    * Pipeline: HISTOGRAM
+    *
+    * TODO(allanmac): All subgroups should try to process approximately the same
+    * number of blocks in order to minimize tail effects.  This was implemented
+    * and reverted but should be reimplemented and benchmarked later.
+    */
+   vk_barrier_transfer_w_to_compute_r(commandBuffer);
+
+   disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                         rs->pipelines.named.histogram);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (!bvh_states[i].leaf_node_count)
+         continue;
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+
+      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      /* Dispatch histogram */
+      struct rs_push_histogram push_histogram = {
+         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
+         .devaddr_keyvals = keyvals_even_addr,
+         .passes = passes,
+      };
+
+      disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                             sizeof(push_histogram), &push_histogram);
+
+      disp->CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1);
+   }
+
+   /*
+    * Pipeline: PREFIX
+    *
+    * Launch one workgroup per pass.
+    */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                         rs->pipelines.named.prefix);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (!bvh_states[i].leaf_node_count)
+         continue;
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      struct rs_push_prefix push_prefix = {
+         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                             sizeof(push_prefix), &push_prefix);
+
+      disp->CmdDispatch(commandBuffer, passes, 1, 1);
+   }
+
+   /* Pipeline: SCATTER */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
+
+   for (uint32_t i = 0; i < infoCount; i++) {
+      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
+      uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1];
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      bvh_states[i].push_scatter = (struct rs_push_scatter){
+         .devaddr_keyvals_even = keyvals_even_addr,
+         .devaddr_keyvals_odd = keyvals_odd_addr,
+         .devaddr_partitions = internal_addr + rs->internal.partitions.offset,
+         .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset,
+      };
+   }
+
+   bool is_even = true;
+
+   while (true) {
+      uint32_t pass_dword = pass_idx / 4;
+
+      /* Bind new pipeline */
+      VkPipeline p =
+         is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd;
+      disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+
+      /* Update push constants that changed */
+      VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even
+                                    : rs->pipeline_layouts.named.scatter[pass_dword].odd;
+
+      for (uint32_t i = 0; i < infoCount; i++) {
+         if (!bvh_states[i].leaf_node_count)
+            continue;
+         if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+            continue;
+
+         bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2;
+
+         disp->CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter),
+                                &bvh_states[i].push_scatter);
+
+         disp->CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1);
+
+         bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t));
+      }
+
+      /* Continue? */
+      if (++pass_idx >= keyval_bytes)
+         break;
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      is_even ^= true;
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+}
+
+static VkResult
+lbvh_build_internal(VkCommandBuffer commandBuffer,
+                    struct vk_device *device, struct vk_meta_device *meta,
+                    const struct vk_acceleration_structure_build_args *args,
+                    uint32_t infoCount,
+                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   VkResult result =
+      get_pipeline_spv(device, meta, "lbvh_main", lbvh_main_spv,
+                       sizeof(lbvh_main_spv),
+                       sizeof(struct lbvh_main_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "lbvh_build_internal");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
+         continue;
+
+      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
+      uint32_t internal_node_count = MAX2(bvh_states[i].leaf_node_count, 2) - 1;
+
+      const struct lbvh_main_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
+         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
+         .id_count = bvh_states[i].leaf_node_count,
+         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      device->cmd_dispatch_unaligned(commandBuffer, internal_node_count, 1, 1);
+      bvh_states[i].internal_node_count = internal_node_count;
+   }
+
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   result =
+      get_pipeline_spv(device, meta, "lbvh_generate_ir", lbvh_generate_ir_spv,
+                       sizeof(lbvh_generate_ir_spv),
+                       sizeof(struct lbvh_generate_ir_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
+         continue;
+
+      const struct lbvh_generate_ir_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].internal_node_count, 1, 1);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+ploc_build_internal(VkCommandBuffer commandBuffer,
+                    struct vk_device *device, struct vk_meta_device *meta,
+                    const struct vk_acceleration_structure_build_args *args,
+                    uint32_t infoCount,
+                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   VkResult result =
+      get_pipeline_spv(device, meta, "ploc", ploc_spv,
+                       sizeof(ploc_spv),
+                       sizeof(struct ploc_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "ploc_build_internal");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC)
+         continue;
+
+      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
+      uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0])
+                                       ? bvh_states[i].scratch.sort_buffer_offset[1]
+                                       : bvh_states[i].scratch.sort_buffer_offset[0];
+
+      const struct ploc_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
+         .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset,
+         .prefix_scan_partitions =
+            pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset,
+         .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      disp->CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].leaf_node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+void
+vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
+                                     struct vk_device *device,
+                                     struct vk_meta_device *meta,
+                                     uint32_t infoCount,
+                                     const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                                     const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+                                     const struct vk_acceleration_structure_build_args *args)
+{
+   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
+   const struct vk_acceleration_structure_build_ops *ops = device->as_build_ops;
+
+   struct bvh_batch_state batch_state = {0};
+
+   struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "vkCmdBuildAccelerationStructuresKHR(%u)", infoCount);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      uint32_t leaf_node_count = 0;
+      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
+         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
+      }
+
+      get_scratch_layout(device, leaf_node_count, pInfos + i, args, &bvh_states[i].scratch);
+
+      struct build_config config = build_config(leaf_node_count, pInfos + i,
+                                                device->as_build_ops);
+      bvh_states[i].config = config;
+
+      if (config.updateable)
+         batch_state.any_updateable = true;
+      else
+         batch_state.any_non_updateable = true;
+
+      if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) {
+         batch_state.any_ploc = true;
+      } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) {
+         batch_state.any_lbvh = true;
+      } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) {
+         batch_state.any_update = true;
+      } else {
+         unreachable("Unknown internal_build_type");
+      }
+
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) {
+         /* The internal node count is updated in lbvh_build_internal for LBVH
+          * and from the PLOC shader for PLOC. */
+         struct vk_ir_header header = {
+            .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff},
+            .max_bounds = {0x80000000, 0x80000000, 0x80000000},
+            .dispatch_size_y = 1,
+            .dispatch_size_z = 1,
+            .sync_data =
+               {
+                  .current_phase_end_counter = TASK_INDEX_INVALID,
+                  /* Will be updated by the first PLOC shader invocation */
+                  .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
+               },
+         };
+
+         device->write_buffer_cp(commandBuffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+                                 &header, sizeof(header));
+      } else {
+         VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure);
+         VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure);
+
+         ops->init_update_scratch(commandBuffer, pInfos[i].scratchData.deviceAddress,
+                                  leaf_node_count, src_as, dst_as);
+      }
+   }
+
+   /* Wait for the write_buffer_cp to land before using in compute shaders */
+   device->flush_buffer_write_cp(commandBuffer);
+   device->dispatch_table.CmdPipelineBarrier(commandBuffer,
+                                             VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                                             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                                             0, /* dependencyFlags */
+                                             1,
+                                             &(VkMemoryBarrier) {
+                                                .srcAccessMask = 0,
+                                                .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
+                                             }, 0, NULL, 0, NULL);
+
+   if (batch_state.any_lbvh || batch_state.any_ploc) {
+      VkResult result;
+
+      if (batch_state.any_non_updateable) {
+         result =
+            build_leaves(commandBuffer, device, meta, args, infoCount, pInfos,
+                         ppBuildRangeInfos, bvh_states, false);
+
+         if (result != VK_SUCCESS) {
+            free(bvh_states);
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      if (batch_state.any_updateable) {
+         result =
+            build_leaves(commandBuffer, device, meta, args, infoCount, pInfos,
+                         ppBuildRangeInfos, bvh_states, true);
+
+         if (result != VK_SUCCESS) {
+            free(bvh_states);
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      result =
+         morton_generate(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
+
+      if (result != VK_SUCCESS) {
+         free(bvh_states);
+         vk_command_buffer_set_error(cmd_buffer, result);
+         return;
+      }
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      morton_sort(commandBuffer, device, args, infoCount, pInfos, bvh_states);
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      if (batch_state.any_lbvh) {
+         result =
+            lbvh_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
+
+         if (result != VK_SUCCESS) {
+            free(bvh_states);
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      if (batch_state.any_ploc) {
+         result =
+            ploc_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
+
+         if (result != VK_SUCCESS) {
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+      vk_barrier_compute_w_to_indirect_compute_r(commandBuffer);
+   }
+
+   for (unsigned pass = 0; pass < ARRAY_SIZE(ops->encode_as); pass++) {
+      if (!ops->encode_as[pass] && !ops->update_as[pass])
+         break;
+
+      bool progress;
+      do {
+         progress = false;
+
+         bool update;
+         uint32_t encode_key;
+         for (uint32_t i = 0; i < infoCount; ++i) {
+            if (bvh_states[i].last_encode_pass == pass + 1)
+               continue;
+
+            if (!progress) {
+               update = (bvh_states[i].config.internal_type ==
+                         INTERNAL_BUILD_TYPE_UPDATE);
+               if (update && !ops->update_as[pass])
+                  continue;
+               if (!update && !ops->encode_as[pass])
+                  continue;
+               encode_key = bvh_states[i].config.encode_key[pass];
+               progress = true;
+               if (update)
+                  ops->update_bind_pipeline[pass](commandBuffer);
+               else
+                  ops->encode_bind_pipeline[pass](commandBuffer, encode_key);
+            } else {
+               if (update != (bvh_states[i].config.internal_type ==
+                              INTERNAL_BUILD_TYPE_UPDATE) ||
+                   encode_key != bvh_states[i].config.encode_key[pass])
+                  continue;
+            }
+
+            VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
+
+            if (update) {
+               VK_FROM_HANDLE(vk_acceleration_structure, src, pInfos[i].srcAccelerationStructure);
+               ops->update_as[pass](commandBuffer,
+                                    &pInfos[i],
+                                    ppBuildRangeInfos[i],
+                                    bvh_states[i].leaf_node_count,
+                                    src,
+                                    accel_struct);
+
+            } else {
+               ops->encode_as[pass](commandBuffer,
+                                    &pInfos[i],
+                                    ppBuildRangeInfos[i],
+                                    pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+                                    pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+                                    bvh_states[i].leaf_node_count,
+                                    encode_key,
+                                    accel_struct);
+            }
+
+            bvh_states[i].last_encode_pass = pass + 1;
+         }
+      } while (progress);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   free(bvh_states);
+}
+
+void
+vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
+                      const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                      const uint32_t *pMaxPrimitiveCounts,
+                      VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo,
+                      const struct vk_acceleration_structure_build_args *args)
+{
+   VK_FROM_HANDLE(vk_device, device, _device);
+
+   uint32_t leaf_count = 0;
+   for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
+      leaf_count += pMaxPrimitiveCounts[i];
+
+   struct scratch_layout scratch;
+
+   get_scratch_layout(device, leaf_count, pBuildInfo, args, &scratch);
+
+   pSizeInfo->accelerationStructureSize =
+      device->as_build_ops->get_as_size(_device, pBuildInfo, leaf_count);
+   pSizeInfo->updateScratchSize = scratch.update_size;
+   pSizeInfo->buildScratchSize = scratch.size;
+}
+
+/* Return true if the common framework supports using this format for loading
+ * vertices. Must match the formats handled by load_vertices() on the GPU.
+ */
+bool
+vk_acceleration_struct_vtx_format_supported(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_R32G32_SFLOAT:
+   case VK_FORMAT_R32G32B32_SFLOAT:
+   case VK_FORMAT_R32G32B32A32_SFLOAT:
+   case VK_FORMAT_R16G16_SFLOAT:
+   case VK_FORMAT_R16G16B16_SFLOAT:
+   case VK_FORMAT_R16G16B16A16_SFLOAT:
+   case VK_FORMAT_R16G16_SNORM:
+   case VK_FORMAT_R16G16_UNORM:
+   case VK_FORMAT_R16G16B16A16_SNORM:
+   case VK_FORMAT_R16G16B16A16_UNORM:
+   case VK_FORMAT_R8G8_SNORM:
+   case VK_FORMAT_R8G8_UNORM:
+   case VK_FORMAT_R8G8B8A8_SNORM:
+   case VK_FORMAT_R8G8B8A8_UNORM:
+   case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+      return true;
+   default:
+      return false;
+   }
+}
+
diff --git a/src/vulkan/runtime/vk_acceleration_structure.h b/src/vulkan/runtime/vk_acceleration_structure.h
index bcc2eff4660..b34d177cbfe 100644
--- a/src/vulkan/runtime/vk_acceleration_structure.h
+++ b/src/vulkan/runtime/vk_acceleration_structure.h
@@ -26,6 +26,11 @@
 #define VK_ACCELERATION_STRUCTURE_H
 
 #include "vk_object.h"
+#include "radix_sort/radix_sort_vk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct vk_acceleration_structure {
    struct vk_object_base base;
@@ -40,4 +45,88 @@ VkDeviceAddress vk_acceleration_structure_get_va(struct vk_acceleration_structur
 VK_DEFINE_NONDISP_HANDLE_CASTS(vk_acceleration_structure, base, VkAccelerationStructureKHR,
                                VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)
 
+#define MAX_ENCODE_PASSES 2
+#define MAX_UPDATE_PASSES 2
+
+struct vk_acceleration_structure_build_ops {
+   VkDeviceSize (*get_as_size)(VkDevice device,
+                               const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                               uint32_t leaf_count);
+   VkDeviceSize (*get_update_scratch_size)(struct vk_device *device, uint32_t leaf_count);
+   uint32_t (*get_encode_key[MAX_ENCODE_PASSES])(VkAccelerationStructureTypeKHR type,
+                                                 VkBuildAccelerationStructureFlagBitsKHR flags);
+   VkResult (*encode_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                                       uint32_t key);
+   void (*encode_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                        const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                        const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                                        VkDeviceAddress intermediate_as_addr,
+                                        VkDeviceAddress intermediate_header_addr,
+                                        uint32_t leaf_count,
+                                        uint32_t key,
+                                        struct vk_acceleration_structure *dst);
+   void (*init_update_scratch)(VkCommandBuffer cmd_buffer,
+                               VkDeviceAddress scratch,
+                               uint32_t leaf_count,
+                               struct vk_acceleration_structure *src_as,
+                               struct vk_acceleration_structure *dst_as);
+   void (*update_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer);
+   void (*update_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                        const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                        const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                                        uint32_t leaf_count,
+                                        struct vk_acceleration_structure *dst,
+                                        struct vk_acceleration_structure *src);
+};
+
+struct vk_acceleration_structure_build_args {
+   uint32_t subgroup_size;
+   uint32_t bvh_bounds_offset;
+   bool emit_markers;
+   const radix_sort_vk_t *radix_sort;
+};
+
+struct vk_meta_device;
+
+void vk_cmd_build_acceleration_structures(VkCommandBuffer cmdbuf,
+                                          struct vk_device *device,
+                                          struct vk_meta_device *meta,
+                                          uint32_t info_count,
+                                          const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                                          const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+                                          const struct vk_acceleration_structure_build_args *args);
+
+void vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
+                           const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                           const uint32_t *pMaxPrimitiveCounts,
+                           VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo,
+                           const struct vk_acceleration_structure_build_args *args);
+
+bool vk_acceleration_struct_vtx_format_supported(VkFormat format);
+
+static inline VkGeometryTypeKHR
+vk_get_as_geometry_type(const VkAccelerationStructureBuildGeometryInfoKHR *build_info)
+{
+   if (build_info->geometryCount) {
+      if (build_info->pGeometries)
+         return build_info->pGeometries[0].geometryType;
+      else
+         return build_info->ppGeometries[0]->geometryType;
+   }
+
+   /* If there are no geometries, the geometry type shouldn't matter, but
+    * return something.
+    */
+   return VK_GEOMETRY_TYPE_TRIANGLES_KHR;
+}
+
+struct vk_bvh_geometry_data
+vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index,
+                      const VkAccelerationStructureGeometryKHR *geometry,
+                      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/vulkan/runtime/vk_device.h b/src/vulkan/runtime/vk_device.h
index 4d7220f832f..83d41afab0c 100644
--- a/src/vulkan/runtime/vk_device.h
+++ b/src/vulkan/runtime/vk_device.h
@@ -37,6 +37,7 @@
 extern "C" {
 #endif
 
+struct vk_acceleration_structure_build_ops;
 struct vk_command_buffer_ops;
 struct vk_device_shader_ops;
 struct vk_sync;
@@ -134,6 +135,9 @@ struct vk_device {
    /** Shader vtable for VK_EXT_shader_object and common pipelines */
    const struct vk_device_shader_ops *shader_ops;
 
+   /** Acceleration structure build vtable for common BVH building. */
+   const struct vk_acceleration_structure_build_ops *as_build_ops;
+
    /**
     * Write data to a buffer from the command processor. This is simpler than
     * setting up a staging buffer and faster for small writes, but is not