From f8b584d6a5f7e64b82cdb80debdc4411947ad08e Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Tue, 5 Mar 2024 06:26:46 -0500
Subject: [PATCH] vulkan/runtime,radv: Add shared BVH building framework

This is mostly adapted from radv's BVH building. This defines a common
"IR" for BVH trees, two algorithms for constructing it, and a callback
that the driver implements for encoding. The framework takes care of
parallelizing the different passes, so the driver just has to split the
encoding process into "stages" and implement just one part for each
stage.

The runtime changes are:
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
The radv changes are;

Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31433>
---
 src/amd/vulkan/bvh/build_helpers.h            |  503 +-----
 src/amd/vulkan/bvh/build_interface.h          |   67 +-
 src/amd/vulkan/bvh/bvh.h                      |   89 +-
 src/amd/vulkan/bvh/encode.comp                |  124 +-
 src/amd/vulkan/bvh/leaf.comp                  |   99 --
 src/amd/vulkan/bvh/meson.build                |   35 +-
 src/amd/vulkan/bvh/update.comp                |   14 +-
 src/amd/vulkan/meson.build                    |    3 -
 src/amd/vulkan/radix_sort/meson.build         |   21 -
 src/amd/vulkan/radix_sort/radv_radix_sort.c   |  196 ---
 src/amd/vulkan/radix_sort/radv_radix_sort.h   |   14 -
 src/amd/vulkan/radix_sort/shaders/meson.build |   40 -
 src/amd/vulkan/radix_sort/shaders/prefix.h    |  353 ----
 .../vulkan/radix_sort/targets/u64/config.h    |   34 -
 src/amd/vulkan/radv_acceleration_structure.c  | 1466 +++++------------
 src/amd/vulkan/radv_device.h                  |   16 +-
 src/amd/vulkan/radv_rra.c                     |    8 +-
 .../runtime}/bvh/lbvh_generate_ir.comp        |   55 +-
 .../runtime}/bvh/lbvh_main.comp               |   31 +-
 src/vulkan/runtime/bvh/leaf.comp              |  250 +++
 src/vulkan/runtime/bvh/meson.build            |   81 +
 .../vulkan => vulkan/runtime}/bvh/morton.comp |   29 +-
 .../runtime}/bvh/ploc_internal.comp           |  116 +-
 src/vulkan/runtime/bvh/vk_build_helpers.h     |  522 ++++++
 src/vulkan/runtime/bvh/vk_build_interface.h   |  103 ++
 src/vulkan/runtime/bvh/vk_bvh.h               |  156 ++
 src/vulkan/runtime/meson.build                |   12 +-
 .../runtime}/radix_sort/LICENSE               |    0
 .../runtime}/radix_sort/common/macros.h       |    0
 .../runtime}/radix_sort/common/util.c         |    0
 .../runtime}/radix_sort/common/util.h         |    0
 .../runtime}/radix_sort/common/vk/barrier.c   |   62 +-
 .../runtime}/radix_sort/common/vk/barrier.h   |    0
 src/vulkan/runtime/radix_sort/meson.build     |   37 +
 .../runtime/radix_sort/radix_sort_u64.c       |   59 +
 .../runtime/radix_sort/radix_sort_u64.h       |   24 +
 .../runtime}/radix_sort/radix_sort_vk.c       |  239 ++-
 .../runtime}/radix_sort/radix_sort_vk.h       |    0
 .../radix_sort/radix_sort_vk_devaddr.h        |    0
 .../runtime}/radix_sort/radix_sort_vk_ext.h   |    0
 .../runtime}/radix_sort/shaders/bufref.h      |    0
 .../runtime/radix_sort/shaders/config.h       |   33 +
 .../runtime}/radix_sort/shaders/fill.comp     |   16 +-
 .../radix_sort/shaders/histogram.comp         |  166 +-
 .../runtime}/radix_sort/shaders/init.comp     |   28 +-
 .../runtime/radix_sort/shaders/meson.build    |   53 +
 .../runtime}/radix_sort/shaders/prefix.comp   |   58 +-
 .../runtime/radix_sort/shaders/prefix.h       |  356 ++++
 .../radix_sort/shaders/prefix_limits.h        |   20 +-
 .../runtime}/radix_sort/shaders/push.h        |    0
 .../runtime}/radix_sort/shaders/scatter.glsl  | 1193 +++++++-------
 .../radix_sort/shaders/scatter_0_even.comp    |    0
 .../radix_sort/shaders/scatter_0_odd.comp     |    0
 .../radix_sort/shaders/scatter_1_even.comp    |    0
 .../radix_sort/shaders/scatter_1_odd.comp     |    0
 .../runtime}/radix_sort/target.h              |    3 +
 .../runtime/vk_acceleration_structure.c       | 1153 +++++++++++++
 .../runtime/vk_acceleration_structure.h       |   89 +
 src/vulkan/runtime/vk_device.h                |    4 +
 59 files changed, 4500 insertions(+), 3530 deletions(-)
 delete mode 100644 src/amd/vulkan/bvh/leaf.comp
 delete mode 100644 src/amd/vulkan/radix_sort/meson.build
 delete mode 100644 src/amd/vulkan/radix_sort/radv_radix_sort.c
 delete mode 100644 src/amd/vulkan/radix_sort/radv_radix_sort.h
 delete mode 100644 src/amd/vulkan/radix_sort/shaders/meson.build
 delete mode 100644 src/amd/vulkan/radix_sort/shaders/prefix.h
 delete mode 100644 src/amd/vulkan/radix_sort/targets/u64/config.h
 rename src/{amd/vulkan => vulkan/runtime}/bvh/lbvh_generate_ir.comp (58%)
 rename src/{amd/vulkan => vulkan/runtime}/bvh/lbvh_main.comp (76%)
 create mode 100644 src/vulkan/runtime/bvh/leaf.comp
 create mode 100644 src/vulkan/runtime/bvh/meson.build
 rename src/{amd/vulkan => vulkan/runtime}/bvh/morton.comp (62%)
 rename src/{amd/vulkan => vulkan/runtime}/bvh/ploc_internal.comp (76%)
 create mode 100644 src/vulkan/runtime/bvh/vk_build_helpers.h
 create mode 100644 src/vulkan/runtime/bvh/vk_build_interface.h
 create mode 100644 src/vulkan/runtime/bvh/vk_bvh.h
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/LICENSE (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/common/macros.h (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/common/util.c (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/common/util.h (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/common/vk/barrier.c (81%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/common/vk/barrier.h (100%)
 create mode 100644 src/vulkan/runtime/radix_sort/meson.build
 create mode 100644 src/vulkan/runtime/radix_sort/radix_sort_u64.c
 create mode 100644 src/vulkan/runtime/radix_sort/radix_sort_u64.h
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/radix_sort_vk.c (83%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/radix_sort_vk.h (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/radix_sort_vk_devaddr.h (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/radix_sort_vk_ext.h (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/bufref.h (100%)
 create mode 100644 src/vulkan/runtime/radix_sort/shaders/config.h
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/fill.comp (89%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/histogram.comp (78%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/init.comp (76%)
 create mode 100644 src/vulkan/runtime/radix_sort/shaders/meson.build
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/prefix.comp (69%)
 create mode 100644 src/vulkan/runtime/radix_sort/shaders/prefix.h
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/prefix_limits.h (71%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/push.h (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/scatter.glsl (58%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/scatter_0_even.comp (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/scatter_0_odd.comp (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/scatter_1_even.comp (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/shaders/scatter_1_odd.comp (100%)
 rename src/{amd/vulkan => vulkan/runtime}/radix_sort/target.h (94%)

diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h
index 30b3224b2d6..3014a827e61 100644
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@@ -8,210 +8,7 @@
 #define BVH_BUILD_HELPERS_H
 
 #include "bvh.h"
-
-#define VK_FORMAT_UNDEFINED                  0
-#define VK_FORMAT_R4G4_UNORM_PACK8           1
-#define VK_FORMAT_R4G4B4A4_UNORM_PACK16      2
-#define VK_FORMAT_B4G4R4A4_UNORM_PACK16      3
-#define VK_FORMAT_R5G6B5_UNORM_PACK16        4
-#define VK_FORMAT_B5G6R5_UNORM_PACK16        5
-#define VK_FORMAT_R5G5B5A1_UNORM_PACK16      6
-#define VK_FORMAT_B5G5R5A1_UNORM_PACK16      7
-#define VK_FORMAT_A1R5G5B5_UNORM_PACK16      8
-#define VK_FORMAT_R8_UNORM                   9
-#define VK_FORMAT_R8_SNORM                   10
-#define VK_FORMAT_R8_USCALED                 11
-#define VK_FORMAT_R8_SSCALED                 12
-#define VK_FORMAT_R8_UINT                    13
-#define VK_FORMAT_R8_SINT                    14
-#define VK_FORMAT_R8_SRGB                    15
-#define VK_FORMAT_R8G8_UNORM                 16
-#define VK_FORMAT_R8G8_SNORM                 17
-#define VK_FORMAT_R8G8_USCALED               18
-#define VK_FORMAT_R8G8_SSCALED               19
-#define VK_FORMAT_R8G8_UINT                  20
-#define VK_FORMAT_R8G8_SINT                  21
-#define VK_FORMAT_R8G8_SRGB                  22
-#define VK_FORMAT_R8G8B8_UNORM               23
-#define VK_FORMAT_R8G8B8_SNORM               24
-#define VK_FORMAT_R8G8B8_USCALED             25
-#define VK_FORMAT_R8G8B8_SSCALED             26
-#define VK_FORMAT_R8G8B8_UINT                27
-#define VK_FORMAT_R8G8B8_SINT                28
-#define VK_FORMAT_R8G8B8_SRGB                29
-#define VK_FORMAT_B8G8R8_UNORM               30
-#define VK_FORMAT_B8G8R8_SNORM               31
-#define VK_FORMAT_B8G8R8_USCALED             32
-#define VK_FORMAT_B8G8R8_SSCALED             33
-#define VK_FORMAT_B8G8R8_UINT                34
-#define VK_FORMAT_B8G8R8_SINT                35
-#define VK_FORMAT_B8G8R8_SRGB                36
-#define VK_FORMAT_R8G8B8A8_UNORM             37
-#define VK_FORMAT_R8G8B8A8_SNORM             38
-#define VK_FORMAT_R8G8B8A8_USCALED           39
-#define VK_FORMAT_R8G8B8A8_SSCALED           40
-#define VK_FORMAT_R8G8B8A8_UINT              41
-#define VK_FORMAT_R8G8B8A8_SINT              42
-#define VK_FORMAT_R8G8B8A8_SRGB              43
-#define VK_FORMAT_B8G8R8A8_UNORM             44
-#define VK_FORMAT_B8G8R8A8_SNORM             45
-#define VK_FORMAT_B8G8R8A8_USCALED           46
-#define VK_FORMAT_B8G8R8A8_SSCALED           47
-#define VK_FORMAT_B8G8R8A8_UINT              48
-#define VK_FORMAT_B8G8R8A8_SINT              49
-#define VK_FORMAT_B8G8R8A8_SRGB              50
-#define VK_FORMAT_A8B8G8R8_UNORM_PACK32      51
-#define VK_FORMAT_A8B8G8R8_SNORM_PACK32      52
-#define VK_FORMAT_A8B8G8R8_USCALED_PACK32    53
-#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32    54
-#define VK_FORMAT_A8B8G8R8_UINT_PACK32       55
-#define VK_FORMAT_A8B8G8R8_SINT_PACK32       56
-#define VK_FORMAT_A8B8G8R8_SRGB_PACK32       57
-#define VK_FORMAT_A2R10G10B10_UNORM_PACK32   58
-#define VK_FORMAT_A2R10G10B10_SNORM_PACK32   59
-#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
-#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
-#define VK_FORMAT_A2R10G10B10_UINT_PACK32    62
-#define VK_FORMAT_A2R10G10B10_SINT_PACK32    63
-#define VK_FORMAT_A2B10G10R10_UNORM_PACK32   64
-#define VK_FORMAT_A2B10G10R10_SNORM_PACK32   65
-#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
-#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
-#define VK_FORMAT_A2B10G10R10_UINT_PACK32    68
-#define VK_FORMAT_A2B10G10R10_SINT_PACK32    69
-#define VK_FORMAT_R16_UNORM                  70
-#define VK_FORMAT_R16_SNORM                  71
-#define VK_FORMAT_R16_USCALED                72
-#define VK_FORMAT_R16_SSCALED                73
-#define VK_FORMAT_R16_UINT                   74
-#define VK_FORMAT_R16_SINT                   75
-#define VK_FORMAT_R16_SFLOAT                 76
-#define VK_FORMAT_R16G16_UNORM               77
-#define VK_FORMAT_R16G16_SNORM               78
-#define VK_FORMAT_R16G16_USCALED             79
-#define VK_FORMAT_R16G16_SSCALED             80
-#define VK_FORMAT_R16G16_UINT                81
-#define VK_FORMAT_R16G16_SINT                82
-#define VK_FORMAT_R16G16_SFLOAT              83
-#define VK_FORMAT_R16G16B16_UNORM            84
-#define VK_FORMAT_R16G16B16_SNORM            85
-#define VK_FORMAT_R16G16B16_USCALED          86
-#define VK_FORMAT_R16G16B16_SSCALED          87
-#define VK_FORMAT_R16G16B16_UINT             88
-#define VK_FORMAT_R16G16B16_SINT             89
-#define VK_FORMAT_R16G16B16_SFLOAT           90
-#define VK_FORMAT_R16G16B16A16_UNORM         91
-#define VK_FORMAT_R16G16B16A16_SNORM         92
-#define VK_FORMAT_R16G16B16A16_USCALED       93
-#define VK_FORMAT_R16G16B16A16_SSCALED       94
-#define VK_FORMAT_R16G16B16A16_UINT          95
-#define VK_FORMAT_R16G16B16A16_SINT          96
-#define VK_FORMAT_R16G16B16A16_SFLOAT        97
-#define VK_FORMAT_R32_UINT                   98
-#define VK_FORMAT_R32_SINT                   99
-#define VK_FORMAT_R32_SFLOAT                 100
-#define VK_FORMAT_R32G32_UINT                101
-#define VK_FORMAT_R32G32_SINT                102
-#define VK_FORMAT_R32G32_SFLOAT              103
-#define VK_FORMAT_R32G32B32_UINT             104
-#define VK_FORMAT_R32G32B32_SINT             105
-#define VK_FORMAT_R32G32B32_SFLOAT           106
-#define VK_FORMAT_R32G32B32A32_UINT          107
-#define VK_FORMAT_R32G32B32A32_SINT          108
-#define VK_FORMAT_R32G32B32A32_SFLOAT        109
-#define VK_FORMAT_R64_UINT                   110
-#define VK_FORMAT_R64_SINT                   111
-#define VK_FORMAT_R64_SFLOAT                 112
-#define VK_FORMAT_R64G64_UINT                113
-#define VK_FORMAT_R64G64_SINT                114
-#define VK_FORMAT_R64G64_SFLOAT              115
-#define VK_FORMAT_R64G64B64_UINT             116
-#define VK_FORMAT_R64G64B64_SINT             117
-#define VK_FORMAT_R64G64B64_SFLOAT           118
-#define VK_FORMAT_R64G64B64A64_UINT          119
-#define VK_FORMAT_R64G64B64A64_SINT          120
-#define VK_FORMAT_R64G64B64A64_SFLOAT        121
-
-#define VK_INDEX_TYPE_UINT16    0
-#define VK_INDEX_TYPE_UINT32    1
-#define VK_INDEX_TYPE_NONE_KHR  1000165000
-#define VK_INDEX_TYPE_UINT8_EXT 1000265000
-
-#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
-#define VK_GEOMETRY_TYPE_AABBS_KHR     1
-#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
-
-#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
-#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR         2
-#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR                 4
-#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR              8
-
-#define TYPE(type, align)                                                                                              \
-   layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref                                  \
-   {                                                                                                                   \
-      type value;                                                                                                      \
-   };
-
-#define REF(type)  type##_ref
-#define VOID_REF   uint64_t
-#define NULL       0
-#define DEREF(var) var.value
-
-#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
-
-#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
-
-#define INFINITY (1.0 / 0.0)
-#define NAN      (0.0 / 0.0)
-
-#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
-
-TYPE(int8_t, 1);
-TYPE(uint8_t, 1);
-TYPE(int16_t, 2);
-TYPE(uint16_t, 2);
-TYPE(int32_t, 4);
-TYPE(uint32_t, 4);
-TYPE(int64_t, 8);
-TYPE(uint64_t, 8);
-
-TYPE(float, 4);
-
-TYPE(vec2, 4);
-TYPE(vec3, 4);
-TYPE(vec4, 4);
-
-TYPE(uvec4, 16);
-
-TYPE(VOID_REF, 8);
-
-/* copied from u_math.h */
-uint32_t
-align(uint32_t value, uint32_t alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
-
-int32_t
-to_emulated_float(float f)
-{
-   int32_t bits = floatBitsToInt(f);
-   return f < 0 ? -2147483648 - bits : bits;
-}
-
-float
-from_emulated_float(int32_t bits)
-{
-   return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
-}
-
-TYPE(radv_aabb, 4);
-
-struct key_id_pair {
-   uint32_t id;
-   uint32_t key;
-};
-TYPE(key_id_pair, 4);
+#include "vk_build_helpers.h"
 
 TYPE(radv_accel_struct_serialization_header, 8);
 TYPE(radv_accel_struct_header, 8);
@@ -221,12 +18,6 @@ TYPE(radv_bvh_instance_node, 8);
 TYPE(radv_bvh_box16_node, 4);
 TYPE(radv_bvh_box32_node, 4);
 
-TYPE(radv_ir_header, 4);
-TYPE(radv_ir_node, 4);
-TYPE(radv_ir_box_node, 4);
-
-TYPE(radv_global_sync_data, 4);
-
 uint32_t
 id_to_offset(uint32_t id)
 {
@@ -259,178 +50,23 @@ addr_to_node(uint64_t addr)
    return (addr >> 3) & ((1ul << 45) - 1);
 }
 
-uint32_t
-ir_id_to_offset(uint32_t id)
-{
-   return id & (~3u);
-}
-
-uint32_t
-ir_id_to_type(uint32_t id)
-{
-   return id & 3u;
-}
-
-uint32_t
-pack_ir_node_id(uint32_t offset, uint32_t type)
-{
-   return offset | type;
-}
-
 uint32_t
 ir_type_to_bvh_type(uint32_t type)
 {
    switch (type) {
-   case radv_ir_node_triangle:
+   case vk_ir_node_triangle:
       return radv_bvh_node_triangle;
-   case radv_ir_node_internal:
+   case vk_ir_node_internal:
       return radv_bvh_node_box32;
-   case radv_ir_node_instance:
+   case vk_ir_node_instance:
       return radv_bvh_node_instance;
-   case radv_ir_node_aabb:
+   case vk_ir_node_aabb:
       return radv_bvh_node_aabb;
    }
    /* unreachable in valid nodes */
    return RADV_BVH_INVALID_NODE;
 }
 
-float
-aabb_surface_area(radv_aabb aabb)
-{
-   vec3 diagonal = aabb.max - aabb.min;
-   return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
-}
-
-/* Just a wrapper for 3 uints. */
-struct triangle_indices {
-   uint32_t index[3];
-};
-
-triangle_indices
-load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
-{
-   triangle_indices result;
-
-   uint32_t index_base = global_id * 3;
-
-   switch (index_format) {
-   case VK_INDEX_TYPE_UINT16: {
-      result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
-      break;
-   }
-   case VK_INDEX_TYPE_UINT32: {
-      result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
-      break;
-   }
-   case VK_INDEX_TYPE_NONE_KHR: {
-      result.index[0] = index_base + 0;
-      result.index[1] = index_base + 1;
-      result.index[2] = index_base + 2;
-      break;
-   }
-   case VK_INDEX_TYPE_UINT8_EXT: {
-      result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
-      result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
-      result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
-      break;
-   }
-   }
-
-   return result;
-}
-
-/* Just a wrapper for 3 vec4s. */
-struct triangle_vertices {
-   vec4 vertex[3];
-};
-
-TYPE(float16_t, 2);
-
-triangle_vertices
-load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
-{
-   triangle_vertices result;
-
-   for (uint32_t i = 0; i < 3; i++) {
-      VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
-      vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
-
-      switch (vertex_format) {
-      case VK_FORMAT_R32G32_SFLOAT:
-         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
-         break;
-      case VK_FORMAT_R32G32B32_SFLOAT:
-      case VK_FORMAT_R32G32B32A32_SFLOAT:
-         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
-         vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
-         break;
-      case VK_FORMAT_R16G16_SFLOAT:
-         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
-         break;
-      case VK_FORMAT_R16G16B16_SFLOAT:
-      case VK_FORMAT_R16G16B16A16_SFLOAT:
-         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
-         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
-         vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
-         break;
-      case VK_FORMAT_R16G16_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
-         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
-         break;
-      case VK_FORMAT_R16G16B16A16_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
-         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
-         vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
-         break;
-      case VK_FORMAT_R8G8_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
-         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
-         break;
-      case VK_FORMAT_R8G8B8A8_SNORM:
-         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
-         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
-         vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
-         break;
-      case VK_FORMAT_R16G16_UNORM:
-         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
-         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
-         break;
-      case VK_FORMAT_R16G16B16A16_UNORM:
-         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
-         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
-         vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
-         break;
-      case VK_FORMAT_R8G8_UNORM:
-         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
-         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
-         break;
-      case VK_FORMAT_R8G8B8A8_UNORM:
-         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
-         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
-         vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
-         break;
-      case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
-         uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
-         vertex.x = float(data & 0x3FF) / 0x3FF;
-         vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
-         vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
-         break;
-      }
-      }
-
-      result.vertex[i] = vertex;
-   }
-
-   return result;
-}
-
 /* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
 struct AccelerationStructureInstance {
    mat3x4 transform;
@@ -441,7 +77,7 @@ struct AccelerationStructureInstance {
 TYPE(AccelerationStructureInstance, 8);
 
 bool
-build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data geom_data, uint32_t global_id)
+build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
 {
    bool is_valid = true;
    triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
@@ -490,7 +126,7 @@ build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data
 }
 
 bool
-build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
+build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
 {
    bool is_valid = true;
    REF(radv_bvh_aabb_node) node = REF(radv_bvh_aabb_node)(dst_ptr);
@@ -521,10 +157,10 @@ build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t
    return is_valid;
 }
 
-radv_aabb
+vk_aabb
 calculate_instance_node_bounds(radv_accel_struct_header header, mat3x4 otw_matrix)
 {
-   radv_aabb aabb;
+   vk_aabb aabb;
    for (uint32_t comp = 0; comp < 3; ++comp) {
       aabb.min[comp] = otw_matrix[comp][3];
       aabb.max[comp] = otw_matrix[comp][3];
@@ -555,7 +191,7 @@ encode_sbt_offset_and_flags(uint32_t src)
 }
 
 bool
-build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
+build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
 {
    REF(radv_bvh_instance_node) node = REF(radv_bvh_instance_node)(dst_ptr);
 
@@ -591,123 +227,4 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3
     From macros.h */
 #define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
 
-#ifdef USE_GLOBAL_SYNC
-
-/* There might be more invocations available than tasks to do.
- * In that case, the fetched task index is greater than the
- * counter offset for the next phase. To avoid out-of-bounds
- * accessing, phases will be skipped until the task index is
- * is in-bounds again. */
-uint32_t num_tasks_to_skip = 0;
-uint32_t phase_index = 0;
-bool should_skip = false;
-shared uint32_t global_task_index;
-
-shared uint32_t shared_phase_index;
-
-uint32_t
-task_count(REF(radv_ir_header) header)
-{
-   uint32_t phase_index = DEREF(header).sync_data.phase_index;
-   return DEREF(header).sync_data.task_counts[phase_index & 1];
-}
-
-/* Sets the task count for the next phase. */
-void
-set_next_task_count(REF(radv_ir_header) header, uint32_t new_count)
-{
-   uint32_t phase_index = DEREF(header).sync_data.phase_index;
-   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
-}
-
-/*
- * This function has two main objectives:
- * Firstly, it partitions pending work among free invocations.
- * Secondly, it guarantees global synchronization between different phases.
- *
- * After every call to fetch_task, a new task index is returned.
- * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
- * to determine if the current phase should be executed or skipped.
- *
- * Since tasks are assigned per-workgroup, there is a possibility of the task index being
- * greater than the total task count.
- */
-uint32_t
-fetch_task(REF(radv_ir_header) header, bool did_work)
-{
-   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
-    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
-    * and their results are written to memory. */
-   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-   if (gl_LocalInvocationIndex == 0) {
-      if (did_work)
-         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
-      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
-
-      do {
-         /* Perform a memory barrier to refresh the current phase's end counter, in case
-          * another workgroup changed it. */
-         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-
-         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
-          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
-          * previous phase have been completed. Switching to a new phase and incrementing the phase
-          * end counter in turn notifies all invocations for that phase that it is safe to execute.
-          */
-         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
-             DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
-            if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
-               DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
-               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-            } else {
-               atomicAdd(DEREF(header).sync_data.phase_index, 1);
-               DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
-               /* Ensure the changes to the phase index and start/end counter are visible for other
-                * workgroup waiting in the loop. */
-               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
-               atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
-                         DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
-            }
-            break;
-         }
-
-         /* If other invocations have finished all nodes, break out; there is no work to do */
-         if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
-            break;
-         }
-      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
-
-      shared_phase_index = DEREF(header).sync_data.phase_index;
-   }
-
-   barrier();
-   if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
-      return TASK_INDEX_INVALID;
-
-   num_tasks_to_skip = shared_phase_index - phase_index;
-
-   uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
-   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
-}
-
-bool
-should_execute_phase()
-{
-   if (num_tasks_to_skip > 0) {
-      /* Skip to next phase. */
-      ++phase_index;
-      --num_tasks_to_skip;
-      return false;
-   }
-   return true;
-}
-
-#define PHASE(header)                                                                                                  \
-   for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
-#endif
-
 #endif /* BUILD_HELPERS_H */
diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h
index 6422319c506..c0c06c98fed 100644
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@@ -16,49 +16,6 @@
 #define VOID_REF  uint64_t
 #endif
 
-struct leaf_args {
-   VOID_REF ir;
-   VOID_REF bvh;
-   REF(radv_ir_header) header;
-   REF(key_id_pair) ids;
-
-   radv_bvh_geometry_data geom_data;
-};
-
-struct morton_args {
-   VOID_REF bvh;
-   REF(radv_ir_header) header;
-   REF(key_id_pair) ids;
-};
-
-#define LBVH_RIGHT_CHILD_BIT_SHIFT 29
-#define LBVH_RIGHT_CHILD_BIT       (1 << LBVH_RIGHT_CHILD_BIT_SHIFT)
-
-struct lbvh_node_info {
-   /* Number of children that have been processed (or are invalid/leaves) in
-    * the lbvh_generate_ir pass.
-    */
-   uint32_t path_count;
-
-   uint32_t children[2];
-   uint32_t parent;
-};
-
-struct lbvh_main_args {
-   VOID_REF bvh;
-   REF(key_id_pair) src_ids;
-   VOID_REF node_info;
-   uint32_t id_count;
-   uint32_t internal_node_base;
-};
-
-struct lbvh_generate_ir_args {
-   VOID_REF bvh;
-   VOID_REF node_info;
-   VOID_REF header;
-   uint32_t internal_node_base;
-};
-
 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
 #define RADV_COPY_MODE_DESERIALIZE 2
@@ -72,30 +29,14 @@ struct copy_args {
 struct encode_args {
    VOID_REF intermediate_bvh;
    VOID_REF output_bvh;
-   REF(radv_ir_header) header;
+   REF(vk_ir_header) header;
    uint32_t output_bvh_offset;
    uint32_t leaf_node_count;
    uint32_t geometry_type;
 };
 
-struct ploc_prefix_scan_partition {
-   uint32_t aggregate;
-   uint32_t inclusive_sum;
-};
-
-#define PLOC_WORKGROUP_SIZE 1024
-
-struct ploc_args {
-   VOID_REF bvh;
-   VOID_REF prefix_scan_partitions;
-   REF(radv_ir_header) header;
-   VOID_REF ids_0;
-   VOID_REF ids_1;
-   uint32_t internal_node_offset;
-};
-
 struct header_args {
-   REF(radv_ir_header) src;
+   REF(vk_ir_header) src;
    REF(radv_accel_struct_header) dst;
    uint32_t bvh_offset;
    uint32_t instance_count;
@@ -104,11 +45,11 @@ struct header_args {
 struct update_args {
    REF(radv_accel_struct_header) src;
    REF(radv_accel_struct_header) dst;
-   REF(radv_aabb) leaf_bounds;
+   REF(vk_aabb) leaf_bounds;
    REF(uint32_t) internal_ready_count;
    uint32_t leaf_node_count;
 
-   radv_bvh_geometry_data geom_data;
+   vk_bvh_geometry_data geom_data;
 };
 
 #endif /* BUILD_INTERFACE_H */
diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index 27399fff200..2b87ec47664 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -7,17 +7,14 @@
 #ifndef BVH_BVH_H
 #define BVH_BVH_H
 
+#include "vk_bvh.h"
+
 #define radv_bvh_node_triangle 0
 #define radv_bvh_node_box16    4
 #define radv_bvh_node_box32    5
 #define radv_bvh_node_instance 6
 #define radv_bvh_node_aabb     7
 
-#define radv_ir_node_triangle 0
-#define radv_ir_node_internal 1
-#define radv_ir_node_instance 2
-#define radv_ir_node_aabb     3
-
 #define RADV_GEOMETRY_OPAQUE (1u << 31)
 
 #define RADV_INSTANCE_FORCE_OPAQUE                 (1u << 31)
@@ -29,31 +26,9 @@
 #define VK_UUID_SIZE 16
 #else
 #include <vulkan/vulkan.h>
-typedef struct radv_ir_node radv_ir_node;
-typedef struct radv_global_sync_data radv_global_sync_data;
-typedef struct radv_bvh_geometry_data radv_bvh_geometry_data;
-
 typedef uint16_t float16_t;
-
-typedef struct {
-   float values[3][4];
-} mat3x4;
-
-typedef struct {
-   float x;
-   float y;
-   float z;
-} vec3;
-
-typedef struct radv_aabb radv_aabb;
-
 #endif
 
-struct radv_aabb {
-   vec3 min;
-   vec3 max;
-};
-
 struct radv_accel_struct_serialization_header {
    uint8_t driver_uuid[VK_UUID_SIZE];
    uint8_t accel_struct_compat[VK_UUID_SIZE];
@@ -74,7 +49,7 @@ struct radv_accel_struct_geometry_info {
 struct radv_accel_struct_header {
    uint32_t bvh_offset;
    uint32_t reserved;
-   radv_aabb aabb;
+   vk_aabb aabb;
 
    /* Everything after this gets either updated/copied from the CPU or written by header.comp. */
    uint64_t compacted_size;
@@ -89,45 +64,6 @@ struct radv_accel_struct_header {
    uint32_t build_flags;
 };
 
-struct radv_ir_node {
-   radv_aabb aabb;
-};
-
-#define RADV_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
-#define RADV_NULL_BVH_OFFSET    0xFFFFFFFE
-
-struct radv_ir_box_node {
-   radv_ir_node base;
-   uint32_t children[2];
-   uint32_t bvh_offset;
-};
-
-struct radv_global_sync_data {
-   uint32_t task_counts[2];
-   uint32_t task_started_counter;
-   uint32_t task_done_counter;
-   uint32_t current_phase_start_counter;
-   uint32_t current_phase_end_counter;
-   uint32_t phase_index;
-   /* If this flag is set, the shader should exit
-    * instead of executing another phase */
-   uint32_t next_phase_exit_flag;
-};
-
-struct radv_ir_header {
-   int32_t min_bounds[3];
-   int32_t max_bounds[3];
-   uint32_t active_leaf_count;
-   /* Indirect dispatch dimensions for the encoder.
-    * ir_internal_node_count is the thread count in the X dimension,
-    * while Y and Z are always set to 1. */
-   uint32_t ir_internal_node_count;
-   uint32_t dispatch_size_y;
-   uint32_t dispatch_size_z;
-   radv_global_sync_data sync_data;
-   uint32_t dst_node_offset;
-};
-
 struct radv_bvh_triangle_node {
    float coords[3][3];
    uint32_t reserved[3];
@@ -170,28 +106,11 @@ struct radv_bvh_box16_node {
 
 struct radv_bvh_box32_node {
    uint32_t children[4];
-   radv_aabb coords[4];
+   vk_aabb coords[4];
    uint32_t reserved[4];
 };
 
 #define RADV_BVH_ROOT_NODE    radv_bvh_node_box32
 #define RADV_BVH_INVALID_NODE 0xffffffffu
 
-/* If the task index is set to this value, there is no
- * more work to do. */
-#define TASK_INDEX_INVALID 0xFFFFFFFF
-
-struct radv_bvh_geometry_data {
-   uint64_t data;
-   uint64_t indices;
-   uint64_t transform;
-
-   uint32_t geometry_id;
-   uint32_t geometry_type;
-   uint32_t first_id;
-   uint32_t stride;
-   uint32_t vertex_format;
-   uint32_t index_format;
-};
-
 #endif /* BVH_H */
diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp
index 5c84f631860..50623aa3736 100644
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@@ -36,31 +36,85 @@ void set_parent(uint32_t child, uint32_t parent)
 void
 main()
 {
-   /* Revert the order so we start at the root */
-   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
-
-   uint32_t output_leaf_node_size;
-   switch (args.geometry_type) {
-   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
-      break;
-   case VK_GEOMETRY_TYPE_AABBS_KHR:
-      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
-      break;
-   default: /* instances */
-      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
-      break;
-   }
-
-   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * SIZEOF(radv_ir_node);
+   /* Encode leaf nodes. */
    uint32_t dst_leaf_offset =
       id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_bvh_box32_node);
+
+   uint32_t ir_leaf_node_size;
+   uint32_t output_leaf_node_size;
+   switch (args.geometry_type) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
+      ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
+
+      vk_ir_triangle_node src_node =
+         DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_triangle_node) dst_node =
+         REF(radv_bvh_triangle_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      DEREF(dst_node).coords = src_node.coords;
+      DEREF(dst_node).triangle_id = src_node.triangle_id;
+      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+      DEREF(dst_node).id = 9;
+
+      break;
+   }
+   case VK_GEOMETRY_TYPE_AABBS_KHR: {
+      ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
+
+      vk_ir_aabb_node src_node =
+         DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_aabb_node) dst_node =
+         REF(radv_bvh_aabb_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      DEREF(dst_node).primitive_id = src_node.primitive_id;
+      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+
+      break;
+   }
+   default: {
+      /* instances */
+      ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
+      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
+
+      vk_ir_instance_node src_node =
+         DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+      REF(radv_bvh_instance_node) dst_node =
+         REF(radv_bvh_instance_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+
+      radv_accel_struct_header blas_header =
+         DEREF(REF(radv_accel_struct_header)(src_node.base_ptr));
+
+      DEREF(dst_node).bvh_ptr = addr_to_node(src_node.base_ptr + blas_header.bvh_offset);
+      DEREF(dst_node).bvh_offset = blas_header.bvh_offset;
+
+      mat4 transform = mat4(src_node.otw_matrix);
+      mat4 inv_transform = transpose(inverse(transpose(transform)));
+      DEREF(dst_node).wto_matrix = mat3x4(inv_transform);
+      DEREF(dst_node).otw_matrix = mat3x4(transform);
+
+      DEREF(dst_node).custom_instance_and_mask = src_node.custom_instance_and_mask;
+      DEREF(dst_node).sbt_offset_and_flags = encode_sbt_offset_and_flags(src_node.sbt_offset_and_flags);
+      DEREF(dst_node).instance_id = src_node.instance_id;
+
+      break;
+   }
+   }
+
+   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
+      return;
+
+   /* Encode internal nodes. Revert the order so we start at the root */
+   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
+
+   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size;
    uint32_t dst_internal_offset = dst_leaf_offset + args.leaf_node_count * output_leaf_node_size;
 
-   REF(radv_ir_box_node) intermediate_internal_nodes =
-      REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
-   REF(radv_ir_box_node) src_node = INDEX(radv_ir_box_node, intermediate_internal_nodes, global_id);
-   radv_ir_box_node src = DEREF(src_node);
+   REF(vk_ir_box_node) intermediate_internal_nodes =
+      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
+   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
+   vk_ir_box_node src = DEREF(src_node);
 
    bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1;
 
@@ -70,10 +124,10 @@ main()
                     gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
       uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
-      if (bvh_offset == RADV_UNKNOWN_BVH_OFFSET)
+      if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
          continue;
 
-      if (bvh_offset == RADV_NULL_BVH_OFFSET)
+      if (bvh_offset == VK_NULL_BVH_OFFSET)
          break;
 
       REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
@@ -92,11 +146,11 @@ main()
          float largest_surface_area = -INFINITY;
 
          for (int32_t i = 0; i < found_child_count; ++i) {
-            if (ir_id_to_type(children[i]) != radv_ir_node_internal)
+            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
                continue;
 
-            radv_aabb bounds =
-               DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh,
+            vk_aabb bounds =
+               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
                                              ir_id_to_offset(children[i]))).aabb;
 
             float surface_area = aabb_surface_area(bounds);
@@ -107,8 +161,8 @@ main()
          }
 
          if (collapsed_child_index != -1) {
-            REF(radv_ir_box_node) child_node =
-               REF(radv_ir_box_node)OFFSET(args.intermediate_bvh,
+            REF(vk_ir_box_node) child_node =
+               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
                                         ir_id_to_offset(children[collapsed_child_index]));
             uint32_t grandchildren[2] = DEREF(child_node).children;
             uint32_t valid_grandchild_count = 0;
@@ -131,7 +185,7 @@ main()
                children[collapsed_child_index] = children[found_child_count];
             }
 
-            DEREF(child_node).bvh_offset = RADV_NULL_BVH_OFFSET;
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
          } else
             break;
       }
@@ -141,24 +195,24 @@ main()
          uint32_t offset = ir_id_to_offset(children[i]);
          uint32_t dst_offset;
 
-         if (type == radv_ir_node_internal) {
+         if (type == vk_ir_node_internal) {
 #if COMPACT
             dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
 #else
             uint32_t offset_in_internal_nodes = offset - intermediate_leaf_nodes_size;
-            uint32_t child_index = offset_in_internal_nodes / SIZEOF(radv_ir_box_node);
+            uint32_t child_index = offset_in_internal_nodes / SIZEOF(vk_ir_box_node);
             dst_offset = dst_internal_offset + child_index * SIZEOF(radv_bvh_box32_node);
 #endif
 
-            REF(radv_ir_box_node) child_node = REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, offset);
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
             DEREF(child_node).bvh_offset = dst_offset;
          } else {
-            uint32_t child_index = offset / SIZEOF(radv_ir_node);
+            uint32_t child_index = offset / ir_leaf_node_size;
             dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
          }
 
-         radv_aabb child_aabb =
-            DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+         vk_aabb child_aabb =
+            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
 
          DEREF(dst_node).coords[i] = child_aabb;
 
diff --git a/src/amd/vulkan/bvh/leaf.comp b/src/amd/vulkan/bvh/leaf.comp
deleted file mode 100644
index 26568527c6f..00000000000
--- a/src/amd/vulkan/bvh/leaf.comp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-#include "build_interface.h"
-
-layout(push_constant) uniform CONSTS {
-   leaf_args args;
-};
-
-void
-main(void)
-{
-   uint32_t global_id = gl_GlobalInvocationID.x;
-   uint32_t primitive_id = args.geom_data.first_id + global_id;
-
-   REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id);
-   uint32_t src_offset = global_id * args.geom_data.stride;
-
-   uint32_t dst_stride;
-   uint32_t node_type;
-   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-      dst_stride = SIZEOF(radv_bvh_triangle_node);
-      node_type = radv_ir_node_triangle;
-   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-      dst_stride = SIZEOF(radv_bvh_aabb_node);
-      node_type = radv_ir_node_aabb;
-   } else {
-      dst_stride = SIZEOF(radv_bvh_instance_node);
-      node_type = radv_ir_node_instance;
-   }
-
-   uint32_t dst_offset = primitive_id * dst_stride;
-   VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset);
-
-   radv_aabb bounds;
-   bool is_active;
-   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-      is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id);
-   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-      is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id);
-   } else {
-      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-      /* arrayOfPointers */
-      if (args.geom_data.stride == 8) {
-         src_ptr = DEREF(REF(VOID_REF)(src_ptr));
-      }
-
-      is_active = build_instance(bounds, src_ptr, dst_ptr, global_id);
-   }
-
-#if ALWAYS_ACTIVE
-   if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
-      bounds.min = vec3(0.0);
-      bounds.max = vec3(0.0);
-      is_active = true;
-   }
-#endif
-
-   if (is_active) {
-      REF(radv_ir_node) ir_node = INDEX(radv_ir_node, args.ir, primitive_id);
-      DEREF(ir_node).aabb = bounds;
-   }
-
-   uint32_t ir_offset = primitive_id * SIZEOF(radv_ir_node);
-   DEREF(id_ptr).id = is_active ? pack_ir_node_id(ir_offset, node_type) : RADV_BVH_INVALID_NODE;
-
-   uvec4 ballot = subgroupBallot(is_active);
-   if (subgroupElect())
-      atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot));
-
-   atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x));
-   atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y));
-   atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z));
-   atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x));
-   atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y));
-   atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z));
-}
diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build
index 594194169a9..9173892d4a1 100644
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@@ -23,36 +23,6 @@ bvh_shaders = [
     'header',
     [],
   ],
-  [
-    'lbvh_generate_ir.comp',
-    'lbvh_generate_ir',
-    [],
-  ],
-  [
-    'lbvh_main.comp',
-    'lbvh_main',
-    [],
-  ],
-  [
-    'leaf.comp',
-    'leaf',
-    ['ALWAYS_ACTIVE=0'],
-  ],
-  [
-    'leaf.comp',
-    'leaf_always_active',
-    ['ALWAYS_ACTIVE=1'],
-  ],
-  [
-    'morton.comp',
-    'morton',
-    [],
-  ],
-  [
-    'ploc_internal.comp',
-    'ploc_internal',
-    [],
-  ],
   [
     'update.comp',
     'update',
@@ -61,17 +31,20 @@ bvh_shaders = [
 ]
 
 bvh_include_dir = dir_source_root + '/src/amd/vulkan/bvh'
+vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
 
 bvh_includes = files(
   'build_helpers.h',
   'build_interface.h',
   'bvh.h',
+  vk_bvh_include_dir + '/vk_build_helpers.h',
+  vk_bvh_include_dir + '/vk_bvh.h',
 )
 
 bvh_spv = []
 foreach s : bvh_shaders
   command = [
-    prog_glslang, '-V', '-I' + bvh_include_dir, '--target-env', 'spirv1.5',
+    prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
     '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
   ]
 
diff --git a/src/amd/vulkan/bvh/update.comp b/src/amd/vulkan/bvh/update.comp
index 54577355e9e..ca06dfdf375 100644
--- a/src/amd/vulkan/bvh/update.comp
+++ b/src/amd/vulkan/bvh/update.comp
@@ -53,7 +53,7 @@ void main() {
     VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset);
     uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride;
 
-    radv_aabb bounds;
+    vk_aabb bounds;
     bool is_active;
     if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
         is_active = build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x);
@@ -65,7 +65,7 @@ void main() {
     if (!is_active)
         return;
 
-    DEREF(INDEX(radv_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
+    DEREF(INDEX(vk_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
     memoryBarrier(gl_ScopeDevice,
         gl_StorageSemanticsBuffer,
         gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
@@ -112,11 +112,11 @@ void main() {
 
         for (uint32_t i = 0; i < valid_child_count; ++i) {
             uint32_t child_offset = id_to_offset(children[i]);
-            radv_aabb child_bounds;
+            vk_aabb child_bounds;
             if (child_offset == dst_offset)
                 child_bounds = bounds;
             else if (child_offset >= internal_nodes_offset) {
-                child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+                child_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
                 REF(radv_bvh_box32_node) child_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset);
                 for (uint32_t j = 0; j < 4; ++j) {
                     if (DEREF(child_node).children[j] == RADV_BVH_INVALID_NODE)
@@ -126,16 +126,16 @@ void main() {
                 }
             } else {
                 uint32_t child_index = (child_offset - first_leaf_offset) / leaf_node_size;
-                child_bounds = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index));
+                child_bounds = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index));
             }
 
             DEREF(dst_node).coords[i] = child_bounds;
         }
 
         if (parent_id == RADV_BVH_ROOT_NODE) {
-            radv_aabb root_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+            vk_aabb root_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY));
             for (uint32_t i = 0; i < valid_child_count; ++i) {
-                radv_aabb bounds = DEREF(dst_node).coords[i];
+                vk_aabb bounds = DEREF(dst_node).coords[i];
                 root_bounds.min = min(root_bounds.min, bounds.min);
                 root_bounds.max = max(root_bounds.max, bounds.max);
             }
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 5976bef8b85..539be57c9dc 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -191,9 +191,6 @@ if amd_with_llvm
   )
 endif
 
-subdir('radix_sort')
-libradv_files += radix_sort_files
-
 subdir('bvh')
 
 subdir('layers')
diff --git a/src/amd/vulkan/radix_sort/meson.build b/src/amd/vulkan/radix_sort/meson.build
deleted file mode 100644
index c1478755822..00000000000
--- a/src/amd/vulkan/radix_sort/meson.build
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright © 2022 Konstantin Seurer
-# SPDX-License-Identifier: MIT
-
-subdir('shaders')
-
-radix_sort_files = files(
-  'common/vk/barrier.c',
-  'common/vk/barrier.h',
-  'common/macros.h',
-  'common/util.c',
-  'common/util.h',
-  'shaders/push.h',
-  'targets/u64/config.h',
-  'radix_sort_vk_devaddr.h',
-  'radix_sort_vk_ext.h',
-  'radix_sort_vk.c',
-  'radix_sort_vk.h',
-  'radv_radix_sort.c',
-  'radv_radix_sort.h',
-  'target.h'
-)
diff --git a/src/amd/vulkan/radix_sort/radv_radix_sort.c b/src/amd/vulkan/radix_sort/radv_radix_sort.c
deleted file mode 100644
index 4305baaba75..00000000000
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#include "radv_radix_sort.h"
-#include "targets/u64/config.h"
-#include "radv_cmd_buffer.h"
-#include "target.h"
-
-static const uint32_t init_spv[] = {
-#include "radix_sort/shaders/init.comp.spv.h"
-};
-
-static const uint32_t fill_spv[] = {
-#include "radix_sort/shaders/fill.comp.spv.h"
-};
-
-static const uint32_t histogram_spv[] = {
-#include "radix_sort/shaders/histogram.comp.spv.h"
-};
-
-static const uint32_t prefix_spv[] = {
-#include "radix_sort/shaders/prefix.comp.spv.h"
-};
-
-static const uint32_t scatter_0_even_spv[] = {
-#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
-};
-
-static const uint32_t scatter_0_odd_spv[] = {
-#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
-};
-
-static const uint32_t scatter_1_even_spv[] = {
-#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
-};
-
-static const uint32_t scatter_1_odd_spv[] = {
-#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
-};
-
-static const struct radix_sort_vk_target_config target_config = {
-   .keyval_dwords = RS_KEYVAL_DWORDS,
-
-   .histogram =
-      {
-         .workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2,
-         .block_rows = RS_HISTOGRAM_BLOCK_ROWS,
-      },
-
-   .prefix =
-      {
-         .workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2,
-      },
-
-   .scatter =
-      {
-         .workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2,
-         .subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2,
-         .block_rows = RS_SCATTER_BLOCK_ROWS,
-      },
-};
-
-radix_sort_vk_t *
-radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc)
-{
-   const uint32_t *spv[8] = {
-      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
-      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
-   };
-   const uint32_t spv_sizes[8] = {
-      sizeof(init_spv),           sizeof(fill_spv),          sizeof(histogram_spv),      sizeof(prefix_spv),
-      sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
-   };
-   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreateShaderModule(VkDevice _device, const VkShaderModuleCreateInfo *pCreateInfo,
-                     const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreateShaderModule(_device, pCreateInfo, pAllocator, pShaderModule);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyShaderModule(VkDevice _device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyShaderModule(_device, shaderModule, pAllocator);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreatePipelineLayout(VkDevice _device, const VkPipelineLayoutCreateInfo *pCreateInfo,
-                       const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreatePipelineLayout(_device, pCreateInfo, pAllocator, pPipelineLayout);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyPipelineLayout(VkDevice _device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyPipelineLayout(_device, pipelineLayout, pAllocator);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-vkCreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount,
-                         const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator,
-                         VkPipeline *pPipelines)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.CreateComputePipelines(_device, pipelineCache, createInfoCount, pCreateInfos,
-                                                           pAllocator, pPipelines);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkDestroyPipeline(VkDevice _device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   device->vk.dispatch_table.DestroyPipeline(_device, pipeline, pAllocator);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
-                     VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount,
-                     const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
-                     const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
-                     const VkImageMemoryBarrier *pImageMemoryBarriers)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, dependencyFlags,
-                                                memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
-                                                pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags,
-                   uint32_t offset, uint32_t size, const void *pValues)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset, size, pValues);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
-}
-
-VKAPI_ATTR VkDeviceAddress VKAPI_CALL
-vkGetBufferDeviceAddress(VkDevice _device, const VkBufferDeviceAddressInfo *pInfo)
-{
-   VK_FROM_HANDLE(radv_device, device, _device);
-   return device->vk.dispatch_table.GetBufferDeviceAddress(_device, pInfo);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size,
-                uint32_t data)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdFillBuffer(commandBuffer, dstBuffer, dstOffset, size, data);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-vkCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   device->vk.dispatch_table.CmdDispatchIndirect(commandBuffer, buffer, offset);
-}
diff --git a/src/amd/vulkan/radix_sort/radv_radix_sort.h b/src/amd/vulkan/radix_sort/radv_radix_sort.h
deleted file mode 100644
index a0990610b9f..00000000000
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright © 2022 Konstantin Seurer
- *
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef RADV_RADIX_SORT_H
-#define RADV_RADIX_SORT_H
-
-#include "radix_sort_vk_devaddr.h"
-
-radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc);
-
-#endif
diff --git a/src/amd/vulkan/radix_sort/shaders/meson.build b/src/amd/vulkan/radix_sort/shaders/meson.build
deleted file mode 100644
index 7b5545696b2..00000000000
--- a/src/amd/vulkan/radix_sort/shaders/meson.build
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright © 2022 Konstantin Seurer
-# SPDX-License-Identifier: MIT
-
-radix_sort_shaders = [
-  'init.comp',
-  'fill.comp',
-  'histogram.comp',
-  'prefix.comp',
-  'scatter_0_even.comp',
-  'scatter_0_odd.comp',
-  'scatter_1_even.comp',
-  'scatter_1_odd.comp'
-]
-
-shader_include_dir = dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64'
-
-shader_include_files = files(
-  'bufref.h',
-  'prefix_limits.h',
-  'prefix.h',
-  'push.h',
-  'scatter.glsl',
-  dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64/config.h'
-)
-
-radix_sort_spv = []
-foreach s : radix_sort_shaders
-  _name = f'@s@.spv.h'
-  radix_sort_spv += custom_target(
-    _name,
-    input : s,
-    output : _name,
-    command : [
-      prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3',
-      '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_quiet, glslang_depfile,
-    ],
-    depfile : f'@_name@.d',
-    depend_files : shader_include_files,
-  )
-endforeach
diff --git a/src/amd/vulkan/radix_sort/shaders/prefix.h b/src/amd/vulkan/radix_sort/shaders/prefix.h
deleted file mode 100644
index f9d470bb3f5..00000000000
--- a/src/amd/vulkan/radix_sort/shaders/prefix.h
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright 2021 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
-#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
-
-//
-// Requires several defines
-//
-#ifndef RS_PREFIX_LIMITS
-#error "Error: \"prefix_limits.h\" not loaded"
-#endif
-
-#ifndef RS_PREFIX_ARGS
-#error "Error: RS_PREFIX_ARGS undefined"
-#endif
-
-#ifndef RS_PREFIX_LOAD
-#error "Error: RS_PREFIX_LOAD undefined"
-#endif
-
-#ifndef RS_PREFIX_STORE
-#error "Error: RS_PREFIX_STORE undefined"
-#endif
-
-#ifndef RS_SUBGROUP_SIZE
-#error "Error: RS_SUBGROUP_SIZE undefined"
-#endif
-
-#ifndef RS_WORKGROUP_SIZE
-#error "Error: RS_WORKGROUP_SIZE undefined"
-#endif
-
-#ifndef RS_WORKGROUP_SUBGROUPS
-#error "Error: RS_WORKGROUP_SUBGROUPS undefined"
-#endif
-
-//
-// Optional switches:
-//
-//   * Disable holding original inclusively scanned histogram values in registers.
-//
-//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-//
-
-//
-// Compute exclusive prefix of uint32_t[256]
-//
-void
-rs_prefix(RS_PREFIX_ARGS)
-{
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  // Workgroup is a single subgroup so no shared memory is required.
-  //
-
-  //
-  // Exclusive scan-add the histogram
-  //
-  const uint32_t               h0     = RS_PREFIX_LOAD(0);
-  const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
-  RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-
-  RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
-
-  //
-  // Each iteration is dependent on the previous so no unrolling.  The
-  // compiler is free to hoist the loads upward though.
-  //
-  for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
-       ii < RS_RADIX_SIZE;
-       ii += RS_SUBGROUP_SIZE)
-    {
-      const uint32_t h     = RS_PREFIX_LOAD(ii);
-      const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
-      h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
-
-      RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
-    }
-
-#else
-  //
-  // Workgroup is multiple subgroups and uses shared memory to store
-  // the scan's intermediate results.
-  //
-  // Assumes a power-of-two subgroup, workgroup and radix size.
-  //
-  // Downsweep: Repeatedly scan reductions until they fit in a single
-  //            subgroup.
-  //
-  // Upsweep:   Then uniformly apply reductions to each subgroup.
-  //
-  //
-  //   Subgroup Size |  4 |  8 | 16 | 32 | 64 |
-  //   --------------+----+----+----+----+----+
-  //   Sweep 0       | 64 | 32 | 16 |  8 |  4 | sweep_0[]
-  //   Sweep 1       | 16 |  4 |  - |  - |  - | sweep_1[]
-  //   Sweep 2       |  4 |  - |  - |  - |  - | sweep_2[]
-  //   --------------+----+----+----+----+----+
-  //   Total dwords  | 84 | 36 | 16 |  8 |  4 |
-  //   --------------+----+----+----+----+----+
-  //
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-  uint32_t h_exc[RS_H_COMPONENTS];
-#endif
-
-  //
-  // Downsweep 0
-  //
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    const uint32_t h_inc = subgroupInclusiveAdd(h);
-
-    const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
-
-    //
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    h_exc[ii] = h_inc - h;
-#else
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
-#endif
-  }
-
-  barrier();
-
-  //
-  // Skip generalizing these sweeps for all possible subgroups -- just
-  // write them directly.
-  //
-#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0
-  //
-#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE)
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // subgroup has inactive invocations
-#endif
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-    }
-
-#elif (RS_SUBGROUP_SIZE == 8)
-
-#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-      RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
-    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
-    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-  }
-
-#endif
-
-  barrier();
-
-  //
-  // Scan 1
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
-    {
-      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
-      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
-    }
-
-#elif (RS_SUBGROUP_SIZE == 4)
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Scan 0 and Downsweep 1
-  //
-#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
-
-  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
-    {
-      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
-      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
-      RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
-    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
-
-    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
-    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
-  }
-#endif
-
-  barrier();
-
-  //
-  // Scan 1 and Downsweep 2
-  //
-#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
-  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
-    {
-      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
-      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
-      RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
-    }
-
-#else
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
-  {
-    const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
-    const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
-    const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
-
-    RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
-    RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
-  }
-
-#endif
-
-  barrier();
-
-  //
-  // Scan 2
-  //
-  // 4 invocations
-  //
-  if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
-    {
-      const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
-      const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
-
-      RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
-    }
-
-#else
-#error "Error: Unsupported subgroup size"
-#endif
-
-  barrier();
-
-  //////////////////////////////////////////////////////////////////////
-  //
-  // Final upsweep 0
-  //
-#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-
-    // clang format issue
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
-#endif
-  }
-
-#elif (RS_SUBGROUP_SIZE == 8)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
-
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
-#endif
-  }
-
-#elif (RS_SUBGROUP_SIZE == 4)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
-  {
-    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
-    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
-    const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
-
-#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
-#else
-    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
-
-    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
-      h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
-#endif
-  }
-
-#else
-#error "Error: Unsupported subgroup size"
-#endif
-
-#endif
-}
-
-//
-//
-//
-
-#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
diff --git a/src/amd/vulkan/radix_sort/targets/u64/config.h b/src/amd/vulkan/radix_sort/targets/u64/config.h
deleted file mode 100644
index fa1a51eb017..00000000000
--- a/src/amd/vulkan/radix_sort/targets/u64/config.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2021 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
-#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
-
-//
-//
-//
-
-// clang-format off
-#define RS_KEYVAL_DWORDS                   2
-
-#define RS_FILL_WORKGROUP_SIZE_LOG2        7
-#define RS_FILL_BLOCK_ROWS                 8
-
-#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2   8
-#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2    6
-#define RS_HISTOGRAM_BLOCK_ROWS            14
-
-#define RS_PREFIX_WORKGROUP_SIZE_LOG2      8
-#define RS_PREFIX_SUBGROUP_SIZE_LOG2       6
-
-#define RS_SCATTER_WORKGROUP_SIZE_LOG2     8
-#define RS_SCATTER_SUBGROUP_SIZE_LOG2      6
-#define RS_SCATTER_BLOCK_ROWS              14
-// clang-format on
-
-//
-//
-//
-
-#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c
index dbb5595494f..c6fe70528b6 100644
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@@ -4,16 +4,12 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "radv_sqtt.h"
-
 #include "meta/radv_meta.h"
 #include "nir_builder.h"
 #include "radv_cs.h"
 #include "radv_entrypoints.h"
 
-#include "radix_sort/common/vk/barrier.h"
-#include "radix_sort/radv_radix_sort.h"
-#include "radix_sort/shaders/push.h"
+#include "radix_sort/radix_sort_u64.h"
 
 #include "bvh/build_interface.h"
 #include "bvh/bvh.h"
@@ -21,30 +17,6 @@
 #include "vk_acceleration_structure.h"
 #include "vk_common_entrypoints.h"
 
-static const uint32_t leaf_spv[] = {
-#include "bvh/leaf.spv.h"
-};
-
-static const uint32_t leaf_always_active_spv[] = {
-#include "bvh/leaf_always_active.spv.h"
-};
-
-static const uint32_t morton_spv[] = {
-#include "bvh/morton.spv.h"
-};
-
-static const uint32_t lbvh_main_spv[] = {
-#include "bvh/lbvh_main.spv.h"
-};
-
-static const uint32_t lbvh_generate_ir_spv[] = {
-#include "bvh/lbvh_generate_ir.spv.h"
-};
-
-static const uint32_t ploc_spv[] = {
-#include "bvh/ploc_internal.spv.h"
-};
-
 static const uint32_t copy_spv[] = {
 #include "bvh/copy.spv.h"
 };
@@ -65,21 +37,6 @@ static const uint32_t update_spv[] = {
 #include "bvh/update.spv.h"
 };
 
-#define KEY_ID_PAIR_SIZE 8
-#define MORTON_BIT_SIZE  24
-
-enum internal_build_type {
-   INTERNAL_BUILD_TYPE_LBVH,
-   INTERNAL_BUILD_TYPE_PLOC,
-   INTERNAL_BUILD_TYPE_UPDATE,
-};
-
-struct build_config {
-   enum internal_build_type internal_type;
-   bool compact;
-   bool updateable;
-};
-
 struct acceleration_structure_layout {
    uint32_t geometry_info_offset;
    uint32_t bvh_offset;
@@ -89,71 +46,23 @@ struct acceleration_structure_layout {
 };
 
 struct scratch_layout {
-   uint32_t size;
    uint32_t update_size;
-
    uint32_t header_offset;
-
-   /* Used for UPDATE only. */
-
    uint32_t internal_ready_count_offset;
-
-   /* Used for BUILD only. */
-
-   uint32_t sort_buffer_offset[2];
-   uint32_t sort_internal_offset;
-
-   uint32_t ploc_prefix_sum_partition_offset;
-   uint32_t lbvh_node_offset;
-
-   uint32_t ir_offset;
-   uint32_t internal_node_offset;
 };
 
-static struct build_config
-build_config(uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfoKHR *build_info)
-{
-   struct build_config config = {0};
-
-   if (leaf_count <= 4)
-      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
-   else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR)
-      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
-   else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) &&
-            !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
-      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
-   else
-      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
-
-   if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR &&
-       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
-      config.internal_type = INTERNAL_BUILD_TYPE_UPDATE;
-
-   if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) &&
-       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
-      config.updateable = true;
-
-   if (build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
-      config.compact = true;
-
-   return config;
-}
+enum radv_encode_key_bits {
+   RADV_ENCODE_KEY_COMPACT = 1,
+};
 
 static void
-get_build_layout(struct radv_device *device, uint32_t leaf_count,
-                 const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
-                 struct acceleration_structure_layout *accel_struct, struct scratch_layout *scratch)
+radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf_count,
+                                       const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                       struct acceleration_structure_layout *accel_struct)
 {
    uint32_t internal_count = MAX2(leaf_count, 2) - 1;
 
-   VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
-
-   if (build_info->geometryCount) {
-      if (build_info->pGeometries)
-         geometry_type = build_info->pGeometries[0].geometryType;
-      else
-         geometry_type = build_info->ppGeometries[0]->geometryType;
-   }
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
 
    uint32_t bvh_leaf_size;
    switch (geometry_type) {
@@ -170,92 +79,52 @@ get_build_layout(struct radv_device *device, uint32_t leaf_count,
       unreachable("Unknown VkGeometryTypeKHR");
    }
 
-   if (accel_struct) {
-      uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count;
-      uint32_t offset = 0;
-      offset += sizeof(struct radv_accel_struct_header);
+   uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count;
+   uint32_t offset = 0;
+   offset += sizeof(struct radv_accel_struct_header);
 
-      if (device->rra_trace.accel_structs) {
-         accel_struct->geometry_info_offset = offset;
-         offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount;
-      }
-      /* Parent links, which have to go directly before bvh_offset as we index them using negative
-       * offsets from there. */
-      offset += bvh_size / 64 * 4;
-
-      /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
-      offset = ALIGN(offset, 64);
-      accel_struct->bvh_offset = offset;
-
-      /* root node */
-      offset += sizeof(struct radv_bvh_box32_node);
-
-      accel_struct->leaf_nodes_offset = offset;
-      offset += bvh_leaf_size * leaf_count;
-
-      accel_struct->internal_nodes_offset = offset;
-      /* Factor out the root node. */
-      offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1);
-
-      accel_struct->size = offset;
+   if (device->rra_trace.accel_structs) {
+      accel_struct->geometry_info_offset = offset;
+      offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount;
    }
+   /* Parent links, which have to go directly before bvh_offset as we index them using negative
+    * offsets from there. */
+   offset += bvh_size / 64 * 4;
 
-   if (scratch) {
-      radix_sort_vk_memory_requirements_t requirements = {
-         0,
-      };
-      if (radv_device_init_accel_struct_build_state(device) == VK_SUCCESS)
-         radix_sort_vk_get_memory_requirements(device->meta_state.accel_struct_build.radix_sort, leaf_count,
-                                               &requirements);
+   /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
+   offset = ALIGN(offset, 64);
+   accel_struct->bvh_offset = offset;
 
-      uint32_t offset = 0;
+   /* root node */
+   offset += sizeof(struct radv_bvh_box32_node);
 
-      uint32_t ploc_scratch_space = 0;
-      uint32_t lbvh_node_space = 0;
+   accel_struct->leaf_nodes_offset = offset;
+   offset += bvh_leaf_size * leaf_count;
 
-      struct build_config config = build_config(leaf_count, build_info);
+   accel_struct->internal_nodes_offset = offset;
+   /* Factor out the root node. */
+   offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1);
 
-      if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC)
-         ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition);
-      else
-         lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count;
+   accel_struct->size = offset;
+}
 
-      scratch->header_offset = offset;
-      offset += sizeof(struct radv_ir_header);
+static void
+radv_get_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct scratch_layout *scratch)
+{
+   uint32_t internal_count = MAX2(leaf_count, 2) - 1;
 
-      scratch->sort_buffer_offset[0] = offset;
-      offset += requirements.keyvals_size;
+   uint32_t offset = 0;
 
-      scratch->sort_buffer_offset[1] = offset;
-      offset += requirements.keyvals_size;
+   scratch->header_offset = offset;
+   offset += sizeof(struct vk_ir_header);
 
-      scratch->sort_internal_offset = offset;
-      /* Internal sorting data is not needed when PLOC/LBVH are invoked,
-       * save space by aliasing them */
-      scratch->ploc_prefix_sum_partition_offset = offset;
-      scratch->lbvh_node_offset = offset;
-      offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space);
+   uint32_t update_offset = 0;
 
-      scratch->ir_offset = offset;
-      offset += sizeof(struct radv_ir_node) * leaf_count;
+   update_offset += sizeof(vk_aabb) * leaf_count;
+   scratch->internal_ready_count_offset = update_offset;
 
-      scratch->internal_node_offset = offset;
-      offset += sizeof(struct radv_ir_box_node) * internal_count;
-
-      scratch->size = offset;
-
-      if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) {
-         uint32_t update_offset = 0;
-
-         update_offset += sizeof(radv_aabb) * leaf_count;
-         scratch->internal_ready_count_offset = update_offset;
-
-         update_offset += sizeof(uint32_t) * internal_count;
-         scratch->update_size = update_offset;
-      } else {
-         scratch->update_size = offset;
-      }
-   }
+   update_offset += sizeof(uint32_t) * internal_count;
+   scratch->update_size = update_offset;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -272,17 +141,11 @@ radv_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStruc
    STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64);
    STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128);
 
-   uint32_t leaf_count = 0;
-   for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
-      leaf_count += pMaxPrimitiveCounts[i];
+   if (radv_device_init_accel_struct_build_state(device) != VK_SUCCESS)
+      return;
 
-   struct acceleration_structure_layout accel_struct;
-   struct scratch_layout scratch;
-   get_build_layout(device, leaf_count, pBuildInfo, &accel_struct, &scratch);
-
-   pSizeInfo->accelerationStructureSize = accel_struct.size;
-   pSizeInfo->updateScratchSize = scratch.update_size;
-   pSizeInfo->buildScratchSize = scratch.size;
+   vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts, pSizeInfo,
+                         &device->meta_state.accel_struct_build.build_args);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -319,24 +182,13 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
    struct vk_device_dispatch_table *dispatch = &device->vk.dispatch_table;
 
    dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.ploc_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_generate_ir_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_main_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_updateable_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_compact_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.header_pipeline, &state->alloc);
-   dispatch->DestroyPipeline(_device, state->accel_struct_build.morton_pipeline, &state->alloc);
    dispatch->DestroyPipeline(_device, state->accel_struct_build.update_pipeline, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.copy_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.ploc_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_generate_ir_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_main_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.leaf_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.encode_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.header_p_layout, &state->alloc);
-   radv_DestroyPipelineLayout(_device, state->accel_struct_build.morton_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(_device, state->accel_struct_build.update_p_layout, &state->alloc);
 
    if (state->accel_struct_build.radix_sort)
@@ -492,7 +344,7 @@ radv_device_init_null_accel_struct(struct radv_device *device)
    };
 
    for (uint32_t child = 0; child < 4; child++) {
-      root.coords[child] = (radv_aabb){
+      root.coords[child] = (vk_aabb){
          .min.x = NAN,
          .min.y = NAN,
          .min.z = NAN,
@@ -524,6 +376,328 @@ radv_device_init_null_accel_struct(struct radv_device *device)
    return VK_SUCCESS;
 }
 
+static VkDeviceSize
+radv_get_as_size(VkDevice _device, const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, uint32_t leaf_count)
+{
+   VK_FROM_HANDLE(radv_device, device, _device);
+
+   struct acceleration_structure_layout accel_struct;
+   radv_get_acceleration_structure_layout(device, leaf_count, pBuildInfo, &accel_struct);
+   return accel_struct.size;
+}
+
+static VkDeviceSize
+radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count)
+{
+   struct radv_device *device = container_of(vk_device, struct radv_device, vk);
+
+   struct scratch_layout scratch;
+   radv_get_scratch_layout(device, leaf_count, &scratch);
+   return scratch.update_size;
+}
+
+static uint32_t
+radv_get_encode_key(VkAccelerationStructureTypeKHR type, VkBuildAccelerationStructureFlagBitsKHR flags)
+{
+   if (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
+      return RADV_ENCODE_KEY_COMPACT;
+
+   return 0;
+}
+
+static VkResult
+radv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   bool compact = key & RADV_ENCODE_KEY_COMPACT;
+   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             compact ? device->meta_state.accel_struct_build.encode_compact_pipeline
+                                                     : device->meta_state.accel_struct_build.encode_pipeline);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_encode_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+               const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, VkDeviceAddress intermediate_as_addr,
+               VkDeviceAddress intermediate_header_addr, uint32_t leaf_count, uint32_t key,
+               struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
+
+   if (key & RADV_ENCODE_KEY_COMPACT) {
+      uint32_t dst_offset = layout.internal_nodes_offset - layout.bvh_offset;
+      radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset),
+                            &dst_offset, sizeof(uint32_t));
+   }
+
+   const struct encode_args args = {
+      .intermediate_bvh = intermediate_as_addr,
+      .output_bvh = vk_acceleration_structure_get_va(dst) + layout.bvh_offset,
+      .header = intermediate_header_addr,
+      .output_bvh_offset = layout.bvh_offset,
+      .leaf_node_count = leaf_count,
+      .geometry_type = vk_get_as_geometry_type(build_info),
+   };
+   vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
+                              VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
+
+   struct radv_dispatch_info dispatch = {
+      .unaligned = true,
+      .ordered = true,
+      .blocks = {leaf_count, 1, 1},
+   };
+
+   radv_compute_dispatch(cmd_buffer, &dispatch);
+}
+
+static VkResult
+radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   if (!(key & RADV_ENCODE_KEY_COMPACT))
+      return VK_SUCCESS;
+
+   /* Wait for encoding to finish. */
+   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                                   radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) |
+                                   radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL);
+
+   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             device->meta_state.accel_struct_build.header_pipeline);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                 const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                 VkDeviceAddress intermediate_as_addr, VkDeviceAddress intermediate_header_addr, uint32_t leaf_count,
+                 uint32_t key, struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   size_t base = offsetof(struct radv_accel_struct_header, compacted_size);
+
+   uint64_t instance_count = build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? leaf_count : 0;
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
+
+   if (key & RADV_ENCODE_KEY_COMPACT) {
+      base = offsetof(struct radv_accel_struct_header, geometry_count);
+
+      struct header_args args = {
+         .src = intermediate_header_addr,
+         .dst = vk_acceleration_structure_get_va(dst),
+         .bvh_offset = layout.bvh_offset,
+         .instance_count = instance_count,
+      };
+
+      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout,
+                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
+
+      radv_unaligned_dispatch(cmd_buffer, 1, 1, 1);
+   }
+
+   struct radv_accel_struct_header header;
+
+   header.instance_offset = layout.bvh_offset + sizeof(struct radv_bvh_box32_node);
+   header.instance_count = instance_count;
+   header.compacted_size = layout.size;
+
+   header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64);
+   header.copy_dispatch_size[1] = 1;
+   header.copy_dispatch_size[2] = 1;
+
+   header.serialization_size =
+      header.compacted_size +
+      align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128);
+
+   header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) -
+                 sizeof(uint64_t) * header.instance_count;
+
+   header.build_flags = build_info->flags;
+   header.geometry_count = build_info->geometryCount;
+
+   radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(dst) + base, (const char *)&header + base,
+                         sizeof(header) - base);
+
+   if (device->rra_trace.accel_structs) {
+      uint64_t geometry_infos_size = build_info->geometryCount * sizeof(struct radv_accel_struct_geometry_info);
+
+      struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size);
+      if (!geometry_infos)
+         return;
+
+      for (uint32_t i = 0; i < build_info->geometryCount; i++) {
+         const VkAccelerationStructureGeometryKHR *geometry =
+            build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i];
+         geometry_infos[i].type = geometry->geometryType;
+         geometry_infos[i].flags = geometry->flags;
+         geometry_infos[i].primitive_count = build_range_infos[i].primitiveCount;
+      }
+
+      radv_CmdUpdateBuffer(commandBuffer, dst->buffer, dst->offset + layout.geometry_info_offset, geometry_infos_size,
+                           geometry_infos);
+   }
+}
+
+static void
+radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch, uint32_t leaf_count,
+                         struct vk_acceleration_structure *src_as, struct vk_acceleration_structure *dst_as)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   struct scratch_layout layout;
+   radv_get_scratch_layout(device, leaf_count, &layout);
+
+   /* Prepare ready counts for internal nodes */
+   radv_fill_buffer(cmd_buffer, NULL, NULL, scratch + layout.internal_ready_count_offset,
+                    layout.update_size - layout.internal_ready_count_offset, 0x0);
+}
+
+static void
+radv_update_bind_pipeline(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   /* Wait for update scratch initialization to finish.. */
+   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                                   radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) |
+                                   radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                                                         VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL);
+
+   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             device->meta_state.accel_struct_build.update_pipeline);
+}
+
+static uint32_t
+pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
+{
+   uint32_t geometry_id_and_flags = geometry_id;
+   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
+      geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE;
+
+   return geometry_id_and_flags;
+}
+
+static void
+radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+               const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count,
+               struct vk_acceleration_structure *dst, struct vk_acceleration_structure *src)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   if (src != dst) {
+      VK_FROM_HANDLE(radv_buffer, src_as_buffer, src->buffer);
+      VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst->buffer);
+
+      struct acceleration_structure_layout layout;
+      radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout);
+
+      /* Copy header/metadata */
+      radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src->offset,
+                       dst_as_buffer->offset + dst->offset, layout.bvh_offset);
+   }
+
+   struct scratch_layout layout;
+   radv_get_scratch_layout(device, leaf_count, &layout);
+
+   struct update_args update_consts = {
+      .src = vk_acceleration_structure_get_va(src),
+      .dst = vk_acceleration_structure_get_va(dst),
+      .leaf_bounds = build_info->scratchData.deviceAddress,
+      .internal_ready_count = build_info->scratchData.deviceAddress + layout.internal_ready_count_offset,
+      .leaf_node_count = leaf_count,
+   };
+
+   uint32_t first_id = 0;
+   for (uint32_t i = 0; i < build_info->geometryCount; i++) {
+      const VkAccelerationStructureGeometryKHR *geom =
+         build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i];
+
+      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &build_range_infos[i];
+
+      update_consts.geom_data = vk_fill_geometry_data(build_info->type, first_id, i, geom, build_range_info);
+
+      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout,
+                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts);
+      radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1);
+
+      first_id += build_range_info->primitiveCount;
+   }
+}
+
+static const struct radix_sort_vk_target_config radix_sort_config = {
+   .keyval_dwords = 2,
+   .fill.workgroup_size_log2 = 7,
+   .fill.block_rows = 8,
+   .histogram.workgroup_size_log2 = 8,
+   .histogram.subgroup_size_log2 = 6,
+   .histogram.block_rows = 14,
+   .prefix.workgroup_size_log2 = 8,
+   .prefix.subgroup_size_log2 = 6,
+   .scatter.workgroup_size_log2 = 8,
+   .scatter.subgroup_size_log2 = 6,
+   .scatter.block_rows = 14,
+};
+
+static const struct vk_acceleration_structure_build_ops build_ops = {
+   .get_as_size = radv_get_as_size,
+   .get_update_scratch_size = radv_get_update_scratch_size,
+   .get_encode_key[0] = radv_get_encode_key,
+   .get_encode_key[1] = radv_get_encode_key,
+   .encode_bind_pipeline[0] = radv_encode_bind_pipeline,
+   .encode_bind_pipeline[1] = radv_init_header_bind_pipeline,
+   .encode_as[0] = radv_encode_as,
+   .encode_as[1] = radv_init_header,
+   .init_update_scratch = radv_init_update_scratch,
+   .update_bind_pipeline[0] = radv_update_bind_pipeline,
+   .update_as[0] = radv_update_as,
+};
+
+static void
+radv_write_buffer_cp(VkCommandBuffer commandBuffer, VkDeviceAddress addr, void *data, uint32_t size)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   radv_update_buffer_cp(cmd_buffer, addr, data, size);
+}
+
+static void
+radv_flush_buffer_write_cp(VkCommandBuffer commandBuffer)
+{
+}
+
+static void
+radv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   radv_unaligned_dispatch(cmd_buffer, x, y, z);
+}
+
+static void
+radv_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer, VkDeviceAddress addr, VkDeviceSize size, uint32_t data)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   radv_fill_buffer(cmd_buffer, NULL, NULL, addr, size, data);
+}
+
 VkResult
 radv_device_init_accel_struct_build_state(struct radv_device *device)
 {
@@ -533,38 +707,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (device->meta_state.accel_struct_build.radix_sort)
       goto exit;
 
-   result = create_build_pipeline_spv(device, leaf_always_active_spv, sizeof(leaf_always_active_spv),
-                                      sizeof(struct leaf_args),
-                                      &device->meta_state.accel_struct_build.leaf_updateable_pipeline,
-                                      &device->meta_state.accel_struct_build.leaf_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, leaf_spv, sizeof(leaf_spv), sizeof(struct leaf_args),
-                                      &device->meta_state.accel_struct_build.leaf_pipeline,
-                                      &device->meta_state.accel_struct_build.leaf_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, lbvh_main_spv, sizeof(lbvh_main_spv), sizeof(struct lbvh_main_args),
-                                      &device->meta_state.accel_struct_build.lbvh_main_pipeline,
-                                      &device->meta_state.accel_struct_build.lbvh_main_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, lbvh_generate_ir_spv, sizeof(lbvh_generate_ir_spv),
-                                      sizeof(struct lbvh_generate_ir_args),
-                                      &device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline,
-                                      &device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
-   result = create_build_pipeline_spv(device, ploc_spv, sizeof(ploc_spv), sizeof(struct ploc_args),
-                                      &device->meta_state.accel_struct_build.ploc_pipeline,
-                                      &device->meta_state.accel_struct_build.ploc_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
    result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args),
                                       &device->meta_state.accel_struct_build.encode_pipeline,
                                       &device->meta_state.accel_struct_build.encode_p_layout);
@@ -584,20 +726,33 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (result != VK_SUCCESS)
       goto exit;
 
-   result = create_build_pipeline_spv(device, morton_spv, sizeof(morton_spv), sizeof(struct morton_args),
-                                      &device->meta_state.accel_struct_build.morton_pipeline,
-                                      &device->meta_state.accel_struct_build.morton_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
    result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args),
                                       &device->meta_state.accel_struct_build.update_pipeline,
                                       &device->meta_state.accel_struct_build.update_p_layout);
    if (result != VK_SUCCESS)
       goto exit;
 
-   device->meta_state.accel_struct_build.radix_sort =
-      radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache);
+   device->meta_state.accel_struct_build.radix_sort = vk_create_radix_sort_u64(
+      radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache, radix_sort_config);
+
+   result = vk_meta_device_init(&device->vk, &device->meta_state.device);
+   if (result != VK_SUCCESS)
+      goto exit;
+
+   device->meta_state.device.pipeline_cache = device->meta_state.cache;
+
+   device->vk.as_build_ops = &build_ops;
+   device->vk.write_buffer_cp = radv_write_buffer_cp;
+   device->vk.flush_buffer_write_cp = radv_flush_buffer_write_cp;
+   device->vk.cmd_dispatch_unaligned = radv_cmd_dispatch_unaligned;
+   device->vk.cmd_fill_buffer_addr = radv_cmd_fill_buffer_addr;
+
+   struct vk_acceleration_structure_build_args *build_args = &device->meta_state.accel_struct_build.build_args;
+   build_args->subgroup_size = 64;
+   build_args->bvh_bounds_offset = offsetof(struct radv_accel_struct_header, aabb);
+   build_args->emit_markers = device->sqtt.bo;
+   build_args->radix_sort = device->meta_state.accel_struct_build.radix_sort;
+
 exit:
    mtx_unlock(&device->meta_state.mtx);
    return result;
@@ -616,727 +771,6 @@ radv_device_init_accel_struct_copy_state(struct radv_device *device)
    return result;
 }
 
-struct bvh_state {
-   uint32_t node_count;
-   uint32_t scratch_offset;
-
-   uint32_t leaf_node_count;
-   uint32_t internal_node_count;
-   uint32_t leaf_node_size;
-
-   struct acceleration_structure_layout accel_struct;
-   struct scratch_layout scratch;
-   struct build_config config;
-
-   /* Radix sort state */
-   uint32_t scatter_blocks;
-   uint32_t count_ru_scatter;
-   uint32_t histo_blocks;
-   uint32_t count_ru_histo;
-   struct rs_push_scatter push_scatter;
-};
-
-struct radv_bvh_batch_state {
-   bool any_compact;
-   bool any_non_compact;
-   bool any_ploc;
-   bool any_lbvh;
-   bool any_updateable;
-   bool any_non_updateable;
-   bool any_update;
-};
-
-static uint32_t
-pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
-{
-   uint32_t geometry_id_and_flags = geometry_id;
-   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
-      geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE;
-
-   return geometry_id_and_flags;
-}
-
-static struct radv_bvh_geometry_data
-fill_geometry_data(VkAccelerationStructureTypeKHR type, struct bvh_state *bvh_state, uint32_t geom_index,
-                   const VkAccelerationStructureGeometryKHR *geometry,
-                   const VkAccelerationStructureBuildRangeInfoKHR *build_range_info)
-{
-   struct radv_bvh_geometry_data data = {
-      .first_id = bvh_state->node_count,
-      .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags),
-      .geometry_type = geometry->geometryType,
-   };
-
-   switch (geometry->geometryType) {
-   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
-
-      data.data = geometry->geometry.triangles.vertexData.deviceAddress +
-                  build_range_info->firstVertex * geometry->geometry.triangles.vertexStride;
-      data.indices = geometry->geometry.triangles.indexData.deviceAddress;
-
-      if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR)
-         data.data += build_range_info->primitiveOffset;
-      else
-         data.indices += build_range_info->primitiveOffset;
-
-      data.transform = geometry->geometry.triangles.transformData.deviceAddress;
-      if (data.transform)
-         data.transform += build_range_info->transformOffset;
-
-      data.stride = geometry->geometry.triangles.vertexStride;
-      data.vertex_format = geometry->geometry.triangles.vertexFormat;
-      data.index_format = geometry->geometry.triangles.indexType;
-      break;
-   case VK_GEOMETRY_TYPE_AABBS_KHR:
-      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
-
-      data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset;
-      data.stride = geometry->geometry.aabbs.stride;
-      break;
-   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
-      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR);
-
-      data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset;
-
-      if (geometry->geometry.instances.arrayOfPointers)
-         data.stride = 8;
-      else
-         data.stride = sizeof(VkAccelerationStructureInstanceKHR);
-      break;
-   default:
-      unreachable("Unknown geometryType");
-   }
-
-   return data;
-}
-
-static void
-build_leaves(VkCommandBuffer commandBuffer, uint32_t infoCount,
-             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
-             const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states,
-             bool updateable)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "leaves");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             updateable ? device->meta_state.accel_struct_build.leaf_updateable_pipeline
-                                                        : device->meta_state.accel_struct_build.leaf_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      if (bvh_states[i].config.updateable != updateable)
-         continue;
-
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-
-      struct leaf_args leaf_consts = {
-         .ir = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.leaf_nodes_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
-      };
-
-      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geom =
-            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
-
-         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];
-
-         leaf_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info);
-
-         vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.leaf_p_layout,
-                                    VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts);
-         radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1);
-
-         bvh_states[i].leaf_node_count += build_range_info->primitiveCount;
-         bvh_states[i].node_count += build_range_info->primitiveCount;
-      }
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-morton_generate(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                enum radv_cmd_flush_bits flush_bits)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "morton");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.morton_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      const struct morton_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.morton_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      radv_unaligned_dispatch(cmd_buffer, bvh_states[i].node_count, 1, 1);
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-
-   cmd_buffer->state.flush_bits |= flush_bits;
-}
-
-static void
-morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount,
-            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-            enum radv_cmd_flush_bits flush_bits)
-{
-   /* Copyright 2019 The Fuchsia Authors. */
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "sort");
-
-   radix_sort_vk_t *rs = device->meta_state.accel_struct_build.radix_sort;
-
-   /*
-    * OVERVIEW
-    *
-    *   1. Pad the keyvals in `scatter_even`.
-    *   2. Zero the `histograms` and `partitions`.
-    *      --- BARRIER ---
-    *   3. HISTOGRAM is dispatched before PREFIX.
-    *      --- BARRIER ---
-    *   4. PREFIX is dispatched before the first SCATTER.
-    *      --- BARRIER ---
-    *   5. One or more SCATTER dispatches.
-    *
-    * Note that the `partitions` buffer can be zeroed anytime before the first
-    * scatter.
-    */
-
-   /* How many passes? */
-   uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t);
-   uint32_t keyval_bits = keyval_bytes * 8;
-   uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits);
-   uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2;
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].node_count)
-         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1];
-      else
-         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0];
-   }
-
-   /*
-    * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS
-    *
-    * Pad fractional blocks with max-valued keyvals.
-    *
-    * Zero the histograms and partitions buffer.
-    *
-    * This assumes the partitions follow the histograms.
-    */
-
-   /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */
-
-   /* How many scatter blocks? */
-   uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2;
-   uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows;
-
-   /*
-    * How many histogram blocks?
-    *
-    * Note that it's OK to have more max-valued digits counted by the histogram
-    * than sorted by the scatters because the sort is stable.
-    */
-   uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2;
-   uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows;
-
-   uint32_t pass_idx = (keyval_bytes - passes);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (!bvh_states[i].node_count)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      bvh_states[i].scatter_blocks = (bvh_states[i].node_count + scatter_block_kvs - 1) / scatter_block_kvs;
-      bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs;
-
-      bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs;
-      bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs;
-
-      /* Fill with max values */
-      if (bvh_states[i].count_ru_histo > bvh_states[i].node_count) {
-         radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + bvh_states[i].node_count * keyval_bytes,
-                          (bvh_states[i].count_ru_histo - bvh_states[i].node_count) * keyval_bytes, 0xFFFFFFFF);
-      }
-
-      /*
-       * Zero histograms and invalidate partitions.
-       *
-       * Note that the partition invalidation only needs to be performed once
-       * because the even/odd scatter dispatches rely on the the previous pass to
-       * leave the partitions in an invalid state.
-       *
-       * Note that the last workgroup doesn't read/write a partition so it doesn't
-       * need to be initialized.
-       */
-      uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1;
-
-      uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
-
-      radv_fill_buffer(cmd_buffer, NULL, NULL, internal_addr + rs->internal.histograms.offset + fill_base,
-                       histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)), 0);
-   }
-
-   /*
-    * Pipeline: HISTOGRAM
-    *
-    * TODO(allanmac): All subgroups should try to process approximately the same
-    * number of blocks in order to minimize tail effects.  This was implemented
-    * and reverted but should be reimplemented and benchmarked later.
-    */
-   vk_barrier_transfer_w_to_compute_r(commandBuffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             rs->pipelines.named.histogram);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (!bvh_states[i].node_count)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      /* Dispatch histogram */
-      struct rs_push_histogram push_histogram = {
-         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
-         .devaddr_keyvals = keyvals_even_addr,
-         .passes = passes,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0,
-                                 sizeof(push_histogram), &push_histogram);
-
-      vk_common_CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1);
-   }
-
-   /*
-    * Pipeline: PREFIX
-    *
-    * Launch one workgroup per pass.
-    */
-   vk_barrier_compute_w_to_compute_r(commandBuffer);
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (!bvh_states[i].node_count)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      struct rs_push_prefix push_prefix = {
-         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0,
-                                 sizeof(push_prefix), &push_prefix);
-
-      vk_common_CmdDispatch(commandBuffer, passes, 1, 1);
-   }
-
-   /* Pipeline: SCATTER */
-   vk_barrier_compute_w_to_compute_r(commandBuffer);
-
-   uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
-
-   for (uint32_t i = 0; i < infoCount; i++) {
-      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
-      uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1];
-      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
-
-      bvh_states[i].push_scatter = (struct rs_push_scatter){
-         .devaddr_keyvals_even = keyvals_even_addr,
-         .devaddr_keyvals_odd = keyvals_odd_addr,
-         .devaddr_partitions = internal_addr + rs->internal.partitions.offset,
-         .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset,
-      };
-   }
-
-   bool is_even = true;
-
-   while (true) {
-      uint32_t pass_dword = pass_idx / 4;
-
-      /* Bind new pipeline */
-      VkPipeline p =
-         is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd;
-      device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p);
-
-      /* Update push constants that changed */
-      VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even
-                                    : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-
-      for (uint32_t i = 0; i < infoCount; i++) {
-         if (!bvh_states[i].node_count)
-            continue;
-         if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-            continue;
-
-         bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2;
-
-         vk_common_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter),
-                                    &bvh_states[i].push_scatter);
-
-         vk_common_CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1);
-
-         bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t));
-      }
-
-      /* Continue? */
-      if (++pass_idx >= keyval_bytes)
-         break;
-
-      vk_barrier_compute_w_to_compute_r(commandBuffer);
-
-      is_even ^= true;
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-
-   cmd_buffer->state.flush_bits |= flush_bits;
-}
-
-static void
-lbvh_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                    enum radv_cmd_flush_bits flush_bits)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "lbvh");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.lbvh_main_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
-         continue;
-
-      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
-      uint32_t internal_node_count = MAX2(bvh_states[i].node_count, 2) - 1;
-
-      const struct lbvh_main_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
-         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
-         .id_count = bvh_states[i].node_count,
-         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_main_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      radv_unaligned_dispatch(cmd_buffer, internal_node_count, 1, 1);
-      bvh_states[i].node_count = internal_node_count;
-      bvh_states[i].internal_node_count = internal_node_count;
-   }
-
-   cmd_buffer->state.flush_bits |= flush_bits;
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
-         continue;
-
-      const struct lbvh_generate_ir_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      radv_unaligned_dispatch(cmd_buffer, bvh_states[i].internal_node_count, 1, 1);
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-ploc_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "ploc");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.ploc_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC)
-         continue;
-
-      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
-      uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0])
-                                       ? bvh_states[i].scratch.sort_buffer_offset[1]
-                                       : bvh_states[i].scratch.sort_buffer_offset[0];
-
-      const struct ploc_args consts = {
-         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
-         .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset,
-         .prefix_scan_partitions =
-            pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset,
-         .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
-      };
-
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.ploc_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
-      vk_common_CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1);
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-encode_nodes(VkCommandBuffer commandBuffer, uint32_t infoCount,
-             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, bool compact)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "encode");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             compact ? device->meta_state.accel_struct_build.encode_compact_pipeline
-                                                     : device->meta_state.accel_struct_build.encode_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (compact != bvh_states[i].config.compact)
-         continue;
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-
-      VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
-
-      /* If the geometry count is 0, then the size does not matter
-       * because it will be multiplied with 0.
-       */
-      if (pInfos[i].geometryCount)
-         geometry_type =
-            pInfos[i].pGeometries ? pInfos[i].pGeometries[0].geometryType : pInfos[i].ppGeometries[0]->geometryType;
-
-      if (bvh_states[i].config.compact) {
-         uint32_t dst_offset = bvh_states[i].accel_struct.internal_nodes_offset - bvh_states[i].accel_struct.bvh_offset;
-         radv_update_buffer_cp(cmd_buffer,
-                               pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset +
-                                  offsetof(struct radv_ir_header, dst_node_offset),
-                               &dst_offset, sizeof(uint32_t));
-      }
-
-      const struct encode_args args = {
-         .intermediate_bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
-         .output_bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.bvh_offset,
-         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-         .output_bvh_offset = bvh_states[i].accel_struct.bvh_offset,
-         .leaf_node_count = bvh_states[i].leaf_node_count,
-         .geometry_type = geometry_type,
-      };
-      vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout,
-                                 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
-
-      struct radv_dispatch_info dispatch = {
-         .unaligned = true,
-         .ordered = true,
-         .va = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset +
-               offsetof(struct radv_ir_header, ir_internal_node_count),
-      };
-
-      radv_compute_dispatch(cmd_buffer, &dispatch);
-   }
-   /* This is the final access to the leaf nodes, no need to flush */
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-init_header(VkCommandBuffer commandBuffer, uint32_t infoCount,
-            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-            struct radv_bvh_batch_state *batch_state)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   if (batch_state->any_compact) {
-      radv_write_user_event_marker(cmd_buffer, UserEventPush, "header");
-
-      device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                                device->meta_state.accel_struct_build.header_pipeline);
-   }
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-      size_t base = offsetof(struct radv_accel_struct_header, compacted_size);
-
-      uint64_t instance_count =
-         pInfos[i].type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? bvh_states[i].leaf_node_count : 0;
-
-      if (bvh_states[i].config.compact) {
-         base = offsetof(struct radv_accel_struct_header, geometry_count);
-
-         struct header_args args = {
-            .src = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-            .dst = vk_acceleration_structure_get_va(accel_struct),
-            .bvh_offset = bvh_states[i].accel_struct.bvh_offset,
-            .instance_count = instance_count,
-         };
-
-         vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout,
-                                    VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args);
-
-         radv_unaligned_dispatch(cmd_buffer, 1, 1, 1);
-      }
-
-      struct radv_accel_struct_header header;
-
-      header.instance_offset = bvh_states[i].accel_struct.bvh_offset + sizeof(struct radv_bvh_box32_node);
-      header.instance_count = instance_count;
-      header.compacted_size = bvh_states[i].accel_struct.size;
-
-      header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64);
-      header.copy_dispatch_size[1] = 1;
-      header.copy_dispatch_size[2] = 1;
-
-      header.serialization_size =
-         header.compacted_size +
-         align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128);
-
-      header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) -
-                    sizeof(uint64_t) * header.instance_count;
-
-      header.build_flags = pInfos[i].flags;
-      header.geometry_count = pInfos[i].geometryCount;
-
-      radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(accel_struct) + base,
-                            (const char *)&header + base, sizeof(header) - base);
-   }
-
-   if (batch_state->any_compact)
-      radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
-static void
-init_geometry_infos(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                    const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
-{
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-      VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
-
-      uint64_t geometry_infos_size = pInfos[i].geometryCount * sizeof(struct radv_accel_struct_geometry_info);
-
-      struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size);
-      if (!geometry_infos)
-         continue;
-
-      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geometry =
-            pInfos[i].pGeometries ? pInfos[i].pGeometries + j : pInfos[i].ppGeometries[j];
-         geometry_infos[j].type = geometry->geometryType;
-         geometry_infos[j].flags = geometry->flags;
-         geometry_infos[j].primitive_count = ppBuildRangeInfos[i][j].primitiveCount;
-      }
-
-      radv_CmdUpdateBuffer(commandBuffer, accel_struct->buffer,
-                           accel_struct->offset + bvh_states[i].accel_struct.geometry_info_offset, geometry_infos_size,
-                           geometry_infos);
-
-      free(geometry_infos);
-   }
-}
-
-static void
-update(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
-       const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states)
-{
-   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPush, "update");
-
-   device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                             device->meta_state.accel_struct_build.update_pipeline);
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE)
-         continue;
-
-      uint32_t leaf_node_count = 0;
-      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
-         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
-      }
-
-      VK_FROM_HANDLE(vk_acceleration_structure, src_bvh, pInfos[i].srcAccelerationStructure);
-      VK_FROM_HANDLE(vk_acceleration_structure, dst_bvh, pInfos[i].dstAccelerationStructure);
-      struct update_args update_consts = {
-         .src = vk_acceleration_structure_get_va(src_bvh),
-         .dst = vk_acceleration_structure_get_va(dst_bvh),
-         .leaf_bounds = pInfos[i].scratchData.deviceAddress,
-         .internal_ready_count =
-            pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset,
-         .leaf_node_count = leaf_node_count,
-      };
-
-      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geom =
-            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
-
-         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];
-
-         update_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info);
-
-         vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout,
-                                    VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts);
-         radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1);
-
-         bvh_states[i].leaf_node_count += build_range_info->primitiveCount;
-         bvh_states[i].node_count += build_range_info->primitiveCount;
-      }
-   }
-
-   radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL);
-}
-
 VKAPI_ATTR void VKAPI_CALL
 radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t infoCount,
                                        const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
@@ -1352,132 +786,14 @@ radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t i
       return;
    }
 
-   enum radv_cmd_flush_bits flush_bits = RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
-                                         radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                                                               VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) |
-                                         radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                                                               VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL);
-
    radv_meta_save(&saved_state, cmd_buffer,
                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
-   struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
-
-   radv_describe_begin_accel_struct_build(cmd_buffer, infoCount);
-
-   struct radv_bvh_batch_state batch_state = {0};
-
-   for (uint32_t i = 0; i < infoCount; ++i) {
-      uint32_t leaf_node_count = 0;
-      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
-         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
-      }
-
-      get_build_layout(device, leaf_node_count, pInfos + i, &bvh_states[i].accel_struct, &bvh_states[i].scratch);
-
-      struct build_config config = build_config(leaf_node_count, pInfos + i);
-      bvh_states[i].config = config;
-
-      if (config.compact)
-         batch_state.any_compact = true;
-      else
-         batch_state.any_non_compact = true;
-
-      if (config.updateable)
-         batch_state.any_updateable = true;
-      else
-         batch_state.any_non_updateable = true;
-
-      if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) {
-         batch_state.any_ploc = true;
-      } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) {
-         batch_state.any_lbvh = true;
-      } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) {
-         batch_state.any_update = true;
-      } else {
-         unreachable("Unknown internal_build_type");
-      }
-
-      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) {
-         /* The internal node count is updated in lbvh_build_internal for LBVH
-          * and from the PLOC shader for PLOC. */
-         struct radv_ir_header header = {
-            .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff},
-            .max_bounds = {0x80000000, 0x80000000, 0x80000000},
-            .dispatch_size_y = 1,
-            .dispatch_size_z = 1,
-            .sync_data =
-               {
-                  .current_phase_end_counter = TASK_INDEX_INVALID,
-                  /* Will be updated by the first PLOC shader invocation */
-                  .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
-               },
-         };
-
-         radv_update_buffer_cp(cmd_buffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
-                               &header, sizeof(header));
-      } else {
-         /* Prepare ready counts for internal nodes */
-         radv_fill_buffer(cmd_buffer, NULL, NULL,
-                          pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset,
-                          bvh_states[i].scratch.update_size - bvh_states[i].scratch.internal_ready_count_offset, 0x0);
-         if (pInfos[i].srcAccelerationStructure != pInfos[i].dstAccelerationStructure) {
-            VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure);
-            VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure);
-
-            VK_FROM_HANDLE(radv_buffer, src_as_buffer, src_as->buffer);
-            VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst_as->buffer);
-
-            /* Copy header/metadata */
-            radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src_as->offset,
-                             dst_as_buffer->offset + dst_as->offset, bvh_states[i].accel_struct.bvh_offset);
-         }
-      }
-   }
 
    cmd_buffer->state.current_event_type = EventInternalUnknown;
 
-   if (batch_state.any_lbvh || batch_state.any_ploc) {
-      if (batch_state.any_non_updateable)
-         build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, false);
-      if (batch_state.any_updateable)
-         build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, true);
+   vk_cmd_build_acceleration_structures(commandBuffer, &device->vk, &device->meta_state.device, infoCount, pInfos,
+                                        ppBuildRangeInfos, &device->meta_state.accel_struct_build.build_args);
 
-      cmd_buffer->state.flush_bits |= flush_bits;
-
-      morton_generate(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
-
-      morton_sort(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
-
-      cmd_buffer->state.flush_bits |= flush_bits;
-
-      if (batch_state.any_lbvh)
-         lbvh_build_internal(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
-
-      if (batch_state.any_ploc)
-         ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states);
-
-      cmd_buffer->state.flush_bits |= flush_bits;
-
-      if (batch_state.any_non_compact)
-         encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, false);
-
-      if (batch_state.any_compact)
-         encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, true);
-
-      cmd_buffer->state.flush_bits |= flush_bits;
-   }
-
-   init_header(commandBuffer, infoCount, pInfos, bvh_states, &batch_state);
-
-   if (device->rra_trace.accel_structs)
-      init_geometry_infos(commandBuffer, infoCount, pInfos, bvh_states, ppBuildRangeInfos);
-
-   if (batch_state.any_update)
-      update(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states);
-
-   radv_describe_end_accel_struct_build(cmd_buffer);
-
-   free(bvh_states);
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h
index e6e432e4771..dbd7f961c38 100644
--- a/src/amd/vulkan/radv_device.h
+++ b/src/amd/vulkan/radv_device.h
@@ -24,7 +24,9 @@
 #include "radv_rra.h"
 #include "radv_shader.h"
 
+#include "vk_acceleration_structure.h"
 #include "vk_device.h"
+#include "vk_meta.h"
 #include "vk_texcompress_astc.h"
 #include "vk_texcompress_etc2.h"
 
@@ -302,17 +304,6 @@ struct radv_meta_state {
    } dcc_retile;
 
    struct {
-      VkPipelineLayout leaf_p_layout;
-      VkPipeline leaf_pipeline;
-      VkPipeline leaf_updateable_pipeline;
-      VkPipelineLayout morton_p_layout;
-      VkPipeline morton_pipeline;
-      VkPipelineLayout lbvh_main_p_layout;
-      VkPipeline lbvh_main_pipeline;
-      VkPipelineLayout lbvh_generate_ir_p_layout;
-      VkPipeline lbvh_generate_ir_pipeline;
-      VkPipelineLayout ploc_p_layout;
-      VkPipeline ploc_pipeline;
       VkPipelineLayout encode_p_layout;
       VkPipeline encode_pipeline;
       VkPipeline encode_compact_pipeline;
@@ -324,6 +315,7 @@ struct radv_meta_state {
       VkPipeline copy_pipeline;
 
       struct radix_sort_vk *radix_sort;
+      struct vk_acceleration_structure_build_args build_args;
 
       struct {
          VkBuffer buffer;
@@ -340,6 +332,8 @@ struct radv_meta_state {
       VkDescriptorSetLayout ds_layout;
       VkPipelineLayout p_layout;
    } dgc_prepare;
+
+   struct vk_meta_device device;
 };
 
 struct radv_memory_trace_data {
diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c
index 79e7802915d..c7adbf20e81 100644
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@@ -542,7 +542,7 @@ rra_transcode_triangle_node(struct rra_transcoding_context *ctx, const struct ra
 }
 
 static void
-rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, radv_aabb bounds)
+rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, vk_aabb bounds)
 {
    struct rra_aabb_node *dst = (struct rra_aabb_node *)(ctx->dst + ctx->dst_leaf_offset);
    ctx->dst_leaf_offset += sizeof(struct rra_aabb_node);
@@ -580,7 +580,7 @@ rra_transcode_instance_node(struct rra_transcoding_context *ctx, const struct ra
 }
 
 static uint32_t rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
-                                   radv_aabb bounds);
+                                   vk_aabb bounds);
 
 static void
 rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_bvh_box16_node *src)
@@ -597,7 +597,7 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_
          continue;
       }
 
-      radv_aabb bounds = {
+      vk_aabb bounds = {
          .min =
             {
                _mesa_half_to_float(src->coords[i][0][0]),
@@ -653,7 +653,7 @@ get_geometry_id(const void *node, uint32_t node_type)
 }
 
 static uint32_t
-rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, radv_aabb bounds)
+rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, vk_aabb bounds)
 {
    uint32_t node_type = src_id & 7;
    uint32_t src_offset = (src_id & (~7u)) << 3;
diff --git a/src/amd/vulkan/bvh/lbvh_generate_ir.comp b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp
similarity index 58%
rename from src/amd/vulkan/bvh/lbvh_generate_ir.comp
rename to src/vulkan/runtime/bvh/lbvh_generate_ir.comp
index 18821d13a79..818e568b4c1 100644
--- a/src/amd/vulkan/bvh/lbvh_generate_ir.comp
+++ b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -18,9 +35,9 @@
 #extension GL_EXT_buffer_reference2 : require
 #extension GL_KHR_memory_scope_semantics : require
 
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"
 
-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
 
 TYPE(lbvh_node_info, 4);
 
@@ -36,8 +53,8 @@ main(void)
 
    uint32_t idx = global_id;
 
-   uint32_t previous_id = RADV_BVH_INVALID_NODE;
-   radv_aabb previous_bounds;
+   uint32_t previous_id = VK_BVH_INVALID_NODE;
+   vk_aabb previous_bounds;
    previous_bounds.min = vec3(INFINITY);
    previous_bounds.max = vec3(-INFINITY);
 
@@ -58,13 +75,13 @@ main(void)
        * parents, which is a requirement of the encoder.
        */
       uint32_t dst_idx =
-         atomicAdd(DEREF(REF(radv_ir_header)(args.header)).ir_internal_node_count, 1);
+         atomicAdd(DEREF(REF(vk_ir_header)(args.header)).ir_internal_node_count, 1);
 
-      uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(radv_ir_box_node);
-      uint32_t current_id = pack_ir_node_id(current_offset, radv_ir_node_internal);
+      uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(vk_ir_box_node);
+      uint32_t current_id = pack_ir_node_id(current_offset, vk_ir_node_internal);
 
-      REF(radv_ir_box_node) node = REF(radv_ir_box_node)(OFFSET(args.bvh, current_offset));
-      radv_aabb bounds = previous_bounds;
+      REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, current_offset));
+      vk_aabb bounds = previous_bounds;
 
       lbvh_node_info info = DEREF(INDEX(lbvh_node_info, args.node_info, idx));
 
@@ -78,10 +95,10 @@ main(void)
          previous_child_index = 1;
 
       if (previous_child_index == -1) {
-         if (children[0] != RADV_BVH_INVALID_NODE) {
+         if (children[0] != VK_BVH_INVALID_NODE) {
             uint32_t child_offset = ir_id_to_offset(children[0]);
-            REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset));
-            radv_aabb child_bounds = DEREF(child).aabb;
+            REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset));
+            vk_aabb child_bounds = DEREF(child).aabb;
             bounds.min = min(bounds.min, child_bounds.min);
             bounds.max = max(bounds.max, child_bounds.max);
          }
@@ -89,23 +106,23 @@ main(void)
       }
 
       /* Fetch the non-cached child */
-      if (children[1 - previous_child_index] != RADV_BVH_INVALID_NODE) {
+      if (children[1 - previous_child_index] != VK_BVH_INVALID_NODE) {
          uint32_t child_offset = ir_id_to_offset(children[1 - previous_child_index]);
-         REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset));
-         radv_aabb child_bounds = DEREF(child).aabb;
+         REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset));
+         vk_aabb child_bounds = DEREF(child).aabb;
          bounds.min = min(bounds.min, child_bounds.min);
          bounds.max = max(bounds.max, child_bounds.max);
       }
 
-      radv_ir_box_node node_value;
+      vk_ir_box_node node_value;
 
       node_value.base.aabb = bounds;
-      node_value.bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+      node_value.bvh_offset = VK_UNKNOWN_BVH_OFFSET;
       node_value.children = children;
 
       DEREF(node) = node_value;
 
-      if (info.parent == RADV_BVH_INVALID_NODE)
+      if (info.parent == VK_BVH_INVALID_NODE)
          break;
 
       idx = info.parent & ~LBVH_RIGHT_CHILD_BIT;
diff --git a/src/amd/vulkan/bvh/lbvh_main.comp b/src/vulkan/runtime/bvh/lbvh_main.comp
similarity index 76%
rename from src/amd/vulkan/bvh/lbvh_main.comp
rename to src/vulkan/runtime/bvh/lbvh_main.comp
index c6c51280985..c79a3164eb9 100644
--- a/src/amd/vulkan/bvh/lbvh_main.comp
+++ b/src/vulkan/runtime/bvh/lbvh_main.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -17,9 +34,9 @@
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require
 
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"
 
-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
 
 TYPE(lbvh_node_info, 4);
 
@@ -74,11 +91,11 @@ main()
 {
    if (args.id_count <= 1) {
       REF(lbvh_node_info) dst = REF(lbvh_node_info)(args.node_info);
-      DEREF(dst).parent = RADV_BVH_INVALID_NODE;
+      DEREF(dst).parent = VK_BVH_INVALID_NODE;
       DEREF(dst).path_count = 2;
       DEREF(dst).children[0] =
-         args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : RADV_BVH_INVALID_NODE;
-      DEREF(dst).children[1] = RADV_BVH_INVALID_NODE;
+         args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : VK_BVH_INVALID_NODE;
+      DEREF(dst).children[1] = VK_BVH_INVALID_NODE;
       return;
    }
 
@@ -136,5 +153,5 @@ main()
    DEREF(dst).children[0] = DEREF(INDEX(key_id_pair, args.src_ids, left)).id;
    DEREF(dst).children[1] = DEREF(INDEX(key_id_pair, args.src_ids, right)).id;
    if (id == 0)
-      DEREF(dst).parent = RADV_BVH_INVALID_NODE;
+      DEREF(dst).parent = VK_BVH_INVALID_NODE;
 }
diff --git a/src/vulkan/runtime/bvh/leaf.comp b/src/vulkan/runtime/bvh/leaf.comp
new file mode 100644
index 00000000000..85f0756204a
--- /dev/null
+++ b/src/vulkan/runtime/bvh/leaf.comp
@@ -0,0 +1,250 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+#include "vk_build_interface.h"
+
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
+
+layout(push_constant) uniform CONSTS {
+   leaf_args args;
+};
+
+/* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
+struct AccelerationStructureInstance {
+   mat3x4 transform;
+   uint32_t custom_instance_and_mask;
+   uint32_t sbt_offset_and_flags;
+   uint64_t accelerationStructureReference;
+};
+TYPE(AccelerationStructureInstance, 8);
+
+bool
+build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id)
+{
+   bool is_valid = true;
+   triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
+
+   triangle_vertices vertices = load_vertices(geom_data.data, indices, geom_data.vertex_format, geom_data.stride);
+
+   /* An inactive triangle is one for which the first (X) component of any vertex is NaN. If any
+    * other vertex component is NaN, and the first is not, the behavior is undefined. If the vertex
+    * format does not have a NaN representation, then all triangles are considered active.
+    */
+   if (isnan(vertices.vertex[0].x) || isnan(vertices.vertex[1].x) || isnan(vertices.vertex[2].x))
+#if ALWAYS_ACTIVE
+      is_valid = false;
+#else
+      return false;
+#endif
+
+   if (geom_data.transform != NULL) {
+      mat4 transform = mat4(1.0);
+
+      for (uint32_t col = 0; col < 4; col++)
+      for (uint32_t row = 0; row < 3; row++)
+      transform[col][row] = DEREF(INDEX(float, geom_data.transform, col + row * 4));
+
+      for (uint32_t i = 0; i < 3; i++)
+      vertices.vertex[i] = transform * vertices.vertex[i];
+   }
+
+   REF(vk_ir_triangle_node) node = REF(vk_ir_triangle_node)(dst_ptr);
+
+   bounds.min = vec3(INFINITY);
+   bounds.max = vec3(-INFINITY);
+
+   for (uint32_t coord = 0; coord < 3; coord++)
+   for (uint32_t comp = 0; comp < 3; comp++) {
+      DEREF(node).coords[coord][comp] = vertices.vertex[coord][comp];
+      bounds.min[comp] = min(bounds.min[comp], vertices.vertex[coord][comp]);
+      bounds.max[comp] = max(bounds.max[comp], vertices.vertex[coord][comp]);
+   }
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).triangle_id = global_id;
+   DEREF(node).geometry_id_and_flags = geom_data.geometry_id;
+   DEREF(node).id = 9;
+
+   return is_valid;
+}
+
+bool
+build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
+{
+   bool is_valid = true;
+   REF(vk_ir_aabb_node) node = REF(vk_ir_aabb_node)(dst_ptr);
+
+   for (uint32_t vec = 0; vec < 2; vec++)
+   for (uint32_t comp = 0; comp < 3; comp++) {
+      float coord = DEREF(INDEX(float, src_ptr, comp + vec * 3));
+
+      if (vec == 0)
+      bounds.min[comp] = coord;
+      else
+      bounds.max[comp] = coord;
+   }
+
+   /* An inactive AABB is one for which the minimum X coordinate is NaN. If any other component is
+    * NaN, and the first is not, the behavior is undefined.
+    */
+   if (isnan(bounds.min.x))
+#if ALWAYS_ACTIVE
+      is_valid = false;
+#else
+      return false;
+#endif
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).primitive_id = global_id;
+   DEREF(node).geometry_id_and_flags = geometry_id;
+
+   return is_valid;
+}
+
+vk_aabb
+calculate_instance_node_bounds(uint64_t base_ptr, mat3x4 otw_matrix)
+{
+   vk_aabb aabb;
+
+   vk_aabb blas_aabb = DEREF(REF(vk_aabb)(base_ptr + BVH_BOUNDS_OFFSET));
+
+   for (uint32_t comp = 0; comp < 3; ++comp) {
+      aabb.min[comp] = otw_matrix[comp][3];
+      aabb.max[comp] = otw_matrix[comp][3];
+      for (uint32_t col = 0; col < 3; ++col) {
+         aabb.min[comp] +=
+            min(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]);
+         aabb.max[comp] +=
+            max(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]);
+      }
+   }
+   return aabb;
+}
+
+bool
+build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
+{
+   REF(vk_ir_instance_node) node = REF(vk_ir_instance_node)(dst_ptr);
+
+   AccelerationStructureInstance instance = DEREF(REF(AccelerationStructureInstance)(src_ptr));
+
+   /* An inactive instance is one whose acceleration structure handle is VK_NULL_HANDLE. Since the active terminology is
+    * only relevant for BVH updates, which we do not implement, we can also skip instances with mask == 0.
+    */
+   if (instance.accelerationStructureReference == 0 || instance.custom_instance_and_mask < (1u << 24u))
+      return false;
+
+   DEREF(node).base_ptr = instance.accelerationStructureReference;
+
+   mat4 transform = mat4(instance.transform);
+   DEREF(node).otw_matrix = mat3x4(transform);
+
+   bounds = calculate_instance_node_bounds(instance.accelerationStructureReference, mat3x4(transform));
+
+   DEREF(node).base.aabb = bounds;
+   DEREF(node).custom_instance_and_mask = instance.custom_instance_and_mask;
+   DEREF(node).sbt_offset_and_flags = instance.sbt_offset_and_flags;
+   DEREF(node).instance_id = global_id;
+
+   return true;
+}
+
+void
+main(void)
+{
+   uint32_t global_id = gl_GlobalInvocationID.x;
+   uint32_t primitive_id = args.geom_data.first_id + global_id;
+
+   REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id);
+   uint32_t src_offset = global_id * args.geom_data.stride;
+
+   uint32_t dst_stride;
+   uint32_t node_type;
+   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
+      dst_stride = SIZEOF(vk_ir_triangle_node);
+      node_type = vk_ir_node_triangle;
+   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
+      dst_stride = SIZEOF(vk_ir_aabb_node);
+      node_type = vk_ir_node_aabb;
+   } else {
+      dst_stride = SIZEOF(vk_ir_instance_node);
+      node_type = vk_ir_node_instance;
+   }
+
+   uint32_t dst_offset = primitive_id * dst_stride;
+   VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset);
+
+   vk_aabb bounds;
+   bool is_active;
+   if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
+      is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id);
+   } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
+      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
+      is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id);
+   } else {
+      VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
+      /* arrayOfPointers */
+      if (args.geom_data.stride == 8) {
+         src_ptr = DEREF(REF(VOID_REF)(src_ptr));
+      }
+
+      is_active = build_instance(bounds, src_ptr, dst_ptr, global_id);
+   }
+
+#if ALWAYS_ACTIVE
+   if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
+      bounds.min = vec3(0.0);
+      bounds.max = vec3(0.0);
+      is_active = true;
+   }
+#endif
+
+   DEREF(id_ptr).id = is_active ? pack_ir_node_id(dst_offset, node_type) : VK_BVH_INVALID_NODE;
+
+   uvec4 ballot = subgroupBallot(is_active);
+   if (subgroupElect())
+      atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot));
+
+   atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x));
+   atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y));
+   atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z));
+   atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x));
+   atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y));
+   atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z));
+}
diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build
new file mode 100644
index 00000000000..a2d751c295c
--- /dev/null
+++ b/src/vulkan/runtime/bvh/meson.build
@@ -0,0 +1,81 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# source file, output name, defines
+bvh_shaders = [
+  [
+    'lbvh_generate_ir.comp',
+    'lbvh_generate_ir',
+    [],
+  ],
+  [
+    'lbvh_main.comp',
+    'lbvh_main',
+    [],
+  ],
+  [
+    'leaf.comp',
+    'leaf',
+    ['ALWAYS_ACTIVE=0'],
+  ],
+  [
+    'leaf.comp',
+    'leaf_always_active',
+    ['ALWAYS_ACTIVE=1'],
+  ],
+  [
+    'morton.comp',
+    'morton',
+    [],
+  ],
+  [
+    'ploc_internal.comp',
+    'ploc_internal',
+    [],
+  ],
+]
+
+vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
+
+vk_bvh_includes = files(
+  'vk_build_helpers.h',
+  'vk_build_interface.h',
+  'vk_bvh.h',
+)
+
+bvh_spv = []
+foreach s : bvh_shaders
+  command = [
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+  ] + (with_mesa_debug ? ['-g'] : [])
+  command += glslang_quiet
+
+  foreach define : s[2]
+    command += '-D' + define
+  endforeach
+
+  bvh_spv += custom_target(
+    s[1] + '.spv.h',
+    input : s[0],
+    output : s[1] + '.spv.h',
+    command : command,
+    depend_files: vk_bvh_includes
+  )
+endforeach
diff --git a/src/amd/vulkan/bvh/morton.comp b/src/vulkan/runtime/bvh/morton.comp
similarity index 62%
rename from src/amd/vulkan/bvh/morton.comp
rename to src/vulkan/runtime/bvh/morton.comp
index f795297a11c..75a6f15baf3 100644
--- a/src/amd/vulkan/bvh/morton.comp
+++ b/src/vulkan/runtime/bvh/morton.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Konstantin Seurer
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -17,9 +34,9 @@
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require
 
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+#include "vk_build_interface.h"
 
-#include "build_interface.h"
+layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in;
 
 layout(push_constant) uniform CONSTS {
    morton_args args;
@@ -56,11 +73,11 @@ main(void)
    uint32_t id = DEREF(key_id).id;
 
    uint32_t key;
-   if (id != RADV_BVH_INVALID_NODE) {
-      radv_aabb bounds = DEREF(REF(radv_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb;
+   if (id != VK_BVH_INVALID_NODE) {
+      vk_aabb bounds = DEREF(REF(vk_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb;
       vec3 center = (bounds.min + bounds.max) * 0.5;
 
-      radv_aabb bvh_bounds;
+      vk_aabb bvh_bounds;
       bvh_bounds.min.x = from_emulated_float(DEREF(args.header).min_bounds[0]);
       bvh_bounds.min.y = from_emulated_float(DEREF(args.header).min_bounds[1]);
       bvh_bounds.min.z = from_emulated_float(DEREF(args.header).min_bounds[2]);
diff --git a/src/amd/vulkan/bvh/ploc_internal.comp b/src/vulkan/runtime/bvh/ploc_internal.comp
similarity index 76%
rename from src/amd/vulkan/bvh/ploc_internal.comp
rename to src/vulkan/runtime/bvh/ploc_internal.comp
index 50fc40edc93..0ecf7d38d82 100644
--- a/src/amd/vulkan/bvh/ploc_internal.comp
+++ b/src/vulkan/runtime/bvh/ploc_internal.comp
@@ -1,7 +1,24 @@
 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
- * SPDX-License-Identifier: MIT
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
 #version 460
@@ -24,7 +41,7 @@
 layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;
 
 #define USE_GLOBAL_SYNC
-#include "build_interface.h"
+#include "vk_build_interface.h"
 
 TYPE(ploc_prefix_scan_partition, 4);
 
@@ -34,7 +51,8 @@ layout(push_constant) uniform CONSTS
 };
 
 shared uint32_t exclusive_prefix_sum;
-shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64];
+shared uint32_t aggregate_sums[PLOC_SUBGROUPS_PER_WORKGROUP];
+shared uint32_t aggregate_sums2[PLOC_SUBGROUPS_PER_WORKGROUP];
 
 /*
  * Global prefix scan over all workgroups to find out the index of the collapsed node to write.
@@ -45,8 +63,7 @@ uint32_t
 prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
 {
    if (gl_LocalInvocationIndex == 0) {
-      /* Temporary copy of exclusive_prefix_sum to avoid reading+writing LDS each addition */
-      uint32_t local_exclusive_prefix_sum = 0;
+      exclusive_prefix_sum = 0;
       if (task_index >= gl_WorkGroupSize.x) {
          REF(ploc_prefix_scan_partition) current_partition =
             REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
@@ -58,28 +75,55 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
             if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
                            gl_StorageSemanticsBuffer,
                            gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
-               local_exclusive_prefix_sum += DEREF(previous_partition).inclusive_sum;
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
                break;
             } else {
-               local_exclusive_prefix_sum += DEREF(previous_partition).aggregate;
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
                previous_partition -= 1;
             }
          }
          /* Set the inclusive sum for the next workgroups */
          atomicStore(DEREF(current_partition).inclusive_sum,
-                     DEREF(current_partition).aggregate + local_exclusive_prefix_sum, gl_ScopeDevice,
+                     DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
                      gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable);
       }
-      exclusive_prefix_sum = local_exclusive_prefix_sum;
    }
 
    if (subgroupElect())
       aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
    barrier();
 
-   if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) {
-      aggregate_sums[gl_LocalInvocationID.x] =
-         exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+   if (PLOC_SUBGROUPS_PER_WORKGROUP <= SUBGROUP_SIZE) {
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
+         aggregate_sums[gl_LocalInvocationID.x] =
+            exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+      }
+   } else {
+      /* If the length of aggregate_sums[] is larger than SUBGROUP_SIZE,
+       * the prefix scan can't be done simply by subgroupExclusiveAdd.
+       */
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+         aggregate_sums2[gl_LocalInvocationID.x] = aggregate_sums[gl_LocalInvocationID.x];
+      barrier();
+
+      /* Hillis Steele inclusive scan on aggregate_sums2 */
+      for (uint32_t stride = 1; stride < PLOC_SUBGROUPS_PER_WORKGROUP; stride *= 2) {
+         uint32_t value = 0;
+         if (gl_LocalInvocationID.x >= stride && gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+            value = aggregate_sums2[gl_LocalInvocationID.x - stride];
+         barrier();
+         if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
+            aggregate_sums2[gl_LocalInvocationID.x] += value;
+         barrier();
+      }
+
+      /* Adapt to exclusive and add the prefix_sum from previous workgroups */
+      if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
+         if (gl_LocalInvocationID.x == 0)
+            aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum;
+         else
+            aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + aggregate_sums2[gl_LocalInvocationID.x - 1];
+      }
    }
    barrier();
 
@@ -90,20 +134,20 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
 #define BVH_LEVEL_COST 0.2
 
 uint32_t
-push_node(uint32_t children[2], radv_aabb bounds[2])
+push_node(uint32_t children[2], vk_aabb bounds[2])
 {
    uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
-   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
-   uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal);
-   REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
+   uint32_t dst_id = pack_ir_node_id(dst_offset, vk_ir_node_internal);
+   REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));
 
-   radv_aabb total_bounds;
+   vk_aabb total_bounds;
    total_bounds.min = vec3(INFINITY);
    total_bounds.max = vec3(-INFINITY);
 
    for (uint i = 0; i < 2; ++i) {
       VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
-      REF(radv_ir_node) child = REF(radv_ir_node)(node);
+      REF(vk_ir_node) child = REF(vk_ir_node)(node);
 
       total_bounds.min = min(total_bounds.min, bounds[i].min);
       total_bounds.max = max(total_bounds.max, bounds[i].max);
@@ -112,7 +156,7 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
    }
 
    DEREF(dst_node).base.aabb = total_bounds;
-   DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+   DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
    return dst_id;
 }
 
@@ -136,7 +180,7 @@ decode_neighbour_offset(uint32_t encoded_offset)
 
 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
 
-shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
+shared vk_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
 
 uint32_t
@@ -155,11 +199,11 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
    for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound;
         i += gl_WorkGroupSize.x) {
       uint32_t id = load_id(ids, iter, i);
-      if (id == RADV_BVH_INVALID_NODE)
+      if (id == VK_BVH_INVALID_NODE)
          continue;
 
       VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
-      REF(radv_ir_node) node = REF(radv_ir_node)(addr);
+      REF(vk_ir_node) node = REF(vk_ir_node)(addr);
 
       shared_bounds[i - lds_base] = DEREF(node).aabb;
    }
@@ -168,7 +212,7 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
 float
 combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
 {
-   radv_aabb combined_bounds;
+   vk_aabb combined_bounds;
    combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
    combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
    return aabb_surface_area(combined_bounds);
@@ -187,10 +231,10 @@ main(void)
    if (DEREF(args.header).active_leaf_count <= 2) {
       if (gl_GlobalInvocationID.x == 0) {
          uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
-         uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
-         REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+         uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
+         REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));
 
-         radv_aabb total_bounds;
+         vk_aabb total_bounds;
          total_bounds.min = vec3(INFINITY);
          total_bounds.max = vec3(-INFINITY);
 
@@ -198,10 +242,10 @@ main(void)
          for (; i < DEREF(args.header).active_leaf_count; i++) {
             uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id;
 
-            if (child_id != RADV_BVH_INVALID_NODE) {
+            if (child_id != VK_BVH_INVALID_NODE) {
                VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id));
-               REF(radv_ir_node) child = REF(radv_ir_node)(node);
-               radv_aabb bounds = DEREF(child).aabb;
+               REF(vk_ir_node) child = REF(vk_ir_node)(node);
+               vk_aabb bounds = DEREF(child).aabb;
 
                total_bounds.min = min(total_bounds.min, bounds.min);
                total_bounds.max = max(total_bounds.max, bounds.max);
@@ -210,10 +254,10 @@ main(void)
             DEREF(dst_node).children[i] = child_id;
          }
          for (; i < 2; i++)
-            DEREF(dst_node).children[i] = RADV_BVH_INVALID_NODE;
+            DEREF(dst_node).children[i] = VK_BVH_INVALID_NODE;
 
          DEREF(dst_node).base.aabb = total_bounds;
-         DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
+         DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
       }
       return;
    }
@@ -329,11 +373,11 @@ main(void)
                if (task_index < neighbour_index) {
                   uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
                   uint32_t children[2] = {id, neighbour_id};
-                  radv_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};
+                  vk_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};
 
                   DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds);
                   DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
-                     RADV_BVH_INVALID_NODE;
+                     VK_BVH_INVALID_NODE;
                } else {
                   /* We still store in the other case so we don't destroy the node id needed to
                    * create the internal node */
@@ -381,14 +425,14 @@ main(void)
 
          uint32_t id = task_index < current_task_count
                           ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
-                          : RADV_BVH_INVALID_NODE;
-         uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE);
+                          : VK_BVH_INVALID_NODE;
+         uvec4 ballot = subgroupBallot(id != VK_BVH_INVALID_NODE);
 
          uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
          if (task_index >= current_task_count)
             continue;
 
-         if (id != RADV_BVH_INVALID_NODE) {
+         if (id != VK_BVH_INVALID_NODE) {
             DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
             ++new_offset;
          }
diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h
new file mode 100644
index 00000000000..0a178adea14
--- /dev/null
+++ b/src/vulkan/runtime/bvh/vk_build_helpers.h
@@ -0,0 +1,522 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VK_BVH_BUILD_HELPERS_H
+#define VK_BVH_BUILD_HELPERS_H
+
+#include "vk_bvh.h"
+
+#define VK_FORMAT_UNDEFINED                  0
+#define VK_FORMAT_R4G4_UNORM_PACK8           1
+#define VK_FORMAT_R4G4B4A4_UNORM_PACK16      2
+#define VK_FORMAT_B4G4R4A4_UNORM_PACK16      3
+#define VK_FORMAT_R5G6B5_UNORM_PACK16        4
+#define VK_FORMAT_B5G6R5_UNORM_PACK16        5
+#define VK_FORMAT_R5G5B5A1_UNORM_PACK16      6
+#define VK_FORMAT_B5G5R5A1_UNORM_PACK16      7
+#define VK_FORMAT_A1R5G5B5_UNORM_PACK16      8
+#define VK_FORMAT_R8_UNORM                   9
+#define VK_FORMAT_R8_SNORM                   10
+#define VK_FORMAT_R8_USCALED                 11
+#define VK_FORMAT_R8_SSCALED                 12
+#define VK_FORMAT_R8_UINT                    13
+#define VK_FORMAT_R8_SINT                    14
+#define VK_FORMAT_R8_SRGB                    15
+#define VK_FORMAT_R8G8_UNORM                 16
+#define VK_FORMAT_R8G8_SNORM                 17
+#define VK_FORMAT_R8G8_USCALED               18
+#define VK_FORMAT_R8G8_SSCALED               19
+#define VK_FORMAT_R8G8_UINT                  20
+#define VK_FORMAT_R8G8_SINT                  21
+#define VK_FORMAT_R8G8_SRGB                  22
+#define VK_FORMAT_R8G8B8_UNORM               23
+#define VK_FORMAT_R8G8B8_SNORM               24
+#define VK_FORMAT_R8G8B8_USCALED             25
+#define VK_FORMAT_R8G8B8_SSCALED             26
+#define VK_FORMAT_R8G8B8_UINT                27
+#define VK_FORMAT_R8G8B8_SINT                28
+#define VK_FORMAT_R8G8B8_SRGB                29
+#define VK_FORMAT_B8G8R8_UNORM               30
+#define VK_FORMAT_B8G8R8_SNORM               31
+#define VK_FORMAT_B8G8R8_USCALED             32
+#define VK_FORMAT_B8G8R8_SSCALED             33
+#define VK_FORMAT_B8G8R8_UINT                34
+#define VK_FORMAT_B8G8R8_SINT                35
+#define VK_FORMAT_B8G8R8_SRGB                36
+#define VK_FORMAT_R8G8B8A8_UNORM             37
+#define VK_FORMAT_R8G8B8A8_SNORM             38
+#define VK_FORMAT_R8G8B8A8_USCALED           39
+#define VK_FORMAT_R8G8B8A8_SSCALED           40
+#define VK_FORMAT_R8G8B8A8_UINT              41
+#define VK_FORMAT_R8G8B8A8_SINT              42
+#define VK_FORMAT_R8G8B8A8_SRGB              43
+#define VK_FORMAT_B8G8R8A8_UNORM             44
+#define VK_FORMAT_B8G8R8A8_SNORM             45
+#define VK_FORMAT_B8G8R8A8_USCALED           46
+#define VK_FORMAT_B8G8R8A8_SSCALED           47
+#define VK_FORMAT_B8G8R8A8_UINT              48
+#define VK_FORMAT_B8G8R8A8_SINT              49
+#define VK_FORMAT_B8G8R8A8_SRGB              50
+#define VK_FORMAT_A8B8G8R8_UNORM_PACK32      51
+#define VK_FORMAT_A8B8G8R8_SNORM_PACK32      52
+#define VK_FORMAT_A8B8G8R8_USCALED_PACK32    53
+#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32    54
+#define VK_FORMAT_A8B8G8R8_UINT_PACK32       55
+#define VK_FORMAT_A8B8G8R8_SINT_PACK32       56
+#define VK_FORMAT_A8B8G8R8_SRGB_PACK32       57
+#define VK_FORMAT_A2R10G10B10_UNORM_PACK32   58
+#define VK_FORMAT_A2R10G10B10_SNORM_PACK32   59
+#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
+#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
+#define VK_FORMAT_A2R10G10B10_UINT_PACK32    62
+#define VK_FORMAT_A2R10G10B10_SINT_PACK32    63
+#define VK_FORMAT_A2B10G10R10_UNORM_PACK32   64
+#define VK_FORMAT_A2B10G10R10_SNORM_PACK32   65
+#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
+#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
+#define VK_FORMAT_A2B10G10R10_UINT_PACK32    68
+#define VK_FORMAT_A2B10G10R10_SINT_PACK32    69
+#define VK_FORMAT_R16_UNORM                  70
+#define VK_FORMAT_R16_SNORM                  71
+#define VK_FORMAT_R16_USCALED                72
+#define VK_FORMAT_R16_SSCALED                73
+#define VK_FORMAT_R16_UINT                   74
+#define VK_FORMAT_R16_SINT                   75
+#define VK_FORMAT_R16_SFLOAT                 76
+#define VK_FORMAT_R16G16_UNORM               77
+#define VK_FORMAT_R16G16_SNORM               78
+#define VK_FORMAT_R16G16_USCALED             79
+#define VK_FORMAT_R16G16_SSCALED             80
+#define VK_FORMAT_R16G16_UINT                81
+#define VK_FORMAT_R16G16_SINT                82
+#define VK_FORMAT_R16G16_SFLOAT              83
+#define VK_FORMAT_R16G16B16_UNORM            84
+#define VK_FORMAT_R16G16B16_SNORM            85
+#define VK_FORMAT_R16G16B16_USCALED          86
+#define VK_FORMAT_R16G16B16_SSCALED          87
+#define VK_FORMAT_R16G16B16_UINT             88
+#define VK_FORMAT_R16G16B16_SINT             89
+#define VK_FORMAT_R16G16B16_SFLOAT           90
+#define VK_FORMAT_R16G16B16A16_UNORM         91
+#define VK_FORMAT_R16G16B16A16_SNORM         92
+#define VK_FORMAT_R16G16B16A16_USCALED       93
+#define VK_FORMAT_R16G16B16A16_SSCALED       94
+#define VK_FORMAT_R16G16B16A16_UINT          95
+#define VK_FORMAT_R16G16B16A16_SINT          96
+#define VK_FORMAT_R16G16B16A16_SFLOAT        97
+#define VK_FORMAT_R32_UINT                   98
+#define VK_FORMAT_R32_SINT                   99
+#define VK_FORMAT_R32_SFLOAT                 100
+#define VK_FORMAT_R32G32_UINT                101
+#define VK_FORMAT_R32G32_SINT                102
+#define VK_FORMAT_R32G32_SFLOAT              103
+#define VK_FORMAT_R32G32B32_UINT             104
+#define VK_FORMAT_R32G32B32_SINT             105
+#define VK_FORMAT_R32G32B32_SFLOAT           106
+#define VK_FORMAT_R32G32B32A32_UINT          107
+#define VK_FORMAT_R32G32B32A32_SINT          108
+#define VK_FORMAT_R32G32B32A32_SFLOAT        109
+#define VK_FORMAT_R64_UINT                   110
+#define VK_FORMAT_R64_SINT                   111
+#define VK_FORMAT_R64_SFLOAT                 112
+#define VK_FORMAT_R64G64_UINT                113
+#define VK_FORMAT_R64G64_SINT                114
+#define VK_FORMAT_R64G64_SFLOAT              115
+#define VK_FORMAT_R64G64B64_UINT             116
+#define VK_FORMAT_R64G64B64_SINT             117
+#define VK_FORMAT_R64G64B64_SFLOAT           118
+#define VK_FORMAT_R64G64B64A64_UINT          119
+#define VK_FORMAT_R64G64B64A64_SINT          120
+#define VK_FORMAT_R64G64B64A64_SFLOAT        121
+
+#define VK_INDEX_TYPE_UINT16    0
+#define VK_INDEX_TYPE_UINT32    1
+#define VK_INDEX_TYPE_NONE_KHR  1000165000
+#define VK_INDEX_TYPE_UINT8_EXT 1000265000
+
+#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
+#define VK_GEOMETRY_TYPE_AABBS_KHR     1
+#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
+
+#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
+#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR         2
+#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR                 4
+#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR              8
+
+#define TYPE(type, align)                                                                                              \
+   layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref                                  \
+   {                                                                                                                   \
+      type value;                                                                                                      \
+   };
+
+#define REF(type)  type##_ref
+#define VOID_REF   uint64_t
+#define NULL       0
+#define DEREF(var) var.value
+
+#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
+
+#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
+
+#define INFINITY (1.0 / 0.0)
+#define NAN      (0.0 / 0.0)
+
+#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
+
+TYPE(int8_t, 1);
+TYPE(uint8_t, 1);
+TYPE(int16_t, 2);
+TYPE(uint16_t, 2);
+TYPE(int32_t, 4);
+TYPE(uint32_t, 4);
+TYPE(int64_t, 8);
+TYPE(uint64_t, 8);
+
+TYPE(float, 4);
+
+TYPE(vec2, 4);
+TYPE(vec3, 4);
+TYPE(vec4, 4);
+
+TYPE(uvec4, 16);
+
+TYPE(VOID_REF, 8);
+
+/* copied from u_math.h */
+uint32_t
+align(uint32_t value, uint32_t alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+int32_t
+to_emulated_float(float f)
+{
+   int32_t bits = floatBitsToInt(f);
+   return f < 0 ? -2147483648 - bits : bits;
+}
+
+float
+from_emulated_float(int32_t bits)
+{
+   return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
+}
+
+TYPE(vk_aabb, 4);
+
+struct key_id_pair {
+   uint32_t id;
+   uint32_t key;
+};
+TYPE(key_id_pair, 4);
+
+TYPE(vk_accel_struct_serialization_header, 8);
+
+TYPE(vk_ir_header, 4);
+TYPE(vk_ir_node, 4);
+TYPE(vk_ir_box_node, 4);
+TYPE(vk_ir_triangle_node, 4);
+TYPE(vk_ir_aabb_node, 4);
+TYPE(vk_ir_instance_node, 8);
+
+TYPE(vk_global_sync_data, 4);
+
+uint32_t
+ir_id_to_offset(uint32_t id)
+{
+   return id & (~3u);
+}
+
+uint32_t
+ir_id_to_type(uint32_t id)
+{
+   return id & 3u;
+}
+
+uint32_t
+pack_ir_node_id(uint32_t offset, uint32_t type)
+{
+   return offset | type;
+}
+
+float
+aabb_surface_area(vk_aabb aabb)
+{
+   vec3 diagonal = aabb.max - aabb.min;
+   return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
+}
+
+/* Just a wrapper for 3 uints. */
+struct triangle_indices {
+   uint32_t index[3];
+};
+
+triangle_indices
+load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
+{
+   triangle_indices result;
+
+   uint32_t index_base = global_id * 3;
+
+   switch (index_format) {
+   case VK_INDEX_TYPE_UINT16: {
+      result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
+      break;
+   }
+   case VK_INDEX_TYPE_UINT32: {
+      result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
+      break;
+   }
+   case VK_INDEX_TYPE_NONE_KHR: {
+      result.index[0] = index_base + 0;
+      result.index[1] = index_base + 1;
+      result.index[2] = index_base + 2;
+      break;
+   }
+   case VK_INDEX_TYPE_UINT8_EXT: {
+      result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
+      result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
+      result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
+      break;
+   }
+   }
+
+   return result;
+}
+
+/* Just a wrapper for 3 vec4s. */
+struct triangle_vertices {
+   vec4 vertex[3];
+};
+
+TYPE(float16_t, 2);
+
+triangle_vertices
+load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
+{
+   triangle_vertices result;
+
+   for (uint32_t i = 0; i < 3; i++) {
+      VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
+      vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
+
+      switch (vertex_format) {
+      case VK_FORMAT_R32G32_SFLOAT:
+         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
+         break;
+      case VK_FORMAT_R32G32B32_SFLOAT:
+      case VK_FORMAT_R32G32B32A32_SFLOAT:
+         vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
+         vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
+         break;
+      case VK_FORMAT_R16G16_SFLOAT:
+         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
+         break;
+      case VK_FORMAT_R16G16B16_SFLOAT:
+      case VK_FORMAT_R16G16B16A16_SFLOAT:
+         vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
+         vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
+         vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
+         break;
+      case VK_FORMAT_R16G16_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
+         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
+         break;
+      case VK_FORMAT_R16G16B16A16_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
+         vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
+         vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
+         break;
+      case VK_FORMAT_R8G8_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
+         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
+         break;
+      case VK_FORMAT_R8G8B8A8_SNORM:
+         vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
+         vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
+         vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
+         break;
+      case VK_FORMAT_R16G16_UNORM:
+         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
+         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
+         break;
+      case VK_FORMAT_R16G16B16A16_UNORM:
+         vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
+         vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
+         vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
+         break;
+      case VK_FORMAT_R8G8_UNORM:
+         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
+         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
+         break;
+      case VK_FORMAT_R8G8B8A8_UNORM:
+         vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
+         vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
+         vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
+         break;
+      case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
+         uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
+         vertex.x = float(data & 0x3FF) / 0x3FF;
+         vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
+         vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
+         break;
+      }
+      }
+
+      result.vertex[i] = vertex;
+   }
+
+   return result;
+}
+
+/** Compute ceiling of integer quotient of A divided by B.
+    From macros.h */
+#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
+
+#ifdef USE_GLOBAL_SYNC
+
+/* There might be more invocations available than tasks to do.
+ * In that case, the fetched task index is greater than the
+ * counter offset for the next phase. To avoid out-of-bounds
+ * accessing, phases will be skipped until the task index is
+ * is in-bounds again. */
+uint32_t num_tasks_to_skip = 0;
+uint32_t phase_index = 0;
+bool should_skip = false;
+shared uint32_t global_task_index;
+
+shared uint32_t shared_phase_index;
+
+uint32_t
+task_count(REF(vk_ir_header) header)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   return DEREF(header).sync_data.task_counts[phase_index & 1];
+}
+
+/* Sets the task count for the next phase. */
+void
+set_next_task_count(REF(vk_ir_header) header, uint32_t new_count)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
+}
+
+/*
+ * This function has two main objectives:
+ * Firstly, it partitions pending work among free invocations.
+ * Secondly, it guarantees global synchronization between different phases.
+ *
+ * After every call to fetch_task, a new task index is returned.
+ * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
+ * to determine if the current phase should be executed or skipped.
+ *
+ * Since tasks are assigned per-workgroup, there is a possibility of the task index being
+ * greater than the total task count.
+ */
+uint32_t
+fetch_task(REF(vk_ir_header) header, bool did_work)
+{
+   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
+    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
+    * and their results are written to memory. */
+   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+   if (gl_LocalInvocationIndex == 0) {
+      if (did_work)
+         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
+      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
+
+      do {
+         /* Perform a memory barrier to refresh the current phase's end counter, in case
+          * another workgroup changed it. */
+         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
+          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
+          * previous phase have been completed. Switching to a new phase and incrementing the phase
+          * end counter in turn notifies all invocations for that phase that it is safe to execute.
+          */
+         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
+             DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
+            if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
+               DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
+               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+            } else {
+               atomicAdd(DEREF(header).sync_data.phase_index, 1);
+               DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
+               /* Ensure the changes to the phase index and start/end counter are visible for other
+                * workgroup waiting in the loop. */
+               memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                             gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+               atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
+                         DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
+            }
+            break;
+         }
+
+         /* If other invocations have finished all nodes, break out; there is no work to do */
+         if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
+            break;
+         }
+      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
+
+      shared_phase_index = DEREF(header).sync_data.phase_index;
+   }
+
+   barrier();
+   if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
+      return TASK_INDEX_INVALID;
+
+   num_tasks_to_skip = shared_phase_index - phase_index;
+
+   uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
+   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
+}
+
+bool
+should_execute_phase()
+{
+   if (num_tasks_to_skip > 0) {
+      /* Skip to next phase. */
+      ++phase_index;
+      --num_tasks_to_skip;
+      return false;
+   }
+   return true;
+}
+
+#define PHASE(header)                                                                                                  \
+   for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
+#endif
+
+#endif
diff --git a/src/vulkan/runtime/bvh/vk_build_interface.h b/src/vulkan/runtime/bvh/vk_build_interface.h
new file mode 100644
index 00000000000..0d2f1fed21c
--- /dev/null
+++ b/src/vulkan/runtime/bvh/vk_build_interface.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VK_BVH_BUILD_INTERFACE_H
+#define VK_BVH_BUILD_INTERFACE_H
+
+#ifdef VULKAN
+#include "vk_build_helpers.h"
+#else
+#include <stdint.h>
+#include "vk_bvh.h"
+#define REF(type) uint64_t
+#define VOID_REF  uint64_t
+#endif
+
+#define SUBGROUP_SIZE_ID 0
+#define BVH_BOUNDS_OFFSET_ID 1
+#ifdef VULKAN
+layout (constant_id = SUBGROUP_SIZE_ID) const int SUBGROUP_SIZE = 64;
+layout (constant_id = BVH_BOUNDS_OFFSET_ID) const int BVH_BOUNDS_OFFSET = 0;
+#endif
+
+struct leaf_args {
+   VOID_REF bvh;
+   REF(vk_ir_header) header;
+   REF(key_id_pair) ids;
+
+   vk_bvh_geometry_data geom_data;
+};
+
+struct morton_args {
+   VOID_REF bvh;
+   REF(vk_ir_header) header;
+   REF(key_id_pair) ids;
+};
+
+#define LBVH_RIGHT_CHILD_BIT_SHIFT 29
+#define LBVH_RIGHT_CHILD_BIT       (1 << LBVH_RIGHT_CHILD_BIT_SHIFT)
+
+struct lbvh_node_info {
+   /* Number of children that have been processed (or are invalid/leaves) in
+    * the lbvh_generate_ir pass.
+    */
+   uint32_t path_count;
+
+   uint32_t children[2];
+   uint32_t parent;
+};
+
+struct lbvh_main_args {
+   VOID_REF bvh;
+   REF(key_id_pair) src_ids;
+   VOID_REF node_info;
+   uint32_t id_count;
+   uint32_t internal_node_base;
+};
+
+struct lbvh_generate_ir_args {
+   VOID_REF bvh;
+   VOID_REF node_info;
+   VOID_REF header;
+   uint32_t internal_node_base;
+};
+
+struct ploc_prefix_scan_partition {
+   uint32_t aggregate;
+   uint32_t inclusive_sum;
+};
+
+#define PLOC_WORKGROUP_SIZE 1024
+#define PLOC_SUBGROUPS_PER_WORKGROUP                                           \
+   (DIV_ROUND_UP(PLOC_WORKGROUP_SIZE, SUBGROUP_SIZE))
+
+struct ploc_args {
+   VOID_REF bvh;
+   VOID_REF prefix_scan_partitions;
+   REF(vk_ir_header) header;
+   VOID_REF ids_0;
+   VOID_REF ids_1;
+   uint32_t internal_node_offset;
+};
+
+#endif
diff --git a/src/vulkan/runtime/bvh/vk_bvh.h b/src/vulkan/runtime/bvh/vk_bvh.h
new file mode 100644
index 00000000000..f393fa443d4
--- /dev/null
+++ b/src/vulkan/runtime/bvh/vk_bvh.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2021 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BVH_VK_BVH_H
+#define BVH_VK_BVH_H
+
+#define vk_ir_node_triangle 0
+#define vk_ir_node_internal 1
+#define vk_ir_node_instance 2
+#define vk_ir_node_aabb     3
+
+#define VK_GEOMETRY_OPAQUE (1u << 31)
+
+#ifdef VULKAN
+#define VK_UUID_SIZE 16
+#else
+#include <vulkan/vulkan.h>
+typedef struct vk_ir_node vk_ir_node;
+typedef struct vk_global_sync_data vk_global_sync_data;
+typedef struct vk_bvh_geometry_data vk_bvh_geometry_data;
+
+typedef struct {
+   float values[3][4];
+} mat3x4;
+
+typedef struct {
+   float x;
+   float y;
+   float z;
+} vec3;
+
+typedef struct vk_aabb vk_aabb;
+#endif
+
+struct vk_aabb {
+   vec3 min;
+   vec3 max;
+};
+
+/* This is the header structure for serialized acceleration structures, as
+ * defined by the Vulkan spec.
+ */
+struct vk_accel_struct_serialization_header {
+   uint8_t driver_uuid[VK_UUID_SIZE];
+   uint8_t accel_struct_compat[VK_UUID_SIZE];
+   uint64_t serialization_size;
+   uint64_t deserialization_size;
+   uint64_t instance_count;
+#ifndef VULKAN
+   uint64_t instances[];
+#endif
+};
+
+struct vk_global_sync_data {
+   uint32_t task_counts[2];
+   uint32_t task_started_counter;
+   uint32_t task_done_counter;
+   uint32_t current_phase_start_counter;
+   uint32_t current_phase_end_counter;
+   uint32_t phase_index;
+   /* If this flag is set, the shader should exit
+    * instead of executing another phase */
+   uint32_t next_phase_exit_flag;
+};
+
+struct vk_ir_header {
+   int32_t min_bounds[3];
+   int32_t max_bounds[3];
+   uint32_t active_leaf_count;
+   /* Indirect dispatch dimensions for the encoder.
+    * ir_internal_node_count is the thread count in the X dimension,
+    * while Y and Z are always set to 1. */
+   uint32_t ir_internal_node_count;
+   uint32_t dispatch_size_y;
+   uint32_t dispatch_size_z;
+   vk_global_sync_data sync_data;
+   uint32_t dst_node_offset;
+};
+
+struct vk_ir_node {
+   vk_aabb aabb;
+};
+
+#define VK_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
+#define VK_NULL_BVH_OFFSET    0xFFFFFFFE
+
+struct vk_ir_box_node {
+   vk_ir_node base;
+   uint32_t children[2];
+   uint32_t bvh_offset;
+};
+
+struct vk_ir_aabb_node {
+   vk_ir_node base;
+   uint32_t primitive_id;
+   uint32_t geometry_id_and_flags;
+};
+
+struct vk_ir_triangle_node {
+   vk_ir_node base;
+   float coords[3][3];
+   uint32_t triangle_id;
+   uint32_t id;
+   uint32_t geometry_id_and_flags;
+};
+
+struct vk_ir_instance_node {
+   vk_ir_node base;
+   /* See radv_bvh_instance_node */
+   uint64_t base_ptr;
+   uint32_t custom_instance_and_mask;
+   uint32_t sbt_offset_and_flags;
+   mat3x4 otw_matrix;
+   uint32_t instance_id;
+};
+
+#define VK_BVH_INVALID_NODE 0xFFFFFFFF
+
+/* If the task index is set to this value, there is no
+ * more work to do. */
+#define TASK_INDEX_INVALID 0xFFFFFFFF
+
+struct vk_bvh_geometry_data {
+   uint64_t data;
+   uint64_t indices;
+   uint64_t transform;
+
+   uint32_t geometry_id;
+   uint32_t geometry_type;
+   uint32_t first_id;
+   uint32_t stride;
+   uint32_t vertex_format;
+   uint32_t index_format;
+};
+
+#endif
diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build
index b325ebe6f3d..9d34ae432f0 100644
--- a/src/vulkan/runtime/meson.build
+++ b/src/vulkan/runtime/meson.build
@@ -7,7 +7,6 @@
 vulkan_lite_runtime_files = files(
   'rmv/vk_rmv_common.c',
   'rmv/vk_rmv_exporter.c',
-  'vk_acceleration_structure.c',
   'vk_blend.c',
   'vk_buffer.c',
   'vk_buffer_view.c',
@@ -277,6 +276,8 @@ vulkan_runtime_deps = [
 ]
 
 if prog_glslang.found()
+  subdir('radix_sort')
+  subdir('bvh')
   vulkan_runtime_files += files('vk_texcompress_astc.c')
   vulkan_runtime_files += custom_target(
     'astc_spv.h',
@@ -288,6 +289,10 @@ if prog_glslang.found()
     ],
     depfile : 'astc_spv.h.d',
   )
+  vulkan_runtime_files += files('vk_acceleration_structure.c')
+  vulkan_runtime_files += radix_sort_files
+  vulkan_runtime_files += bvh_spv
+  vulkan_runtime_files += radix_sort_spv
 endif
 
 libvulkan_runtime = static_library(
@@ -320,7 +325,10 @@ else
   )
 endif
 
-idep_vulkan_runtime_headers = idep_vulkan_lite_runtime_headers
+idep_vulkan_runtime_headers = [idep_vulkan_lite_runtime_headers]
+idep_vulkan_runtime_headers += declare_dependency(
+  include_directories : include_directories('bvh'),
+)
 
 idep_vulkan_runtime = declare_dependency(
   dependencies : [
diff --git a/src/amd/vulkan/radix_sort/LICENSE b/src/vulkan/runtime/radix_sort/LICENSE
similarity index 100%
rename from src/amd/vulkan/radix_sort/LICENSE
rename to src/vulkan/runtime/radix_sort/LICENSE
diff --git a/src/amd/vulkan/radix_sort/common/macros.h b/src/vulkan/runtime/radix_sort/common/macros.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/macros.h
rename to src/vulkan/runtime/radix_sort/common/macros.h
diff --git a/src/amd/vulkan/radix_sort/common/util.c b/src/vulkan/runtime/radix_sort/common/util.c
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/util.c
rename to src/vulkan/runtime/radix_sort/common/util.c
diff --git a/src/amd/vulkan/radix_sort/common/util.h b/src/vulkan/runtime/radix_sort/common/util.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/util.h
rename to src/vulkan/runtime/radix_sort/common/util.h
diff --git a/src/amd/vulkan/radix_sort/common/vk/barrier.c b/src/vulkan/runtime/radix_sort/common/vk/barrier.c
similarity index 81%
rename from src/amd/vulkan/radix_sort/common/vk/barrier.c
rename to src/vulkan/runtime/radix_sort/common/vk/barrier.c
index 58134dbd11a..e0865f6b770 100644
--- a/src/amd/vulkan/radix_sort/common/vk/barrier.c
+++ b/src/vulkan/runtime/radix_sort/common/vk/barrier.c
@@ -7,6 +7,8 @@
 //
 
 #include "barrier.h"
+#include "vulkan/runtime/vk_device.h"
+#include "vulkan/runtime/vk_command_buffer.h"
 
 //
 //
@@ -15,6 +17,10 @@
 void
 vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -23,7 +29,7 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -42,6 +48,10 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -50,7 +60,7 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        0,
@@ -69,6 +79,10 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -77,7 +91,7 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -96,6 +110,10 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -104,7 +122,7 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -123,6 +141,10 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -132,7 +154,7 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
                      VK_ACCESS_SHADER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
@@ -151,6 +173,10 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -160,7 +186,7 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        0,
@@ -179,6 +205,10 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
 void
 vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -187,7 +217,7 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_HOST_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_HOST_BIT,
                        0,
@@ -206,6 +236,10 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
 void
 vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -214,7 +248,7 @@ vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
     .dstAccessMask = VK_ACCESS_HOST_READ_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        VK_PIPELINE_STAGE_HOST_BIT,
                        0,
@@ -237,12 +271,16 @@ vk_memory_barrier(VkCommandBuffer      cb,
                   VkPipelineStageFlags dst_stage,
                   VkAccessFlags        dst_mask)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   VkMemoryBarrier const mb = { .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
                                .pNext         = NULL,
                                .srcAccessMask = src_mask,
                                .dstAccessMask = dst_mask };
 
-  vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
+  disp->CmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
 }
 
 //
@@ -252,6 +290,10 @@ vk_memory_barrier(VkCommandBuffer      cb,
 void
 vk_barrier_debug(VkCommandBuffer cb)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   static VkMemoryBarrier const mb = {
 
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -288,7 +330,7 @@ vk_barrier_debug(VkCommandBuffer cb)
                      VK_ACCESS_HOST_WRITE_BIT
   };
 
-  vkCmdPipelineBarrier(cb,
+  disp->CmdPipelineBarrier(cb,
                        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                        0,
diff --git a/src/amd/vulkan/radix_sort/common/vk/barrier.h b/src/vulkan/runtime/radix_sort/common/vk/barrier.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/common/vk/barrier.h
rename to src/vulkan/runtime/radix_sort/common/vk/barrier.h
diff --git a/src/vulkan/runtime/radix_sort/meson.build b/src/vulkan/runtime/radix_sort/meson.build
new file mode 100644
index 00000000000..138c0c9369a
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/meson.build
@@ -0,0 +1,37 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+subdir('shaders')
+
+radix_sort_files = files(
+  'common/vk/barrier.c',
+  'common/vk/barrier.h',
+  'common/macros.h',
+  'common/util.c',
+  'common/util.h',
+  'shaders/push.h',
+  'radix_sort_u64.c',
+  'radix_sort_u64.h',
+  'radix_sort_vk_devaddr.h',
+  'radix_sort_vk_ext.h',
+  'radix_sort_vk.c',
+  'radix_sort_vk.h',
+  'target.h'
+)
diff --git a/src/vulkan/runtime/radix_sort/radix_sort_u64.c b/src/vulkan/runtime/radix_sort/radix_sort_u64.c
new file mode 100644
index 00000000000..0d5f9217656
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "radix_sort_u64.h"
+#include <assert.h>
+
+static const uint32_t init_spv[] = {
+#include "radix_sort/shaders/init.comp.spv.h"
+};
+
+static const uint32_t fill_spv[] = {
+#include "radix_sort/shaders/fill.comp.spv.h"
+};
+
+static const uint32_t histogram_spv[] = {
+#include "radix_sort/shaders/histogram.comp.spv.h"
+};
+
+static const uint32_t prefix_spv[] = {
+#include "radix_sort/shaders/prefix.comp.spv.h"
+};
+
+static const uint32_t scatter_0_even_spv[] = {
+#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
+};
+
+static const uint32_t scatter_0_odd_spv[] = {
+#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
+};
+
+static const uint32_t scatter_1_even_spv[] = {
+#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
+};
+
+static const uint32_t scatter_1_odd_spv[] = {
+#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
+};
+
+
+radix_sort_vk_t *
+vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                         VkPipelineCache pc,
+                         struct radix_sort_vk_target_config config)
+{
+   assert(config.keyval_dwords == 2);
+
+   const uint32_t *spv[8] = {
+      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
+      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
+   };
+   const uint32_t spv_sizes[8] = {
+      sizeof(init_spv),           sizeof(fill_spv),          sizeof(histogram_spv),      sizeof(prefix_spv),
+      sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
+   };
+   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, config);
+}
+
diff --git a/src/vulkan/runtime/radix_sort/radix_sort_u64.h b/src/vulkan/runtime/radix_sort/radix_sort_u64.h
new file mode 100644
index 00000000000..8bb37fe2082
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef VK_RADIX_SORT_U64
+#define VK_RADIX_SORT_U64
+
+#include "radix_sort_vk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+radix_sort_vk_t *
+vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                         VkPipelineCache pc,
+                         struct radix_sort_vk_target_config config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk.c b/src/vulkan/runtime/radix_sort/radix_sort_vk.c
similarity index 83%
rename from src/amd/vulkan/radix_sort/radix_sort_vk.c
rename to src/vulkan/runtime/radix_sort/radix_sort_vk.c
index 70253884fc4..31efd3d4a75 100644
--- a/src/amd/vulkan/radix_sort/radix_sort_vk.c
+++ b/src/vulkan/runtime/radix_sort/radix_sort_vk.c
@@ -11,6 +11,10 @@
 #include "common/vk/barrier.h"
 #include "radix_sort_vk_devaddr.h"
 #include "shaders/push.h"
+#include "shaders/config.h"
+
+#include "vk_command_buffer.h"
+#include "vk_device.h"
 
 //
 //
@@ -100,14 +104,41 @@ radix_sort_vk_get_memory_requirements(radix_sort_vk_t const *               rs,
       // NOTE: Assumes .histograms are before .partitions.
       //
       // Last scatter workgroup skips writing to a partition.
+      // Each RS_RADIX_LOG2 (8) bit pass has a zero-initialized histogram. This
+      // is one RS_RADIX_SIZE histogram per keyval byte.
       //
-      // One histogram per (keyval byte + partitions)
+      // The last scatter workgroup skips writing to a partition so it doesn't
+      // need to be allocated.
       //
-      uint32_t const partitions = scatter_blocks - 1;
+      // If the device doesn't support "sequential dispatch" of workgroups, then
+      // we need a zero-initialized dword counter per radix pass in the keyval
+      // to atomically acquire a virtual workgroup id.  On sequentially
+      // dispatched devices, this is simply `gl_WorkGroupID.x`.
+      //
+      // The "internal" memory map looks like this:
+      //
+      //   +---------------------------------+ <-- 0
+      //   | histograms[keyval_size]         |
+      //   +---------------------------------+ <-- keyval_size                           * histo_size
+      //   | partitions[scatter_blocks_ru-1] |
+      //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size
+      //   | workgroup_ids[keyval_size]      |
+      //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size
+      //
+      // The `.workgroup_ids[]` are located after the last partition.
+      //
+      VkDeviceSize const histo_size = RS_RADIX_SIZE * sizeof(uint32_t);
 
-      mr->internal_size      = (mr->keyval_size + partitions) * (RS_RADIX_SIZE * sizeof(uint32_t));
+      mr->internal_size      = (mr->keyval_size + scatter_blocks - 1) * histo_size;
       mr->internal_alignment = internal_sg_size * sizeof(uint32_t);
 
+      //
+      // Support for nonsequential dispatch can be disabled.
+      //
+      VkDeviceSize const workgroup_ids_size = mr->keyval_size * sizeof(uint32_t);
+
+      mr->internal_size += workgroup_ids_size;
+
       //
       // Indirect
       //
@@ -185,13 +216,17 @@ rs_pipeline_count(struct radix_sort_vk const * rs)
 }
 
 radix_sort_vk_t *
-radix_sort_vk_create(VkDevice                           device,
+radix_sort_vk_create(VkDevice                           _device,
                     VkAllocationCallbacks const *      ac,
                     VkPipelineCache                    pc,
                     const uint32_t* const*             spv,
                     const uint32_t*                    spv_sizes,
                     struct radix_sort_vk_target_config config)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   //
   // Allocate radix_sort_vk
   //
@@ -244,6 +279,38 @@ radix_sort_vk_create(VkDevice                           device,
       .size       = sizeof(struct rs_push_scatter) },  // scatter_1_odd
   };
 
+  uint32_t spec_constants[] = {
+    [RS_FILL_WORKGROUP_SIZE] = 1u << config.fill.workgroup_size_log2,
+    [RS_FILL_BLOCK_ROWS] = config.fill.block_rows,
+    [RS_HISTOGRAM_WORKGROUP_SIZE] = 1u << config.histogram.workgroup_size_log2,
+    [RS_HISTOGRAM_SUBGROUP_SIZE_LOG2] = config.histogram.subgroup_size_log2,
+    [RS_HISTOGRAM_BLOCK_ROWS] = config.histogram.block_rows,
+    [RS_PREFIX_WORKGROUP_SIZE] = 1u << config.prefix.workgroup_size_log2,
+    [RS_PREFIX_SUBGROUP_SIZE_LOG2] = config.prefix.subgroup_size_log2,
+    [RS_SCATTER_WORKGROUP_SIZE] = 1u << config.scatter.workgroup_size_log2,
+    [RS_SCATTER_SUBGROUP_SIZE_LOG2] = config.scatter.subgroup_size_log2,
+    [RS_SCATTER_BLOCK_ROWS] = config.scatter.block_rows,
+    [RS_SCATTER_NONSEQUENTIAL_DISPATCH] = config.nonsequential_dispatch,
+  };
+
+  VkSpecializationMapEntry spec_map[ARRAY_LENGTH_MACRO(spec_constants)];
+
+  for (uint32_t ii = 0; ii < ARRAY_LENGTH_MACRO(spec_constants); ii++)
+    {
+      spec_map[ii] = (VkSpecializationMapEntry) {
+        .constantID = ii,
+        .offset = sizeof(uint32_t) * ii,
+        .size = sizeof(uint32_t),
+      };
+    }
+
+  VkSpecializationInfo spec_info = {
+    .mapEntryCount = ARRAY_LENGTH_MACRO(spec_map),
+    .pMapEntries = spec_map,
+    .dataSize = sizeof(spec_constants),
+    .pData = spec_constants,
+  };
+
   VkPipelineLayoutCreateInfo plci = {
 
     .sType                  = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
@@ -259,7 +326,7 @@ radix_sort_vk_create(VkDevice                           device,
     {
       plci.pPushConstantRanges = pcr + ii;
 
-      if (vkCreatePipelineLayout(device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS)
+      if (disp->CreatePipelineLayout(_device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS)
         goto fail_layout;
     }
 
@@ -282,7 +349,7 @@ radix_sort_vk_create(VkDevice                           device,
       smci.codeSize = spv_sizes[ii];
       smci.pCode    = spv[ii];
 
-      if (vkCreateShaderModule(device, &smci, ac, sms + ii) != VK_SUCCESS)
+      if (disp->CreateShaderModule(_device, &smci, ac, sms + ii) != VK_SUCCESS)
         goto fail_shader;
     }
 
@@ -323,11 +390,11 @@ radix_sort_vk_create(VkDevice                           device,
     .flags = 0,                                                                                    \
     .stage = { .sType               = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,         \
                .pNext               = NULL,                                                        \
-               .flags               = 0,                                                           \
+               .flags               = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT, \
                .stage               = VK_SHADER_STAGE_COMPUTE_BIT,                                 \
                .module              = sms[idx_],                                                   \
                .pName               = "main",                                                      \
-               .pSpecializationInfo = NULL },                                                      \
+               .pSpecializationInfo = &spec_info },                                                \
                                                                                                    \
     .layout             = rs->pipeline_layouts.handles[idx_],                                      \
     .basePipelineHandle = VK_NULL_HANDLE,                                                          \
@@ -358,7 +425,7 @@ radix_sort_vk_create(VkDevice                           device,
   //
   // Create the compute pipelines
   //
-  if (vkCreateComputePipelines(device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS)
+  if (disp->CreateComputePipelines(_device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS)
     goto fail_pipeline;
 
   //
@@ -366,7 +433,7 @@ radix_sort_vk_create(VkDevice                           device,
   //
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyShaderModule(device, sms[ii], ac);
+      disp->DestroyShaderModule(_device, sms[ii], ac);
     }
 
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
@@ -397,17 +464,17 @@ radix_sort_vk_create(VkDevice                           device,
 fail_pipeline:
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipeline(device, rs->pipelines.handles[ii], ac);
+      disp->DestroyPipeline(_device, rs->pipelines.handles[ii], ac);
     }
 fail_shader:
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyShaderModule(device, sms[ii], ac);
+      disp->DestroyShaderModule(_device, sms[ii], ac);
     }
 fail_layout:
    for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipelineLayout(device, rs->pipeline_layouts.handles[ii], ac);
+      disp->DestroyPipelineLayout(_device, rs->pipeline_layouts.handles[ii], ac);
     }
 
   free(rs);
@@ -420,18 +487,22 @@ fail_layout:
 void
 radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbacks const * const ac)
 {
+  VK_FROM_HANDLE(vk_device, device, d);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   uint32_t const pipeline_count = rs_pipeline_count(rs);
 
   // destroy pipelines
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipeline(d, rs->pipelines.handles[ii], ac);
+      disp->DestroyPipeline(d, rs->pipelines.handles[ii], ac);
     }
 
   // destroy pipeline layouts
   for (uint32_t ii = 0; ii < pipeline_count; ii++)
     {
-      vkDestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac);
+      disp->DestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac);
     }
 
   free(rs);
@@ -441,8 +512,12 @@ radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbac
 //
 //
 static VkDeviceAddress
-rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi)
+rs_get_devaddr(VkDevice _device, VkDescriptorBufferInfo const * dbi)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   VkBufferDeviceAddressInfo const bdai = {
 
     .sType  = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
@@ -450,7 +525,7 @@ rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi)
     .buffer = dbi->buffer
   };
 
-  VkDeviceAddress const devaddr = vkGetBufferDeviceAddress(device, &bdai) + dbi->offset;
+  VkDeviceAddress const devaddr = disp->GetBufferDeviceAddress(_device, &bdai) + dbi->offset;
 
   return devaddr;
 }
@@ -465,13 +540,17 @@ rs_ext_cmd_write_timestamp(struct radix_sort_vk_ext_timestamps * ext_timestamps,
                            VkCommandBuffer                       cb,
                            VkPipelineStageFlagBits               pipeline_stage)
 {
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
   if ((ext_timestamps != NULL) &&
       (ext_timestamps->timestamps_set < ext_timestamps->timestamp_count))
     {
-      vkCmdWriteTimestamp(cb,
-                          pipeline_stage,
-                          ext_timestamps->timestamps,
-                          ext_timestamps->timestamps_set++);
+      disp->CmdWriteTimestamp(cb,
+                              pipeline_stage,
+                              ext_timestamps->timestamps,
+                              ext_timestamps->timestamps_set++);
     }
 }
 
@@ -497,10 +576,14 @@ struct radix_sort_vk_ext_base
 void
 radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
                            radix_sort_vk_sort_devaddr_info_t const * info,
-                           VkDevice                                  device,
+                           VkDevice                                  _device,
                            VkCommandBuffer                           cb,
                            VkDeviceAddress *                         keyvals_sorted)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   //
   // Anything to do?
   //
@@ -557,16 +640,13 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
     // Label the command buffer
     //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL)
-    {
-      VkDebugUtilsLabelEXT const label = {
-        .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
-        .pNext      = NULL,
-        .pLabelName = "radix_sort_vk_sort",
-      };
+   VkDebugUtilsLabelEXT const label = {
+     .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+     .pNext      = NULL,
+     .pLabelName = "radix_sort_vk_sort",
+   };
 
-      pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label);
-    }
+   disp->CmdBeginDebugUtilsLabelEXT(cb, &label);
 #endif
 
   //
@@ -679,16 +759,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
     .passes             = passes
   };
 
-  vkCmdPushConstants(cb,
+  disp->CmdPushConstants(cb,
                      rs->pipeline_layouts.named.histogram,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_histogram),
                      &push_histogram);
 
-  vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
+  disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
 
-  vkCmdDispatch(cb, histo_blocks, 1, 1);
+  disp->CmdDispatch(cb, histo_blocks, 1, 1);
 
   ////////////////////////////////////////////////////////////////////////
   //
@@ -707,16 +787,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
     .devaddr_histograms = devaddr_histograms,
   };
 
-  vkCmdPushConstants(cb,
+  disp->CmdPushConstants(cb,
                      rs->pipeline_layouts.named.prefix,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_prefix),
                      &push_prefix);
 
-  vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
+  disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
 
-  vkCmdDispatch(cb, passes, 1, 1);
+  disp->CmdDispatch(cb, passes, 1, 1);
 
   ////////////////////////////////////////////////////////////////////////
   //
@@ -746,14 +826,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
   {
     uint32_t const pass_dword = pass_idx / 4;
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.scatter[pass_dword].even,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_scatter),
                        &push_scatter);
 
-    vkCmdBindPipeline(cb,
+    disp->CmdBindPipeline(cb,
                       VK_PIPELINE_BIND_POINT_COMPUTE,
                       rs->pipelines.named.scatter[pass_dword].even);
   }
@@ -762,7 +842,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
 
   while (true)
     {
-      vkCmdDispatch(cb, scatter_blocks, 1, 1);
+      disp->CmdDispatch(cb, scatter_blocks, 1, 1);
 
       //
       // Continue?
@@ -788,7 +868,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
       //
       VkPipelineLayout const pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even  //
                                           : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-      vkCmdPushConstants(cb,
+      disp->CmdPushConstants(cb,
                          pl,
                          VK_SHADER_STAGE_COMPUTE_BIT,
                          OFFSETOF_MACRO(struct rs_push_scatter, devaddr_histograms),
@@ -801,7 +881,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
       VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even  //
                                    : rs->pipelines.named.scatter[pass_dword].odd;
 
-      vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+      disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
     }
 
 #ifdef RS_VK_ENABLE_EXTENSIONS
@@ -812,10 +892,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
   // End the label
   //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL)
-    {
-      pfn_vkCmdEndDebugUtilsLabelEXT(cb);
-    }
+  disp->CmdEndDebugUtilsLabelEXT(cb);
 #endif
 }
 
@@ -825,10 +902,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
 void
 radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *                            rs,
                                     radix_sort_vk_sort_indirect_devaddr_info_t const * info,
-                                    VkDevice                                           device,
+                                    VkDevice                                           _device,
                                     VkCommandBuffer                                    cb,
                                     VkDeviceAddress * keyvals_sorted)
 {
+  VK_FROM_HANDLE(vk_device, device, _device);
+
+  const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
   //
   // Anything to do?
   //
@@ -886,16 +967,13 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
     // Label the command buffer
     //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL)
-    {
-      VkDebugUtilsLabelEXT const label = {
-        .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
-        .pNext      = NULL,
-        .pLabelName = "radix_sort_vk_sort_indirect",
-      };
+  VkDebugUtilsLabelEXT const label = {
+    .sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+    .pNext      = NULL,
+    .pLabelName = "radix_sort_vk_sort_indirect",
+  };
 
-      pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label);
-    }
+  disp->CmdBeginDebugUtilsLabelEXT(cb, &label);
 #endif
 
   //
@@ -938,16 +1016,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .passes        = passes
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.init,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_init),
                        &push_init);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init);
 
-    vkCmdDispatch(cb, 1, 1, 1);
+    disp->CmdDispatch(cb, 1, 1, 1);
   }
 
 #ifdef RS_VK_ENABLE_EXTENSIONS
@@ -967,14 +1045,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .dword          = 0xFFFFFFFF
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.fill,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_pad),
                        &push_pad);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
 
     info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.pad));
   }
@@ -992,14 +1070,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .dword          = 0
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.fill,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_zero),
                        &push_zero);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill);
 
     info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.zero));
   }
@@ -1021,14 +1099,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .passes             = passes
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.histogram,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_histogram),
                        &push_histogram);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram);
 
     info->dispatch_indirect(cb,
                             &info->indirect,
@@ -1049,16 +1127,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
       .devaddr_histograms = devaddr_histograms,
     };
 
-    vkCmdPushConstants(cb,
+    disp->CmdPushConstants(cb,
                        rs->pipeline_layouts.named.prefix,
                        VK_SHADER_STAGE_COMPUTE_BIT,
                        0,
                        sizeof(push_prefix),
                        &push_prefix);
 
-    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
+    disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix);
 
-    vkCmdDispatch(cb, passes, 1, 1);
+    disp->CmdDispatch(cb, passes, 1, 1);
   }
 
 #ifdef RS_VK_ENABLE_EXTENSIONS
@@ -1088,14 +1166,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
     {
       uint32_t const pass_dword = pass_idx / 4;
 
-      vkCmdPushConstants(cb,
+      disp->CmdPushConstants(cb,
                          rs->pipeline_layouts.named.scatter[pass_dword].even,
                          VK_SHADER_STAGE_COMPUTE_BIT,
                          0,
                          sizeof(push_scatter),
                          &push_scatter);
 
-      vkCmdBindPipeline(cb,
+      disp->CmdBindPipeline(cb,
                         VK_PIPELINE_BIND_POINT_COMPUTE,
                         rs->pipelines.named.scatter[pass_dword].even);
     }
@@ -1134,7 +1212,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
         VkPipelineLayout const pl = is_even
                                       ? rs->pipeline_layouts.named.scatter[pass_dword].even  //
                                       : rs->pipeline_layouts.named.scatter[pass_dword].odd;
-        vkCmdPushConstants(
+        disp->CmdPushConstants(
           cb,
           pl,
           VK_SHADER_STAGE_COMPUTE_BIT,
@@ -1148,7 +1226,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
         VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even  //
                                      : rs->pipelines.named.scatter[pass_dword].odd;
 
-        vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+        disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p);
       }
   }
 
@@ -1160,10 +1238,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *
   // End the label
   //
 #ifdef RS_VK_ENABLE_DEBUG_UTILS
-  if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL)
-    {
-      pfn_vkCmdEndDebugUtilsLabelEXT(cb);
-    }
+  disp->CmdEndDebugUtilsLabelEXT(cb);
 #endif
 }
 
@@ -1177,7 +1252,11 @@ radix_sort_vk_fill_buffer(VkCommandBuffer                     cb,
                           VkDeviceSize                        size,
                           uint32_t                            data)
 {
-  vkCmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data);
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
+  disp->CmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data);
 }
 
 //
@@ -1221,7 +1300,11 @@ radix_sort_vk_dispatch_indirect(VkCommandBuffer                     cb,
                                 radix_sort_vk_buffer_info_t const * buffer_info,
                                 VkDeviceSize                        offset)
 {
-  vkCmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset);
+  VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb);
+  const struct vk_device_dispatch_table *disp =
+    &cmd_buffer->base.device->dispatch_table;
+
+  disp->CmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset);
 }
 
 //
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk.h b/src/vulkan/runtime/radix_sort/radix_sort_vk.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/radix_sort_vk.h
rename to src/vulkan/runtime/radix_sort/radix_sort_vk.h
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h b/src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h
rename to src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h
diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk_ext.h b/src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/radix_sort_vk_ext.h
rename to src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h
diff --git a/src/amd/vulkan/radix_sort/shaders/bufref.h b/src/vulkan/runtime/radix_sort/shaders/bufref.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/bufref.h
rename to src/vulkan/runtime/radix_sort/shaders/bufref.h
diff --git a/src/vulkan/runtime/radix_sort/shaders/config.h b/src/vulkan/runtime/radix_sort/shaders/config.h
new file mode 100644
index 00000000000..702f1649605
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/shaders/config.h
@@ -0,0 +1,33 @@
+// Copyright 2024 Valve Corporation
+// SPDX-License-Identifier: MIT
+
+#ifdef VULKAN
+#define CONFIG(_name, _id, default_val) layout (constant_id = _id) const int _name = default_val;
+#else
+enum rs_config {
+#define CONFIG(_name, _id, default_val) _name = _id,
+#endif
+
+#define RS_FILL_WORKGROUP_SIZE_ID 0
+CONFIG(RS_FILL_WORKGROUP_SIZE, RS_FILL_WORKGROUP_SIZE_ID, 7)
+CONFIG(RS_FILL_BLOCK_ROWS, 1, 8)
+
+#define RS_HISTOGRAM_WORKGROUP_SIZE_ID 2
+CONFIG(RS_HISTOGRAM_WORKGROUP_SIZE, RS_HISTOGRAM_WORKGROUP_SIZE_ID, 7)
+CONFIG(RS_HISTOGRAM_SUBGROUP_SIZE_LOG2, 3, 7)
+CONFIG(RS_HISTOGRAM_BLOCK_ROWS, 4, 8)
+
+#define RS_PREFIX_WORKGROUP_SIZE_ID 5
+CONFIG(RS_PREFIX_WORKGROUP_SIZE, RS_PREFIX_WORKGROUP_SIZE_ID, 8)
+CONFIG(RS_PREFIX_SUBGROUP_SIZE_LOG2, 6, 6)
+
+#define RS_SCATTER_WORKGROUP_SIZE_ID 7
+CONFIG(RS_SCATTER_WORKGROUP_SIZE, RS_SCATTER_WORKGROUP_SIZE_ID, 8)
+CONFIG(RS_SCATTER_SUBGROUP_SIZE_LOG2, 8, 6)
+CONFIG(RS_SCATTER_BLOCK_ROWS, 9, 14)
+
+CONFIG(RS_SCATTER_NONSEQUENTIAL_DISPATCH, 10, 0)
+
+#ifndef VULKAN
+};
+#endif
diff --git a/src/amd/vulkan/radix_sort/shaders/fill.comp b/src/vulkan/runtime/radix_sort/shaders/fill.comp
similarity index 89%
rename from src/amd/vulkan/radix_sort/shaders/fill.comp
rename to src/vulkan/runtime/radix_sort/shaders/fill.comp
index 76b446d8c5d..c85d650d0ff 100644
--- a/src/amd/vulkan/radix_sort/shaders/fill.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/fill.comp
@@ -49,23 +49,11 @@ layout(push_constant) uniform block_push
 //
 RS_STRUCT_INDIRECT_INFO_FILL();
 
-//
-// Check all switches are defined
-//
-#ifndef RS_FILL_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_FILL_BLOCK_ROWS
-#error "Undefined: RS_FILL_BLOCK_ROWS"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
-#define RS_WORKGROUP_SIZE   (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE   (RS_FILL_WORKGROUP_SIZE)
 #define RS_BLOCK_DWORDS     (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE)
 #define RS_RADIX_MASK       ((1 << RS_RADIX_LOG2) - 1)
 // clang-format on
@@ -73,7 +61,7 @@ RS_STRUCT_INDIRECT_INFO_FILL();
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_FILL_WORKGROUP_SIZE_ID) in;
 
 //
 //
diff --git a/src/amd/vulkan/radix_sort/shaders/histogram.comp b/src/vulkan/runtime/radix_sort/shaders/histogram.comp
similarity index 78%
rename from src/amd/vulkan/radix_sort/shaders/histogram.comp
rename to src/vulkan/runtime/radix_sort/shaders/histogram.comp
index 7d554630fe5..0eb078807b7 100644
--- a/src/amd/vulkan/radix_sort/shaders/histogram.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/histogram.comp
@@ -61,26 +61,11 @@ layout(push_constant) uniform block_push
 #error "Undefined: RS_KEYVAL_DWORDS"
 #endif
 
-//
-#ifndef RS_HISTOGRAM_BLOCK_ROWS
-#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS"
-#endif
-
-//
-#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
-#define RS_WORKGROUP_SIZE       (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE       (RS_HISTOGRAM_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE        (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 #define RS_BLOCK_KEYVALS        (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE)
@@ -104,11 +89,8 @@ layout(push_constant) uniform block_push
 //
 #define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_)
 
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4)
-#else
-#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4)
-#endif
+#define RS_HISTOGRAM_OFFSET(pass_) \
+  RS_HISTOGRAM_BASE(pass_) + (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x) * 4
 
 //
 // Assumes (RS_RADIX_LOG2 == 8)
@@ -167,7 +149,7 @@ shared rs_histogram_smem smem;
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_HISTOGRAM_WORKGROUP_SIZE_ID) in;
 
 //
 //
@@ -196,41 +178,38 @@ rs_histogram_zero()
   //
   // Zero SMEM histogram
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = gl_SubgroupInvocationID;
-
-  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    smem.histogram[smem_offset + ii] = 0;
-  }
+    const uint32_t smem_offset = gl_SubgroupInvocationID;
 
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    smem.histogram[smem_offset + ii] = 0;
-  }
-
-  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
-
-  if (smem_idx < RS_RADIX_SIZE)
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      smem.histogram[smem_idx] = 0;
+      smem.histogram[smem_offset + ii] = 0;
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      smem.histogram[smem_offset + ii] = 0;
     }
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
 
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      smem.histogram[gl_LocalInvocationID.x] = 0;
-    }
-
-#endif
+    if (smem_idx < RS_RADIX_SIZE)
+      {
+        smem.histogram[smem_idx] = 0;
+      }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        smem.histogram[gl_LocalInvocationID.x] = 0;
+      }
+  }
 }
 
 //
@@ -242,50 +221,47 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
   //
   // Store to GMEM
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = gl_SubgroupInvocationID;
-
-  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    const uint32_t count = smem.histogram[smem_offset + ii];
+    const uint32_t smem_offset = gl_SubgroupInvocationID;
 
-    atomicAdd(rs_histograms.extent[ii], count);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t count = smem.histogram[smem_offset + ii];
-
-    atomicAdd(rs_histograms.extent[ii], count);
-  }
-
-  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
-
-  if (smem_idx < RS_RADIX_SIZE)
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      const uint32_t count = smem.histogram[smem_idx];
+      const uint32_t count = smem.histogram[smem_offset + ii];
 
-      atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
-                count);
+      atomicAdd(rs_histograms.extent[ii], count);
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      const uint32_t count = smem.histogram[smem_offset + ii];
+
+      atomicAdd(rs_histograms.extent[ii], count);
     }
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
 
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
+    if (smem_idx < RS_RADIX_SIZE)
+      {
+        const uint32_t count = smem.histogram[smem_idx];
 
-      atomicAdd(rs_histograms.extent[0], count);
-    }
+        atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
+                  count);
+      }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
 
-#endif
+        atomicAdd(rs_histograms.extent[0], count);
+      }
+  }
 }
 
 #endif
@@ -298,21 +274,19 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
 void
 rs_histogram_atomic_after_write()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  subgroupMemoryBarrierShared();
-#else
-  barrier();
-#endif
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupMemoryBarrierShared();
+  else
+    barrier();
 }
 
 void
 rs_histogram_read_after_atomic()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  subgroupMemoryBarrierShared();
-#else
-  barrier();
-#endif
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupMemoryBarrierShared();
+  else
+    barrier();
 }
 
 #endif
diff --git a/src/amd/vulkan/radix_sort/shaders/init.comp b/src/vulkan/runtime/radix_sort/shaders/init.comp
similarity index 76%
rename from src/amd/vulkan/radix_sort/shaders/init.comp
rename to src/vulkan/runtime/radix_sort/shaders/init.comp
index 1ffd48d79df..5865be65488 100644
--- a/src/amd/vulkan/radix_sort/shaders/init.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/init.comp
@@ -53,9 +53,9 @@ RS_STRUCT_INDIRECT_INFO();
 // Local macros
 //
 // clang-format off
-#define RS_FILL_WORKGROUP_SIZE        (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
-#define RS_SCATTER_WORKGROUP_SIZE     (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
-#define RS_HISTOGRAM_WORKGROUP_SIZE   (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_FILL_WORKGROUP_SIZE        (RS_FILL_WORKGROUP_SIZE)
+#define RS_SCATTER_WORKGROUP_SIZE     (RS_SCATTER_WORKGROUP_SIZE)
+#define RS_HISTOGRAM_WORKGROUP_SIZE   (RS_HISTOGRAM_WORKGROUP_SIZE)
 
 #define RS_FILL_BLOCK_DWORDS          (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
 #define RS_SCATTER_BLOCK_KEYVALS      (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
@@ -150,12 +150,34 @@ main()
   // 256-dword partitions directly follow the 256-dword histograms, we
   // can dispatch just one FILL.
   //
+  // The "internal" memory map looks like this:
+  //
+  //   +---------------------------------+ <-- 0
+  //   | histograms[keyval_size]         |
+  //   +---------------------------------+ <-- keyval_size                           * histo_dwords
+  //   | partitions[scatter_blocks_ru-1] |
+  //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords
+  //   | workgroup_ids[keyval_size]      |
+  //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size
+  //
+  // NOTE(allanmac): The `.block_offset` and `.dword_offset_min`
+  // parameters are zeroes because the host can offset the buffer
+  // device address since the number of passes is known by the host.
+  // If we ever wanted to supported an indirect number of "key" bits
+  // in the sort, then this would need to change.
+  //
+  // NOTE(allanmac): The `.workgroup_ids[]` are only used if
+  // nonsequential dispatch isn't supported by the device.
+  //
   rs_indirect_info_fill zero;
 
   zero.block_offset               = 0;
   zero.dword_offset_min           = 0;
   zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE;
 
+  if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0)
+    zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4);  // one pass per byte
+
   const uint32_t zero_ru_blocks =
     RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS);
 
diff --git a/src/vulkan/runtime/radix_sort/shaders/meson.build b/src/vulkan/runtime/radix_sort/shaders/meson.build
new file mode 100644
index 00000000000..4152735b730
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/shaders/meson.build
@@ -0,0 +1,53 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+radix_sort_shaders = [
+  'init.comp',
+  'fill.comp',
+  'histogram.comp',
+  'prefix.comp',
+  'scatter_0_even.comp',
+  'scatter_0_odd.comp',
+  'scatter_1_even.comp',
+  'scatter_1_odd.comp'
+]
+
+shader_include_files = files(
+  'bufref.h',
+  'prefix_limits.h',
+  'prefix.h',
+  'push.h',
+  'scatter.glsl',
+  'config.h',
+)
+
+defines = ['-DRS_KEYVAL_DWORDS=2']
+
+radix_sort_spv = []
+foreach s : radix_sort_shaders
+  radix_sort_spv += custom_target(
+    s + '.spv.h',
+    input : s,
+    output : s + '.spv.h',
+    command : [
+      prog_glslang, '-V', '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+    ] + defines + glslang_quiet + (with_mesa_debug ? ['-g'] : []),
+    depend_files: shader_include_files)
+endforeach
diff --git a/src/amd/vulkan/radix_sort/shaders/prefix.comp b/src/vulkan/runtime/radix_sort/shaders/prefix.comp
similarity index 69%
rename from src/amd/vulkan/radix_sort/shaders/prefix.comp
rename to src/vulkan/runtime/radix_sort/shaders/prefix.comp
index aae88869a6e..650d3305fd6 100644
--- a/src/amd/vulkan/radix_sort/shaders/prefix.comp
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix.comp
@@ -46,41 +46,20 @@ layout(push_constant) uniform block_push
 #define RS_SUBGROUP_UNIFORM
 #endif
 
-//
-// Check all switches are defined
-//
-//
-#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2"
-#endif
-
 //
 // Local macros
 //
 // clang-format off
 #define RS_KEYVAL_SIZE          (RS_KEYVAL_DWORDS * 4)
-#define RS_WORKGROUP_SIZE       (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE       (RS_PREFIX_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE        (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 // clang-format on
 
 //
-// There is no purpose in having a workgroup size larger than the
-// radix size.
-//
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)"
-#endif
-
 //
 //
-//
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_PREFIX_WORKGROUP_SIZE_ID) in;
 
 //
 // Histogram buffer reference
@@ -95,34 +74,23 @@ layout(buffer_reference, std430) buffer buffer_rs_histograms
 //
 #include "prefix_limits.h"
 
-//
-// If multi-subgroup then define shared memory
-//
-#if (RS_WORKGROUP_SUBGROUPS > 1)
-
 //----------------------------------------
 shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE];
 
 #define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_]
 //----------------------------------------
 
-#if (RS_SWEEP_1_SIZE > 0)
 //----------------------------------------
 shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE];
 
 #define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_]
 //----------------------------------------
-#endif
 
-#if (RS_SWEEP_2_SIZE > 0)
 //----------------------------------------
 shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE];
 
 #define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_]
 //----------------------------------------
-#endif
-
-#endif
 
 //
 // Define function arguments
@@ -151,37 +119,21 @@ main()
   //
   // Define buffer reference to read histograms
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  // Define histograms bufref for single subgroup
-  //
   // NOTE(allanmac): The histogram buffer reference could be adjusted
   // on the host to save a couple instructions at the cost of added
   // complexity.
   //
+  const uint32_t invocation_id = RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x;
+
   RS_SUBGROUP_UNIFORM
   const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
-  const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4;
+  const uint32_t histograms_offset = (histograms_base + invocation_id) * 4;
 
   RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
                                     rs_histograms,
                                     push.devaddr_histograms,
                                     histograms_offset);
 
-#else
-  //
-  // Define histograms bufref for workgroup
-  //
-  RS_SUBGROUP_UNIFORM
-  const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
-  const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4;
-
-  RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
-                                    rs_histograms,
-                                    push.devaddr_histograms,
-                                    histograms_offset);
-
-#endif
 
   //
   // Compute exclusive prefix of uint32_t[256]
diff --git a/src/vulkan/runtime/radix_sort/shaders/prefix.h b/src/vulkan/runtime/radix_sort/shaders/prefix.h
new file mode 100644
index 00000000000..f9582da0067
--- /dev/null
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix.h
@@ -0,0 +1,356 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+
+//
+// Requires several defines
+//
+#ifndef RS_PREFIX_LIMITS
+#error "Error: \"prefix_limits.h\" not loaded"
+#endif
+
+#ifndef RS_PREFIX_ARGS
+#error "Error: RS_PREFIX_ARGS undefined"
+#endif
+
+#ifndef RS_PREFIX_LOAD
+#error "Error: RS_PREFIX_LOAD undefined"
+#endif
+
+#ifndef RS_PREFIX_STORE
+#error "Error: RS_PREFIX_STORE undefined"
+#endif
+
+//
+// Optional switches:
+//
+//   * Disable holding original inclusively scanned histogram values in registers.
+//
+//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+//
+
+//
+// Compute exclusive prefix of uint32_t[256]
+//
+void
+rs_prefix(RS_PREFIX_ARGS)
+{
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+  {
+    //
+    // Workgroup is a single subgroup so no shared memory is required.
+    //
+
+    //
+    // Exclusive scan-add the histogram
+    //
+    const uint32_t               h0     = RS_PREFIX_LOAD(0);
+    const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
+    RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+
+    RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
+
+    //
+    // Each iteration is dependent on the previous so no unrolling.  The
+    // compiler is free to hoist the loads upward though.
+    //
+    for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
+         ii < RS_RADIX_SIZE;
+         ii += RS_SUBGROUP_SIZE)
+      {
+        const uint32_t h     = RS_PREFIX_LOAD(ii);
+        const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
+        h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+        RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
+      }
+  }
+  else
+  {
+    //
+    // Workgroup is multiple subgroups and uses shared memory to store
+    // the scan's intermediate results.
+    //
+    // Assumes a power-of-two subgroup, workgroup and radix size.
+    //
+    // Downsweep: Repeatedly scan reductions until they fit in a single
+    //            subgroup.
+    //
+    // Upsweep:   Then uniformly apply reductions to each subgroup.
+    //
+    //
+    //   Subgroup Size |  4 |  8 | 16 | 32 | 64 | 128 |
+    //   --------------+----+----+----+----+----+-----+
+    //   Sweep 0       | 64 | 32 | 16 |  8 |  4 |   2 | sweep_0[]
+    //   Sweep 1       | 16 |  4 |  - |  - |  - |   - | sweep_1[]
+    //   Sweep 2       |  4 |  - |  - |  - |  - |   - | sweep_2[]
+    //   --------------+----+----+----+----+----+-----+
+    //   Total dwords  | 84 | 36 | 16 |  8 |  4 |   2 |
+    //   --------------+----+----+----+----+----+-----+
+    //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    uint32_t h_exc[RS_H_COMPONENTS];
+#endif
+
+    //
+    // Downsweep 0
+    //
+    [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+    {
+      const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+      const uint32_t h_inc = subgroupInclusiveAdd(h);
+
+      const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+      RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+      //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+      h_exc[ii] = h_inc - h;
+#else
+      RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
+#endif
+    }
+
+    barrier();
+
+    //
+    // Skip generalizing these sweeps for all possible subgroups -- just
+    // write them directly.
+    //
+    if (RS_SUBGROUP_SIZE == 128)
+    {
+      // There are only two elements in SWEEP0 per subgroup. The scan is
+      // trivial so we fold it into the upsweep.
+    }
+    else if (RS_SUBGROUP_SIZE >= 16)
+    {
+      //////////////////////////////////////////////////////////////////////
+      //
+      // Scan 0
+      //
+      if (RS_SWEEP_0_SIZE != RS_WORKGROUP_SIZE && // workgroup has inactive components
+          gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)
+        {
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+        }
+
+      barrier();
+    }
+    else if (RS_SUBGROUP_SIZE == 8)
+    {
+      if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+      {
+        //////////////////////////////////////////////////////////////////////
+        //
+        // Scan 0 and Downsweep 1
+        //
+        if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
+          {
+            const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+            const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+            RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+            RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else
+      {
+        //////////////////////////////////////////////////////////////////////
+        //
+        // Scan 0 and Downsweep 1
+        //
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
+        {
+          const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+          RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 1
+      //
+      if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
+        {
+          const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+          const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+          RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+        }
+
+      barrier();
+    }
+    else if (RS_SUBGROUP_SIZE == 4)
+    {
+      //////////////////////////////////////////////////////////////////////
+      //
+      // Scan 0 and Downsweep 1
+      //
+      if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+      {
+        if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
+          {
+            const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+            const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+            RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+            RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else
+      {
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
+        {
+          const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+          const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+          RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+          RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 1 and Downsweep 2
+      //
+      if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
+      {
+        if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
+          {
+            const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+            const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+            RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+            RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+          }
+      }
+      else 
+      {
+        [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
+        {
+          const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+          const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+          const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
+          const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+          RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
+          RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+        }
+      }
+
+      barrier();
+
+      //
+      // Scan 2
+      //
+      // 4 invocations
+      //
+      if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
+        {
+          const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
+          const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
+
+          RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
+        }
+
+      barrier();
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    //
+    // Final upsweep 0
+    //
+    if (RS_SUBGROUP_SIZE == 128)
+    {
+      // There must be more than one subgroup per workgroup, but the maximum
+      // workgroup size is 256 so there must be exactly two subgroups per
+      // workgroup and RS_H_COMPONENTS must be 1.
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+      RS_PREFIX_STORE(0) = h_exc[0] + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0);
+#else
+      const uint32_t h_exc = RS_PREFIX_LOAD(0);
+
+      RS_PREFIX_STORE(0) = h_exc + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0);
+#endif
+    }
+    else if (RS_SUBGROUP_SIZE >= 16)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+        // clang format issue
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
+#endif
+      }
+    }
+    else if (RS_SUBGROUP_SIZE == 8)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+        const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#endif
+      }
+    }
+    else if (RS_SUBGROUP_SIZE == 4)
+    {
+      [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+      {
+        const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+        const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+        const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#else
+        const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+        RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+          h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#endif
+      }
+    }
+  }
+}
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
diff --git a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
similarity index 71%
rename from src/amd/vulkan/radix_sort/shaders/prefix_limits.h
rename to src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
index a98e554ad4a..4d0e89fb9c2 100644
--- a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h
+++ b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h
@@ -10,17 +10,12 @@
 //
 #define RS_PREFIX_LIMITS
 
-//
-// Multi-subgroup prefix requires shared memory.
-//
-#if (RS_WORKGROUP_SUBGROUPS > 1)
-
 // clang-format off
 #define RS_H_COMPONENTS    (RS_RADIX_SIZE / RS_WORKGROUP_SIZE)
 
-#define RS_SWEEP_0_SIZE    (RS_RADIX_SIZE   / RS_SUBGROUP_SIZE)
-#define RS_SWEEP_1_SIZE    (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)
-#define RS_SWEEP_2_SIZE    (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)
+#define RS_SWEEP_0_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_RADIX_SIZE / RS_SUBGROUP_SIZE))
+#define RS_SWEEP_1_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE))
+#define RS_SWEEP_2_SIZE    (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE))
 
 #define RS_SWEEP_SIZE      (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE)
 
@@ -32,15 +27,6 @@
 #define RS_SWEEP_2_OFFSET  (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE)
 // clang-format on
 
-//
-// Single subgroup prefix doesn't use shared memory.
-//
-#else
-
-#define RS_SWEEP_SIZE 0
-
-#endif
-
 //
 //
 //
diff --git a/src/amd/vulkan/radix_sort/shaders/push.h b/src/vulkan/runtime/radix_sort/shaders/push.h
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/push.h
rename to src/vulkan/runtime/radix_sort/shaders/push.h
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter.glsl b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl
similarity index 58%
rename from src/amd/vulkan/radix_sort/shaders/scatter.glsl
rename to src/vulkan/runtime/radix_sort/shaders/scatter.glsl
index b57d9e80850..bacd44682f5 100644
--- a/src/amd/vulkan/radix_sort/shaders/scatter.glsl
+++ b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl
@@ -84,21 +84,6 @@ layout(push_constant) uniform block_push
 #error "Undefined: RS_SCATTER_KEYVAL_DWORD_BASE"
 #endif
 
-//
-#ifndef RS_SCATTER_BLOCK_ROWS
-#error "Undefined: RS_SCATTER_BLOCK_ROWS"
-#endif
-
-//
-#ifndef RS_SCATTER_SUBGROUP_SIZE_LOG2
-#error "Undefined: RS_SCATTER_SUBGROUP_SIZE_LOG2"
-#endif
-
-//
-#ifndef RS_SCATTER_WORKGROUP_SIZE_LOG2
-#error "Undefined: RS_SCATTER_WORKGROUP_SIZE_LOG2"
-#endif
-
 //
 // Status masks are defined differently for the scatter_even and
 // scatter_odd shaders.
@@ -140,7 +125,7 @@ layout(push_constant) uniform block_push
 //
 // clang-format off
 #define RS_KEYVAL_SIZE               (RS_KEYVAL_DWORDS * 4)
-#define RS_WORKGROUP_SIZE            (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SIZE            (RS_SCATTER_WORKGROUP_SIZE)
 #define RS_SUBGROUP_SIZE             (1 << RS_SCATTER_SUBGROUP_SIZE_LOG2)
 #define RS_WORKGROUP_SUBGROUPS       (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
 #define RS_SUBGROUP_KEYVALS          (RS_SCATTER_BLOCK_ROWS * RS_SUBGROUP_SIZE)
@@ -148,13 +133,6 @@ layout(push_constant) uniform block_push
 #define RS_RADIX_MASK                ((1 << RS_RADIX_LOG2) - 1)
 // clang-format on
 
-//
-// Validate number of keyvals fit in a uint16_t.
-//
-#if (RS_BLOCK_KEYVALS >= 65536)
-#error "Error: (RS_BLOCK_KEYVALS >= 65536)"
-#endif
-
 //
 // Keyval type
 //
@@ -181,9 +159,7 @@ layout(push_constant) uniform block_push
 // Determine at compile time the base of the final iteration for
 // workgroups smaller than RS_RADIX_SIZE.
 //
-#if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
 #define RS_WORKGROUP_BASE_FINAL ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)
-#endif
 
 //
 // Max macro
@@ -291,7 +267,7 @@ layout(push_constant) uniform block_push
 //
 //
 //
-layout(local_size_x = RS_WORKGROUP_SIZE) in;
+layout(local_size_x_id = RS_SCATTER_WORKGROUP_SIZE_ID) in;
 
 //
 //
@@ -325,48 +301,55 @@ shared rs_scatter_smem smem;
 // The shared memory barrier is either subgroup-wide or
 // workgroup-wide.
 //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_BARRIER() subgroupBarrier()
-#else
-#define RS_BARRIER() barrier()
-#endif
+void rsBarrier()
+{
+  if (RS_WORKGROUP_SUBGROUPS == 1)
+    subgroupBarrier();
+  else
+    barrier();
+}
 
 //
 // If multi-subgroup then define shared memory
 //
-#if (RS_WORKGROUP_SUBGROUPS > 1)
 
 //----------------------------------------
 #define RS_PREFIX_SWEEP0(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_0_OFFSET + (idx_)]
 //----------------------------------------
 
-#if (RS_SWEEP_1_SIZE > 0)
 //----------------------------------------
 #define RS_PREFIX_SWEEP1(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_1_OFFSET + (idx_)]
 //----------------------------------------
-#endif
 
-#if (RS_SWEEP_2_SIZE > 0)
 //----------------------------------------
 #define RS_PREFIX_SWEEP2(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_2_OFFSET + (idx_)]
 //----------------------------------------
-#endif
 
-#endif
+uint32_t
+invocation_id()
+{
+  return RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupID : gl_LocalInvocationID.x;
+}
 
 //
 // Define prefix load/store functions
 //
 // clang-format off
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-#define RS_PREFIX_LOAD(idx_)   smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID + (idx_)]
-#define RS_PREFIX_STORE(idx_)  smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID + (idx_)]
-#else
-#define RS_PREFIX_LOAD(idx_)   smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x + (idx_)]
-#define RS_PREFIX_STORE(idx_)  smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x + (idx_)]
-#endif
+#define RS_PREFIX_LOAD(idx_)   smem.extent[RS_SMEM_HISTOGRAM_OFFSET + invocation_id() + (idx_)]
+#define RS_PREFIX_STORE(idx_)  smem.extent[RS_SMEM_HISTOGRAM_OFFSET + invocation_id() + (idx_)]
 // clang-format on
 
+layout(buffer_reference, std430) buffer buffer_rs_workgroup_id
+{
+  uint32_t x[RS_KEYVAL_DWORDS * 4];
+};
+
+#define RS_IS_FIRST_LOCAL_INVOCATION() (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID == 0 : gl_LocalInvocationID.x == 0)
+
+RS_SUBGROUP_UNIFORM uint32_t rs_gl_workgroup_id_x;
+
+#define RS_GL_WORKGROUP_ID_X (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0 ? rs_gl_workgroup_id_x : gl_WorkGroupID.x)
+
 //
 // Load the prefix function
 //
@@ -383,45 +366,43 @@ shared rs_scatter_smem smem;
 void
 rs_histogram_zero()
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    smem.extent[smem_offset + ii] = 0;
-  }
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
 
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    smem.extent[smem_offset + ii] = 0;
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      smem.histogram[smem_offset_final] = 0;
+      smem.extent[smem_offset + ii] = 0;
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
-      smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x] = 0;
+      smem.extent[smem_offset + ii] = 0;
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
 
-  RS_BARRIER();
+      if (smem_offset_final < RS_RADIX_SIZE)
+        {
+          smem.extent[smem_offset_final] = 0;
+        }
+    }
+  }
+  else
+  {
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x] = 0;
+      }
+  }
+
+  rsBarrier();
 }
 
 //
@@ -450,11 +431,6 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //----------------------------------------------------------------------
 #ifdef RS_SCATTER_ENABLE_NV_MATCH
 
-  //
-  // 32
-  //
-#if (RS_SUBGROUP_SIZE == 32)
-
   [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
   {
     //
@@ -470,13 +446,6 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
     kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
   }
 
-  //
-  // Undefined!
-  //
-#else
-#error "Error: rs_histogram_rank() undefined for subgroup size"
-#endif
-
   //----------------------------------------------------------------------
   //
   // Default is to emulate a `match` operation with ballots.
@@ -484,79 +453,32 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //----------------------------------------------------------------------
 #elif !defined(RS_SCATTER_ENABLE_BROADCAST_MATCH)
 
-  //
-  // 64
-  //
-#if (RS_SUBGROUP_SIZE == 64)
-
   [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
   {
     const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
 
-    u32vec2 match;
+    u32vec4 match;
 
     {
-      const bool     is_one = RS_BIT_IS_ONE(digit, 0);
-      const u32vec2  ballot = subgroupBallot(is_one).xy;
-      const uint32_t mask   = is_one ? 0 : 0xFFFFFFFF;
+      const bool    is_one = RS_BIT_IS_ONE(digit, 0);
+      const u32vec4 ballot = subgroupBallot(is_one);
+      const u32vec4 mask   = u32vec4(is_one ? 0 : 0xFFFFFFFF);
 
-      match.x = (ballot.x ^ mask);
-      match.y = (ballot.y ^ mask);
+      match = ballot ^ mask;
     }
 
     [[unroll]] for (int32_t bit = 1; bit < RS_RADIX_LOG2; bit++)
     {
-      const bool     is_one = RS_BIT_IS_ONE(digit, bit);
-      const u32vec2  ballot = subgroupBallot(is_one).xy;
-      const uint32_t mask   = is_one ? 0 : 0xFFFFFFFF;
+      const bool    is_one = RS_BIT_IS_ONE(digit, bit);
+      const u32vec4 ballot = subgroupBallot(is_one);
+      const u32vec4 mask   = u32vec4(is_one ? 0 : 0xFFFFFFFF);
 
-      match.x &= (ballot.x ^ mask);
-      match.y &= (ballot.y ^ mask);
+      match &= ballot ^ mask;
     }
 
-    kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) |
-             (bitCount(match.x & gl_SubgroupLeMask.x) +  //
-              bitCount(match.y & gl_SubgroupLeMask.y));
+    kr[ii] = (subgroupBallotBitCount(match) << 16) | subgroupBallotInclusiveBitCount(match);
   }
 
-  //
-  // <= 32
-  //
-#elif ((RS_SUBGROUP_SIZE <= 32) && !defined(RS_SCATTER_ENABLE_NV_MATCH))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
-  {
-    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
-
-    uint32_t match;
-
-    {
-      const bool     is_one = RS_BIT_IS_ONE(digit, 0);
-      const uint32_t ballot = subgroupBallot(is_one).x;
-      const uint32_t mask   = is_one ? 0 : RS_SUBGROUP_MASK;
-
-      match = (ballot ^ mask);
-    }
-
-    [[unroll]] for (int32_t bit = 1; bit < RS_RADIX_LOG2; bit++)
-    {
-      const bool     is_one = RS_BIT_IS_ONE(digit, bit);
-      const uint32_t ballot = subgroupBallot(is_one).x;
-      const uint32_t mask   = is_one ? 0 : RS_SUBGROUP_MASK;
-
-      match &= (ballot ^ mask);
-    }
-
-    kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
-  }
-
-  //
-  // Undefined!
-  //
-#else
-#error "Error: rs_histogram_rank() undefined for subgroup size"
-#endif
-
   //----------------------------------------------------------------------
   //
   // Emulate a `match` operation with broadcasts.
@@ -569,69 +491,58 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //
   // 64
   //
-#if (RS_SUBGROUP_SIZE == 64)
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
+  if (RS_SUBGROUP_SIZE == 64)
   {
-    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
-
-    u32vec2 match;
-
-    // subgroup invocation 0
+    [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
     {
-      match[0] = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
-    }
+      const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
 
-    // subgroup invocations 1-31
-    [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
+      u32vec2 match;
+
+      // subgroup invocation 0
+      {
+        match[0] = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
+      }
+
+      // subgroup invocations 1-31
+      [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
+      {
+        match[0] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
+      }
+
+      // subgroup invocation 32
+      {
+        match[1] = (subgroupBroadcast(digit, 32) == digit) ? (1u << 0) : 0;
+      }
+
+      // subgroup invocations 33-63
+      [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
+      {
+        match[1] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
+      }
+
+      kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) |
+               (bitCount(match.x & gl_SubgroupLeMask.x) +  //
+                bitCount(match.y & gl_SubgroupLeMask.y));
+    }
+  } else if (RS_SUBGROUP_SIZE <= 32) {
+    [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
     {
-      match[0] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
-    }
+      const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
 
-    // subgroup invocation 32
-    {
-      match[1] = (subgroupBroadcast(digit, 32) == digit) ? (1u << 0) : 0;
-    }
+      // subgroup invocation 0
+      uint32_t match = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
 
-    // subgroup invocations 33-63
-    [[unroll]] for (int32_t jj = 1; jj < 32; jj++)
-    {
-      match[1] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
-    }
+      // subgroup invocations 1-(RS_SUBGROUP_SIZE-1)
+      [[unroll]] for (int32_t jj = 1; jj < RS_SUBGROUP_SIZE; jj++)
+      {
+        match |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
+      }
 
-    kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) |
-             (bitCount(match.x & gl_SubgroupLeMask.x) +  //
-              bitCount(match.y & gl_SubgroupLeMask.y));
+      kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
+    }
   }
 
-  //
-  // <= 32
-  //
-#elif ((RS_SUBGROUP_SIZE <= 32) && !defined(RS_SCATTER_ENABLE_NV_MATCH))
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
-  {
-    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]);
-
-    // subgroup invocation 0
-    uint32_t match = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0;
-
-    // subgroup invocations 1-(RS_SUBGROUP_SIZE-1)
-    [[unroll]] for (int32_t jj = 1; jj < RS_SUBGROUP_SIZE; jj++)
-    {
-      match |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0;
-    }
-
-    kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x);
-  }
-
-  //
-  // Undefined!
-  //
-#else
-#error "Error: rs_histogram_rank() undefined for subgroup size"
-#endif
-
 #endif
 
   //
@@ -660,7 +571,7 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
           }
         }
 
-      RS_BARRIER();
+      rsBarrier();
     }
 }
 
@@ -677,110 +588,103 @@ rs_first_prefix_store(restrict buffer_rs_partitions rs_partitions)
   //
   // Define the histogram reference
   //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  const uint32_t hist_offset = gl_SubgroupInvocationID * 4;
-#else
-  const uint32_t hist_offset = gl_LocalInvocationID.x * 4;
-#endif
+  const uint32_t hist_offset = invocation_id() * 4;
 
   readonly RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histogram,
                                              rs_histogram,
                                              push.devaddr_histograms,
                                              hist_offset);
 
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
-  const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    const uint32_t exc = rs_histogram.extent[ii];
-    const uint32_t red = smem.extent[smem_offset_h + ii];
+    const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
+    const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
 
-    smem.extent[smem_offset_l + ii] = exc;
-
-    const uint32_t inc = exc + red;
-
-    atomicStore(rs_partitions.extent[ii],
-                inc | RS_PARTITION_MASK_PREFIX,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
-  const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t exc = rs_histogram.extent[ii];
-    const uint32_t red = smem.extent[smem_offset_h + ii];
-
-    smem.extent[smem_offset_l + ii] = exc;
-
-    const uint32_t inc = exc + red;
-
-    atomicStore(rs_partitions.extent[ii],
-                inc | RS_PARTITION_MASK_PREFIX,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final_h = smem_offset_h + RS_WORKGROUP_BASE_FINAL;
-  const uint32_t smem_offset_final_l = smem_offset_l + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      const uint32_t exc = rs_histogram.extent[RS_WORKGROUP_BASE_FINAL];
-      const uint32_t red = smem.extent[smem_offset_final_h];
+      const uint32_t exc = rs_histogram.extent[ii];
+      const uint32_t red = smem.extent[smem_offset_h + ii];
 
-      smem.extent[smem_offset_final_l] = exc;
+      smem.extent[smem_offset_l + ii] = exc;
 
       const uint32_t inc = exc + red;
 
-      atomicStore(rs_partitions.extent[RS_WORKGROUP_BASE_FINAL],
+      atomicStore(rs_partitions.extent[ii],
                   inc | RS_PARTITION_MASK_PREFIX,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
+    const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
-      const uint32_t exc = rs_histogram.extent[0];
-      const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+      const uint32_t exc = rs_histogram.extent[ii];
+      const uint32_t red = smem.extent[smem_offset_h + ii];
 
-      smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+      smem.extent[smem_offset_l + ii] = exc;
 
       const uint32_t inc = exc + red;
 
-      atomicStore(rs_partitions.extent[0],
+      atomicStore(rs_partitions.extent[ii],
                   inc | RS_PARTITION_MASK_PREFIX,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final_h = smem_offset_h + RS_WORKGROUP_BASE_FINAL;
+      const uint32_t smem_offset_final_l = smem_offset_l + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final_h < RS_RADIX_SIZE)
+        {
+          const uint32_t exc = rs_histogram.extent[RS_WORKGROUP_BASE_FINAL];
+          const uint32_t red = smem.extent[smem_offset_final_h];
+
+          smem.extent[smem_offset_final_l] = exc;
+
+          const uint32_t inc = exc + red;
+
+          atomicStore(rs_partitions.extent[RS_WORKGROUP_BASE_FINAL],
+                      inc | RS_PARTITION_MASK_PREFIX,
+                      gl_ScopeQueueFamily,
+                      gl_StorageSemanticsBuffer,
+                      gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+        }
+    }
+  }
+  else
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t exc = rs_histogram.extent[0];
+        const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+
+        smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+
+        const uint32_t inc = exc + red;
+
+        atomicStore(rs_partitions.extent[0],
+                    inc | RS_PARTITION_MASK_PREFIX,
+                    gl_ScopeQueueFamily,
+                    gl_StorageSemanticsBuffer,
+                    gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+      }
+  }
 }
 
 //
@@ -790,76 +694,77 @@ void
 rs_reduction_store(restrict buffer_rs_partitions      rs_partitions,
                    RS_SUBGROUP_UNIFORM const uint32_t partition_base)
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    const uint32_t red = smem.extent[smem_offset + ii];
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SUBGROUPS == 1)
+    //
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID;
 
-    atomicStore(rs_partitions.extent[partition_base + ii],
-                red | RS_PARTITION_MASK_REDUCTION,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    const uint32_t red = smem.extent[smem_offset + ii];
-
-    atomicStore(rs_partitions.extent[partition_base + ii],
-                red | RS_PARTITION_MASK_REDUCTION,
-                gl_ScopeQueueFamily,
-                gl_StorageSemanticsBuffer,
-                gl_SemanticsRelease);
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
-      const uint32_t red = smem.extent[smem_offset_final];
+      const uint32_t red = smem.extent[smem_offset + ii];
 
-      atomicStore(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+      atomicStore(rs_partitions.extent[partition_base + ii],
                   red | RS_PARTITION_MASK_REDUCTION,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
-      const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+      const uint32_t red = smem.extent[smem_offset + ii];
 
-      atomicStore(rs_partitions.extent[partition_base],
+      atomicStore(rs_partitions.extent[partition_base + ii],
                   red | RS_PARTITION_MASK_REDUCTION,
                   gl_ScopeQueueFamily,
                   gl_StorageSemanticsBuffer,
-                  gl_SemanticsRelease);
+                  gl_SemanticsRelease | gl_SemanticsMakeAvailable);
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final < RS_RADIX_SIZE)
+        {
+          const uint32_t red = smem.extent[smem_offset_final];
+
+          atomicStore(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+                      red | RS_PARTITION_MASK_REDUCTION,
+                      gl_ScopeQueueFamily,
+                      gl_StorageSemanticsBuffer,
+                      gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+        }
+    }
+  }
+  else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x];
+
+        atomicStore(rs_partitions.extent[partition_base],
+                    red | RS_PARTITION_MASK_REDUCTION,
+                    gl_ScopeQueueFamily,
+                    gl_StorageSemanticsBuffer,
+                    gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+      }
+  }
 }
 
 //
@@ -875,120 +780,15 @@ void
 rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
                   RS_SUBGROUP_UNIFORM const uint32_t partition_base)
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
+    ////////////////////////////////////////////////////////////////////////////
     //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
+    // (RS_WORKGROUP_SUBGROUPS == 1)
     //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
 
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        //
-        // Otherwise, save the exclusive scan and atomically transform
-        // the reduction into an inclusive prefix status math:
-        //
-        //   reduction + 1 = prefix
-        //
-        smem.extent[smem_offset + ii] = exc;
-
-        atomicAdd(rs_partitions.extent[partition_base + ii],
-                  exc | (1 << 30),
-                  gl_ScopeQueueFamily,
-                  gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease);
-        break;
-      }
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
-    //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
-    //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
-
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        //
-        // Otherwise, save the exclusive scan and atomically transform
-        // the reduction into an inclusive prefix status math:
-        //
-        //   reduction + 1 = prefix
-        //
-        smem.extent[smem_offset + ii] = exc;
-
-        atomicAdd(rs_partitions.extent[partition_base + ii],
-                  exc | (1 << 30),
-                  gl_ScopeQueueFamily,
-                  gl_StorageSemanticsBuffer,
-                  gl_SemanticsAcquireRelease);
-        break;
-      }
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_SMEM_LOOKBACK_OFFSET + RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
       uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
       uint32_t exc                 = 0;
@@ -1002,7 +802,7 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
                                            gl_ScopeQueueFamily,
                                            gl_StorageSemanticsBuffer,
-                                           gl_SemanticsAcquire);
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
           // spin until valid
           if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
@@ -1027,7 +827,7 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           //
           smem.extent[smem_offset + ii] = exc;
 
-          atomicAdd(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+          atomicAdd(rs_partitions.extent[partition_base + ii],
                     exc | (1 << 30),
                     gl_ScopeQueueFamily,
                     gl_StorageSemanticsBuffer,
@@ -1035,16 +835,16 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           break;
         }
     }
-#endif
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
 
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
     {
       uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
       uint32_t exc                 = 0;
@@ -1055,10 +855,10 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
       //
       while (true)
         {
-          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
                                            gl_ScopeQueueFamily,
                                            gl_StorageSemanticsBuffer,
-                                           gl_SemanticsAcquire);
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
           // spin until valid
           if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
@@ -1081,9 +881,9 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
           //
           //   reduction + 1 = prefix
           //
-          smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+          smem.extent[smem_offset + ii] = exc;
 
-          atomicAdd(rs_partitions.extent[partition_base],
+          atomicAdd(rs_partitions.extent[partition_base + ii],
                     exc | (1 << 30),
                     gl_ScopeQueueFamily,
                     gl_StorageSemanticsBuffer,
@@ -1092,7 +892,113 @@ rs_lookback_store(restrict buffer_rs_partitions      rs_partitions,
         }
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final < RS_SMEM_LOOKBACK_OFFSET + RS_RADIX_SIZE)
+        {
+          uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+          uint32_t exc                 = 0;
+
+          //
+          // NOTE: Each workgroup invocation can proceed independently.
+          // Subgroups and workgroups do NOT have to coordinate.
+          //
+          while (true)
+            {
+              const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL],
+                                               gl_ScopeQueueFamily,
+                                               gl_StorageSemanticsBuffer,
+                                               gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+              // spin until valid
+              if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+                {
+                  continue;
+                }
+
+              exc += (prev & RS_PARTITION_MASK_COUNT);
+
+              if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+                {
+                  // continue accumulating reductions
+                  partition_base_prev -= RS_RADIX_SIZE;
+                  continue;
+                }
+
+              //
+              // Otherwise, save the exclusive scan and atomically transform
+              // the reduction into an inclusive prefix status math:
+              //
+              //   reduction + 1 = prefix
+              //
+              smem.extent[smem_offset + RS_WORKGROUP_BASE_FINAL] = exc;
+
+              atomicAdd(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL],
+                        exc | (1 << 30),
+                        gl_ScopeQueueFamily,
+                        gl_StorageSemanticsBuffer,
+                        gl_SemanticsAcquireRelease);
+              break;
+            }
+        }
+    }
+  }
+  else
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+        uint32_t exc                 = 0;
+
+        //
+        // NOTE: Each workgroup invocation can proceed independently.
+        // Subgroups and workgroups do NOT have to coordinate.
+        //
+        while (true)
+          {
+            const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+                                             gl_ScopeQueueFamily,
+                                             gl_StorageSemanticsBuffer,
+                                             gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+            // spin until valid
+            if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+              {
+                continue;
+              }
+
+            exc += (prev & RS_PARTITION_MASK_COUNT);
+
+            if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+              {
+                // continue accumulating reductions
+                partition_base_prev -= RS_RADIX_SIZE;
+                continue;
+              }
+
+            //
+            // Otherwise, save the exclusive scan and atomically transform
+            // the reduction into an inclusive prefix status math:
+            //
+            //   reduction + 1 = prefix
+            //
+            smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+
+            atomicAdd(rs_partitions.extent[partition_base],
+                      exc | (1 << 30),
+                      gl_ScopeQueueFamily,
+                      gl_StorageSemanticsBuffer,
+                      gl_SemanticsAcquireRelease);
+            break;
+          }
+      }
+  }
 }
 
 //
@@ -1105,98 +1011,15 @@ void
 rs_lookback_skip_store(restrict buffer_rs_partitions      rs_partitions,
                        RS_SUBGROUP_UNIFORM const uint32_t partition_base)
 {
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SUBGROUPS == 1)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  if (RS_WORKGROUP_SUBGROUPS == 1)
   {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
+    ////////////////////////////////////////////////////////////////////////////
     //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
+    // (RS_WORKGROUP_SUBGROUPS == 1)
     //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID;
 
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        // Otherwise, save the exclusive scan.
-        smem.extent[smem_offset + ii] = exc;
-        break;
-      }
-  }
-
-#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
-  //
-  const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
-
-  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
-  {
-    uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-    uint32_t exc                 = 0;
-
-    //
-    // NOTE: Each workgroup invocation can proceed independently.
-    // Subgroups and workgroups do NOT have to coordinate.
-    //
-    while (true)
-      {
-        const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
-                                         gl_ScopeQueueFamily,
-                                         gl_StorageSemanticsBuffer,
-                                         gl_SemanticsAcquire);
-
-        // spin until valid
-        if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-          {
-            continue;
-          }
-
-        exc += (prev & RS_PARTITION_MASK_COUNT);
-
-        if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-          {
-            // continue accumulating reductions
-            partition_base_prev -= RS_RADIX_SIZE;
-            continue;
-          }
-
-        // Otherwise, save the exclusive scan.
-        smem.extent[smem_offset + ii] = exc;
-        break;
-      }
-  }
-
-#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
-  const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
-
-  if (smem_offset_final < RS_RADIX_SIZE)
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
     {
       uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
       uint32_t exc                 = 0;
@@ -1207,56 +1030,10 @@ rs_lookback_skip_store(restrict buffer_rs_partitions      rs_partitions,
       //
       while (true)
         {
-          const uint32_t prev =
-            atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL],
-                       gl_ScopeQueueFamily,
-                       gl_StorageSemanticsBuffer,
-                       gl_SemanticsAcquire);
-
-          // spin until valid
-          if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
-            {
-              continue;
-            }
-
-          exc += (prev & RS_PARTITION_MASK_COUNT);
-
-          if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
-            {
-              // continue accumulating reductions
-              partition_base_prev -= RS_RADIX_SIZE;
-              continue;
-            }
-
-          // Otherwise, save the exclusive scan.
-          smem.extent[smem_offset_final] = exc;
-          break;
-        }
-    }
-#endif
-
-#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  ////////////////////////////////////////////////////////////////////////////
-  //
-  // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
-  //
-#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
-  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
-#endif
-    {
-      uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
-      uint32_t exc                 = 0;
-
-      //
-      // NOTE: Each workgroup invocation can proceed independently.
-      // Subgroups and workgroups do NOT have to coordinate.
-      //
-      while (true)
-        {
-          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
                                            gl_ScopeQueueFamily,
                                            gl_StorageSemanticsBuffer,
-                                           gl_SemanticsAcquire);
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
           // spin until valid
           if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
@@ -1274,12 +1051,142 @@ rs_lookback_skip_store(restrict buffer_rs_partitions      rs_partitions,
             }
 
           // Otherwise, save the exclusive scan.
-          smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+          smem.extent[smem_offset + ii] = exc;
+          break;
+        }
+    }
+  }
+  else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+    //
+    const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+    {
+      uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+      uint32_t exc                 = 0;
+
+      //
+      // NOTE: Each workgroup invocation can proceed independently.
+      // Subgroups and workgroups do NOT have to coordinate.
+      //
+      while (true)
+        {
+          const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii],
+                                           gl_ScopeQueueFamily,
+                                           gl_StorageSemanticsBuffer,
+                                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+          // spin until valid
+          if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+            {
+              continue;
+            }
+
+          exc += (prev & RS_PARTITION_MASK_COUNT);
+
+          if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+            {
+              // continue accumulating reductions
+              partition_base_prev -= RS_RADIX_SIZE;
+              continue;
+            }
+
+          // Otherwise, save the exclusive scan.
+          smem.extent[smem_offset + ii] = exc;
           break;
         }
     }
 
-#endif
+    if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE)
+    {
+      const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL;
+
+      if (smem_offset_final < RS_RADIX_SIZE)
+        {
+          uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+          uint32_t exc                 = 0;
+
+          //
+          // NOTE: Each workgroup invocation can proceed independently.
+          // Subgroups and workgroups do NOT have to coordinate.
+          //
+          while (true)
+            {
+              const uint32_t prev =
+                atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL],
+                           gl_ScopeQueueFamily,
+                           gl_StorageSemanticsBuffer,
+                           gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+              // spin until valid
+              if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+                {
+                  continue;
+                }
+
+              exc += (prev & RS_PARTITION_MASK_COUNT);
+
+              if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+                {
+                  // continue accumulating reductions
+                  partition_base_prev -= RS_RADIX_SIZE;
+                  continue;
+                }
+
+              // Otherwise, save the exclusive scan.
+              smem.extent[smem_offset_final] = exc;
+              break;
+            }
+        }
+    }
+  }
+  else
+  {
+    ////////////////////////////////////////////////////////////////////////////
+    //
+    // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+    //
+    if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE)
+      {
+        uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE;
+        uint32_t exc                 = 0;
+
+        //
+        // NOTE: Each workgroup invocation can proceed independently.
+        // Subgroups and workgroups do NOT have to coordinate.
+        //
+        while (true)
+          {
+            const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev],
+                                             gl_ScopeQueueFamily,
+                                             gl_StorageSemanticsBuffer,
+                                             gl_SemanticsAcquire | gl_SemanticsMakeVisible);
+
+            // spin until valid
+            if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID)
+              {
+                continue;
+              }
+
+            exc += (prev & RS_PARTITION_MASK_COUNT);
+
+            if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX)
+              {
+                // continue accumulating reductions
+                partition_base_prev -= RS_RADIX_SIZE;
+                continue;
+              }
+
+            // Otherwise, save the exclusive scan.
+            smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc;
+            break;
+          }
+      }
+  }
 }
 
 //
@@ -1302,7 +1209,7 @@ rs_rank_to_local(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
   //
   // Reordering phase will overwrite histogram span.
   //
-  RS_BARRIER();
+  rsBarrier();
 }
 
 //
@@ -1333,13 +1240,7 @@ rs_rank_to_global(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
 void
 rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_SCATTER_BLOCK_ROWS])
 {
-  // clang-format off
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_SubgroupInvocationID;
-#else
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_LocalInvocationID.x;
-#endif
-  // clang-format on
+  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + invocation_id();
 
   [[unroll]] for (uint32_t ii = 0; ii < RS_KEYVAL_DWORDS; ii++)
   {
@@ -1353,7 +1254,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_
       smem.extent[smem_idx] = RS_KV_DWORD(kv[jj], ii);
     }
 
-    RS_BARRIER();
+    rsBarrier();
 
     //
     // Load keyval dword from sorted location
@@ -1363,7 +1264,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_
       RS_KV_DWORD(kv[jj], ii) = smem.extent[smem_base + jj * RS_WORKGROUP_SIZE];
     }
 
-    RS_BARRIER();
+    rsBarrier();
   }
 
   //
@@ -1376,7 +1277,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_
     smem.extent[smem_idx] = uint32_t(kr[ii]);
   }
 
-  RS_BARRIER();
+  rsBarrier();
 
   //
   // Load kr[] from sorted location -- we only need the rank.
@@ -1395,13 +1296,7 @@ void
 rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
              inout uint32_t       kr[RS_SCATTER_BLOCK_ROWS])
 {
-  // clang-format off
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_SubgroupInvocationID;
-#else
-  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_LocalInvocationID.x;
-#endif
-  // clang-format on
+  const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + invocation_id();
 
   [[unroll]] for (uint32_t ii = 0; ii < RS_KEYVAL_DWORDS; ii++)
   {
@@ -1415,7 +1310,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
       smem.extent[smem_idx] = RS_KV_DWORD(kv[jj], ii);
     }
 
-    RS_BARRIER();
+    rsBarrier();
 
     //
     // Load keyval dword from sorted location
@@ -1425,7 +1320,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
       RS_KV_DWORD(kv[jj], ii) = smem.extent[smem_base + jj * RS_WORKGROUP_SIZE];
     }
 
-    RS_BARRIER();
+    rsBarrier();
   }
 
   //
@@ -1438,7 +1333,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS],
     smem.extent[smem_idx] = uint32_t(kr[ii]);
   }
 
-  RS_BARRIER();
+  rsBarrier();
 
   //
   // Load kr[] from sorted location -- we only need the rank.
@@ -1459,7 +1354,7 @@ rs_load(out RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS])
   //
   // Set up buffer reference
   //
-  const uint32_t kv_in_offset_keys = gl_WorkGroupID.x * RS_BLOCK_KEYVALS +
+  const uint32_t kv_in_offset_keys = RS_GL_WORKGROUP_ID_X * RS_BLOCK_KEYVALS +
                                      gl_SubgroupID * RS_SUBGROUP_KEYVALS + gl_SubgroupInvocationID;
 
   u32vec2 kv_in_offset;
@@ -1530,6 +1425,58 @@ rs_store(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], const uint32_t kr[RS_SC
 void
 main()
 {
+  //
+  // If this is a nonsequential dispatch device then acquire a virtual
+  // workgroup id.
+  //
+  // This is only run once and is a special compile-time-enabled case
+  // so we leverage the existing `push.devaddr_partitions` address
+  // instead of altering the push constant structure definition.
+  //
+  if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0)
+    {
+      if (RS_IS_FIRST_LOCAL_INVOCATION())
+        {
+          // The "internal" memory map looks like this:
+          //
+          //   +---------------------------------+ <-- 0
+          //   | histograms[keyval_size]         |
+          //   +---------------------------------+ <-- keyval_size                           * histo_size
+          //   | partitions[scatter_blocks_ru-1] |
+          //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size
+          //   | workgroup_ids[keyval_size]      |
+          //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size
+          //
+          // Extended multiply to avoid 4GB overflow
+          //
+          u32vec2 workgroup_id_offset;
+
+          umulExtended((gl_NumWorkGroups.x - 1),  // virtual workgroup ids follow partitions[]
+                       4 * RS_RADIX_SIZE,         // sizeof(uint32_t) * 256
+                       workgroup_id_offset.y,     // msb
+                       workgroup_id_offset.x);    // lsb
+
+          RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_workgroup_id,
+                                             rs_workgroup_id,
+                                             push.devaddr_partitions,
+                                             workgroup_id_offset);
+
+          const uint32_t x_idx = RS_SCATTER_KEYVAL_DWORD_BASE * 4 + (push.pass_offset / RS_RADIX_LOG2);
+
+          smem.extent[0] = atomicAdd(rs_workgroup_id.x[x_idx],
+                                     1,
+                                     gl_ScopeQueueFamily,
+                                     gl_StorageSemanticsBuffer,
+                                     gl_SemanticsAcquireRelease);
+        }
+
+      rsBarrier();
+
+      rs_gl_workgroup_id_x = smem.extent[0];
+
+      rsBarrier();
+    }
+
   //
   // Load keyvals
   //
@@ -1568,7 +1515,7 @@ main()
 
     [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++)
     {
-      rs_kv_out.extent[gl_WorkGroupID.x * RS_BLOCK_KEYVALS + ii * RS_WORKGROUP_SIZE] = kr[ii];
+      rs_kv_out.extent[RS_GL_WORKGROUP_ID_X * RS_BLOCK_KEYVALS + ii * RS_WORKGROUP_SIZE] = kr[ii];
     }
 
     return;
@@ -1594,11 +1541,7 @@ main()
       //
       // Define partitions bufref
       //
-#if (RS_WORKGROUP_SUBGROUPS == 1)
-      const uint32_t partition_offset = gl_SubgroupInvocationID * 4;
-#else
-      const uint32_t partition_offset = gl_LocalInvocationID.x * 4;
-#endif
+      const uint32_t partition_offset = invocation_id() * 4;
 
       RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_partitions,
                                         rs_partitions,
@@ -1608,7 +1551,7 @@ main()
       //
       // The first partition is a special case.
       //
-      if (gl_WorkGroupID.x == 0)
+      if (RS_GL_WORKGROUP_ID_X == 0)
         {
           //
           // Other workgroups may lookback on this partition.
@@ -1623,12 +1566,12 @@ main()
           //
           // Otherwise, this is not the first workgroup.
           //
-          RS_SUBGROUP_UNIFORM const uint32_t partition_base = gl_WorkGroupID.x * RS_RADIX_SIZE;
+          RS_SUBGROUP_UNIFORM const uint32_t partition_base = RS_GL_WORKGROUP_ID_X * RS_RADIX_SIZE;
 
           //
           // The last partition is a special case.
           //
-          if (gl_WorkGroupID.x + 1 < gl_NumWorkGroups.x)
+          if (RS_GL_WORKGROUP_ID_X + 1 < gl_NumWorkGroups.x)
             {
               //
               // Atomically store the reduction to the global partition.
@@ -1667,7 +1610,7 @@ main()
       //
       // Barrier before reading prefix scanned histogram.
       //
-      RS_BARRIER();
+      rsBarrier();
 
       //
       // Convert keyval's rank to a local index
@@ -1686,7 +1629,7 @@ main()
       //
       // Wait for lookback to complete.
       //
-      RS_BARRIER();
+      rsBarrier();
 #endif
 
       //
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp
diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp
similarity index 100%
rename from src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp
rename to src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp
diff --git a/src/amd/vulkan/radix_sort/target.h b/src/vulkan/runtime/radix_sort/target.h
similarity index 94%
rename from src/amd/vulkan/radix_sort/target.h
rename to src/vulkan/runtime/radix_sort/target.h
index 2164389757d..1ddac0ccc8e 100644
--- a/src/amd/vulkan/radix_sort/target.h
+++ b/src/vulkan/runtime/radix_sort/target.h
@@ -27,6 +27,7 @@ struct radix_sort_vk_target_config
   struct
   {
     uint32_t workgroup_size_log2;
+    uint32_t block_rows;
   } fill;
 
   struct
@@ -48,6 +49,8 @@ struct radix_sort_vk_target_config
     uint32_t subgroup_size_log2;
     uint32_t block_rows;
   } scatter;
+
+  bool nonsequential_dispatch;
 };
 
 //
diff --git a/src/vulkan/runtime/vk_acceleration_structure.c b/src/vulkan/runtime/vk_acceleration_structure.c
index 074b94ea85c..ccea927f559 100644
--- a/src/vulkan/runtime/vk_acceleration_structure.c
+++ b/src/vulkan/runtime/vk_acceleration_structure.c
@@ -27,7 +27,41 @@
 #include "vk_alloc.h"
 #include "vk_common_entrypoints.h"
 #include "vk_device.h"
+#include "vk_command_buffer.h"
 #include "vk_log.h"
+#include "vk_meta.h"
+
+#include "bvh/vk_build_interface.h"
+#include "bvh/vk_bvh.h"
+
+#include "radix_sort/common/vk/barrier.h"
+#include "radix_sort/shaders/push.h"
+
+#include "util/u_string.h"
+
+static const uint32_t leaf_spv[] = {
+#include "bvh/leaf.spv.h"
+};
+
+static const uint32_t leaf_always_active_spv[] = {
+#include "bvh/leaf_always_active.spv.h"
+};
+
+static const uint32_t morton_spv[] = {
+#include "bvh/morton.spv.h"
+};
+
+static const uint32_t lbvh_main_spv[] = {
+#include "bvh/lbvh_main.spv.h"
+};
+
+static const uint32_t lbvh_generate_ir_spv[] = {
+#include "bvh/lbvh_generate_ir.spv.h"
+};
+
+static const uint32_t ploc_spv[] = {
+#include "bvh/ploc_internal.spv.h"
+};
 
 VkDeviceAddress
 vk_acceleration_structure_get_va(struct vk_acceleration_structure *accel_struct)
@@ -92,3 +126,1122 @@ vk_common_GetAccelerationStructureDeviceAddressKHR(
    VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfo->accelerationStructure);
    return vk_acceleration_structure_get_va(accel_struct);
 }
+
+#define KEY_ID_PAIR_SIZE 8
+#define MORTON_BIT_SIZE  24
+
+enum internal_build_type {
+   INTERNAL_BUILD_TYPE_LBVH,
+   INTERNAL_BUILD_TYPE_PLOC,
+   INTERNAL_BUILD_TYPE_UPDATE,
+};
+
+struct build_config {
+   enum internal_build_type internal_type;
+   bool updateable;
+   uint32_t encode_key[MAX_ENCODE_PASSES];
+};
+
+struct scratch_layout {
+   uint32_t size;
+   uint32_t update_size;
+
+   uint32_t header_offset;
+
+   /* Used for BUILD only. */
+
+   uint32_t sort_buffer_offset[2];
+   uint32_t sort_internal_offset;
+
+   uint32_t ploc_prefix_sum_partition_offset;
+   uint32_t lbvh_node_offset;
+
+   uint32_t ir_offset;
+   uint32_t internal_node_offset;
+};
+
+static struct build_config
+build_config(uint32_t leaf_count,
+             const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+             const struct vk_acceleration_structure_build_ops *ops)
+{
+   struct build_config config = {0};
+
+   if (leaf_count <= 4)
+      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
+   else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR)
+      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
+   else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) &&
+            !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
+      config.internal_type = INTERNAL_BUILD_TYPE_PLOC;
+   else
+      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
+
+   if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR &&
+       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
+       ops->update_as[0])
+      config.internal_type = INTERNAL_BUILD_TYPE_UPDATE;
+
+   if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) &&
+       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
+       ops->update_as[0])
+      config.updateable = true;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(config.encode_key); i++) {
+      if (!ops->get_encode_key[i])
+         break;
+      config.encode_key[i] = ops->get_encode_key[i](leaf_count, build_info->flags);
+   }
+
+   return config;
+}
+
+static void
+get_scratch_layout(struct vk_device *device,
+                   uint32_t leaf_count,
+                   const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                   const struct vk_acceleration_structure_build_args *args,
+                   struct scratch_layout *scratch)
+{
+   uint32_t internal_count = MAX2(leaf_count, 2) - 1;
+
+   radix_sort_vk_memory_requirements_t requirements = {
+      0,
+   };
+   radix_sort_vk_get_memory_requirements(args->radix_sort, leaf_count,
+                                         &requirements);
+
+   uint32_t ir_leaf_size;
+   switch (vk_get_as_geometry_type(build_info)) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      ir_leaf_size = sizeof(struct vk_ir_triangle_node);
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      ir_leaf_size = sizeof(struct vk_ir_aabb_node);
+      break;
+   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+      ir_leaf_size = sizeof(struct vk_ir_instance_node);
+      break;
+   default:
+      unreachable("Unknown VkGeometryTypeKHR");
+   }
+
+
+   uint32_t offset = 0;
+
+   uint32_t ploc_scratch_space = 0;
+   uint32_t lbvh_node_space = 0;
+
+   struct build_config config = build_config(leaf_count, build_info,
+                                             device->as_build_ops);
+
+   if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC)
+      ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition);
+   else
+      lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count;
+
+   scratch->header_offset = offset;
+   offset += sizeof(struct vk_ir_header);
+
+   scratch->sort_buffer_offset[0] = offset;
+   offset += requirements.keyvals_size;
+
+   scratch->sort_buffer_offset[1] = offset;
+   offset += requirements.keyvals_size;
+
+   scratch->sort_internal_offset = offset;
+   /* Internal sorting data is not needed when PLOC/LBVH are invoked,
+    * save space by aliasing them */
+   scratch->ploc_prefix_sum_partition_offset = offset;
+   scratch->lbvh_node_offset = offset;
+   offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space);
+
+   scratch->ir_offset = offset;
+   offset += ir_leaf_size * leaf_count;
+
+   scratch->internal_node_offset = offset;
+   offset += sizeof(struct vk_ir_box_node) * internal_count;
+
+   scratch->size = offset;
+
+   if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
+       device->as_build_ops->update_as[0]) {
+      scratch->update_size =
+         device->as_build_ops->get_update_scratch_size(device, leaf_count);
+   } else {
+      scratch->update_size = offset;
+   }
+}
+
+struct bvh_state {
+   uint32_t scratch_offset;
+
+   uint32_t leaf_node_count;
+   uint32_t internal_node_count;
+   uint32_t leaf_node_size;
+
+   struct scratch_layout scratch;
+   struct build_config config;
+
+   /* Radix sort state */
+   uint32_t scatter_blocks;
+   uint32_t count_ru_scatter;
+   uint32_t histo_blocks;
+   uint32_t count_ru_histo;
+   struct rs_push_scatter push_scatter;
+
+   uint32_t last_encode_pass;
+};
+
+struct bvh_batch_state {
+   bool any_updateable;
+   bool any_non_updateable;
+   bool any_ploc;
+   bool any_lbvh;
+   bool any_update;
+};
+
+static VkResult
+get_pipeline_spv(struct vk_device *device, struct vk_meta_device *meta,
+                 const char *name, const uint32_t *spv, uint32_t spv_size,
+                 unsigned push_constant_size,
+                 const struct vk_acceleration_structure_build_args *args,
+                 VkPipeline *pipeline, VkPipelineLayout *layout)
+{
+   size_t key_size = strlen(name);
+
+   VkResult result = vk_meta_get_pipeline_layout(
+         device, meta, NULL,
+         &(VkPushConstantRange){
+            VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size
+         },
+         name, key_size, layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(meta, name, key_size);
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline = pipeline_from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkShaderModuleCreateInfo module_info = {
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = NULL,
+      .flags = 0,
+      .codeSize = spv_size,
+      .pCode = spv,
+   };
+
+   VkSpecializationMapEntry spec_map[2] = {
+      {
+         .constantID = SUBGROUP_SIZE_ID,
+         .offset = 0,
+         .size = sizeof(args->subgroup_size),
+      },
+      {
+         .constantID = BVH_BOUNDS_OFFSET_ID,
+         .offset = sizeof(args->subgroup_size),
+         .size = sizeof(args->bvh_bounds_offset),
+      },
+   };
+
+   uint32_t spec_constants[2] = {
+      args->subgroup_size,
+      args->bvh_bounds_offset
+   };
+
+   VkSpecializationInfo spec_info = {
+      .mapEntryCount = ARRAY_SIZE(spec_map),
+      .pMapEntries = spec_map,
+      .dataSize = sizeof(spec_constants),
+      .pData = spec_constants,
+   };
+
+   VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT rssci = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+      .pNext = &module_info,
+      .requiredSubgroupSize = args->subgroup_size,
+   };
+
+   VkPipelineShaderStageCreateInfo shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &rssci,
+      .flags = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+      .pSpecializationInfo = &spec_info,
+   };
+
+   VkComputePipelineCreateInfo pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = shader_stage,
+      .flags = 0,
+      .layout = *layout,
+   };
+
+   return vk_meta_create_compute_pipeline(device, meta, &pipeline_info,
+                                          name, key_size, pipeline);
+}
+
+static uint32_t
+pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
+{
+   uint32_t geometry_id_and_flags = geometry_id;
+   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
+      geometry_id_and_flags |= VK_GEOMETRY_OPAQUE;
+
+   return geometry_id_and_flags;
+}
+
+struct vk_bvh_geometry_data
+vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index,
+                      const VkAccelerationStructureGeometryKHR *geometry,
+                      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info)
+{
+   struct vk_bvh_geometry_data data = {
+      .first_id = first_id,
+      .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags),
+      .geometry_type = geometry->geometryType,
+   };
+
+   switch (geometry->geometryType) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
+
+      data.data = geometry->geometry.triangles.vertexData.deviceAddress +
+                  build_range_info->firstVertex * geometry->geometry.triangles.vertexStride;
+      data.indices = geometry->geometry.triangles.indexData.deviceAddress;
+
+      if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR)
+         data.data += build_range_info->primitiveOffset;
+      else
+         data.indices += build_range_info->primitiveOffset;
+
+      data.transform = geometry->geometry.triangles.transformData.deviceAddress;
+      if (data.transform)
+         data.transform += build_range_info->transformOffset;
+
+      data.stride = geometry->geometry.triangles.vertexStride;
+      data.vertex_format = geometry->geometry.triangles.vertexFormat;
+      data.index_format = geometry->geometry.triangles.indexType;
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);
+
+      data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset;
+      data.stride = geometry->geometry.aabbs.stride;
+      break;
+   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR);
+
+      data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset;
+
+      if (geometry->geometry.instances.arrayOfPointers)
+         data.stride = 8;
+      else
+         data.stride = sizeof(VkAccelerationStructureInstanceKHR);
+      break;
+   default:
+      unreachable("Unknown geometryType");
+   }
+
+   return data;
+}
+
+static void
+vk_cmd_begin_debug_marker(VkCommandBuffer commandBuffer, const char *format, ...)
+{
+   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
+   struct vk_device *device = cmd_buffer->base.device;
+
+   va_list ap;
+   va_start(ap, format);
+
+   char *name;
+   if (vasprintf(&name, format, ap) == -1)
+      return;
+
+   va_end(ap);
+
+   VkDebugMarkerMarkerInfoEXT marker = {
+      .sType = VK_STRUCTURE_TYPE_DEBUG_MARKER_MARKER_INFO_EXT,
+      .pMarkerName = name,
+   };
+
+   device->dispatch_table.CmdDebugMarkerBeginEXT(commandBuffer, &marker);
+}
+
+static void
+vk_cmd_end_debug_marker(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
+   struct vk_device *device = cmd_buffer->base.device;
+
+   device->dispatch_table.CmdDebugMarkerEndEXT(commandBuffer);
+}
+
+static VkResult
+build_leaves(VkCommandBuffer commandBuffer,
+             struct vk_device *device, struct vk_meta_device *meta,
+             const struct vk_acceleration_structure_build_args *args,
+             uint32_t infoCount,
+             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+             const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+             struct bvh_state *bvh_states,
+             bool updateable)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   /* Many apps are broken and will make inactive primitives active when
+    * updating, even though this is disallowed by the spec.  To handle this,
+    * we use a different variant for updateable acceleration structures when
+    * the driver implements an update pass. This passes through inactive leaf
+    * nodes as if they were active, with an empty bounding box. It's then the
+    * driver or HW's responsibility to filter out inactive nodes.
+    */
+    VkResult result;
+   if (updateable) {
+      result = get_pipeline_spv(device, meta, "leaves_always_active",
+                                leaf_always_active_spv,
+                                sizeof(leaf_always_active_spv),
+                                sizeof(struct leaf_args), args, &pipeline, &layout);
+   } else {
+      result = get_pipeline_spv(device, meta, "leaves", leaf_spv, sizeof(leaf_spv),
+                                sizeof(struct leaf_args), args, &pipeline, &layout);
+   }
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "build_leaves");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+      if (bvh_states[i].config.updateable != updateable)
+         continue;
+
+      struct leaf_args leaf_consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
+      };
+
+      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
+         const VkAccelerationStructureGeometryKHR *geom =
+            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
+
+         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];
+
+         leaf_consts.geom_data = vk_fill_geometry_data(pInfos[i].type, bvh_states[i].leaf_node_count, j, geom, build_range_info);
+
+         disp->CmdPushConstants(commandBuffer, layout,
+                                VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts);
+         device->cmd_dispatch_unaligned(commandBuffer, build_range_info->primitiveCount, 1, 1);
+
+         bvh_states[i].leaf_node_count += build_range_info->primitiveCount;
+      }
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+morton_generate(VkCommandBuffer commandBuffer, struct vk_device *device,
+                struct vk_meta_device *meta, 
+                const struct vk_acceleration_structure_build_args *args,
+                uint32_t infoCount,
+                const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                struct bvh_state *bvh_states)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   VkResult result =
+      get_pipeline_spv(device, meta, "morton", morton_spv, sizeof(morton_spv),
+                       sizeof(struct morton_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "morton_generate");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+      const struct morton_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0],
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].leaf_node_count, 1, 1);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+static void
+morton_sort(VkCommandBuffer commandBuffer, struct vk_device *device,
+            const struct vk_acceleration_structure_build_args *args,
+            uint32_t infoCount,
+            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
+{
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "morton_sort");
+
+   /* Copyright 2019 The Fuchsia Authors. */
+   const radix_sort_vk_t *rs = args->radix_sort;
+
+   /*
+    * OVERVIEW
+    *
+    *   1. Pad the keyvals in `scatter_even`.
+    *   2. Zero the `histograms` and `partitions`.
+    *      --- BARRIER ---
+    *   3. HISTOGRAM is dispatched before PREFIX.
+    *      --- BARRIER ---
+    *   4. PREFIX is dispatched before the first SCATTER.
+    *      --- BARRIER ---
+    *   5. One or more SCATTER dispatches.
+    *
+    * Note that the `partitions` buffer can be zeroed anytime before the first
+    * scatter.
+    */
+
+   /* How many passes? */
+   uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t);
+   uint32_t keyval_bits = keyval_bytes * 8;
+   uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits);
+   uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2;
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].leaf_node_count)
+         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1];
+      else
+         bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0];
+   }
+
+   /*
+    * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS
+    *
+    * Pad fractional blocks with max-valued keyvals.
+    *
+    * Zero the histograms and partitions buffer.
+    *
+    * This assumes the partitions follow the histograms.
+    */
+
+   /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */
+
+   /* How many scatter blocks? */
+   uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2;
+   uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows;
+
+   /*
+    * How many histogram blocks?
+    *
+    * Note that it's OK to have more max-valued digits counted by the histogram
+    * than sorted by the scatters because the sort is stable.
+    */
+   uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2;
+   uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows;
+
+   uint32_t pass_idx = (keyval_bytes - passes);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (!bvh_states[i].leaf_node_count)
+         continue;
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+
+      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      bvh_states[i].scatter_blocks = (bvh_states[i].leaf_node_count + scatter_block_kvs - 1) / scatter_block_kvs;
+      bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs;
+
+      bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs;
+      bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs;
+
+      /* Fill with max values */
+      if (bvh_states[i].count_ru_histo > bvh_states[i].leaf_node_count) {
+         device->cmd_fill_buffer_addr(commandBuffer, keyvals_even_addr +
+                                      bvh_states[i].leaf_node_count * keyval_bytes,
+                                      (bvh_states[i].count_ru_histo - bvh_states[i].leaf_node_count) * keyval_bytes,
+                                      0xFFFFFFFF);
+      }
+
+      /*
+       * Zero histograms and invalidate partitions.
+       *
+       * Note that the partition invalidation only needs to be performed once
+       * because the even/odd scatter dispatches rely on the the previous pass to
+       * leave the partitions in an invalid state.
+       *
+       * Note that the last workgroup doesn't read/write a partition so it doesn't
+       * need to be initialized.
+       */
+      uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1;
+
+      uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
+
+      device->cmd_fill_buffer_addr(commandBuffer, 
+                                   internal_addr + rs->internal.histograms.offset + fill_base,
+                                   histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)) + keyval_bytes * sizeof(uint32_t), 0);
+   }
+
+   /*
+    * Pipeline: HISTOGRAM
+    *
+    * TODO(allanmac): All subgroups should try to process approximately the same
+    * number of blocks in order to minimize tail effects.  This was implemented
+    * and reverted but should be reimplemented and benchmarked later.
+    */
+   vk_barrier_transfer_w_to_compute_r(commandBuffer);
+
+   disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                         rs->pipelines.named.histogram);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (!bvh_states[i].leaf_node_count)
+         continue;
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+
+      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      /* Dispatch histogram */
+      struct rs_push_histogram push_histogram = {
+         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
+         .devaddr_keyvals = keyvals_even_addr,
+         .passes = passes,
+      };
+
+      disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                             sizeof(push_histogram), &push_histogram);
+
+      disp->CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1);
+   }
+
+   /*
+    * Pipeline: PREFIX
+    *
+    * Launch one workgroup per pass.
+    */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                         rs->pipelines.named.prefix);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (!bvh_states[i].leaf_node_count)
+         continue;
+      if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+         continue;
+
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      struct rs_push_prefix push_prefix = {
+         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                             sizeof(push_prefix), &push_prefix);
+
+      disp->CmdDispatch(commandBuffer, passes, 1, 1);
+   }
+
+   /* Pipeline: SCATTER */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));
+
+   for (uint32_t i = 0; i < infoCount; i++) {
+      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0];
+      uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1];
+      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset;
+
+      bvh_states[i].push_scatter = (struct rs_push_scatter){
+         .devaddr_keyvals_even = keyvals_even_addr,
+         .devaddr_keyvals_odd = keyvals_odd_addr,
+         .devaddr_partitions = internal_addr + rs->internal.partitions.offset,
+         .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset,
+      };
+   }
+
+   bool is_even = true;
+
+   while (true) {
+      uint32_t pass_dword = pass_idx / 4;
+
+      /* Bind new pipeline */
+      VkPipeline p =
+         is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd;
+      disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p);
+
+      /* Update push constants that changed */
+      VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even
+                                    : rs->pipeline_layouts.named.scatter[pass_dword].odd;
+
+      for (uint32_t i = 0; i < infoCount; i++) {
+         if (!bvh_states[i].leaf_node_count)
+            continue;
+         if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE)
+            continue;
+
+         bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2;
+
+         disp->CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter),
+                                &bvh_states[i].push_scatter);
+
+         disp->CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1);
+
+         bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t));
+      }
+
+      /* Continue? */
+      if (++pass_idx >= keyval_bytes)
+         break;
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      is_even ^= true;
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+}
+
+static VkResult
+lbvh_build_internal(VkCommandBuffer commandBuffer,
+                    struct vk_device *device, struct vk_meta_device *meta,
+                    const struct vk_acceleration_structure_build_args *args,
+                    uint32_t infoCount,
+                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   VkResult result =
+      get_pipeline_spv(device, meta, "lbvh_main", lbvh_main_spv,
+                       sizeof(lbvh_main_spv),
+                       sizeof(struct lbvh_main_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "lbvh_build_internal");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
+         continue;
+
+      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
+      uint32_t internal_node_count = MAX2(bvh_states[i].leaf_node_count, 2) - 1;
+
+      const struct lbvh_main_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
+         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
+         .id_count = bvh_states[i].leaf_node_count,
+         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      device->cmd_dispatch_unaligned(commandBuffer, internal_node_count, 1, 1);
+      bvh_states[i].internal_node_count = internal_node_count;
+   }
+
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   result =
+      get_pipeline_spv(device, meta, "lbvh_generate_ir", lbvh_generate_ir_spv,
+                       sizeof(lbvh_generate_ir_spv),
+                       sizeof(struct lbvh_generate_ir_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH)
+         continue;
+
+      const struct lbvh_generate_ir_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].internal_node_count, 1, 1);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+ploc_build_internal(VkCommandBuffer commandBuffer,
+                    struct vk_device *device, struct vk_meta_device *meta,
+                    const struct vk_acceleration_structure_build_args *args,
+                    uint32_t infoCount,
+                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
+{
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+
+   VkResult result =
+      get_pipeline_spv(device, meta, "ploc", ploc_spv,
+                       sizeof(ploc_spv),
+                       sizeof(struct ploc_args), args, &pipeline, &layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "ploc_build_internal");
+
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   disp->CmdBindPipeline(
+      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC)
+         continue;
+
+      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
+      uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0])
+                                       ? bvh_states[i].scratch.sort_buffer_offset[1]
+                                       : bvh_states[i].scratch.sort_buffer_offset[0];
+
+      const struct ploc_args consts = {
+         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+         .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
+         .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset,
+         .prefix_scan_partitions =
+            pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset,
+         .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset,
+      };
+
+      disp->CmdPushConstants(commandBuffer, layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+      disp->CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].leaf_node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   return VK_SUCCESS;
+}
+
+void
+vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
+                                     struct vk_device *device,
+                                     struct vk_meta_device *meta,
+                                     uint32_t infoCount,
+                                     const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                                     const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+                                     const struct vk_acceleration_structure_build_args *args)
+{
+   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
+   const struct vk_acceleration_structure_build_ops *ops = device->as_build_ops;
+
+   struct bvh_batch_state batch_state = {0};
+
+   struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
+
+   if (args->emit_markers)
+      vk_cmd_begin_debug_marker(commandBuffer, "vkCmdBuildAccelerationStructuresKHR(%u)", infoCount);
+
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      uint32_t leaf_node_count = 0;
+      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
+         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
+      }
+
+      get_scratch_layout(device, leaf_node_count, pInfos + i, args, &bvh_states[i].scratch);
+
+      struct build_config config = build_config(leaf_node_count, pInfos + i,
+                                                device->as_build_ops);
+      bvh_states[i].config = config;
+
+      if (config.updateable)
+         batch_state.any_updateable = true;
+      else
+         batch_state.any_non_updateable = true;
+
+      if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) {
+         batch_state.any_ploc = true;
+      } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) {
+         batch_state.any_lbvh = true;
+      } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) {
+         batch_state.any_update = true;
+      } else {
+         unreachable("Unknown internal_build_type");
+      }
+
+      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) {
+         /* The internal node count is updated in lbvh_build_internal for LBVH
+          * and from the PLOC shader for PLOC. */
+         struct vk_ir_header header = {
+            .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff},
+            .max_bounds = {0x80000000, 0x80000000, 0x80000000},
+            .dispatch_size_y = 1,
+            .dispatch_size_z = 1,
+            .sync_data =
+               {
+                  .current_phase_end_counter = TASK_INDEX_INVALID,
+                  /* Will be updated by the first PLOC shader invocation */
+                  .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
+               },
+         };
+
+         device->write_buffer_cp(commandBuffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+                                 &header, sizeof(header));
+      } else {
+         VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure);
+         VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure);
+
+         ops->init_update_scratch(commandBuffer, pInfos[i].scratchData.deviceAddress,
+                                  leaf_node_count, src_as, dst_as);
+      }
+   }
+
+   /* Wait for the write_buffer_cp to land before using in compute shaders */
+   device->flush_buffer_write_cp(commandBuffer);
+   device->dispatch_table.CmdPipelineBarrier(commandBuffer,
+                                             VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                                             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                                             0, /* dependencyFlags */
+                                             1,
+                                             &(VkMemoryBarrier) {
+                                                .srcAccessMask = 0,
+                                                .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
+                                             }, 0, NULL, 0, NULL);
+
+   if (batch_state.any_lbvh || batch_state.any_ploc) {
+      VkResult result;
+
+      if (batch_state.any_non_updateable) {
+         result =
+            build_leaves(commandBuffer, device, meta, args, infoCount, pInfos,
+                         ppBuildRangeInfos, bvh_states, false);
+
+         if (result != VK_SUCCESS) {
+            free(bvh_states);
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      if (batch_state.any_updateable) {
+         result =
+            build_leaves(commandBuffer, device, meta, args, infoCount, pInfos,
+                         ppBuildRangeInfos, bvh_states, true);
+
+         if (result != VK_SUCCESS) {
+            free(bvh_states);
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      result =
+         morton_generate(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
+
+      if (result != VK_SUCCESS) {
+         free(bvh_states);
+         vk_command_buffer_set_error(cmd_buffer, result);
+         return;
+      }
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      morton_sort(commandBuffer, device, args, infoCount, pInfos, bvh_states);
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      if (batch_state.any_lbvh) {
+         result =
+            lbvh_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
+
+         if (result != VK_SUCCESS) {
+            free(bvh_states);
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      if (batch_state.any_ploc) {
+         result =
+            ploc_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);
+
+         if (result != VK_SUCCESS) {
+            vk_command_buffer_set_error(cmd_buffer, result);
+            return;
+         }
+      }
+
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+      vk_barrier_compute_w_to_indirect_compute_r(commandBuffer);
+   }
+
+   for (unsigned pass = 0; pass < ARRAY_SIZE(ops->encode_as); pass++) {
+      if (!ops->encode_as[pass] && !ops->update_as[pass])
+         break;
+
+      bool progress;
+      do {
+         progress = false;
+
+         bool update;
+         uint32_t encode_key;
+         for (uint32_t i = 0; i < infoCount; ++i) {
+            if (bvh_states[i].last_encode_pass == pass + 1)
+               continue;
+
+            if (!progress) {
+               update = (bvh_states[i].config.internal_type ==
+                         INTERNAL_BUILD_TYPE_UPDATE);
+               if (update && !ops->update_as[pass])
+                  continue;
+               if (!update && !ops->encode_as[pass])
+                  continue;
+               encode_key = bvh_states[i].config.encode_key[pass];
+               progress = true;
+               if (update)
+                  ops->update_bind_pipeline[pass](commandBuffer);
+               else
+                  ops->encode_bind_pipeline[pass](commandBuffer, encode_key);
+            } else {
+               if (update != (bvh_states[i].config.internal_type ==
+                              INTERNAL_BUILD_TYPE_UPDATE) ||
+                   encode_key != bvh_states[i].config.encode_key[pass])
+                  continue;
+            }
+
+            VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure);
+
+            if (update) {
+               VK_FROM_HANDLE(vk_acceleration_structure, src, pInfos[i].srcAccelerationStructure);
+               ops->update_as[pass](commandBuffer,
+                                    &pInfos[i],
+                                    ppBuildRangeInfos[i],
+                                    bvh_states[i].leaf_node_count,
+                                    src,
+                                    accel_struct);
+
+            } else {
+               ops->encode_as[pass](commandBuffer,
+                                    &pInfos[i],
+                                    ppBuildRangeInfos[i],
+                                    pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset,
+                                    pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset,
+                                    bvh_states[i].leaf_node_count,
+                                    encode_key,
+                                    accel_struct);
+            }
+
+            bvh_states[i].last_encode_pass = pass + 1;
+         }
+      } while (progress);
+   }
+
+   if (args->emit_markers)
+      vk_cmd_end_debug_marker(commandBuffer);
+
+   free(bvh_states);
+}
+
+void
+vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
+                      const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                      const uint32_t *pMaxPrimitiveCounts,
+                      VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo,
+                      const struct vk_acceleration_structure_build_args *args)
+{
+   VK_FROM_HANDLE(vk_device, device, _device);
+
+   uint32_t leaf_count = 0;
+   for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
+      leaf_count += pMaxPrimitiveCounts[i];
+
+   struct scratch_layout scratch;
+
+   get_scratch_layout(device, leaf_count, pBuildInfo, args, &scratch);
+
+   pSizeInfo->accelerationStructureSize =
+      device->as_build_ops->get_as_size(_device, pBuildInfo, leaf_count);
+   pSizeInfo->updateScratchSize = scratch.update_size;
+   pSizeInfo->buildScratchSize = scratch.size;
+}
+
+/* Return true if the common framework supports using this format for loading
+ * vertices. Must match the formats handled by load_vertices() on the GPU.
+ */
+bool
+vk_acceleration_struct_vtx_format_supported(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_R32G32_SFLOAT:
+   case VK_FORMAT_R32G32B32_SFLOAT:
+   case VK_FORMAT_R32G32B32A32_SFLOAT:
+   case VK_FORMAT_R16G16_SFLOAT:
+   case VK_FORMAT_R16G16B16_SFLOAT:
+   case VK_FORMAT_R16G16B16A16_SFLOAT:
+   case VK_FORMAT_R16G16_SNORM:
+   case VK_FORMAT_R16G16_UNORM:
+   case VK_FORMAT_R16G16B16A16_SNORM:
+   case VK_FORMAT_R16G16B16A16_UNORM:
+   case VK_FORMAT_R8G8_SNORM:
+   case VK_FORMAT_R8G8_UNORM:
+   case VK_FORMAT_R8G8B8A8_SNORM:
+   case VK_FORMAT_R8G8B8A8_UNORM:
+   case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+      return true;
+   default:
+      return false;
+   }
+}
+
diff --git a/src/vulkan/runtime/vk_acceleration_structure.h b/src/vulkan/runtime/vk_acceleration_structure.h
index bcc2eff4660..b34d177cbfe 100644
--- a/src/vulkan/runtime/vk_acceleration_structure.h
+++ b/src/vulkan/runtime/vk_acceleration_structure.h
@@ -26,6 +26,11 @@
 #define VK_ACCELERATION_STRUCTURE_H
 
 #include "vk_object.h"
+#include "radix_sort/radix_sort_vk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct vk_acceleration_structure {
    struct vk_object_base base;
@@ -40,4 +45,88 @@ VkDeviceAddress vk_acceleration_structure_get_va(struct vk_acceleration_structur
 VK_DEFINE_NONDISP_HANDLE_CASTS(vk_acceleration_structure, base, VkAccelerationStructureKHR,
                                VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)
 
+#define MAX_ENCODE_PASSES 2
+#define MAX_UPDATE_PASSES 2
+
+struct vk_acceleration_structure_build_ops {
+   VkDeviceSize (*get_as_size)(VkDevice device,
+                               const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                               uint32_t leaf_count);
+   VkDeviceSize (*get_update_scratch_size)(struct vk_device *device, uint32_t leaf_count);
+   uint32_t (*get_encode_key[MAX_ENCODE_PASSES])(VkAccelerationStructureTypeKHR type,
+                                                 VkBuildAccelerationStructureFlagBitsKHR flags);
+   VkResult (*encode_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                                       uint32_t key);
+   void (*encode_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                        const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                        const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                                        VkDeviceAddress intermediate_as_addr,
+                                        VkDeviceAddress intermediate_header_addr,
+                                        uint32_t leaf_count,
+                                        uint32_t key,
+                                        struct vk_acceleration_structure *dst);
+   void (*init_update_scratch)(VkCommandBuffer cmd_buffer,
+                               VkDeviceAddress scratch,
+                               uint32_t leaf_count,
+                               struct vk_acceleration_structure *src_as,
+                               struct vk_acceleration_structure *dst_as);
+   void (*update_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer);
+   void (*update_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer,
+                                        const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                                        const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                                        uint32_t leaf_count,
+                                        struct vk_acceleration_structure *dst,
+                                        struct vk_acceleration_structure *src);
+};
+
+struct vk_acceleration_structure_build_args {
+   uint32_t subgroup_size;
+   uint32_t bvh_bounds_offset;
+   bool emit_markers;
+   const radix_sort_vk_t *radix_sort;
+};
+
+struct vk_meta_device;
+
+void vk_cmd_build_acceleration_structures(VkCommandBuffer cmdbuf,
+                                          struct vk_device *device,
+                                          struct vk_meta_device *meta,
+                                          uint32_t info_count,
+                                          const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+                                          const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+                                          const struct vk_acceleration_structure_build_args *args);
+
+void vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
+                           const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                           const uint32_t *pMaxPrimitiveCounts,
+                           VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo,
+                           const struct vk_acceleration_structure_build_args *args);
+
+bool vk_acceleration_struct_vtx_format_supported(VkFormat format);
+
+static inline VkGeometryTypeKHR
+vk_get_as_geometry_type(const VkAccelerationStructureBuildGeometryInfoKHR *build_info)
+{
+   if (build_info->geometryCount) {
+      if (build_info->pGeometries)
+         return build_info->pGeometries[0].geometryType;
+      else
+         return build_info->ppGeometries[0]->geometryType;
+   }
+
+   /* If there are no geometries, the geometry type shouldn't matter, but
+    * return something.
+    */
+   return VK_GEOMETRY_TYPE_TRIANGLES_KHR;
+}
+
+struct vk_bvh_geometry_data
+vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index,
+                      const VkAccelerationStructureGeometryKHR *geometry,
+                      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/vulkan/runtime/vk_device.h b/src/vulkan/runtime/vk_device.h
index 4d7220f832f..83d41afab0c 100644
--- a/src/vulkan/runtime/vk_device.h
+++ b/src/vulkan/runtime/vk_device.h
@@ -37,6 +37,7 @@
 extern "C" {
 #endif
 
+struct vk_acceleration_structure_build_ops;
 struct vk_command_buffer_ops;
 struct vk_device_shader_ops;
 struct vk_sync;
@@ -134,6 +135,9 @@ struct vk_device {
    /** Shader vtable for VK_EXT_shader_object and common pipelines */
    const struct vk_device_shader_ops *shader_ops;
 
+   /** Acceleration structure build vtable for common BVH building. */
+   const struct vk_acceleration_structure_build_ops *as_build_ops;
+
    /**
     * Write data to a buffer from the command processor. This is simpler than
     * setting up a staging buffer and faster for small writes, but is not