diff --git a/src/intel/vulkan/grl/gpu/AABB.h b/src/intel/vulkan/grl/gpu/AABB.h
new file mode 100644
index 00000000000..11d848e3c09
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/AABB.h
@@ -0,0 +1,450 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#ifndef __OPENCL_VERSION__
+#include "stdio.h"
+#endif
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+/* ====== QUAD ENCODING config ====== */
+
+#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom
+#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS)
+#define QUAD_GEOMID_MASK      ((1<<QUAD_GEOMID_BITS)-1)
+
+#define QUAD_PRIMID_BITS 29 // dxr limit is 2^29 prims total within one blas
+#define QUAD_PRIMID_MASK  ((1<<QUAD_PRIMID_BITS)-1)
+
+#define INSTANCE_ID_BITS 24
+#define INSTANCE_ID_MASK ((1<<INSTANCE_ID_BITS)-1)
+
+// JDB TODO:  Make this a separate, dedicated structure..  Aliasing a float4 AABB as a primref is needlessly obfuscated
+
+typedef struct AABB PrimRef;
+
+GRL_INLINE void AABB_init(struct AABB *aabb)
+{
+    aabb->lower = (float4)(INFINITY, INFINITY, INFINITY, 0);
+    aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0);
+}
+
+GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb)
+{
+    const uint v = as_uint(aabb->lower.w);
+    return v & QUAD_GEOMID_MASK;
+}
+
+GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb)
+{
+    return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK;
+}
+
+GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb)
+{
+    const uint v = as_uint(aabb->lower.w);
+    const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK;
+    const uint deltaID = v >> QUAD_GEOMID_BITS;
+    const uint primID1 = primID0 + deltaID;
+    return primID1;
+}
+
+GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb )
+{
+    const uint v = as_uint( aabb->upper.w );
+    return (v >> QUAD_PRIMID_BITS) ;
+}
+
+GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb )
+{
+    return as_uint(aabb->lower.w) & INSTANCE_ID_MASK;
+}
+
+GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb )
+{
+    return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS;
+}
+
+GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags )
+{
+    /* encode geomID, primID */
+    uint flags = (geomFlags << QUAD_PRIMID_BITS);
+    primref->lower.w = as_float( geomID );
+    primref->upper.w = as_float( primID | flags );
+}
+
+GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags )
+{
+    const uint primID_diff = primID1 - primID0;
+    uint flags = geomFlags << QUAD_PRIMID_BITS;
+    
+    primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) );
+    primref->upper.w = as_float( (primID0 | flags) );
+}
+
+GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper )
+{
+    primref->lower.xyz = lower.xyz;
+    primref->upper.xyz = upper.xyz;
+}
+
+GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural )
+{
+    PrimRef new_ref;
+    new_ref.lower.xyz = lower;
+    new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24));
+    new_ref.upper.xyz = upper;
+    new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0));    
+    return new_ref;
+}
+
+GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref )
+{
+    return (as_uint(primref->upper.w) & 0x80000000) != 0;
+}
+
+GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref)
+{
+    return (as_uint(primref->upper.w) & 0x7fffffff);
+}
+
+GRL_INLINE float3 PRIMREF_lower( PrimRef* primref )
+{
+    return primref->lower.xyz;
+}
+GRL_INLINE float3 PRIMREF_upper( PrimRef* primref )
+{
+    return primref->upper.xyz;
+}
+
+GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v)
+{
+    aabb->lower = min(aabb->lower, v->lower);
+    aabb->upper = max(aabb->upper, v->upper);
+}
+
+GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p)
+{
+    aabb->lower = min(aabb->lower, p);
+    aabb->upper = max(aabb->upper, p);
+}
+
+GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper)
+{
+    aabb->lower = min(aabb->lower, lower);
+    aabb->upper = max(aabb->upper, upper);
+}
+
+GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v)
+{
+    struct AABB box;
+    box.lower = aabb->lower - (float4)v;
+    box.upper = aabb->upper + (float4)v;
+    return box;
+}
+
+GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v)
+{
+    aabb->lower = max(aabb->lower, v->lower);
+    aabb->upper = min(aabb->upper, v->upper);
+}
+
+GRL_INLINE float4 AABB_size(struct AABB *aabb)
+{
+    return aabb->upper - aabb->lower;
+}
+
+GRL_INLINE float4 AABB_centroid2(struct AABB *aabb)
+{
+    return aabb->lower + aabb->upper;
+}
+
+GRL_INLINE float AABB_halfArea(struct AABB *aabb)
+{
+    const float4 d = AABB_size(aabb);
+    return halfarea(d.xyz);
+}
+
+GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v)
+{
+    struct AABB temp = *aabb;
+    AABB_intersect(&temp, v);
+    float4 len = AABB_size(&temp);
+    float ret = 0.0f;
+    if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) {
+        float3 v = { len.x, len.y, len.z };
+        ret = halfarea(v);
+    }
+    return ret;
+}
+
+GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big)
+{
+    const int4 b0 = small->lower >= big->lower;
+    const int4 b1 = small->upper <= big->upper;
+    const int4 b = b0 & b1;
+    return b.x & b.y & b.z;
+}
+
+GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box)
+{
+    struct AABB box4d = {
+        {box.lower[0], box.lower[1], box.lower[2], 0.0f},
+        {box.upper[0], box.upper[1], box.upper[2], 0.0f}
+    };
+    return box4d;
+}
+
+GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box)
+{
+    struct AABB3f box3d = {
+        {box.lower[0], box.lower[1], box.lower[2]},
+        {box.upper[0], box.upper[1], box.upper[2]}
+    };
+    return box3d;
+}
+
+GRL_INLINE bool AABB_verify(struct AABB* aabb)
+{
+    bool error = false;
+    if (aabb->lower.x > aabb->upper.x)
+        error = true;
+    if (aabb->lower.y > aabb->upper.y)
+        error = true;
+    if (aabb->lower.z > aabb->upper.z)
+        error = true;
+    if (!isfinite(aabb->lower.x))
+        error = true;
+    if (!isfinite(aabb->lower.y))
+        error = true;
+    if (!isfinite(aabb->lower.z))
+        error = true;
+    if (!isfinite(aabb->upper.x))
+        error = true;
+    if (!isfinite(aabb->upper.y))
+        error = true;
+    if (!isfinite(aabb->upper.z))
+        error = true;
+    return error;
+}
+
+GRL_INLINE void AABB_print(struct AABB* aabb)
+{
+    printf("AABB {\n  area = %f\n  lower = %f\n  upper = %f\n  geomID = %i  primID0 = %i  primID1 = %i\n  aabb->lower.w = %x  aabb->upper.w = %x }\n",
+        AABB_halfArea(aabb),
+        aabb->lower.xyz,
+        aabb->upper.xyz,
+        PRIMREF_geomID(aabb),
+        PRIMREF_primID0(aabb),
+        PRIMREF_primID1(aabb),
+        as_uint(aabb->lower.w),
+        as_uint(aabb->upper.w));
+}
+
+#ifdef __OPENCL_VERSION__
+
+GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID)
+{
+    PrimRef shuffledPrimref;
+    shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID);
+    shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID);
+    shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID);
+    shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID);
+    shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID);
+    shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID);
+    shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID);
+    shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID);
+    return shuffledPrimref;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID);
+    bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID);
+    bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID);
+    bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID);
+    bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID);
+    bounds.upper.w = 0;
+    return bounds;
+}
+GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID)
+{
+    struct AABB bounds;
+    bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID);
+    bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID);
+    bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID);
+    bounds.lower.w = 0;
+    bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID);
+    bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID);
+    bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID)
+{
+    float coordData[8] = {
+        sub_group_broadcast(aabb->lower.x, slotID),
+        sub_group_broadcast(aabb->lower.y, slotID),
+        sub_group_broadcast(aabb->lower.z, slotID),
+        sub_group_broadcast(aabb->lower.w, slotID),
+        sub_group_broadcast(aabb->upper.x, slotID),
+        sub_group_broadcast(aabb->upper.y, slotID),
+        sub_group_broadcast(aabb->upper.z, slotID),
+        sub_group_broadcast(aabb->upper.w, slotID) };
+
+    uint coordDataFiltered;
+    const uint lane = get_sub_group_local_id();
+    if (lane < 8) coordDataFiltered = as_uint(coordData[lane]);
+    return coordDataFiltered;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_reduce_min(aabb->lower.x);
+    bounds.lower.y = sub_group_reduce_min(aabb->lower.y);
+    bounds.lower.z = sub_group_reduce_min(aabb->lower.z);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_reduce_max(aabb->upper.x);
+    bounds.upper.y = sub_group_reduce_max(aabb->upper.y);
+    bounds.upper.z = sub_group_reduce_max(aabb->upper.z);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+
+GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb )
+{
+    float3 l = aabb->lower.xyz;
+    float3 u = aabb->upper.xyz;
+    l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) );
+    l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) );
+    l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) );
+    u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) );
+    u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) );
+    u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) );
+    
+    struct AABB bounds;
+    bounds.lower.x = l.x;
+    bounds.lower.y = l.y;
+    bounds.lower.z = l.z;
+    bounds.lower.w = 0;
+    bounds.upper.x = u.x;
+    bounds.upper.y = u.y;
+    bounds.upper.z = u.z;
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+
+GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = work_group_reduce_min(aabb->lower.x);
+    bounds.lower.y = work_group_reduce_min(aabb->lower.y);
+    bounds.lower.z = work_group_reduce_min(aabb->lower.z);
+    bounds.upper.x = work_group_reduce_max(aabb->upper.x);
+    bounds.upper.y = work_group_reduce_max(aabb->upper.y);
+    bounds.upper.z = work_group_reduce_max(aabb->upper.z);
+    return bounds;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x);
+    bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y);
+    bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x);
+    bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y);
+    bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x);
+    bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y);
+    bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x);
+    bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y);
+    bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb)
+{
+    atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x);
+    atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y);
+    atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z);
+    atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x);
+    atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y);
+    atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z);
+}
+
+GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper )
+{
+    atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x);
+    atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y);
+    atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z);
+    atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x);
+    atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y);
+    atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z);
+}
+
+GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper)
+{
+    uint lane = get_sub_group_local_id();
+    float l[3];
+    l[0] = sub_group_reduce_min(lower.x);
+    l[1] = sub_group_reduce_min(lower.y);
+    l[2] = sub_group_reduce_min(lower.z);
+    float u[3];
+    u[0] = sub_group_reduce_max(upper.x);
+    u[1] = sub_group_reduce_max(upper.y);
+    u[2] = sub_group_reduce_max(upper.z);
+
+    if (lane < 3)
+    {
+        atomic_min((global float*)&aabb->lower + lane, l[lane]);
+        atomic_max((global float*)&aabb->upper + lane, u[lane]);
+    }
+}
+
+
+GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper)
+{
+    if (lower.x < aabb->lower.x)
+        atomic_min((local float *)&aabb->lower + 0, lower.x);
+    if (lower.y < aabb->lower.y)
+        atomic_min((local float *)&aabb->lower + 1, lower.y);
+    if (lower.z < aabb->lower.z)
+        atomic_min((local float *)&aabb->lower + 2, lower.z);
+    if (upper.x > aabb->upper.x)
+        atomic_max((local float *)&aabb->upper + 0, upper.x);
+    if (upper.y > aabb->upper.y)
+        atomic_max((local float *)&aabb->upper + 1, upper.y);
+    if (upper.z > aabb->upper.z)
+        atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+#endif
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/api_interface.h b/src/intel/vulkan/grl/gpu/api_interface.h
new file mode 100644
index 00000000000..71a1fff6327
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/api_interface.h
@@ -0,0 +1,840 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+#include "GRLStructs.h"
+#include "shared.h"
+#include "libs/lsc_intrinsics.h"
+
+typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC;
+
+typedef struct GRL_RAYTRACING_AABB
+{
+    float MinX;
+    float MinY;
+    float MinZ;
+    float MaxX;
+    float MaxY;
+    float MaxZ;
+} GRL_RAYTRACING_AABB;
+
+GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source)
+{
+    dest->MinX = source->lower.x;
+    dest->MinY = source->lower.y;
+    dest->MinZ = source->lower.z;
+    dest->MaxX = source->upper.x;
+    dest->MaxY = source->upper.y;
+    dest->MaxZ = source->upper.z;
+}
+
+GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID)
+{
+    global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
+    uint index_format = geomDesc->Desc.Triangles.IndexFormat;
+
+    if (index_format == INDEX_FORMAT_R32_UINT)
+    {
+        const uint* data = (const uint*)(indices + triID * 3 * 4);
+        return (uint3)(data[0], data[1], data[2]);
+    }
+    else if (index_format == INDEX_FORMAT_NONE)
+    {
+        return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
+    }
+    else
+    {
+        const ushort* data = (const ushort*)(indices + triID * 3 * 2);
+        return (uint3)(data[0], data[1], data[2]);
+    }
+}
+
+GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID)
+{
+    if (index_format == INDEX_FORMAT_R32_UINT)
+    {
+        return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0);
+    }
+    else if (index_format == INDEX_FORMAT_NONE)
+    {
+        return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
+    }
+    else
+    {
+        const ushort* data = (const ushort*)(indices + triID * 3 * 2);
+        return (uint3)(data[0], data[1], data[2]);
+    }
+}
+
+// Load all 3 indices from one triangle, and a single index from another
+GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert)
+{
+    global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
+    uint index_format = geomDesc->Desc.Triangles.IndexFormat;
+
+    if (index_format == INDEX_FORMAT_R32_UINT)
+    {
+        const uint* data0 = (const uint*)(indices + triID * 3 * 4);
+        const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4);
+        return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
+    }
+    else if (index_format == INDEX_FORMAT_NONE)
+    {
+        return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert);
+    }
+    else
+    {
+        const ushort* data0 = (const ushort*)(indices + triID * 3 * 2);
+        const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2);
+        return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
+    }
+}
+
+GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type)
+{
+    geomDesc->Type = type;
+}
+
+GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Type;
+}
+
+GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags)
+{
+    geomDesc->Flags = flags;
+}
+
+GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Flags;
+}
+
+GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform)
+{
+    geomDesc->Desc.Triangles.pTransformBuffer = transform;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.pTransformBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format)
+{
+    geomDesc->Desc.Triangles.IndexFormat = format;
+}
+
+GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.IndexFormat;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format)
+{
+    geomDesc->Desc.Triangles.VertexFormat = format;
+}
+
+GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.VertexFormat;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+    geomDesc->Desc.Triangles.IndexCount = count;
+}
+
+GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.IndexCount;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+    geomDesc->Desc.Triangles.VertexCount = count;
+}
+
+GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.VertexCount;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer)
+{
+    geomDesc->Desc.Triangles.pIndexBuffer = buffer;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.pIndexBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
+{
+    geomDesc->Desc.Triangles.pVertexBuffer = address;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.pVertexBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride)
+{
+    geomDesc->Desc.Triangles.VertexBufferByteStride = stride;
+}
+
+GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.VertexBufferByteStride;
+}
+
+GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat);
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+    geomDesc->Desc.Procedural.AABBCount = count;
+}
+
+GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Procedural.AABBCount;
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
+{
+    geomDesc->Desc.Procedural.pAABBs_GPUVA = address;
+}
+
+GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Procedural.pAABBs_GPUVA;
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride)
+{
+    geomDesc->Desc.Procedural.AABBByteStride = stride;
+}
+
+GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Procedural.AABBByteStride;
+}
+
+GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
+}
+
+GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
+}
+
+GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    return 0x00FFFFFF;
+}
+
+GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value)
+{
+    return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value);
+}
+
+GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    if (GRL_is_triangle(desc))
+    {
+        if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
+        {
+            return desc->Desc.Triangles.VertexCount / 3;
+        }
+        else
+        {
+            return desc->Desc.Triangles.IndexCount / 3;
+        }
+    }
+    else
+    {
+        return desc->Desc.Procedural.AABBCount;
+    }
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values
+
+GRL_INLINE float snorm_to_float(short v)
+{
+    return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float snorm8_to_float(signed char v)
+{
+    return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float unorm_to_float(unsigned short v)
+{
+    return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this?
+}
+
+//only lower 10 bits of v are used
+GRL_INLINE float unorm10_to_float(unsigned v)
+{
+    const unsigned short mask = (unsigned short)((1u << 10u) - 1u);
+    const unsigned short v10 = (unsigned short)v & mask;
+    return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float unorm8_to_float(unsigned char v)
+{
+    return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID)
+{
+    float4 v = (float4)(0, 0, 0, 0);
+    global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
+    uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
+    uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
+
+    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+    {
+        const float* data = (const float*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], data[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+    {
+        const float* data = (const float*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+    {
+        const half* data = (const half*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], data[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+    {
+        const half* data = (const half*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+    {
+        const short* data = (const short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm_to_float(data[0]),
+            snorm_to_float(data[1]),
+            snorm_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+    {
+        const short* data = (const short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm_to_float(data[0]),
+            snorm_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+    {
+        const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm_to_float(data[0]),
+            unorm_to_float(data[1]),
+            unorm_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+    {
+        const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm_to_float(data[0]),
+            unorm_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+    {
+        const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm10_to_float(data),
+            unorm10_to_float((data >> 10)),
+            unorm10_to_float((data >> 20)),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm8_to_float(data[0]),
+            unorm8_to_float(data[1]),
+            unorm8_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm8_to_float(data[0]),
+            unorm8_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm8_to_float(data[0]),
+            snorm8_to_float(data[1]),
+            snorm8_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm8_to_float(data[0]),
+            snorm8_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+
+    /* perform vertex transformation */
+    if (geomDesc->Desc.Triangles.pTransformBuffer)
+    {
+        global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
+        const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3];
+        const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7];
+        const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11];
+        v = (float4)(x, y, z, 0.0f);
+    }
+
+    return v;
+}
+
+GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out)
+{
+    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+    {
+        const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0));
+        const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0));
+        const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0));
+        out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
+        out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
+        out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+    {
+        const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride);
+        const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride);
+        const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
+        out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
+        out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
+        const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
+        const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
+        out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
+        out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
+        const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
+        const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
+        out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
+        out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
+        const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
+        const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
+        const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
+        const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+    {
+        const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride);
+        const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride);
+        const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f);
+        out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f);
+        out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f);
+    }
+
+    /* perform vertex transformation */
+    if (transform_buffer)
+    {
+        global float* xfm = (global float*)transform_buffer;
+        for (uint i = 0; i < 3; ++i)
+        {
+            const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3];
+            const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7];
+            const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11];
+            out[i] = (float4)(x, y, z, 0.0f);
+        }
+    }
+}
+
+GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    float3* out0, float3* out1, float3* out2, float3* out3,
+    const uint4 vtxID, const uint vertex_format, global char* vertices)
+{
+    float3 v0, v1, v2, v3;
+
+    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+    {
+        const float* data0 = (const float*)(vertices + vtxID.x);
+        const float* data1 = (const float*)(vertices + vtxID.y);
+        const float* data2 = (const float*)(vertices + vtxID.z);
+        const float* data3 = (const float*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], data0[2]);
+        v1 = (float3)(data1[0], data1[1], data1[2]);
+        v2 = (float3)(data2[0], data2[1], data2[2]);
+        v3 = (float3)(data3[0], data3[1], data3[2]);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+    {
+        const float* data0 = (const float*)(vertices + vtxID.x);
+        const float* data1 = (const float*)(vertices + vtxID.y);
+        const float* data2 = (const float*)(vertices + vtxID.z);
+        const float* data3 = (const float*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], 0.0f);
+        v1 = (float3)(data1[0], data1[1], 0.0f);
+        v2 = (float3)(data2[0], data2[1], 0.0f);
+        v3 = (float3)(data3[0], data3[1], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtxID.x);
+        const half* data1 = (const half*)(vertices + vtxID.y);
+        const half* data2 = (const half*)(vertices + vtxID.z);
+        const half* data3 = (const half*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], data0[2]);
+        v1 = (float3)(data1[0], data1[1], data1[2]);
+        v2 = (float3)(data2[0], data2[1], data2[2]);
+        v3 = (float3)(data3[0], data3[1], data3[2]);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtxID.x);
+        const half* data1 = (const half*)(vertices + vtxID.y);
+        const half* data2 = (const half*)(vertices + vtxID.z);
+        const half* data3 = (const half*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], 0.0f);
+        v1 = (float3)(data1[0], data1[1], 0.0f);
+        v2 = (float3)(data2[0], data2[1], 0.0f);
+        v3 = (float3)(data3[0], data3[1], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtxID.x);
+        const short* data1 = (const short*)(vertices + vtxID.y);
+        const short* data2 = (const short*)(vertices + vtxID.z);
+        const short* data3 = (const short*)(vertices + vtxID.w);
+        v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]));
+        v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]));
+        v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]));
+        v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtxID.x);
+        const short* data1 = (const short*)(vertices + vtxID.y);
+        const short* data2 = (const short*)(vertices + vtxID.z);
+        const short* data3 = (const short*)(vertices + vtxID.w);
+        v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f);
+        v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f);
+        v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f);
+        v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
+        const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
+        v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]));
+        v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]));
+        v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]));
+        v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
+        const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
+        v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f);
+        v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f);
+        v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f);
+        v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+    {
+        const unsigned data0 = *(const unsigned*)(vertices + vtxID.x);
+        const unsigned data1 = *(const unsigned*)(vertices + vtxID.y);
+        const unsigned data2 = *(const unsigned*)(vertices + vtxID.z);
+        const unsigned data3 = *(const unsigned*)(vertices + vtxID.w);
+        v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20)));
+        v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20)));
+        v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20)));
+        v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20)));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
+        const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
+        v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]));
+        v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]));
+        v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]));
+        v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
+        const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
+        v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f);
+        v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f);
+        v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f);
+        v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+    {
+        const signed char* data0 = (const signed char*)(vertices + vtxID.x);
+        const signed char* data1 = (const signed char*)(vertices + vtxID.y);
+        const signed char* data2 = (const signed char*)(vertices + vtxID.z);
+        const signed char* data3 = (const signed char*)(vertices + vtxID.w);
+        v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]));
+        v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]));
+        v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]));
+        v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+    {
+        const signed char* data0 = (const signed char*)(vertices + vtxID.x);
+        const signed char* data1 = (const signed char*)(vertices + vtxID.y);
+        const signed char* data2 = (const signed char*)(vertices + vtxID.z);
+        const signed char* data3 = (const signed char*)(vertices + vtxID.w);
+        v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f);
+        v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f);
+        v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f);
+        v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f);
+    }
+    else
+    {
+        v0 = (float3)(0.0f, 0.0f, 0.0f);
+        v1 = (float3)(0.0f, 0.0f, 0.0f);
+        v2 = (float3)(0.0f, 0.0f, 0.0f);
+        v3 = (float3)(0.0f, 0.0f, 0.0f);
+    }
+
+
+    /* perform vertex transformation */
+    if (geomDesc->Desc.Triangles.pTransformBuffer)
+    {
+        global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
+
+        v0.xyz = (float3)(
+            xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3],
+            xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7],
+            xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11]
+            );
+
+        v1.xyz = (float3)(
+            xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3],
+            xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7],
+            xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11]
+            );
+
+        v2.xyz = (float3)(
+            xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3],
+            xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7],
+            xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11]
+            );
+
+        v3.xyz = (float3)(
+            xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3],
+            xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7],
+            xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11]
+            );
+    }
+
+    *out0 = v0;
+    *out1 = v1;
+    *out2 = v2;
+    *out3 = v3;
+}
+
+
+GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    float3* out0, float3* out1, float3* out2, float3* out3,
+    uint4 vtxID)
+{
+    global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
+    uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
+    uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
+
+    vtxID *= vertex_stride;
+
+    GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3,
+        vtxID, vertex_format, vertices);
+}
+
+
+GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID)
+{
+    global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA;
+    global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride);
+    return *(global GRL_RAYTRACING_AABB*)aabb;
+}
+
+// same as for d3d12
+typedef struct GRL_RAYTRACING_INSTANCE_DESC
+{
+    float Transform[12];
+    //     unsigned int InstanceID : 24;
+    //     unsigned int InstanceMask : 8;
+    uint32_t DW0;
+    //     unsigned int InstanceContributionToHitGroupIndex : 24;
+    //     unsigned int Flags : 8;
+    uint32_t DW1;
+    global char* AccelerationStructure;
+} GRL_RAYTRACING_INSTANCE_DESC;
+
+GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column)
+{
+    return d->Transform[row * 4 + column];
+}
+
+GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW0 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW0 >> 24;
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW1 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW1 >> 24;
+}
+
+GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return (gpuva_t)d->AccelerationStructure;
+}
+
+GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value)
+{
+    d->Transform[row * 4 + column] = value;
+}
+
+GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id)
+{
+    d->DW0 &= 255 << 24;
+    d->DW0 |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask)
+{
+    d->DW0 &= ((1 << 24) - 1);
+    d->DW0 |= mask << 24;
+}
+
+GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution)
+{
+    d->DW1 &= 255 << 24;
+    d->DW1 |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags)
+{
+    d->DW1 &= ((1 << 24) - 1);
+    d->DW1 |= flags << 24;
+}
+
+GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address)
+{
+    d->AccelerationStructure = (global char*)address;
+}
diff --git a/src/intel/vulkan/grl/gpu/atomic_update.cl b/src/intel/vulkan/grl/gpu/atomic_update.cl
new file mode 100644
index 00000000000..5171a122dc1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/atomic_update.cl
@@ -0,0 +1,1112 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "GRLGen12.h"
+
+#include "bvh_build_refit.h"
+#include "bvh_build_treelet_refit.h"
+
+
+struct RefitScratch
+{
+    float lower[3];
+    uint mask;
+    float upper[3];
+    uint _pad;
+
+};
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+init_refit_scratch(
+    global struct BVHBase* bvh,
+    global struct RefitScratch* scratch )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {        
+        float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0);        
+        store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) );
+        store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) );
+    }
+}
+
+bool is_fat_leaf( InternalNode* curNode )
+{
+    return curNode->nodeType != BVH_INTERNAL_NODE; // TODO:  Not enough for traversal shaders!! if ts enabled need to check child types
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_fatleaf_table(
+    global struct BVHBase* bvh )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {
+        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+        if ( is_fat_leaf(curNode) )
+        {
+            uint offs = atomic_inc_global( &bvh->fatLeafCount );
+    
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+            
+            LeafTableEntry* leaf   = BVHBase_GetFatLeafTable(bvh)+offs;
+            leaf->backpointer      = bp;
+            leaf->inner_node_index = tid;
+            leaf->leaf_index       = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_fatleaf_table_new_update(
+    global struct Globals *globals,
+    global struct BVHBase* bvh )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {
+        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+        if ( is_fat_leaf(curNode) )
+        {
+            // This implementation uses fatleaf table structure but it is actually quad table
+            // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children
+            // but performance was worse
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+            uint fatLeafTableStart = bvh->fatLeafTableStart;
+            
+            uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
+            uint numChildren = (bp >> 3) & 0x7;
+            
+            uint quad_leaf_table_index = leaf_index;
+            
+            // Check if num children is outside of the % 256 work group
+            // If so, move these cases to the offset after numQuads and push them to the leftovers part
+            // where fatleaves are stored every 8th pos with additional padding
+            // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups
+            
+            uint prev_group = leaf_index & 255;
+            uint next_group = (leaf_index + (numChildren - 1)) & 255;
+            uint slm_pos = prev_group;
+            bool is_leftover = prev_group > next_group;
+            
+            if(is_leftover)
+            {
+                LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
+                uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255;
+            
+                uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 );
+            
+                for(uint i = 0; i < BVH_NODE_N6; i++)
+                {
+                    uint pos = (i < numChildren) ? i : 0;
+                    LeafTableEntry* leaf_null = &leafBase[pos];
+                    leaf_null->leaf_index = -1 << 3;
+                }
+            
+                quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset;
+                slm_pos = leftovers_offset & 255;
+            }
+            
+            LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
+            
+            for(uint i = 0; i < BVH_NODE_N6; i++)
+            {
+                uint pos = (i < numChildren) ? i : 0;
+                LeafTableEntry* leafCur = &leaf[pos];
+                leafCur->backpointer = bp;
+                leafCur->inner_node_index = (tid << 8) | slm_pos;
+                leafCur->leaf_index = (leaf_index << 3) | pos;
+            }
+            
+            // Need to clean the unused area where we pad to 8 for leftovers
+            if(is_leftover)
+            {
+                for(uint i = 1; i < 8; i++)
+                {
+                    uint pos = (i >= numChildren) ? i : 7;
+                    LeafTableEntry* leafCur = &leaf[pos];
+                    leafCur->leaf_index = -1 << 3;
+                }
+            }
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_innernode_table(
+    global struct BVHBase* bvh )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {
+        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+        if ( !is_fat_leaf( curNode ) )
+        {
+            uint offs = atomic_inc_global( &bvh->innerCount );
+
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+
+            InnerNodeTableEntry* inner   = BVHBase_GetInnerNodeTable(bvh)+offs;
+            inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7);
+            inner->first_child = tid + curNode->childOffset;
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) void kernel
+fixup_quad_table(
+    global struct BVHBase* bvh )
+{
+    // This kernel has 2 work groups that set the magic number for unused data in
+    // fatleaf table. One work group for thelast group of the first part where quads are packed,
+    // second one for the last group of the part where quads are stored padded
+
+    uint numQuads = BVHBase_GetNumQuads(bvh);
+    uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate;
+    uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255;
+
+    uint numQuads_aligned_256 = (numQuads + 255) & ~255;
+    uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256;
+    uint quadOffsetStart = quadOffsetEnd - 256;
+
+    uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers;
+
+    uint leftovers = quadOffsetEnd - quads_number_last_group;
+
+    uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers;
+
+    if(leftovers != 0)
+    {
+        LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh);
+        
+        LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid];
+        leaf->leaf_index = -1 << 3;
+    }
+
+    if(get_group_id(0) == 1 && get_local_id(0) == 0)
+        bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd;
+}
+
+
+// updates one quad leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_quad_WB(
+    global struct QuadLeaf* quad,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    struct AABB* childAABB)
+{    
+    /* get the geomID and primID0/1 for both quad triangles */
+    const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc);
+    const uint primID0 = quad->primIndex0;
+    const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad);
+    ushort fourth_vert = 0;
+
+    if (primID1 != primID0)
+    {
+        ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
+        fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
+        fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
+    }
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID;
+
+    uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
+
+    // read the indices of the 4 verts we want
+    float3 vtx0, vtx1, vtx2, vtx3;
+    GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+    childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) );
+    childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) );
+
+    float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
+    float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
+    float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
+
+    global uint4* dst_verts = (global uint4*) &(quad->v[0][0]);
+    store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) );
+    store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) );
+    store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) );
+}
+
+inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs )
+{
+    uint vi = as_uint(v);
+    return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs));
+}
+inline uchar4 uchar4_broadcast( uchar4 v, uint offs )
+{
+    uint vi = as_uint(v);
+    return as_uchar4(sub_group_broadcast(vi,offs));
+}
+
+GRL_INLINE void sg_InternalNode_setFields(
+    struct InternalNode* node, 
+    struct AABB reduced_aabb, 
+    const int offset, const uint nodeType, struct AABB* input_aabb, 
+    const uint numChildren, const uchar nodeMask )
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB conservative_aabb = conservativeAABB(&reduced_aabb);
+    const float3 org = conservative_aabb.lower.xyz;
+    
+    const float3 len = AABB_size(&conservative_aabb).xyz * up;
+    int3 exp;
+    const float3 mant = frexp_vec3(len, &exp);
+    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    uchar4 lower_uchar = 0x80;
+    uchar4 upper_uchar = 0;
+
+    ushort lane = get_sub_group_local_id();
+    ushort simd8_id     = lane/8;
+    ushort logical_lane = lane%8;
+
+    if( logical_lane < numChildren )
+    {
+        struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ???
+
+        float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
+        lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+        float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
+        upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+        lower_uchar.xyz = convert_uchar3_rtn( lower );
+        upper_uchar.xyz = convert_uchar3_rtp( upper );
+    }
+
+    uchar4 lo0 = lower_uchar;
+    uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 );
+    uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 );
+    uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 );
+    uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 );
+    uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 );
+
+    uchar4 hi0 = upper_uchar;
+    uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 );
+    uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 );
+    uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 );
+    uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 );
+    uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 );
+
+    if( logical_lane == 0 )
+    {
+        uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE);
+
+        uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset);
+
+        char3 exp_char = (char3)(exp.x,exp.y,exp.z);
+
+        uint4 block1 = (uint4)(
+            as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)),
+            as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) ,
+            as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) ,
+            as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x))
+        );
+ 
+        uint4 block2 = (uint4)(
+            as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) ,
+            as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) ,
+            as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) ,
+            as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y)) 
+            );
+
+        uint4 block3 = (uint4)(
+            as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)),
+            as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)),
+            as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)),
+            as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z))
+            );
+
+        global uint4* pNode = (global uint4*)node;
+
+#if 0
+        printf(
+            "block0 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
+            "block1 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
+            "block2 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
+            "block3 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n" ,
+            block0.x,block0.y,block0.z,block0.w, 
+            pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w,
+            block1.x,block1.y,block1.z,block1.w, 
+            pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w,
+            block2.x,block2.y,block2.z,block2.w, 
+            pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w ,
+            block3.x,block3.y,block3.z,block3.w, 
+            pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w );
+#endif
+
+         store_uint4_L1WB_L3WB( pNode, 0, block0 );
+         store_uint4_L1WB_L3WB( pNode, 1, block1 );
+         store_uint4_L1WB_L3WB( pNode, 2, block2 );
+         store_uint4_L1WB_L3WB( pNode, 3, block3 );
+    }
+
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) 
+void kernel
+traverse_aabbs_quad(
+        global struct BVHBase* bvh,
+        global struct RefitScratch* scratch,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc
+    )
+{
+
+    uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
+    varying ushort lane = get_sub_group_local_id();
+
+    uniform uint num_leaves = bvh->fatLeafCount;
+
+    local struct RefitScratch local_scratch[256];
+    if( get_local_id(0) < min(num_nodes,256u) )
+    {
+        for( uint i=0; i<3; i++ ){
+            local_scratch[get_local_id(0)].lower[i] = FLT_MAX;
+            local_scratch[get_local_id(0)].upper[i] = FLT_MAX;
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    
+    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
+    ushort SIMD8_PER_WG   = get_num_sub_groups()*SIMD8_PER_SG;
+    ushort simd8_local_id = get_sub_group_local_id()/8;
+    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 
+    ushort logical_lane   = lane%8;
+
+    uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG;
+
+
+    if ( fatleaf_index < num_leaves )
+    {            
+        LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index;
+        uint innerNodeIdx = leaf->inner_node_index;
+        uint bp           = leaf->backpointer;
+        uint leaf_index   = leaf->leaf_index;
+
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
+        varying QuadLeaf* quad =  BVHBase_GetQuadLeaves(bvh) + leaf_index;
+
+        uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+        varying struct AABB childrenBox;
+        AABB_init(&childrenBox);
+
+        uint numChildren = (bp >> 3) & 0x7;
+        if (logical_lane < numChildren)
+        {    
+            refit_bottom_child_quad_WB(  
+                (global struct QuadLeaf*) &quad[logical_lane],
+                geomDesc,
+                &childrenBox );
+        }
+
+        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+
+        sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            childOffs,
+            NODE_TYPE_QUAD,
+            &childrenBox,
+            numChildren,
+            0xff );
+
+        // atomic min operation vectorized across 6 lanes
+        //    [ lower.xyz ][-][upper.xyz][-]
+        //
+        // Lanes 3 and 7 are inactive.   'upper' is negated
+        bool atomic_mask = (1<<logical_lane) & 0x77;
+
+        uint lmod = logical_lane % 4;
+        uint ldiv = logical_lane / 4;
+        float vlo = reduce_bounds.lower.x;
+        float vhi = reduce_bounds.upper.x;
+        vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+        vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+        vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+        vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+        float v = (ldiv == 0) ? vlo : -vhi;
+
+
+        global float* pv = (global float*) &scratch[innerNodeIdx];
+
+        store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
+
+        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+        uint parent = (bp >> 6);
+
+        // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256
+        if(atomic_mask && parent != 0x03FFFFFF)
+        {
+            while( parent >= 256 )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+            while( parent != 0x03FFFFFF )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+        }
+        
+    }
+
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    num_nodes = min(num_nodes,256u);
+
+    local float* in = (local float*)&local_scratch[0];
+    global float* out = (global float*)&scratch[0];
+
+    for (uint i = get_local_id(0); i < num_nodes*6; i += 256 )
+    {
+        // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet
+        uint idx = i + (i/3);
+
+        float v = in[idx];
+        if( v != FLT_MAX )
+            atomic_min( out + idx , v );
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) 
+void kernel
+write_inner_nodes(
+    global struct BVHBase* bvh,
+    global struct RefitScratch* scratch
+    )
+{
+    uint SIMD8_PER_SG = get_sub_group_size()/8;
+    uniform uint node_id    = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8);
+    varying ushort lane = get_sub_group_local_id() % 8;
+    varying uint num_inners = bvh->innerCount;
+
+    if ( node_id < num_inners )
+    {
+        InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id;
+        uint node_index  = entry->node_index_and_numchildren>>3;
+        uint numChildren = entry->node_index_and_numchildren & 7;
+        uint first_child = entry->first_child;
+
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index;
+
+        varying struct AABB childAABB;
+        AABB_init(&childAABB);
+
+        if( lane < numChildren )
+        {            
+            uint child = first_child + lane;
+            childAABB.lower.x = scratch[child].lower[0];
+            childAABB.lower.y = scratch[child].lower[1];
+            childAABB.lower.z = scratch[child].lower[2];
+            childAABB.upper.x = -scratch[child].upper[0];
+            childAABB.upper.y = -scratch[child].upper[1];
+            childAABB.upper.z = -scratch[child].upper[2];
+        }
+
+        varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) ==  (get_sub_group_local_id()/8);
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+        
+        sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            first_child - node_index,
+            NODE_TYPE_INTERNAL,
+            &childAABB,
+            numChildren,
+            0xff );
+
+    }
+
+    if (node_id == 0 && lane == 0 )
+    {
+        bvh->Meta.bounds.lower[0] = scratch[0].lower[0];
+        bvh->Meta.bounds.lower[1] = scratch[0].lower[1];
+        bvh->Meta.bounds.lower[2] = scratch[0].lower[2];
+        bvh->Meta.bounds.upper[0] = -scratch[0].upper[0];
+        bvh->Meta.bounds.upper[1] = -scratch[0].upper[1];
+        bvh->Meta.bounds.upper[2] = -scratch[0].upper[2];
+    }
+
+}
+
+
+
+#if 1
+#define SLM_BOX_COUNT 1024
+
+struct AABB load_box( uint place,  local struct AABB* local_boxes, global struct AABB* extra_boxes )
+{
+    if( place < SLM_BOX_COUNT )
+        return local_boxes[place];
+    else
+        return extra_boxes[place-SLM_BOX_COUNT];
+}
+
+void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes )
+{
+    if (place < SLM_BOX_COUNT)
+    {
+        local_boxes[place] = box;
+    }
+    else
+    {
+        global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT];
+        store_uint4_L1WB_L3WB( ptr,   0, as_uint4(box.lower) );
+        store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) );
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1))) 
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+update_single_group_quads(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global struct AABB* extra_boxes
+)
+{
+    uniform uint tid = get_sub_group_global_id();
+    uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
+    uniform uint num_leaves = bvh->fatLeafCount;
+    uniform uint num_inners = bvh->innerCount;
+
+    varying ushort lane = get_sub_group_local_id();
+    
+    local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB
+   
+    // initialize nodes
+    for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0))
+    {
+        struct AABB tmp;
+        AABB_init(&tmp);
+        tmp.upper = -tmp.upper;
+        store_box( tmp, i, local_boxes, extra_boxes );
+    }
+
+
+    if( num_nodes > SLM_BOX_COUNT )
+        mem_fence_workgroup_default();
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+
+    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
+    ushort NUM_SIMD8      = get_num_sub_groups()*SIMD8_PER_SG;
+    ushort simd8_local_id = get_sub_group_local_id()/8;
+    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 
+    ushort logical_lane = lane%8;
+
+
+    for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 )
+    {
+        LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i;
+        uint innerNodeIdx = leaf->inner_node_index;
+        uint bp           = leaf->backpointer;
+        uint leaf_index   = leaf->leaf_index;
+
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
+        QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index;
+
+        uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+        varying struct AABB childrenBox;
+        AABB_init(&childrenBox);
+
+        uint numChildren = (bp >> 3) & 0x7;
+        if (logical_lane < numChildren)
+        {                
+            
+            refit_bottom_child_quad_WB(  
+                (global struct QuadLeaf*) &quad[logical_lane],
+                geomDesc,
+                &childrenBox );
+        }
+
+        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+
+       
+        if( logical_lane == 0 )
+        {
+            struct AABB negated = reduce_bounds;
+            negated.upper = -negated.upper;
+            store_box( negated, innerNodeIdx, local_boxes, extra_boxes );
+        }
+
+        sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            childOffs,
+            NODE_TYPE_QUAD,
+            &childrenBox,
+            numChildren,
+            0xff );
+
+    
+        // atomic min operation vectorized across 6 lanes
+        //    [ lower.xyz ][-][upper.xyz][-]
+        //
+        // Lanes 3 and 7 are inactive.   'upper' is negated
+        uint lmod = logical_lane % 4;
+        uint ldiv = logical_lane / 4;
+        float vlo = reduce_bounds.lower.x;
+        float vhi = reduce_bounds.upper.x;
+        vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+        vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+        vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+        vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+        float v = (ldiv == 0) ? vlo : -vhi;
+        bool atomic_mask = (1<<logical_lane) & 0x77;
+
+        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+        uint parent = (bp >> 6);
+
+        // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT
+        if(atomic_mask && parent != 0x03FFFFFF)
+        {
+            while( parent >= SLM_BOX_COUNT )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+            while( parent != 0x03FFFFFF )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+        }
+
+    }
+
+    if( num_nodes > SLM_BOX_COUNT )
+        mem_fence_workgroup_default();
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 )
+    {
+        InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i;
+        uint node_index  = inner->node_index_and_numchildren>>3;
+        uint numChildren = inner->node_index_and_numchildren & 7;
+        uint first_child = inner->first_child;
+        
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index;
+
+        //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders
+        {                                           // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads
+            uint child = first_child + logical_lane;
+
+            bool child_valid = (logical_lane < numChildren);
+            
+            struct AABB childAABB;
+            AABB_init(&childAABB);
+            if (child_valid)
+            {
+                childAABB = load_box( child, local_boxes, extra_boxes );
+                childAABB.upper = -childAABB.upper;
+            }
+
+            varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
+            struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+            for (uint i = 1; i < SIMD8_PER_SG; i++)
+            {
+                struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+                int3 is_upper_lane = ((uint3)(i)) ==  (get_sub_group_local_id()/8);
+                reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+                reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+            }
+
+            sg_InternalNode_setFields(
+                curNode, 
+                reduce_bounds,
+                first_child - node_index,
+                NODE_TYPE_INTERNAL,
+                &childAABB,
+                numChildren,
+                0xff );
+        }
+    }
+
+
+    if (get_sub_group_id() == 0 && lane == 0 )
+    {
+        bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x;
+        bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y;
+        bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z;
+        bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x;
+        bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y;
+        bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z;
+    }
+
+}
+#endif
+
+GRL_INLINE void traverse_aabbs_new_update_func(
+        global struct BVHBase* bvh,
+        global char* vertices,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+        global struct RefitScratch* scratch,
+        uint vertex_format,
+        local struct AABB3f* children_AABBs,
+        local uint* num_fat_leaves,
+        local struct LeafTableEntry* leafTable_local,
+        const bool single_geo
+    )
+{
+    // The first part of the kernel with vertices loads/stores is executed with quad per work item,
+    // using previously prepared QuadDataIndices to get the quad data and vert indices
+    // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is
+    // executed per simd.
+    // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional
+    // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit
+    // on future gens.
+
+    varying uint lid = get_local_id(0);
+    varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+    num_fat_leaves[0] = 0;
+    leafTable_local[lid].leaf_index = -1 << 3;
+
+    LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid));
+    uint innerNodeIdx_mem = leaf->inner_node_index;
+    uint bp           = leaf->backpointer;
+    uint leaf_index_mem = leaf->leaf_index;
+
+    uint numChildren = (bp >> 3) & 0x7;
+
+    uint leaf_index = leaf_index_mem >> 3;
+    uint slm_child_offset = leaf_index_mem & 0x7;
+
+    uint innerNodeIdx = innerNodeIdx_mem >> 8;
+    uint slm_pos_main = innerNodeIdx_mem & 0xFF;
+
+    uint first_el_of_group = get_group_id(0)*get_local_size(0);
+    uint quadsNum = BVHBase_GetNumQuads(bvh);
+    uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1;
+
+    // Skip writes when not all children for single fatleaf are present in this work group
+    bool skip_tid = leaf_index == 0x1FFFFFFF;
+    leaf_index = skip_tid ? expected_tid : leaf_index;
+
+    // Compute bounding box for quads
+    varying struct AABB3f childrenBox;
+        
+    tid = leaf_index + slm_child_offset;
+    
+    // Read vertex indices and quad header from separate buffer
+    uint quadIndicesStart = bvh->quadIndicesDataStart;
+    varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid));
+    QuadDataIndices vertexMap = vertex_indice_ptr[0];
+    
+    varying global uint4* bounds =  (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) );
+    uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]);
+    uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]);
+    
+    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc;
+
+    if(!single_geo)
+    {
+        uint geomID = vertexMap.header_data[0] & 0xFFFFFF;
+        desc += geomID;
+        vertices = (global char*)desc->Desc.Triangles.pVertexBuffer;
+        vertex_format = desc->Desc.Triangles.VertexFormat;
+    }
+
+    float3 vtx0, vtx1, vtx2, vtx3;
+    GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices);
+    
+    for(uint i = 0; i < 3; i++)
+        childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) );
+
+    for(uint i = 0; i < 3; i++)
+        childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) );
+    
+    float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
+    float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
+    float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
+    
+    // Store quad data in bvh
+    // Make sure this goes without partial writes to get best perf
+    store_uint4_L1WB_L3WB( bounds, 0, quad_data );
+    store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) );
+    store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) );
+    store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+    struct AABB reduce_bounds;
+    
+    if(!skip_tid)
+    {
+        // Store AABB in SLM, to be used later for children quantization in fatleaf
+        children_AABBs[slm_pos_main + slm_child_offset] = childrenBox;
+    
+        if(slm_child_offset == 0)
+        {
+           uint offset = atomic_inc_local(&num_fat_leaves[0]);
+           leafTable_local[offset].inner_node_index = innerNodeIdx_mem;
+           leafTable_local[offset].backpointer = bp;
+           leafTable_local[offset].leaf_index = leaf_index_mem;
+        }
+    }
+       
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+    varying ushort lane   = get_sub_group_local_id();
+    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
+    ushort SIMD8_PER_WG   = get_num_sub_groups()*SIMD8_PER_SG;
+    ushort simd8_local_id = get_sub_group_local_id()/8;
+    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 
+    ushort logical_lane   = lane%8;
+    
+    uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31;
+    
+    for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32)
+    {
+        uniform uint fatleaf_index = simd8_id + offset;
+        uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index;
+        uint bp           = leafTable_local[fatleaf_index].backpointer;
+        uint leaf_index_mem   = leafTable_local[fatleaf_index].leaf_index;
+    
+        uint numChildren = (bp >> 3) & 0x7;
+        
+        uint leaf_index = leaf_index_mem >> 3;
+        uint slm_child_offset = leaf_index_mem & 0x7;
+        
+        uint innerNodeIdx = innerNodeIdx_mem >> 8;
+        uint slm_pos_main = innerNodeIdx_mem & 0xFF;
+        
+        bool skip_tid = leaf_index == 0x1FFFFFFF;
+        bool active_lane = (logical_lane < numChildren);
+        uint lane_children = active_lane ? logical_lane : 0;
+        
+        fatleaf_index = leaf_index;
+    
+        varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx));
+        
+        global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart );
+    
+        varying struct AABB childrenBox_bu;
+        AABB_init(&childrenBox_bu);
+        
+        if(!skip_tid)
+            childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]);
+    
+        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+
+        if(!skip_tid)
+        {
+            uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index;
+            varying QuadLeaf* quad =  (QuadLeaf*)(((char*)bvh) + quad_offset);
+            uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+    
+            sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            childOffs,
+            NODE_TYPE_QUAD,
+            &childrenBox_bu,
+            numChildren,
+            0xff );
+            
+            bool atomic_mask = (1<<logical_lane) & 0x77;
+            
+            uint lmod = logical_lane % 4;
+            uint ldiv = logical_lane / 4;
+            float vlo = reduce_bounds.lower.x;
+            float vhi = reduce_bounds.upper.x;
+            vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+            vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+            vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+            vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+            float v = (ldiv == 0) ? vlo : -vhi;
+            
+            global float* pv = (global float*) &scratch[innerNodeIdx];
+            
+            store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
+            
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint parent = (bp >> 6);
+            
+            global float* parent_v = (global float*) &(scratch[parent]) + logical_lane;
+            
+            if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF))
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( parent_v, v );
+                parent = bp >> 6;
+            
+                if(parent != 0x03FFFFFF)
+                {
+                    while( parent != 0x03FFFFFF )
+                    {
+                        innerNodeIdx = parent;
+                        bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                
+                        global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane;
+                        if(*parent_v_global >= v)
+                            atomic_min( parent_v_global, v );
+                        else
+                            break;
+                
+                        parent = bp >> 6;
+                    }
+                }
+            }
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) 
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel
+traverse_aabbs_new_update(
+        global struct BVHBase* bvh,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+        global struct RefitScratch* scratch
+    )
+{
+    varying uint lid = get_local_id(0);
+    varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+    local struct AABB3f children_AABBs[256];
+    local struct LeafTableEntry leafTable_local[256];
+    local uint num_fat_leaves;
+
+    traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */,
+        &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) 
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel
+traverse_aabbs_new_update_single_geo(
+        global struct BVHBase* bvh,
+        global char* vertices,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+        global struct RefitScratch* scratch,
+        const uint vertex_format
+    )
+{
+    varying uint lid = get_local_id(0);
+    varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+    local struct AABB3f children_AABBs[256];
+    local struct LeafTableEntry leafTable_local[256];
+    local uint num_fat_leaves;
+
+    if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+      traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT,
+          &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+}
diff --git a/src/intel/vulkan/grl/gpu/atomic_update.grl b/src/intel/vulkan/grl/gpu/atomic_update.grl
new file mode 100644
index 00000000000..9e1d6923d4a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/atomic_update.grl
@@ -0,0 +1,198 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module atomic_update;
+
+kernel_module atomic_update ("atomic_update.cl")
+{    
+    links lsc_intrinsics;
+    kernel init_refit_scratch   < kernelFunction = "init_refit_scratch"  >;
+    kernel traverse_aabbs_quad  < kernelFunction = "traverse_aabbs_quad" >;
+    kernel write_inner_nodes    < kernelFunction = "write_inner_nodes"   >;
+    kernel build_fatleaf_table  < kernelFunction = "build_fatleaf_table" >;
+    kernel build_innernode_table < kernelFunction = "build_innernode_table" >;
+
+    kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;
+
+    kernel build_fatleaf_table_new_update  < kernelFunction = "build_fatleaf_table_new_update" >;
+    kernel fixup_quad_table  < kernelFunction = "fixup_quad_table" >;
+    kernel traverse_aabbs_new_update  < kernelFunction = "traverse_aabbs_new_update" >;
+    kernel traverse_aabbs_new_update_single_geo  < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
+}
+
+import struct MKBuilderState "structs.grl";
+
+// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
+metakernel init_refit_scratch_metakernel_registers()
+{
+    REG0.hi = 0;
+    REG1 = 3;
+    REG2 = 63;
+    REG3 = 4;
+    REG4 = 2;
+
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+}
+
+metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
+{
+    REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+    define C_3  REG1;
+    define C_63 REG2;
+    define C_4  REG3;
+    define C_2  REG4;
+
+    REG0 = REG0 - C_3; // nodedataCurr - fixed offset
+    REG0 = REG0 + C_63; // + 63
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64
+
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base,scratch);
+
+}
+
+metakernel build_node_tables( qword bvh_base )
+{
+    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+    REG1 = 2;
+    REG2 = 63;
+    REG3 = 4;
+    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!
+
+    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
+    REG0 = REG0 + REG2; // + 63
+    REG0 = REG0 >> REG3; // >> 4
+    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base);
+    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base);
+}
+
+metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
+{
+    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+    REG1 = 2;
+    REG2 = 63;
+    REG3 = 4;
+    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!
+
+    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
+    REG0 = REG0 + REG2; // + 63
+    REG0 = REG0 >> REG3; // >> 4
+    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
+        args(state.build_globals, bvh_base);
+    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base);
+}
+
+metakernel fixup_quad_table( qword bvh_base )
+{
+    dispatch  fixup_quad_table(2,1,1)
+        args(bvh_base);
+}
+
+// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
+metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
+{
+    REG0.hi = 0;
+    REG1 = 1;
+    REG2 = 31;
+    REG3 = 4;
+    REG4 = 2;
+    REG5 = 7;
+    REG6 = 255;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+}
+
+metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
+{
+
+    REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
+    define C_1  REG1;
+    define C_31 REG2;
+    define C_4  REG3;
+
+    REG0 = REG0 + C_31; // + 31
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32
+
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 )
+        args(bvh_base,scratch,geos);
+}
+
+metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes )
+{
+    REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode!
+    define C_1 REG1;
+    define C_2 REG4;
+    define C_7 REG5;
+
+    REG0 = REG0 + C_7;  // + 7
+    REG0 = REG0 >> C_2; // >> 2 
+    REG0 = REG0 >> C_1; // >> 1 ==>  >> 3  (/8)
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect  write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 )
+        args(bvh_base,scratch);
+}
+
+metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs  )
+{
+    dispatch  update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 )
+        args(bvh_base,geos,aabbs);
+}
+
+metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch )
+{
+    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
+    define C_255 REG6;
+    define C_4   REG3;
+    
+    REG0 = REG0 + C_255; // + 255
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
+    
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 )
+        args(bvh_base, geos, scratch);
+}
+
+metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format )
+{
+    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
+    define C_255 REG6;
+    define C_4   REG3;
+    
+    REG0 = REG0 + C_255; // + 255
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
+    
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 )
+        args(bvh_base, vertices, geos, scratch, vertex_format);
+}
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/binned_sah_shared.h b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
new file mode 100644
index 00000000000..8b22f6612cd
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
@@ -0,0 +1,265 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+//   This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#include "GRLGen12.h"
+#pragma once
+
+#define BFS_NUM_BINS        16
+#define BFS_NUM_VCONTEXTS   256
+#define BFS_MAX_DEPTH 32
+
+#define TRIVIAL_BUILD_THRESHOLD   6
+#define SINGLE_WG_BUILD_THRESHOLD 256
+
+#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
+
+
+typedef uchar vcontext_id_t;
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
+
+struct BFS_Split
+{
+    float sah;
+    int dim;
+    int pos;
+};
+
+
+struct BFS_BinInfo
+{
+    float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]  
+                                      //          The 6 are lower(xyz) and -upper(xyz)
+                                      // bins use negated-max so that we can use vectorized mins instead of min/max pairs
+    uint counts[3 * BFS_NUM_BINS];
+};
+
+enum_uint8(SAHBuildFlags)
+{
+    SAH_FLAG_NEED_BACKPOINTERS = 1,        // identifies a mixed internal node where each child can have a different type
+    SAH_FLAG_NEED_MASKS        = 2
+};
+
+struct SAHBuildGlobals
+{
+    qword   p_primref_index_buffers;
+    qword   p_primrefs_buffer;
+    qword   p_bvh2;
+    qword   p_globals;     // TODO: deprecate this
+    qword   p_bvh_base;
+    gpuva_t p_qnode_root_buffer;
+
+    dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
+    dword num_primrefs;
+    dword leaf_size;
+    dword leaf_type;
+    
+    dword root_buffer_num_produced;
+    dword root_buffer_num_produced_hi;
+    dword root_buffer_num_consumed;
+    dword root_buffer_num_consumed_hi;
+    dword root_buffer_num_to_consume;
+    dword root_buffer_num_to_consume_hi;
+};
+
+struct SAHBuildBuffersInfo
+{
+    gpuva_t p_globals;
+    gpuva_t p_primref_index_buffers;
+    gpuva_t p_primrefs_buffer;
+    gpuva_t p_bvh2;
+    gpuva_t p_bvh_base;
+    gpuva_t p_qnode_root_buffer;
+    dword   sah_globals_flags;
+    dword   _pad;
+    gpuva_t _pad2;
+};
+
+typedef union LRBounds
+{    
+    struct
+    {
+        struct AABB3f left_centroid_bounds;
+        struct AABB3f left_geom_bounds;
+        struct AABB3f right_centroid_bounds;
+        struct AABB3f right_geom_bounds;
+    } boxes;
+    struct
+    {
+        float Array[24];
+    } scalars;
+} LRBounds;
+
+
+struct VContext
+{
+    uint dispatch_primref_begin;    // range of primrefs for this task
+    uint dispatch_primref_end;
+    uint bvh2_root;                 // BVH2 root node for this task
+    uint tree_depth;                // depth of this node in the tree
+    uint num_left;          // primref counts
+    uint num_right;
+    uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
+    uint batch_index;
+
+    // pass1 global working state and output
+    struct BFS_Split split;
+    struct BFS_BinInfo global_bin_info;
+
+    // pass2 global working state and output
+    LRBounds lr_bounds;
+};
+
+
+
+struct BFSDispatchRecord
+{
+    ushort batch_index;
+    ushort context_id;
+};
+
+
+struct BFSDispatchQueue
+{
+    uint num_dispatches;
+    uint wg_count[BFS_NUM_VCONTEXTS];
+    struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
+};
+
+struct BFS1SpillStackEntry
+{
+    uint primref_begin;
+    uint primref_end;
+    uint bvh2_root;
+    ushort tree_depth;
+    ushort batch_index;
+};
+
+struct BFS1SpillStack
+{
+    uint size;
+    struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
+};
+
+struct QNodeGlobalRootBufferEntry
+{
+    uint bvh2_node;
+    uint qnode;
+    uint build_idx;
+    uint _pad;
+};
+
+struct QNodeGlobalRootBuffer
+{
+    uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
+    struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
+};
+
+struct DFSDispatchRecord
+{
+    uint primref_base;
+    uint bvh2_base;
+    uint batch_index;
+    ushort num_primrefs;
+    ushort tree_depth;
+};
+
+
+struct DFSDispatchQueue
+{
+    struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
+};
+
+#define VCONTEXT_STATE_EXECUTING   0
+#define VCONTEXT_STATE_UNALLOCATED 1
+
+union SchedulerUnion
+{
+    struct VContextScheduler
+    {
+        /////////////////////////////////////////////////////////////
+        //  State data used for communication with command streamer
+        //   NOTE: This part must match definition in 'new_sah_builder.grl'
+        /////////////////////////////////////////////////////////////
+
+        dword num_bfs_wgs;
+        dword num_dfs_wgs;
+
+        dword scheduler_postsync;
+        dword _pad1;
+
+        dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).  
+        dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+        dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
+        dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
+
+        /////////////////////////////////////////////////////////////
+
+        dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+        dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+        dword vcontext_state[BFS_NUM_VCONTEXTS];
+
+        struct BFSDispatchQueue bfs_queue;
+        struct DFSDispatchQueue dfs_queue;
+
+        struct VContext contexts[BFS_NUM_VCONTEXTS];
+
+        struct BFS1SpillStack bfs2_spill_stack;
+    } vContextScheduler;
+
+    struct QnodeScheduler
+    {
+        dword num_qnode_grb_curr_entries;
+        dword num_qnode_grb_new_entries;
+
+        dword scheduler_postsync;
+        dword _pad1;
+
+        dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).  
+        dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+        dword batched_builds_to_process;
+        dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
+
+        /////////////////////////////////////////////////////////////
+
+        dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+        dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+        struct QNodeGlobalRootBuffer qnode_global_root_buffer;
+    } qnodeScheduler;
+};
+
+
+struct BVH2Node
+{
+    struct AABB3f box;
+    uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
+    uint  meta_ss;  
+    //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes  
+    //uchar is_inner; //  1 if inner, 0 if leaf
+    //uchar mask;
+};
+
+struct BVH2
+{
+    uint num_nodes;
+    uint _pad[7];  // align to 32B
+};
+
+
+GRL_NAMESPACE_END(GPUBVHBuilder)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/build_leaf.grl b/src/intel/vulkan/grl/gpu/build_leaf.grl
new file mode 100644
index 00000000000..7b154d03b43
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_leaf.grl
@@ -0,0 +1,206 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module leaf_builder;
+
+kernel_module leaf_kernels ("bvh_build_leaf.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_primref_to_quads                   < kernelFunction="primref_to_quads" >;
+    kernel opencl_kernel_primref_to_procedurals             < kernelFunction="primref_to_procedurals" >;
+    kernel opencl_kernel_create_HW_instance_nodes           < kernelFunction="create_HW_instance_nodes" >;
+    kernel opencl_kernel_create_HW_instance_nodes_pointers  < kernelFunction="create_HW_instance_nodes_pointers" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+const Instances_GROUPSIZE = 16;
+
+metakernel buildLeafDXR_instances(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArray,
+            dword stride,
+            dword offset,
+            dword numPrims)
+{
+    define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
+    dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArray,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_instances_indirect(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArray,
+            qword indirectBuildRangeInfo,
+            dword stride,
+            dword offset)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // Instances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(Instances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_create_HW_instance_nodes args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArray,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_instances_pointers(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArrayPtr,
+            dword stride,
+            dword offset,
+            dword numPrims)
+{
+    define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
+    dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArrayPtr,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_instances_pointers_indirect(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArrayPtr,
+            qword indirectBuildRangeInfo,
+            dword stride,
+            dword offset)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // Instances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(Instances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArrayPtr,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_procedurals(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            dword stride,
+            dword offset,
+            qword p_numPrimitives)
+{
+    define C_1                  REG0;
+    define REG_PRIMS_PER_WG     REG1;
+    define REG_PRIMS_PER_WG_SHR REG2;
+
+    C_1 = 1;
+    REG_PRIMS_PER_WG = 16;
+    REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
+
+    define reg_numPrimitives  REG3;
+    define reg_num_wgs        REG4;
+
+    reg_numPrimitives = load_dword(p_numPrimitives);
+    reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
+    reg_num_wgs = reg_num_wgs - C_1;
+    reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR;
+
+    DISPATCHDIM_X = reg_num_wgs;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primref_to_procedurals args(
+        state.build_globals,
+        state.build_primref_buffer,
+        build_primref_index_buffers,
+        state.bvh_buffer,
+        state.geomDesc_buffer,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_quads(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            dword stride,
+            dword offset,
+            qword p_numPrimitives,
+            dword allow_update)
+{
+    define C_1                  REG0;
+    define REG_PRIMS_PER_WG     REG1;
+    define SHIFT                REG2;
+
+    C_1 = 1;
+    REG_PRIMS_PER_WG = 32;
+    SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
+
+    define reg_numPrimitives  REG3;
+    define reg_num_wgs        REG4;
+
+    reg_numPrimitives = load_dword(p_numPrimitives);
+    reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
+    reg_num_wgs = reg_num_wgs - C_1;
+    reg_num_wgs = reg_num_wgs >> SHIFT;
+    reg_num_wgs = reg_num_wgs >> C_1;
+
+    DISPATCHDIM_X = reg_num_wgs;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primref_to_quads args(
+        state.build_globals,
+        state.build_primref_buffer,
+        build_primref_index_buffers,
+        state.bvh_buffer,
+        state.geomDesc_buffer,
+        stride,
+        offset,
+        allow_update);
+}
diff --git a/src/intel/vulkan/grl/gpu/build_primref.grl b/src/intel/vulkan/grl/gpu/build_primref.grl
new file mode 100644
index 00000000000..33728bd01f6
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_primref.grl
@@ -0,0 +1,229 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module build_primref;
+
+kernel_module primref_kernels ("bvh_build_primref.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_primrefs_from_DXR_instances          < kernelFunction="primrefs_from_DXR_instances" >;
+    kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >;
+    kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >;
+    kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >;
+
+    kernel opencl_kernel_triangles_to_primrefs            < kernelFunction="triangles_to_primrefs" >;
+    kernel opencl_kernel_triangles_to_primrefs_indirect   < kernelFunction="triangles_to_primrefs_indirect" >;
+    kernel opencl_kernel_procedurals_to_primrefs          < kernelFunction="procedurals_to_primrefs" >;
+    kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+const PrimirefsFromInstances_GROUPSIZE = 16;
+
+metakernel buildPrimirefsFromInstances(
+            qword instanceDescBuff,
+            MKSizeEstimate estimate,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+  define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
+  dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args(
+    build_state.build_globals,
+    build_state.bvh_buffer,
+    instanceDescBuff,
+    estimate.numPrimitives,
+    build_state.build_primref_buffer,
+    allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesIndirect(
+            qword instanceDescBuff,
+            qword indirectBuildRangeInfo,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        instanceDescBuff,
+        indirectBuildRangeInfo,
+        build_state.build_primref_buffer,
+        allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesArrOfPtrs(
+            qword instanceDescPtrArrayBuff,
+            MKSizeEstimate estimate,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+  define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
+  dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args(
+    build_state.build_globals,
+    build_state.bvh_buffer,
+    instanceDescPtrArrayBuff,
+    estimate.numPrimitives,
+    build_state.build_primref_buffer,
+    allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect(
+            qword instanceDescPtrArrayBuff,
+            qword indirectBuildRangeInfo,
+            MKSizeEstimate estimate,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        instanceDescPtrArrayBuff,
+        build_state.build_primref_buffer,
+        indirectBuildRangeInfo,
+        allowUpdate);
+}
+
+
+
+
+metakernel primrefs_from_tris(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            dword geom_id,
+            dword geom_flags,
+            dword num_prims)
+{
+    define num_threads ((num_prims+15)/16);
+    dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        (geom_id & 0x00ffffff) + (geom_flags<<24),
+        num_prims);
+}
+
+metakernel primrefs_from_tris_indirect(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            qword indirectBuildRangeInfo,
+            dword geom_id,
+            dword geom_flags)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups  = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4         = 4;  // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        indirectBuildRangeInfo,
+        (geom_id & 0x00ffffff) + (geom_flags << 24));
+}
+
+metakernel primrefs_from_proc(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            dword geom_id,
+            dword geom_flags,
+            dword num_prims)
+{
+    define num_threads ((num_prims+15)/16);
+    dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        (geom_id & 0x00ffffff) + (geom_flags<<24),
+        num_prims);
+}
+
+metakernel primrefs_from_proc_indirect(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            qword indirectBuildRangeInfo,
+            dword geom_id,
+            dword geom_flags)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups  = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4         = 4;  // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        indirectBuildRangeInfo,
+        (geom_id & 0x00ffffff) + (geom_flags<<24));
+}
diff --git a/src/intel/vulkan/grl/gpu/build_refit.grl b/src/intel/vulkan/grl/gpu/build_refit.grl
new file mode 100644
index 00000000000..46d6e76add2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_refit.grl
@@ -0,0 +1,324 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module build_refit;
+
+kernel_module morton_kernels ("bvh_build_refit.cl")
+{
+    links lsc_intrinsics;
+
+    kernel update_instance_leaves    < kernelFunction="update_instance_leaves" >;
+    kernel refit_indirect_sg         < kernelFunction="Refit_indirect_sg" >;
+    kernel update_instance_leaves_indirect    < kernelFunction="update_instance_leaves_indirect" >;
+
+
+}
+
+const INSTANCE_LEAF_GROUP_SIZE = 16;
+const REFIT_GROUP_SIZE = 8;
+
+metakernel update_instance_leaves(
+    qword bvh,
+    qword dxrInstancesArray,
+    qword dxrInstancesPtrArray,
+    qword instance_leaf_aabbs,
+    dword num_instances )
+{
+    define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE;
+
+    dispatch update_instance_leaves(num_groups, 1, 1) args(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtrArray,
+        instance_leaf_aabbs);
+}
+
+metakernel update_instance_leaves_indirect(
+    qword bvh,
+    qword dxrInstancesArray,
+    qword dxrInstancesPtrArray,
+    qword instance_leaf_aabbs,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1
+    C_4 = 4;  // log_2(INSTANCE_LEAF_GROUP_SIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect update_instance_leaves_indirect args(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtrArray,
+        instance_leaf_aabbs,
+        indirectBuildRangeInfo);
+}
+
+/*
+metakernel refit(
+    qword bvh,
+    qword geomDesc,
+    qword instance_aabbs,
+    dword dispatchSize )
+{
+    define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE;
+
+    dispatch refit(num_groups, 1, 1) args(
+        bvh,
+        geomDesc,
+        instance_aabbs);
+}
+
+const REFIT_SIMD_SIZE = 8;
+const REFIT_SIMD_SIZE_SHIFT = 3;
+
+metakernel refit_indirect(
+    qword bvh,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end,
+    qword geomDesc,
+    qword instance_aabbs )
+{
+    define cRoundingSIMD REG4;
+    define TWO REG3;
+    define ONE REG5;
+    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
+
+    TWO = 2;
+    ONE = 1;
+
+    REG0 = bvh_inner_nodes_start_value;
+    REG1 = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
+    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect refit_indirect args(
+        bvh,
+        geomDesc,
+        instance_aabbs);
+
+}
+*/
+
+metakernel refit_indirect_sg(
+    qword bvh,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end,
+    qword geomDesc,
+    qword instance_aabbs )
+{
+    
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect refit_indirect_sg args(
+        bvh,
+        geomDesc,
+        instance_aabbs);
+
+}
+/*
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 1: mark nodes that will be roots of bottom treelets
+// also for each node leave a number of startpoints that are under it and max depth of the path from the node
+metakernel find_refit_treelets(
+    qword bvh,
+    qword treelet_node_data,
+    qword scratch_startpoints,
+    qword startpointAlloc,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end )
+{
+    define cRoundingSIMD REG4;
+    define TWO REG3;
+    define ONE REG5;
+    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
+
+    TWO = 2;
+    ONE = 1;
+
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
+    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect find_refit_treelets args(
+        bvh,
+        treelet_node_data,
+        scratch_startpoints,
+        startpointAlloc);
+}
+
+
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 2 totally parallel, run threads up to assign startpoints to given treelet
+// 
+metakernel assign_refit_startpoints_to_treelets(
+    qword bvh,
+    qword treelet_node_data,
+    qword scratch_startpoints,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end )
+{
+    define cRoundingSIMD REG4;
+    define TWO REG3;
+    define ONE REG5;
+    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
+
+    TWO = 2;
+    ONE = 1;
+
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
+    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect assign_refit_startpoints_to_treelets args(
+        bvh,
+        treelet_node_data,
+        scratch_startpoints);
+}
+
+
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path
+metakernel finalize_treelets_in_groups(
+    qword bvh,
+    qword scratch_startpoints,
+    qword ptrNumTreelets )
+{
+    REG0 = load_qword(ptrNumTreelets);
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect finalize_treelets_in_groups args(
+        bvh,
+        scratch_startpoints);
+}
+
+
+////////////////////////////////////////////////////////////////
+// Updating treelets
+// phase 1 update vertex and generate boxes for vertices
+//
+
+const PER_GROUP_ELEMENTS_ROUNDING = 15;
+const PER_GROUP_ELEMENTS_SHIFT = 4;
+
+metakernel init_treelets_refit(qword pSquashGroupsCountToReset)
+{
+    REG1 = 0;
+    store_qword(pSquashGroupsCountToReset, REG1);
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+    //REG4 = PER_GROUP_ELEMENTS_SHIFT;
+    //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING;
+    //REG5.lo = 0;
+}
+
+metakernel update_quads(
+    qword scratch_box,
+    qword bvh,
+    qword input,
+    dword numPrimsDividedBy32,
+    qword bigSquashInput)
+{
+    //REG0 = load_qword(quads_nodes_begin_end_pair);
+    //REG1.hi = REG0.lo; // this holds inner nodes begin
+    //REG2 = REG0 - REG1;
+    //REG2 = REG2 + REG5;
+    //REG2 = REG2 >> REG4;
+    //DISPATCHDIM_X = REG2.hi;
+
+    dispatch  refit_quads(numPrimsDividedBy32, 1, 1) args(
+        bvh,
+        input,
+        scratch_box,
+        numPrimsDividedBy32,
+        bigSquashInput );
+}
+
+//
+////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////
+//
+// phase 1 or 2 - update primitives as well as bottom up refit internal nodes
+// in single dispatch (in single group per tree)
+metakernel refit_tree_by_group_including_quads(
+    qword squashed_inputs,
+    dword numBuilds
+)
+{
+    dispatch refit_tree_per_group(numBuilds, 1, 1) args(
+        squashed_inputs);
+}
+//
+////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////
+//
+// phase 2 bottom up refit internal nodes
+//
+metakernel refit_treelet_per_group(
+    qword bigSquashInput,
+    qword ptrNumTreelets)
+{
+    DISPATCHDIM_X = load_dword(ptrNumTreelets);
+
+    dispatch_indirect refit_treelet_per_group args(
+        bigSquashInput);
+}
+//
+////////////////////////////////////////////////////////////////
+
+#endif
+*/
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
new file mode 100644
index 00000000000..d72f192056e
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
@@ -0,0 +1,4823 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "binned_sah_shared.h"
+
+#include "libs/lsc_intrinsics.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+
+#include "qbvh6.h"
+#include "common.h"
+
+#include "libs/lsc_intrinsics.h"
+
+#define SGPRINT_16x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        type v6 = sub_group_broadcast( val, 6 );\
+                                        type v7 = sub_group_broadcast( val, 7 );\
+                                        type v8 = sub_group_broadcast( val, 8 );\
+                                        type v9 = sub_group_broadcast( val, 9 );\
+                                        type v10 = sub_group_broadcast( val, 10 );\
+                                        type v11 = sub_group_broadcast( val, 11 );\
+                                        type v12 = sub_group_broadcast( val, 12 );\
+                                        type v13 = sub_group_broadcast( val, 13 );\
+                                        type v14 = sub_group_broadcast( val, 14 );\
+                                        type v15 = sub_group_broadcast( val, 15 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
+                                                      fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
+                                            v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
+
+
+#define SGPRINT_6x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
+                                            v0,v1,v2,v3,v4,v5);}}
+
+#define BFS_WG_SIZE  512
+
+#define BFS_NUM_VCONTEXTS 256 // must be multiple of 64
+
+#define TREE_ARITY 6
+
+#define DFS_WG_SIZE  256
+#define DFS_THRESHOLD 256
+
+
+void BFSDispatchQueue_print(struct BFSDispatchQueue* q, uint n)
+{
+    for (uint i = 0; i < q->num_dispatches; i++)
+        printf("   %u,ctx=%u,batch=%u\n", q->wg_count[i], q->records[i].context_id, q->records[i].batch_index);
+}
+
+void VContextScheduler_print(struct VContextScheduler* scheduler)
+{
+    if (get_local_id(0) == 0)
+    {
+        printf("SCHEDULER:\n");
+        printf("    bfs=%u dfs=%u\n", scheduler->num_bfs_wgs, scheduler->num_dfs_wgs);
+
+        printf("BFS QUEUE:\n");
+        BFSDispatchQueue_print(&scheduler->bfs_queue, scheduler->num_bfs_wgs);
+
+
+        printf("DFS QUEUE\n");
+        for (uint i = 0; i < scheduler->num_dfs_wgs; i++)
+        {
+            struct DFSDispatchRecord* r = &scheduler->dfs_queue.records[i];
+            printf("    (%u-%u) root=%u  depth=%u  batch_index=%u\n",
+                r->primref_base, r->primref_base + r->num_primrefs,
+                r->bvh2_base, r->tree_depth, r->batch_index);
+        }
+
+        printf("CONTEXTS:\n");
+        for (uint i = 0; i < BFS_NUM_VCONTEXTS; i++)
+        {
+            if (scheduler->vcontext_state[i] != VCONTEXT_STATE_UNALLOCATED)
+            {
+                printf(" context: %u  state=%u\n", i, scheduler->vcontext_state[i]);
+                printf("     prims: %u-%u\n", scheduler->contexts[i].dispatch_primref_begin, scheduler->contexts[i].dispatch_primref_end);
+                printf("     depth: %u\n", scheduler->contexts[i].tree_depth);
+                printf("     root: %u\n", scheduler->contexts[i].bvh2_root);
+                printf("     batch: %u\n", scheduler->contexts[i].batch_index);
+            }
+        }
+
+
+
+    }
+
+}
+
+
+inline float3 select_min(float3 v, bool mask)
+{
+    return (float3)(mask ? v.x : (float)(INFINITY),
+        mask ? v.y : (float)(INFINITY),
+        mask ? v.z : (float)(INFINITY));
+}
+inline float3 select_max(float3 v, bool mask)
+{
+    return (float3)(mask ? v.x : -(float)(INFINITY),
+        mask ? v.y : -(float)(INFINITY),
+        mask ? v.z : -(float)(INFINITY));
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+//  The 'LRBounds' structure uses negated-max to allow
+//  both atomic_min and atomic_max to be issued fused into one message
+
+struct AABB3f LRBounds_get_left_centroid( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.left_centroid_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_right_centroid( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.right_centroid_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_left_geom( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.left_geom_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_right_geom( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.right_geom_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+
+
+void LRBounds_merge_left( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax )
+{
+    // All of the input vectors have come from sub-group reductions and are thus uniform
+    //   Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs
+    //  The code below should result in 1 atomic_min message and a simularly large stack of movs
+
+    float mergeVal0 = INFINITY;
+    float mergeVal1 = INFINITY;
+    uint i = get_sub_group_local_id();
+
+    // insert the various merge values into one register
+    //  We use two parallel variables here to enable some ILP
+
+    uint imod = (i>=6) ? (i-6) : i;
+    mergeVal0 = (imod==0)  ?  CMin.x : mergeVal0;
+    mergeVal1 = (imod==0)  ?  GMin.x : mergeVal1;
+
+    mergeVal0 = (imod==1)  ?  CMin.y : mergeVal0;
+    mergeVal1 = (imod==1)  ?  GMin.y : mergeVal1;
+
+    mergeVal0 = (imod==2)  ?  CMin.z : mergeVal0;
+    mergeVal1 = (imod==2)  ?  GMin.z : mergeVal1;
+
+    mergeVal0 = (imod==3)  ? -CMax.x : mergeVal0;
+    mergeVal1 = (imod==3)  ? -GMax.x : mergeVal1;
+
+    mergeVal0 = (imod==4)  ? -CMax.y : mergeVal0;
+    mergeVal1 = (imod==4)  ? -GMax.y : mergeVal1;
+
+    mergeVal0 = (imod==5)  ? -CMax.z : mergeVal0;
+    mergeVal1 = (imod==5)  ? -GMax.z : mergeVal1;
+
+    float merge = (i<6) ? mergeVal0 : mergeVal1;
+    if( i < 12 )
+        atomic_min( &b->scalars.Array[i], merge );
+
+    //atomic_min( &b->boxes.left_centroid_bounds.lower[0], CMin.x );
+    //atomic_min( &b->boxes.left_centroid_bounds.lower[1], CMin.y );
+    //atomic_min( &b->boxes.left_centroid_bounds.lower[2], CMin.z );
+    //atomic_min( &b->boxes.left_centroid_bounds.upper[0], -CMax.x );
+    //atomic_min( &b->boxes.left_centroid_bounds.upper[1], -CMax.y );
+    //atomic_min( &b->boxes.left_centroid_bounds.upper[2], -CMax.z );
+    //atomic_min( &b->boxes.left_geom_bounds.lower[0],      GMin.x );
+    //atomic_min( &b->boxes.left_geom_bounds.lower[1],      GMin.y );
+    //atomic_min( &b->boxes.left_geom_bounds.lower[2],      GMin.z );
+    //atomic_min( &b->boxes.left_geom_bounds.upper[0], -GMax.x );
+    //atomic_min( &b->boxes.left_geom_bounds.upper[1], -GMax.y );
+    //atomic_min( &b->boxes.left_geom_bounds.upper[2], -GMax.z );
+}
+
+void LRBounds_merge_right( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax )
+{
+    // All of the input vectors have come from sub-group reductions and are thus uniform
+    //   Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs
+    //  The code below should result in 1 atomic_min message and a simularly large stack of movs
+
+    float mergeVal0 = INFINITY;
+    float mergeVal1 = INFINITY;
+    uint i = get_sub_group_local_id();
+
+    // insert the various merge values into one register
+    //  We use two parallel variables here to enable some ILP
+
+    uint imod = (i>=6) ? (i-6) : i;
+    mergeVal0 = (imod==0)  ?  CMin.x : mergeVal0;
+    mergeVal1 = (imod==0)  ?  GMin.x : mergeVal1;
+
+    mergeVal0 = (imod==1)  ?  CMin.y : mergeVal0;
+    mergeVal1 = (imod==1)  ?  GMin.y : mergeVal1;
+
+    mergeVal0 = (imod==2)  ?  CMin.z : mergeVal0;
+    mergeVal1 = (imod==2)  ?  GMin.z : mergeVal1;
+
+    mergeVal0 = (imod==3)  ? -CMax.x : mergeVal0;
+    mergeVal1 = (imod==3)  ? -GMax.x : mergeVal1;
+
+    mergeVal0 = (imod==4)  ? -CMax.y : mergeVal0;
+    mergeVal1 = (imod==4)  ? -GMax.y : mergeVal1;
+
+    mergeVal0 = (imod==5)  ? -CMax.z : mergeVal0;
+    mergeVal1 = (imod==5)  ? -GMax.z : mergeVal1;
+
+    float merge = (i<6) ? mergeVal0 : mergeVal1;
+    if( i < 12 )
+        atomic_min( &b->scalars.Array[i+12], merge );
+
+    //atomic_min( &b->boxes.right_centroid_bounds.lower[0],  CMin.x );
+    //atomic_min( &b->boxes.right_centroid_bounds.lower[1],  CMin.y );
+    //atomic_min( &b->boxes.right_centroid_bounds.lower[2],  CMin.z );
+    //atomic_min( &b->boxes.right_centroid_bounds.upper[0], -CMax.x );
+    //atomic_min( &b->boxes.right_centroid_bounds.upper[1], -CMax.y );
+    //atomic_min( &b->boxes.right_centroid_bounds.upper[2], -CMax.z );
+    //atomic_min( &b->boxes.right_geom_bounds.lower[0],      GMin.x );
+    //atomic_min( &b->boxes.right_geom_bounds.lower[1],      GMin.y );
+    //atomic_min( &b->boxes.right_geom_bounds.lower[2],      GMin.z );
+    //atomic_min( &b->boxes.right_geom_bounds.upper[0],     -GMax.x );
+    //atomic_min( &b->boxes.right_geom_bounds.upper[1],     -GMax.y );
+    //atomic_min( &b->boxes.right_geom_bounds.upper[2],     -GMax.z );
+}
+
+void LRBounds_merge( global LRBounds* globalBounds, local LRBounds* localBounds )
+{
+    uint i = get_local_id(0);
+    if( i < 24 )
+        atomic_min(&globalBounds->scalars.Array[i], localBounds->scalars.Array[i] );
+}
+
+
+void LRBounds_init( LRBounds* bounds )
+{
+    uint i = get_local_id(0) * 4;
+    if( i < 24 )
+    {
+        // compiler should merge it into a 4xdword send
+        bounds->scalars.Array[i+0] = INFINITY;
+        bounds->scalars.Array[i+1] = INFINITY;
+        bounds->scalars.Array[i+2] = INFINITY;
+        bounds->scalars.Array[i+3] = INFINITY;
+    }
+
+}
+
+
+inline void LRBounds_init_subgroup( LRBounds* bounds)
+{
+    uint sg_size = get_sub_group_size();
+    uint lane = get_sub_group_local_id();
+
+    for (uint i = lane * 4; i < 24; i += sg_size * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bounds->scalars.Array[i+0] = INFINITY;
+        bounds->scalars.Array[i+1] = INFINITY;
+        bounds->scalars.Array[i+2] = INFINITY;
+        bounds->scalars.Array[i+3] = INFINITY;
+    }
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+inline void BinInfo_init(struct BFS_BinInfo* bin_info)
+{
+    for (uint id = get_local_id(0) * 4; id < 18 * BFS_NUM_BINS; id += get_local_size(0) * 4)
+    {
+        float inf = INFINITY;
+        // compiler should merge it into a 4xdword send
+        bin_info->min_max[id+0] = inf;
+        bin_info->min_max[id+1] = inf;
+        bin_info->min_max[id+2] = inf;
+        bin_info->min_max[id+3] = inf;
+    }
+    for (uint id = get_local_id(0) * 4; id < 3 * BFS_NUM_BINS; id += get_local_size(0) * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bin_info->counts[id+0] = 0;
+        bin_info->counts[id+1] = 0;
+        bin_info->counts[id+2] = 0;
+        bin_info->counts[id+3] = 0;
+    }
+}
+
+
+// copy global to local
+inline void BinInfo_copy( local struct BFS_BinInfo* local_bin_info, global struct BFS_BinInfo* global_bin_info )
+{
+    for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+        float inf = INFINITY ;
+        float f = global_bin_info->min_max[id];
+        local_bin_info->min_max[id] = f;
+    }
+    for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+        local_bin_info->counts[id] = global_bin_info->counts[id];
+    }
+}
+
+inline void BinInfo_init_subgroup(struct BFS_BinInfo* bin_info)
+{
+    uint sg_size = get_sub_group_size();
+    uint lane = get_sub_group_local_id();
+
+    for (uint i = lane * 4; i < 3 * BFS_NUM_BINS; i += sg_size * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bin_info->counts[i+0] = 0;
+        bin_info->counts[i+1] = 0;
+        bin_info->counts[i+2] = 0;
+        bin_info->counts[i+3] = 0;
+    }
+
+
+    for (uint i = lane * 4; i < 18 * BFS_NUM_BINS; i += sg_size * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bin_info->min_max[i+0] = INFINITY;
+        bin_info->min_max[i+1] = INFINITY;
+        bin_info->min_max[i+2] = INFINITY;
+        bin_info->min_max[i+3] = INFINITY;
+    }
+
+}
+
+float3 shuffle_down_float3( float3 a, float3 b, uint delta )
+{
+    return (float3)(
+        intel_sub_group_shuffle_down( a.x, b.x, delta ),
+        intel_sub_group_shuffle_down( a.y, b.y, delta ),
+        intel_sub_group_shuffle_down( a.z, b.z, delta )
+        );
+}
+
+
+
+
+void BinInfo_primref_ballot_loop( local struct BFS_BinInfo* bin_info, uint axis, uint bin, float3 lower, float3 upper, bool active_lane )
+{
+    local float* bins_min = &bin_info->min_max[0];
+    local float* bins_max = &bin_info->min_max[3];
+
+    varying uint place = (bin + axis*BFS_NUM_BINS);
+    varying uint lane = get_sub_group_local_id();
+
+    uniform uint active_mask = intel_sub_group_ballot(active_lane);
+
+    while( active_mask )
+    {
+        uniform uint leader     = ctz( active_mask );
+        uniform uint lead_place = intel_sub_group_shuffle( place, leader );
+        varying bool matching_bin = lead_place == place && active_lane;
+
+        varying float3 lo = (float3)(INFINITY,INFINITY,INFINITY);
+        varying float3 hi = (float3)(-INFINITY,-INFINITY,-INFINITY);
+        if (matching_bin)
+        {
+            lo = lower.xyz;
+            hi = upper.xyz;
+        }
+
+        lo = sub_group_reduce_min_float3( lo );
+        hi = sub_group_reduce_max_float3( hi );
+
+        {
+            // atomic min operation vectorized across 6 lanes
+            //    [ lower.xyz ][-][upper.xyz][-]
+            //
+            // Lanes 3 and 7 are inactive
+
+            uint lmod = lane % 4;
+            uint ldiv = lane / 4;
+            float vlo = lo.x;
+            float vhi = hi.x;
+            vlo = (lmod == 1) ? lo.y : vlo;
+            vhi = (lmod == 1) ? hi.y : vhi;
+            vlo = (lmod == 2) ? lo.z : vlo;
+            vhi = (lmod == 2) ? hi.z : vhi;
+
+            float v = (ldiv == 0) ? vlo : -vhi;
+
+            if( (1<<lane) & 0x77 )
+                atomic_min( &bin_info->min_max[ 6*lead_place + lmod + 3*ldiv ], v );
+        }
+
+      //if( lane == 0 )
+      //    atomic_add_local(&bin_info->counts[lead_place], popcount(active_mask & intel_sub_group_ballot(matching_bin)) );
+
+        active_mask = active_mask & intel_sub_group_ballot(!matching_bin);
+    }
+}
+
+inline void BinInfo_add_primref(struct BinMapping* binMapping, local struct BFS_BinInfo* bin_info, PrimRef* primref, bool active_lane )
+{
+
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4( (p - binMapping->ofs) * binMapping->scale );
+
+    BinInfo_primref_ballot_loop( bin_info, 0, i.x, lower.xyz, upper.xyz, active_lane );
+    BinInfo_primref_ballot_loop( bin_info, 1, i.y, lower.xyz, upper.xyz, active_lane );
+    BinInfo_primref_ballot_loop( bin_info, 2, i.z, lower.xyz, upper.xyz, active_lane );
+
+    if (active_lane)
+    {
+        atomic_inc_local( &bin_info->counts[i.x + 0 * BFS_NUM_BINS] );
+        atomic_inc_local( &bin_info->counts[i.y + 1 * BFS_NUM_BINS] );
+        atomic_inc_local( &bin_info->counts[i.z + 2 * BFS_NUM_BINS] );
+    }
+}
+
+inline void BinInfo_merge(global struct BFS_BinInfo* global_info, local struct BFS_BinInfo* local_info)
+{
+    uint id = get_local_id(0);
+    for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+            float v = local_info->min_max[id];
+            if( v != INFINITY )
+                atomic_min(&global_info->min_max[id], v);
+    }
+    for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+            uint c = local_info->counts[id];
+            if( c )
+                atomic_add_global(&global_info->counts[id], c);
+    }
+}
+
+inline struct AABB3f BinInfo_get_AABB(struct BFS_BinInfo* bin_info, ushort bin, ushort axis)
+{
+    float* min = &bin_info->min_max[6*(bin + axis*BFS_NUM_BINS)];
+    float* max = min + 3;
+    struct AABB3f box;
+    for (uint i = 0; i < 3; i++)
+    {
+        box.lower[i] = min[i];
+        box.upper[i] = -max[i];
+    }
+
+    return box;
+}
+
+inline uint3 BinInfo_get_counts(struct BFS_BinInfo* bin_info, ushort bin)
+{
+    uint3 counts;
+    counts.x = bin_info->counts[bin + 0 * BFS_NUM_BINS]; // TODO: block load these
+    counts.y = bin_info->counts[bin + 1 * BFS_NUM_BINS];
+    counts.z = bin_info->counts[bin + 2 * BFS_NUM_BINS];
+    return counts;
+}
+inline uint BinInfo_get_count(struct BFS_BinInfo* bin_info, ushort bin, ushort axis)
+{
+    return bin_info->counts[bin + axis * BFS_NUM_BINS];
+}
+
+
+void BVH2_Initialize( struct BVH2* bvh )
+{
+    bvh->num_nodes = 1;
+}
+
+inline bool BVH2_IsInnerNode( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return (n->meta_ss & 0x10000) != 0;
+}
+inline uint BVH2_GetRoot( struct BVH2* bvh )
+{
+    return 0;
+}
+
+//////////////////////////////////////////////
+// BVH2NodeMetaData funcs
+//////////////////////////////////////////////
+struct BVH2NodeMetaData
+{
+    uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
+    uint  meta_ss;
+};
+
+inline struct BVH2NodeMetaData BVH2_GetNodeMetaData( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    struct BVH2NodeMetaData meta;
+    meta.meta_u = n->meta_u;
+    meta.meta_ss = n->meta_ss;
+    return meta;
+}
+
+inline bool BVH2NodeMetaData_IsInnerNode( struct BVH2NodeMetaData* meta )
+{
+    return (meta->meta_ss & 0x10000) != 0;
+}
+
+inline ushort BVH2NodeMetaData_GetLeafPrimCount( struct BVH2NodeMetaData* meta )
+{
+    return meta->meta_ss & 0xffff;
+}
+
+inline uint BVH2NodeMetaData_GetLeafPrimStart( struct BVH2NodeMetaData* meta )
+{
+    return meta->meta_u;
+}
+
+inline uint BVH2NodeMetaData_GetMask( struct BVH2NodeMetaData* meta )
+{
+    return (meta->meta_ss>>24);
+}
+
+//////////////////////////////////////////////
+
+inline ushort BVH2_GetLeafPrimCount( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    return n->meta_ss & 0xffff;
+}
+inline uint BVH2_GetLeafPrimStart( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    return n->meta_u;
+}
+inline uint2 BVH2_GetChildIndices( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    uint2 idx;
+    idx.x = n->meta_u;
+    idx.y = idx.x + (n->meta_ss & 0xffff);
+    return idx;
+}
+
+inline float BVH2_GetNodeArea( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return AABB3f_halfArea( &n->box );
+}
+
+
+inline struct AABB3f BVH2_GetNodeBox( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return n->box;
+}
+inline void BVH2_SetNodeBox( global struct BVH2* bvh, uint node_index, struct AABB3f* box )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    n->box = *box;
+}
+
+inline void BVH2_SetNodeBox_lu( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    AABB3f_set( &n->box, lower, upper );
+}
+
+inline void BVH2_InitNodeBox( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    AABB3f_init( &n->box );
+}
+
+inline struct AABB BVH2_GetAABB( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    struct AABB r;
+    r.lower.xyz = AABB3f_load_lower( &n->box );
+    r.upper.xyz = AABB3f_load_upper( &n->box );
+    return r;
+}
+
+inline void BVH2_WriteInnerNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint2 child_offsets, uint mask )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    n->box = *box;
+    n->meta_u   = child_offsets.x;
+    n->meta_ss  = 0x10000 + (child_offsets.y - child_offsets.x) + (mask<<24);
+  //  n->is_inner  = true;
+}
+
+inline void BVH2_WriteLeafNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint prim_start, uint prim_count, uint mask  )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    n->box = *box;
+    n->meta_u   = prim_start;
+    n->meta_ss  = prim_count + (mask<<24);
+    //  n->is_inner  = true;
+}
+
+inline uint BVH2_GetMask( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return (n->meta_ss>>24);
+}
+
+
+uint BVH2_AllocateNodes( global struct BVH2* bvh, uint num_nodes )
+{
+    return atomic_add_global( &bvh->num_nodes, num_nodes );
+}
+
+inline void BVH2_AtomicMergeNodeBox( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    AABB3f_atomic_merge_global_lu( &n->box, lower, upper );
+}
+
+
+void BVH2_print( global struct BVH2* bvh, uint start_node )
+{
+    if ( get_local_id( 0 ) == 0 && get_sub_group_id() == 0 )
+    {
+        uint num_nodes = bvh->num_nodes;
+
+        uint2 stack[BFS_MAX_DEPTH * 2];
+        uint sp = 0;
+
+        printf( "allocated_nodes=%u\n", num_nodes );
+
+        stack[sp++] = (uint2)(start_node, 0);
+        while ( sp > 0 )
+        {
+            uint2 data = stack[--sp];
+            uint node = data.x;
+            uint depth = data.y;
+
+            for ( uint i = 0; i < depth; i++ )
+                printf( "    " );
+
+            if ( BVH2_IsInnerNode( bvh, node ) )
+            {
+                uint2 kids = BVH2_GetChildIndices( bvh, node );
+                printf( " %5u: inner: %u %u \n", node, kids.x, kids.y );
+                stack[sp++] = (uint2)(kids.y, depth + 1);
+                stack[sp++] = (uint2)(kids.x, depth + 1);
+
+                struct AABB3f l = BVH2_GetNodeBox( bvh, kids.x );
+                struct AABB3f r = BVH2_GetNodeBox( bvh, kids.y );
+                struct AABB3f p = BVH2_GetNodeBox( bvh, node );
+
+                float3 pl = AABB3f_load_lower( &p );
+                float3 pu = AABB3f_load_upper( &p );
+                float3 ll = AABB3f_load_lower( &l );
+                float3 lu = AABB3f_load_upper( &l );
+                float3 rl = AABB3f_load_lower( &r );
+                float3 ru = AABB3f_load_upper( &r );
+                if ( any( ll < pl ) || any( rl < pl ) ||
+                     any( lu > pu ) || any( ru > pu ) )
+                {
+                    for ( uint i = 0; i < depth; i++ )
+                        printf( "    " );
+
+                    printf( "BAD_BOUNDS!!!!!!!! %u\n", node );
+                }
+
+
+            }
+            else
+            {
+
+                uint start = BVH2_GetLeafPrimStart( bvh, node );
+                uint count = BVH2_GetLeafPrimCount( bvh, node );
+                printf( " %5u: leaf: start=%u count=%u\n  ",node,start,count );
+
+            }
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+global uint* SAHBuildGlobals_GetPrimrefIndices_In( struct SAHBuildGlobals* globals, bool odd_pass )
+{
+    uint num_refs = globals->num_primrefs;
+    global uint* ib = (global uint*) globals->p_primref_index_buffers;
+    return ib + (odd_pass ? num_refs : 0);
+}
+
+global uint* SAHBuildGlobals_GetPrimrefIndices_Out( struct SAHBuildGlobals* globals, bool odd_pass )
+{
+    uint num_refs = globals->num_primrefs;
+    global uint* ib = (global uint*) globals->p_primref_index_buffers;
+    return ib + (odd_pass ? 0 : num_refs);
+}
+
+global PrimRef* SAHBuildGlobals_GetPrimrefs( struct SAHBuildGlobals* globals )
+{
+    return (global PrimRef*) globals->p_primrefs_buffer;
+}
+
+global struct BVH2* SAHBuildGlobals_GetBVH2( struct SAHBuildGlobals* globals )
+{
+    return (global struct BVH2*)globals->p_bvh2;
+}
+
+uint SAHBuildGlobals_GetLeafSizeInBytes( struct SAHBuildGlobals* globals )
+{
+    return globals->leaf_size;
+}
+
+uint SAHBuildGlobals_GetLeafType( struct SAHBuildGlobals* globals )
+{
+    return globals->leaf_type;
+}
+
+uint SAHBuildGlobals_GetInternalNodeType( struct SAHBuildGlobals* globals )
+{
+    return NODE_TYPE_INTERNAL;
+}
+
+global struct BVHBase* SAHBuildGlobals_GetBVHBase( struct SAHBuildGlobals* globals )
+{
+    return (global struct BVHBase*) globals->p_bvh_base;
+}
+
+uint SAHBuildGlobals_GetTotalPrimRefs( struct SAHBuildGlobals* globals )
+{
+    return globals->num_primrefs;
+}
+
+inline bool SAHBuildGlobals_NeedBackPointers( struct SAHBuildGlobals* globals )
+{
+    return globals->flags & SAH_FLAG_NEED_BACKPOINTERS;
+}
+inline bool SAHBuildGlobals_NeedMasks( struct SAHBuildGlobals* globals )
+{
+    return globals->flags & SAH_FLAG_NEED_MASKS;
+}
+
+
+void SAHBuildGlobals_print( struct SAHBuildGlobals* globals )
+{
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "SAHBuildGlobals: %p\n", globals );
+        printf( "  p_primref_index_buffers =%p\n", globals->p_primref_index_buffers );
+        printf( "  p_primrefs_buffer =%p\n",       globals->p_primrefs_buffer );
+        printf( "  p_bvh2 =%p\n",                  globals->p_bvh2 );
+        printf( "  p_globals =%p\n",               globals->p_globals );
+        printf( "  p_bvh_base =%p\n",              globals->p_bvh_base );
+        printf( "  num_primrefs = %u\n",           globals->num_primrefs );
+        printf( "  leaf_size = %u\n",              globals->leaf_size );
+        printf( "  leaf_type = %u\n",              globals->leaf_type );
+        printf( "  p_qnode_buffer = %p\n",         globals->p_qnode_root_buffer);
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+uint get_num_wgs(uint thread_count, uint WG_SIZE)
+{
+    return (thread_count + WG_SIZE - 1) / WG_SIZE;
+}
+
+
+
+
+
+struct BFSDispatchArgs
+{
+    global struct VContextScheduler* scheduler;
+    global struct VContext* context;
+    global struct BVH2* bvh2;
+    global uint* primref_index_in;
+    global uint* primref_index_out;
+    global PrimRef* primref_buffer;
+
+    uint   wg_primref_begin;
+    uint   wg_primref_end;
+    uint   dispatch_primref_begin;
+    uint   dispatch_primref_end;
+    uint   context_id;
+    uint   num_wgs;
+    uint   bvh2_root;
+    uint   global_num_primrefs;
+    bool   do_mask_processing;
+};
+
+
+
+
+// TODO_OPT:  Enable larger WGs
+//    We need a way to do this in a portable fashion.
+//     Gen12 can support larger WGs than Gen9 can
+//
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+kernel void
+begin( global struct VContextScheduler* scheduler,
+       dword leaf_size,
+       dword leaf_type,
+       global uint* primref_index_buffers,
+       global PrimRef* primref_buffer,
+       global struct BVH2* bvh2,
+       global struct BVHBase* bvh_base,
+       global struct Globals* globals,
+       global struct SAHBuildGlobals* sah_globals,
+       global uint2* qnode_root_buffer,
+       dword sah_globals_flags
+     )
+{
+    dword num_primrefs = globals->numPrimitives;
+    if ( get_local_id( 0 ) == 0 )
+    {
+        sah_globals->p_primrefs_buffer       = (qword) primref_buffer;
+        sah_globals->p_primref_index_buffers = (qword)primref_index_buffers;
+        sah_globals->p_bvh2                  = (qword) bvh2;
+        sah_globals->p_bvh_base              = (qword) bvh_base;
+        sah_globals->leaf_size               = leaf_size;
+        sah_globals->leaf_type               = leaf_type;
+        sah_globals->num_primrefs            = num_primrefs;
+        sah_globals->p_globals               = (qword) globals;
+        sah_globals->p_qnode_root_buffer     = (gpuva_t) qnode_root_buffer;
+        sah_globals->flags                   = sah_globals_flags;
+
+        // initialize the spill stack
+        scheduler->bfs2_spill_stack.size = 0;
+
+        // initialize BVH2 node counter
+        BVH2_Initialize( bvh2 );
+
+        // configure first vcontext for first build
+        scheduler->contexts[0].dispatch_primref_begin = 0;
+        scheduler->contexts[0].dispatch_primref_end   = num_primrefs;
+        scheduler->contexts[0].bvh2_root              = BVH2_GetRoot( bvh2 );
+        scheduler->contexts[0].tree_depth             = 0;
+        scheduler->contexts[0].batch_index            = 0;
+
+        scheduler->bfs_queue.records[0].context_id = 0;
+
+        scheduler->contexts[0].num_left = 0;
+        scheduler->contexts[0].num_right = 0;
+        scheduler->contexts[0].lr_mask = 0;
+
+        // copy centroid bounds into the BVH2 root node'
+        BVH2_SetNodeBox_lu( bvh2, BVH2_GetRoot( bvh2 ), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz );
+
+        // zero the trivial build counters.. these are only used by the batch-build path
+        //  but single-wg QNode path (if used) depends on them
+        scheduler->num_trivial_builds = 0;
+        scheduler->num_single_builds = 0;
+
+        // initialize the root-buffer counters
+        sah_globals->root_buffer_num_produced     = 0;
+        sah_globals->root_buffer_num_produced_hi  = 0;
+        sah_globals->root_buffer_num_consumed     = 0;
+        sah_globals->root_buffer_num_consumed_hi  = 0;
+    }
+
+    // initialize vcontext states
+    for ( uint i = get_local_id( 0 ); i < BFS_NUM_VCONTEXTS; i += get_local_size( 0 ) )
+        scheduler->vcontext_state[i] = (i==0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED;
+
+    // initialize global bin info in vcontext - only context[0] will be used in first iteration
+    BinInfo_init( &scheduler->contexts[0].global_bin_info );
+    LRBounds_init( &scheduler->contexts[0].lr_bounds );
+
+   // barrier( CLK_GLOBAL_MEM_FENCE  ); // lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+
+// TODO_OPT:  Enable larger WGs
+//    We need a way to do this in a portable fashion.
+//     Gen12 can support larger WGs than Gen9 can
+//
+
+
+// TODO_OPT:  Enable larger WGs
+//    We need a way to do this in a portable fashion.
+//     Gen12 can support larger WGs than Gen9 can
+//
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+kernel void
+categorize_builds_and_init_scheduler(
+    global struct VContextScheduler* scheduler,
+    global gpuva_t* globals_ptrs,                // OCL-C does not allow kernel parameters to be pointer-to-pointer, so we trick it...
+    global struct SAHBuildBuffersInfo* buffers_info,
+    global struct SAHBuildGlobals* builds_out,
+    dword num_builds
+)
+{
+    local uint num_trivial;
+    local uint num_single;
+    local uint num_full;
+
+    if (get_group_id(0) == 0) // first workgroup performs build categorization
+    {
+        if (get_local_id(0) == 0)
+        {
+            num_trivial = 0;
+            num_single = 0;
+            num_full = 0;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // first pass, count builds of each type
+        uint triv = 0;
+        uint single = 0;
+        uint full = 0;
+        for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0))
+        {
+            global struct Globals* globals = (global struct Globals*) globals_ptrs[i];
+            dword num_refs = globals->numPrimitives;
+
+            if (num_refs <= TRIVIAL_BUILD_THRESHOLD)
+                triv++;
+            else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD)
+                single++;
+            else
+                full++;
+        }
+
+        // merge counts across work-group.  These variables are now offsets into this thread's ranges
+        triv   = atomic_add_local(&num_trivial, triv);
+        single = atomic_add_local(&num_single, single);
+        full   = atomic_add_local(&num_full, full);
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        global struct SAHBuildGlobals* trivial_builds_out = builds_out;
+        global struct SAHBuildGlobals* single_builds_out = builds_out + num_trivial;
+        global struct SAHBuildGlobals* full_builds_out = builds_out + num_trivial + num_single;
+
+        for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0))
+        {
+            global struct Globals* globals = (global struct Globals*) globals_ptrs[i];
+            global struct SAHBuildBuffersInfo* buffers = &buffers_info[i];
+
+            dword num_refs = globals->numPrimitives;
+            dword leaf_type = globals->leafPrimType;
+            dword leaf_size = globals->leafSize;
+
+            global struct SAHBuildGlobals* place;
+            if (num_refs <= TRIVIAL_BUILD_THRESHOLD)
+                place = trivial_builds_out + (triv++);
+            else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD)
+                place = single_builds_out + (single++);
+            else
+                place = full_builds_out + (full++);
+
+            place->p_primref_index_buffers = buffers->p_primref_index_buffers;
+            place->p_primrefs_buffer    = buffers->p_primrefs_buffer;
+            place->p_bvh2               = buffers->p_bvh2;
+            place->p_bvh_base           = buffers->p_bvh_base;
+            place->p_globals            = (gpuva_t)globals;
+            place->num_primrefs         = num_refs;
+            place->leaf_size            = leaf_size;
+            place->leaf_type            = leaf_type;
+            place->flags                = buffers->sah_globals_flags;
+            place->p_qnode_root_buffer  = buffers->p_qnode_root_buffer;
+
+            // only initialize BVH2 if it will actually be used by the build
+            //   trivial passes will not use it
+            if( num_refs > SINGLE_WG_BUILD_THRESHOLD )
+            {
+                // initialize BVH2 node counter
+                global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(place);
+                BVH2_Initialize(bvh2);
+
+                // copy centroid bounds into the BVH2 root node'
+                BVH2_SetNodeBox_lu(bvh2, BVH2_GetRoot(bvh2), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz);
+            }
+        }
+
+        if (get_local_id(0) == 0)
+        {
+            scheduler->num_trivial_builds   = num_trivial;
+            scheduler->num_single_builds    = num_single;
+            scheduler->batched_build_offset = num_trivial + num_single;
+            scheduler->batched_build_count  = num_full;
+        }
+    }
+    else // second workgroup initializes the scheduler
+    {
+        // initialize vcontext states
+        for (uint i = get_local_id(0); i < BFS_NUM_VCONTEXTS; i += get_local_size(0))
+            scheduler->vcontext_state[i] = (i == 0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED;
+
+        // initialize global bin info in vcontexts
+        for (uint i = get_sub_group_id(); i < BFS_NUM_VCONTEXTS; i += get_num_sub_groups())
+            BinInfo_init_subgroup(&scheduler->contexts[i].global_bin_info);
+
+        // initialize the spill stack
+        if (get_local_id(0) == 0)
+            scheduler->bfs2_spill_stack.size = 0;
+    }
+
+    //barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );// lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_NUM_VCONTEXTS, 1, 1)))
+kernel void
+begin_batchable(
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* sah_globals
+)
+{
+    ushort scheduler_build_offset = scheduler->batched_build_offset;
+    ushort scheduler_num_builds   = scheduler->batched_build_count;
+
+    ushort num_builds = min( scheduler_num_builds, (ushort)BFS_NUM_VCONTEXTS );
+
+    uint num_wgs = 0;
+
+    ushort tid = get_local_id(0);
+    if ( tid < num_builds )
+    {
+        ushort batch_index = scheduler_build_offset + tid;
+
+        uint num_primrefs = sah_globals[batch_index].num_primrefs;
+
+        // configure first vcontext for first build
+        scheduler->contexts[tid].dispatch_primref_begin = 0;
+        scheduler->contexts[tid].dispatch_primref_end   = num_primrefs;
+        scheduler->contexts[tid].bvh2_root              = BVH2_GetRoot( SAHBuildGlobals_GetBVH2(&sah_globals[batch_index]) );
+        scheduler->contexts[tid].tree_depth             = 0;
+        scheduler->contexts[tid].batch_index            = batch_index;
+        scheduler->vcontext_state[tid] = VCONTEXT_STATE_EXECUTING;
+
+        scheduler->contexts[tid].num_left = 0;
+        scheduler->contexts[tid].num_right = 0;
+        scheduler->contexts[tid].lr_mask   = 0;
+
+        num_wgs = get_num_wgs( num_primrefs, BFS_WG_SIZE );
+
+        scheduler->bfs_queue.wg_count[tid] = num_wgs;
+        scheduler->bfs_queue.records[tid].batch_index = batch_index;
+        scheduler->bfs_queue.records[tid].context_id  = tid;
+    }
+
+    num_wgs = work_group_reduce_add(num_wgs);
+
+    if (tid == 0)
+    {
+        // write out build count and offset for next BFS iteration
+        scheduler->batched_build_offset = scheduler_build_offset + num_builds;
+        scheduler->batched_build_count  = scheduler_num_builds - num_builds;
+
+        // write out initial WG count and loop termination mask for command streamer to consume
+        scheduler->batched_build_wg_count  = num_wgs;
+        scheduler->batched_build_loop_mask = (scheduler_num_builds > num_builds) ? 1 : 0;
+
+        scheduler->bfs_queue.num_dispatches = num_builds;
+    }
+
+    for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() )
+        BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info );
+
+    for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() )
+        LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds );
+}
+
+
+
+bool is_leaf( uint num_refs )
+{
+    return num_refs <= TREE_ARITY;
+}
+
+bool is_dfs( uint num_refs )
+{
+    return num_refs > TREE_ARITY&& num_refs <= DFS_THRESHOLD;
+}
+
+bool is_bfs( uint num_refs )
+{
+    return num_refs > DFS_THRESHOLD;
+}
+
+int2 is_leaf_2( uint2 num_refs )
+{
+    return num_refs.xy <= TREE_ARITY;
+}
+int2 is_bfs_2( uint2 num_refs )
+{
+    return num_refs.xy > DFS_THRESHOLD;
+}
+
+int2 is_dfs_2( uint2 num_refs )
+{
+    return num_refs.xy > TREE_ARITY && num_refs.xy <= DFS_THRESHOLD;
+}
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+sg_scheduler( global struct VContextScheduler* scheduler )
+{
+    local struct BFS1SpillStackEntry SLM_local_spill_stack[BFS_NUM_VCONTEXTS];
+    local uchar SLM_context_state[BFS_NUM_VCONTEXTS];
+    local vcontext_id_t SLM_free_list[BFS_NUM_VCONTEXTS];
+    local vcontext_id_t SLM_exec_list[BFS_NUM_VCONTEXTS];
+
+
+    varying ushort lane = get_sub_group_local_id();
+
+    uniform uint free_list_size = 0;
+    uniform uint exec_list_size = 0;
+
+    // read context states, build lists of free and executing contexts
+    for (varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size())
+    {
+        uchar state = scheduler->vcontext_state[i];
+        SLM_context_state[i] = state;
+
+        uniform ushort exec_mask = intel_sub_group_ballot(state == VCONTEXT_STATE_EXECUTING);
+
+        varying ushort prefix_exec = subgroup_bit_prefix_exclusive(exec_mask);
+        varying ushort prefix_free = lane - prefix_exec;
+        varying ushort exec_list_pos = exec_list_size + prefix_exec;
+        varying ushort free_list_pos = free_list_size + prefix_free;
+
+        if (state == VCONTEXT_STATE_EXECUTING)
+            SLM_exec_list[exec_list_pos] = i;
+        else
+            SLM_free_list[free_list_pos] = i;
+
+        uniform ushort num_exec = popcount(exec_mask);
+        exec_list_size += num_exec;
+        free_list_size += get_sub_group_size() - num_exec;
+    }
+
+    uniform uint total_bfs_dispatches = 0;
+    uniform uint total_dfs_dispatches = 0;
+    uniform uint bfs_spill_stack_size   = 0;
+    uniform uint total_bfs_wgs      = 0;
+
+    // process executing context.  accumulate bfs/dfs dispatches and free-list entries
+    for (uint i = 0; i < exec_list_size; i+= get_sub_group_size() )
+    {
+        varying ushort num_dfs_dispatches     = 0;
+        varying ushort num_bfs_spills         = 0;
+
+        varying ushort num_bfs_children;
+        varying ushort context_id;
+        struct VContext* context;
+        varying uint num_left      ;
+        varying uint num_right     ;
+        varying uint primref_begin ;
+        varying uint primref_end   ;
+        varying uint depth         ;
+
+        bool active_lane = (i + lane) < exec_list_size;
+        if ( active_lane )
+        {
+            context_id = SLM_exec_list[i + lane];
+            context    = &scheduler->contexts[context_id];
+
+            num_left      = context->num_left;
+            num_right     = context->num_right;
+            primref_begin = context->dispatch_primref_begin;
+            primref_end   = context->dispatch_primref_end;
+            depth         = context->tree_depth;
+
+            // get dispatch counts
+
+            num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right);
+            num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+            num_bfs_spills = (num_bfs_children == 2) ? 1 : 0;
+        }
+
+        // allocate space for DFS, BFS dispatches, and BFS spills
+        varying uint dfs_pos               = total_dfs_dispatches + sub_group_scan_exclusive_add(num_dfs_dispatches);
+        varying ushort mask_bfs_spills     = intel_sub_group_ballot(num_bfs_children & 2); // spill if #children == 2
+        varying ushort mask_bfs_dispatches = intel_sub_group_ballot(num_bfs_children & 3); // dispatch if #children == 1 or 2
+        varying uint bfs_spill_pos         = bfs_spill_stack_size + subgroup_bit_prefix_exclusive(mask_bfs_spills);
+        varying uint bfs_dispatch_pos      = total_bfs_dispatches + subgroup_bit_prefix_exclusive(mask_bfs_dispatches);
+
+        total_dfs_dispatches += sub_group_reduce_add(num_dfs_dispatches);
+        bfs_spill_stack_size += popcount(mask_bfs_spills);
+        total_bfs_dispatches += popcount(mask_bfs_dispatches);
+
+        varying uint num_bfs_wgs = 0;
+        if (active_lane)
+        {
+            if (num_dfs_dispatches)
+            {
+                if (is_dfs(num_left))
+                {
+                    scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin;
+                    scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left;
+                    scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->left_bvh2_root;
+                    scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                    dfs_pos++;
+                }
+                if (is_dfs(num_right))
+                {
+                    scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left;
+                    scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right;
+                    scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->right_bvh2_root;
+                    scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                }
+            }
+
+            uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+            if (num_bfs_children == 2)
+            {
+                // spill the right child.. push an entry onto local spill stack
+                SLM_local_spill_stack[bfs_spill_pos].primref_begin = primref_begin + num_left;
+                SLM_local_spill_stack[bfs_spill_pos].primref_end = primref_end;
+                SLM_local_spill_stack[bfs_spill_pos].bvh2_root = context->right_bvh2_root;
+                SLM_local_spill_stack[bfs_spill_pos].tree_depth = depth + 1;
+
+                // setup BFS1 dispatch for left child
+                context->dispatch_primref_end = primref_begin + num_left;
+                context->bvh2_root = context->left_bvh2_root;
+                context->tree_depth = depth + 1;
+                num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE);
+
+                scheduler->bfs_queue.wg_count[bfs_dispatch_pos]           = num_bfs_wgs;
+                scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id;
+            }
+            else if (num_bfs_children == 1)
+            {
+                // setup BFS1 dispatch for whichever child wants it
+                if (is_bfs(num_left))
+                {
+                    // bfs on left child
+                    context->dispatch_primref_end = context->dispatch_primref_begin + num_left;
+                    context->bvh2_root = context->left_bvh2_root;
+                    context->tree_depth = depth + 1;
+                    num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE);
+                }
+                else
+                {
+                    // bfs on right child
+                    context->dispatch_primref_begin = context->dispatch_primref_begin + num_left;
+                    context->bvh2_root = context->right_bvh2_root;
+                    context->tree_depth = depth + 1;
+                    num_bfs_wgs = get_num_wgs(num_right, BFS_WG_SIZE);
+                }
+
+                scheduler->bfs_queue.wg_count[bfs_dispatch_pos]           = num_bfs_wgs;
+                scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id;
+            }
+            else
+            {
+                // no bfs dispatch.. this context is now free
+                SLM_context_state[context_id] = VCONTEXT_STATE_UNALLOCATED;
+            }
+        }
+
+        // count bfs work groups
+        total_bfs_wgs += sub_group_reduce_add(num_bfs_wgs);
+
+        // add newly deallocated contexts to the free list
+        uniform uint free_mask = intel_sub_group_ballot( active_lane && num_bfs_children == 0);
+        varying uint free_list_pos = free_list_size + subgroup_bit_prefix_exclusive(free_mask);
+        free_list_size += popcount(free_mask);
+
+        if ( free_mask & (1<<lane) )
+            SLM_free_list[free_list_pos] = context_id;
+
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // if we have more free contexts than spills, read additional spills from the scheduler's spill stack
+    uniform uint memory_spill_stack_size = scheduler->bfs2_spill_stack.size;
+
+    if(bfs_spill_stack_size < free_list_size && memory_spill_stack_size > 0 )
+    {
+        uniform uint read_count = min(free_list_size - bfs_spill_stack_size, memory_spill_stack_size);
+
+        for (varying uint i = lane; i < read_count; i+= get_sub_group_size())
+            SLM_local_spill_stack[bfs_spill_stack_size + i] = scheduler->bfs2_spill_stack.entries[memory_spill_stack_size - 1 - i];
+
+        bfs_spill_stack_size += read_count;
+        memory_spill_stack_size -= read_count;
+    }
+
+    // steal pending BFS work and assign it to free contexts
+    uniform uint num_steals = min(bfs_spill_stack_size, free_list_size);
+
+    for (uniform uint i = 0; i < num_steals; i += get_sub_group_size())
+    {
+        varying uint num_bfs_wgs = 0;
+
+        if (i + lane < num_steals)
+        {
+            uint context_id = SLM_free_list[i+lane];
+            struct VContext* context = &scheduler->contexts[context_id];
+            struct BFS1SpillStackEntry entry = SLM_local_spill_stack[i+lane];
+
+            context->dispatch_primref_begin = entry.primref_begin;
+            context->dispatch_primref_end = entry.primref_end;
+            context->bvh2_root = entry.bvh2_root;
+            context->tree_depth = entry.tree_depth;
+
+            num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE);
+
+            scheduler->bfs_queue.wg_count[total_bfs_dispatches + i + lane] = num_bfs_wgs;
+            scheduler->bfs_queue.records[total_bfs_dispatches + i + lane].context_id = context_id;
+
+            SLM_context_state[context_id] = VCONTEXT_STATE_EXECUTING;
+        }
+
+        total_bfs_wgs += sub_group_reduce_add( num_bfs_wgs );
+    }
+
+    total_bfs_dispatches += num_steals;
+
+    //  write out excess spills to global spill stack
+    uniform uint extra_spills = bfs_spill_stack_size - num_steals;
+    for (varying uint i = lane; i < extra_spills; i += get_sub_group_size())
+    {
+        scheduler->bfs2_spill_stack.entries[memory_spill_stack_size + i] = SLM_local_spill_stack[num_steals+i];
+    }
+
+
+    // write out modified context states
+    for ( varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size())
+        scheduler->vcontext_state[i] = SLM_context_state[i];
+
+
+    if (get_local_id(0) == 0)
+    {
+        // write out new memory stack size
+        scheduler->bfs2_spill_stack.size = memory_spill_stack_size + extra_spills;
+
+        // store workgroup counters
+        scheduler->bfs_queue.num_dispatches = total_bfs_dispatches;
+        scheduler->num_bfs_wgs = total_bfs_wgs;
+        scheduler->num_dfs_wgs = total_dfs_dispatches;
+    }
+
+  //  barrier(CLK_GLOBAL_MEM_FENCE); // make memory writes globally visible// lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+#endif
+
+#define SCHEDULER_SG_SIZE 16
+#define SCHEDULER_WG_SIZE BFS_NUM_VCONTEXTS
+#define SCHEDULER_NUM_SGS (SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE)
+
+
+struct BFSDispatchArgs get_bfs_args_from_record_batchable(
+    struct BFSDispatchRecord* record,
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer );
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SCHEDULER_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SCHEDULER_SG_SIZE)))
+kernel void
+scheduler(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS1SpillStackEntry SLM_local_spill_stack[2 * BFS_NUM_VCONTEXTS];
+    local uint SLM_local_spill_stack_size;
+    local uint SLM_dfs_dispatch_count;
+
+    if (get_local_id(0) == 0)
+    {
+        SLM_local_spill_stack_size = 0;
+        SLM_dfs_dispatch_count = 0;
+    }
+
+    uint context_id = get_local_id(0);
+    uint state = scheduler->vcontext_state[context_id];
+    uint initial_state = state;
+
+    uint batch_index = 0;
+    global struct VContext* context = &scheduler->contexts[context_id];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
+    uint global_spill_stack_size = scheduler->bfs2_spill_stack.size;
+
+
+    if (state == VCONTEXT_STATE_EXECUTING)
+    {
+        uint left_bvh2_root;
+        uint right_bvh2_root;
+
+        uint num_left = context->num_left;
+        uint num_right = context->num_right;
+
+        uint primref_begin = context->dispatch_primref_begin;
+        uint primref_end = context->dispatch_primref_end;
+
+        uint depth = context->tree_depth;
+        uint batch_index = context->batch_index;
+
+        struct BFSDispatchRecord record;
+        record.context_id = context_id;
+        record.batch_index = context->batch_index;
+
+        struct BFSDispatchArgs args = get_bfs_args_from_record_batchable( &record, scheduler, sah_globals);
+
+        // do cleanup of bfs_pass2
+        {
+            // compute geom bounds
+            struct AABB3f left_geom_bounds;
+            struct AABB3f right_geom_bounds;
+            struct AABB3f left_centroid_bounds;
+            struct AABB3f right_centroid_bounds;
+            uint2 lr_counts = (uint2)(num_left, num_right);
+
+            {
+                left_centroid_bounds    = LRBounds_get_left_centroid( &context->lr_bounds );
+                left_geom_bounds        = LRBounds_get_left_geom(  &context->lr_bounds );
+                right_centroid_bounds   = LRBounds_get_right_centroid( &context->lr_bounds );
+                right_geom_bounds       = LRBounds_get_right_geom( &context->lr_bounds );
+            }
+
+            int2 v_is_leaf = is_leaf_2( lr_counts );
+            int2 v_is_dfs  = is_dfs_2( lr_counts );
+            int2 v_is_bfs  = is_bfs_2( lr_counts );
+            uint left_mask  = args.do_mask_processing ? context->lr_mask & 0xff : 0xff;
+            uint right_mask = args.do_mask_processing ? (context->lr_mask & 0xff00) >> 8 : 0xff;
+
+            // how many BVH2 nodes do we need to allocate?  For DFS, we need to pre-allocate full subtree
+            uint2 lr_node_counts = select( (uint2)(1,1), (2*lr_counts-1), v_is_dfs );
+            uint left_node_count = lr_node_counts.x;
+            uint right_node_count = lr_node_counts.y;
+
+            // allocate the nodes
+            uint first_node = BVH2_AllocateNodes( args.bvh2, left_node_count + right_node_count );
+
+            // point our root node at its children
+            left_bvh2_root  = first_node;
+            right_bvh2_root = first_node + left_node_count;
+
+            // store combined geom bounds in the root node's AABB.. we previously stored centroid bounds there
+            //   but node creation requires geom bounds
+            struct AABB3f geom_bounds = left_geom_bounds;
+            AABB3f_extend(&geom_bounds, &right_geom_bounds);
+            BVH2_WriteInnerNode( args.bvh2, args.bvh2_root, &geom_bounds, (uint2)(left_bvh2_root,right_bvh2_root), left_mask | right_mask );
+
+//            printf(" node: %u  mask: %x\n", args.bvh2_root, left_mask|right_mask );
+
+            // store the appropriate AABBs in the child nodes
+            //   - BFS passes need centroid bounds
+            //   - DFS passes need geom bounds
+            //  Here we also write leaf connectivity information (prim start+count)
+            //   this will be overwritten later if we are creating an inner node
+            struct AABB3f left_box, right_box;
+            left_box  = AABB3f_select( left_geom_bounds,  left_centroid_bounds,  v_is_bfs.xxx );
+            right_box = AABB3f_select( right_geom_bounds, right_centroid_bounds, v_is_bfs.yyy );
+
+            uint left_start  = primref_begin;
+            uint right_start = primref_begin + num_left;
+            BVH2_WriteLeafNode( args.bvh2, left_bvh2_root,  &left_box, left_start,  num_left, left_mask );
+            BVH2_WriteLeafNode( args.bvh2, right_bvh2_root, &right_box, right_start, num_right, right_mask );
+
+            // make input and output primref index buffers consistent in the event we're creating a leaf
+            //   There should only ever be one leaf created, otherwise we'd have done a DFS pass sooner
+            if (any( v_is_leaf.xy ))
+            {
+                uint start    = v_is_leaf.x ? left_start : right_start;
+                uint num_refs = v_is_leaf.x ? num_left : num_right;
+
+                for(uint i = 0; i < num_refs; i++)
+                {
+                    args.primref_index_in[start + i] = args.primref_index_out[start + i];
+                }
+            }
+        }
+
+        // when BFS2 finishes, we need to dispatch two child tasks.
+        //   DFS dispatches can run free and do not need a context
+        //   BFS dispatches need a context.
+        //  In the case where both of the child nodes are BFS, the current context can immediately run one of the child dispatches
+        //   and the other is spilled for an unallocated context to pick up
+
+        uint num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right);
+        if (num_dfs_dispatches)
+        {
+            uint dfs_pos = atomic_add_local(&SLM_dfs_dispatch_count, num_dfs_dispatches);
+            if (is_dfs(num_left))
+            {
+                scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin;
+                scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left;
+                scheduler->dfs_queue.records[dfs_pos].bvh2_base = left_bvh2_root;
+                scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index;
+                dfs_pos++;
+            }
+            if (is_dfs(num_right))
+            {
+                scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left;
+                scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right;
+                scheduler->dfs_queue.records[dfs_pos].bvh2_base = right_bvh2_root;
+                scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index;
+            }
+        }
+
+        uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+        if (num_bfs_children)
+        {
+            uint place = atomic_add_local(&SLM_local_spill_stack_size, num_bfs_children);
+            if (is_bfs(num_left))
+            {
+                SLM_local_spill_stack[place].primref_begin = primref_begin;
+                SLM_local_spill_stack[place].primref_end = primref_begin + num_left;
+                SLM_local_spill_stack[place].bvh2_root = left_bvh2_root;
+                SLM_local_spill_stack[place].tree_depth = depth + 1;
+                SLM_local_spill_stack[place].batch_index = batch_index;
+                place++;
+            }
+            if (is_bfs(num_right))
+            {
+                SLM_local_spill_stack[place].primref_begin = primref_begin + num_left;
+                SLM_local_spill_stack[place].primref_end = primref_end;
+                SLM_local_spill_stack[place].bvh2_root = right_bvh2_root;
+                SLM_local_spill_stack[place].tree_depth = depth + 1;
+                SLM_local_spill_stack[place].batch_index = batch_index;
+                place++;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint local_spill_stack_size = SLM_local_spill_stack_size;
+
+    struct BFS1SpillStackEntry entry;
+    state = VCONTEXT_STATE_UNALLOCATED;
+    if (context_id < local_spill_stack_size)
+    {
+        // pull BFS work from the local spill stack if there's enough work there
+        entry = SLM_local_spill_stack[context_id];
+        state = VCONTEXT_STATE_EXECUTING;
+    }
+    else if ((context_id - local_spill_stack_size) < (global_spill_stack_size))
+    {
+        // if there isn't enough work on the local stack, consume from the global one
+        uint global_pos = (global_spill_stack_size - 1) - (context_id - local_spill_stack_size);
+        entry = scheduler->bfs2_spill_stack.entries[global_pos];
+        state = VCONTEXT_STATE_EXECUTING;
+    }
+
+    // contexts which received work set themselves up for the next BFS1 dispatch
+    uint num_bfs_wgs = 0;
+    uint num_bfs_dispatches = 0;
+    if (state == VCONTEXT_STATE_EXECUTING)
+    {
+        context->dispatch_primref_begin = entry.primref_begin;
+        context->dispatch_primref_end = entry.primref_end;
+        context->bvh2_root = entry.bvh2_root;
+        context->tree_depth = entry.tree_depth;
+        context->batch_index = entry.batch_index;
+
+        context->num_left = 0;
+        context->num_right = 0;
+        context->lr_mask = 0;
+
+        batch_index = entry.batch_index;
+        num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE);
+        num_bfs_dispatches = 1;
+    }
+
+
+    if (local_spill_stack_size > BFS_NUM_VCONTEXTS)
+    {
+        // write out additional spills if we produced more work than we can consume
+        uint excess_spills = local_spill_stack_size - BFS_NUM_VCONTEXTS;
+        uint write_base = global_spill_stack_size;
+        uint lid = get_local_id(0);
+        if (lid < excess_spills)
+            scheduler->bfs2_spill_stack.entries[write_base + lid] = SLM_local_spill_stack[BFS_NUM_VCONTEXTS + lid];
+
+        if (lid == 0)
+            scheduler->bfs2_spill_stack.size = global_spill_stack_size + excess_spills;
+    }
+    else if (global_spill_stack_size > 0)
+    {
+        // otherwise, if we consumed any spills from the global stack, update the stack size
+        if (get_local_id(0) == 0)
+        {
+            uint global_spills_consumed = min(global_spill_stack_size, BFS_NUM_VCONTEXTS - local_spill_stack_size);
+            scheduler->bfs2_spill_stack.size = global_spill_stack_size - global_spills_consumed;
+        }
+    }
+
+
+    // Do various WG reductions..  the code below is a hand-written version of the following:
+    //
+    // uint bfs_dispatch_queue_pos     = work_group_scan_exclusive_add( num_bfs_dispatches );
+    // uint reduce_num_bfs_wgs         = work_group_reduce_add(num_bfs_wgs);
+    // uint reduce_num_bfs_dispatches  = work_group_reduce_add(num_bfs_dispatches);
+    uint bfs_dispatch_queue_pos;
+    uint reduce_num_bfs_dispatches;
+    uint reduce_num_bfs_wgs;
+    local uint partial_dispatches[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE];
+    local uint partial_wgs[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE];
+    {
+        partial_dispatches[get_sub_group_id()] = sub_group_reduce_add(num_bfs_dispatches);
+        partial_wgs[get_sub_group_id()] = sub_group_reduce_add(num_bfs_wgs);
+
+        uint sg_prefix = sub_group_scan_exclusive_add(num_bfs_dispatches);
+
+        uint prefix_dispatches = 0;
+        uint total_dispatches = 0;
+        uint total_wgs = 0;
+        ushort lane = get_sub_group_local_id();
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (ushort i = 0; i < SCHEDULER_NUM_SGS; i += SCHEDULER_SG_SIZE) // this loop is intended to be fully unrolled after compilation
+        {
+            uint p_dispatch = partial_dispatches[i + lane];
+            uint p_wg = partial_wgs[i + lane];
+
+            prefix_dispatches += (i + lane < get_sub_group_id()) ? p_dispatch : 0;
+            total_dispatches += p_dispatch;
+            total_wgs += p_wg;
+        }
+
+        bfs_dispatch_queue_pos = sg_prefix + sub_group_reduce_add(prefix_dispatches);
+        reduce_num_bfs_dispatches = sub_group_reduce_add(total_dispatches);
+        reduce_num_bfs_wgs = sub_group_reduce_add(total_wgs);
+    }
+
+    // insert records into BFS queue
+    if (num_bfs_dispatches)
+    {
+        scheduler->bfs_queue.wg_count[bfs_dispatch_queue_pos] = num_bfs_wgs;
+        scheduler->bfs_queue.records[bfs_dispatch_queue_pos].context_id = context_id;
+        scheduler->bfs_queue.records[bfs_dispatch_queue_pos].batch_index = batch_index;
+    }
+
+
+    // store modified vcontext state if it has changed
+    if (initial_state != state)
+        scheduler->vcontext_state[context_id] = state;
+
+
+    // store workgroup counters
+    if (get_local_id(0) == 0)
+    {
+        scheduler->bfs_queue.num_dispatches = reduce_num_bfs_dispatches;
+        scheduler->num_bfs_wgs = reduce_num_bfs_wgs;
+        scheduler->num_dfs_wgs = SLM_dfs_dispatch_count;
+    }
+
+    const uint contexts_to_clear = min( (uint)BFS_NUM_VCONTEXTS, (uint)(local_spill_stack_size+global_spill_stack_size) );
+
+    for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() )
+        BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info );
+
+    for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() )
+        LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds );
+}
+
+#if 0
+uint record_search( struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue )
+{
+    uint group = get_group_id(0);
+    ushort lane = get_sub_group_local_id();
+    uint num_dispatches = queue->num_dispatches;
+    uint base = 0;
+    for (uint i = 0; i < num_dispatches; i += get_sub_group_size())
+    {
+        uint counts = intel_sub_group_block_read(&queue->wg_count[i]);
+
+        for (uint j = 0; j < get_sub_group_size(); j++)
+        {
+            uint n = sub_group_broadcast(counts, j);
+            if (group < n)
+            {
+                *record_out = queue->records[i + j];
+                return group;
+            }
+            group -= n;
+        }
+    }
+
+    return 0; // NOTE: unreachable in practice
+}
+#endif
+
+
+uint record_search(struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue)
+{
+    uint group = get_group_id(0);
+
+    uint num_dispatches = queue->num_dispatches;
+
+    uint dispatch_id = 0;
+    uint local_id = 0;
+    uint i = 0;
+    do
+    {
+        uint counts = intel_sub_group_block_read(&queue->wg_count[i]);
+        uint prefix = sub_group_scan_exclusive_add(counts);
+
+        uint g = group - prefix;
+        uint ballot = intel_sub_group_ballot(g < counts);
+        if (ballot)
+        {
+            uint lane = ctz(ballot);
+            dispatch_id = i + lane;
+            local_id = intel_sub_group_shuffle(g, lane);
+            break;
+        }
+
+        group -= sub_group_broadcast(prefix + counts, get_sub_group_size() - 1);
+
+        i += get_sub_group_size();
+    } while (i < num_dispatches);
+
+
+    *record_out = queue->records[dispatch_id];
+    return local_id;
+}
+
+
+
+
+struct BFSDispatchArgs get_bfs_args(struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals, uint local_group_id)
+{
+    uint context_id = record->context_id;
+    struct VContext* context = &scheduler->contexts[context_id];
+    bool odd_pass = context->tree_depth & 1;
+
+    struct BFSDispatchArgs args;
+    args.scheduler              = scheduler;
+    args.primref_index_in       = SAHBuildGlobals_GetPrimrefIndices_In( globals, odd_pass );
+    args.primref_index_out      = SAHBuildGlobals_GetPrimrefIndices_Out( globals, odd_pass );
+    args.primref_buffer         = SAHBuildGlobals_GetPrimrefs( globals );
+    args.wg_primref_begin       = context->dispatch_primref_begin + local_group_id * BFS_WG_SIZE;
+    args.wg_primref_end         = min( args.wg_primref_begin + BFS_WG_SIZE, context->dispatch_primref_end );
+    args.dispatch_primref_begin = context->dispatch_primref_begin;
+    args.dispatch_primref_end   = context->dispatch_primref_end;
+    args.context_id             = context_id;
+    args.context                = &scheduler->contexts[context_id];
+    args.num_wgs                = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE;
+    args.bvh2_root              = context->bvh2_root;
+    args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+    args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+    args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals );
+    return args;
+}
+
+struct BFSDispatchArgs get_bfs_args_queue( global struct BFSDispatchQueue* queue,
+                                           global struct VContextScheduler* scheduler,
+                                           global struct SAHBuildGlobals* globals )
+{
+
+    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
+    //    Or use sub-group ops
+
+    struct BFSDispatchRecord record;
+    uint local_group_id = record_search(&record, queue);
+
+    return get_bfs_args(&record, scheduler, globals, local_group_id);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_from_record( struct BFSDispatchRecord* record,
+                                           global struct VContextScheduler* scheduler,
+                                           global struct SAHBuildGlobals* globals )
+{
+    return get_bfs_args(record, scheduler, globals, 0);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_batchable(
+    global struct BFSDispatchQueue* queue,
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer )
+{
+
+    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
+    //    Or use sub-group ops
+
+    struct BFSDispatchRecord record;
+    uint local_group_id = record_search(&record, queue);
+
+    global struct SAHBuildGlobals* globals = globals_buffer + record.batch_index;
+
+    return get_bfs_args(&record, scheduler, globals, local_group_id);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_from_record_batchable(
+    struct BFSDispatchRecord* record,
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer )
+{
+    global struct SAHBuildGlobals* globals = globals_buffer + record->batch_index;
+
+    return get_bfs_args(record, scheduler, globals, 0);
+}
+
+struct BFSDispatchArgs get_bfs_args_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals )
+{
+    uint context_id = 0;
+
+    uint num_refs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+
+    struct BFSDispatchArgs args;
+    args.scheduler = scheduler;
+    args.primref_index_in   = SAHBuildGlobals_GetPrimrefIndices_In( globals, false );
+    args.primref_index_out  = SAHBuildGlobals_GetPrimrefIndices_Out( globals, false );
+    args.primref_buffer     = SAHBuildGlobals_GetPrimrefs( globals );
+    args.wg_primref_begin   = get_group_id(0) * BFS_WG_SIZE;
+    args.wg_primref_end     = min( args.wg_primref_begin + BFS_WG_SIZE, num_refs );
+    args.dispatch_primref_begin = 0;
+    args.dispatch_primref_end   = num_refs;
+    args.context_id = context_id;
+    args.context = &scheduler->contexts[context_id];
+    args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE;
+    args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+    args.bvh2_root = BVH2_GetRoot( args.bvh2 );
+    args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+    args.do_mask_processing = SAHBuildGlobals_NeedMasks(globals);
+    return args;
+}
+
+
+inline void BinMapping_init( struct BinMapping* binMapping, struct AABB3f* centBounds, const uint bins )
+{
+    const float4 eps = 1E-34f;
+    const float4 omega = 1E+34f;
+    float3 l = AABB3f_load_lower( centBounds );
+    float3 u = AABB3f_load_upper( centBounds );
+    float4 diag;
+    diag.xyz = max( eps.xyz, u - l );
+    diag.w = 0;
+    float4 scale = (float4)(0.99f * (float)bins) / diag;
+    scale = select( (float4)(0.0f), scale, diag > eps );
+    scale = select( (float4)(0.0f), scale, diag < omega );
+    binMapping->scale = scale;
+    binMapping->ofs.xyz = l.xyz;
+    binMapping->ofs.w = 0;
+}
+
+
+inline ulong getBestSplit( float3 sah, uint ID, const float4 scale, const ulong defaultSplit )
+{
+    ulong splitX = (((ulong)as_uint( sah.x )) << 32) | ((uint)ID << 2) | 0;
+    ulong splitY = (((ulong)as_uint( sah.y )) << 32) | ((uint)ID << 2) | 1;
+    ulong splitZ = (((ulong)as_uint( sah.z )) << 32) | ((uint)ID << 2) | 2;
+    /* ignore zero sized dimensions */
+    splitX = select( splitX, defaultSplit, (ulong)(scale.x == 0) );
+    splitY = select( splitY, defaultSplit, (ulong)(scale.y == 0) );
+    splitZ = select( splitZ, defaultSplit, (ulong)(scale.z == 0) );
+    ulong bestSplit = min( min( splitX, splitY ), splitZ );
+    bestSplit = sub_group_reduce_min( bestSplit );
+    return bestSplit;
+}
+
+
+
+inline float left_to_right_area16( struct AABB3f* low )
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low );
+    return halfArea_AABB3f( &low_prefix );
+}
+
+inline uint left_to_right_counts16( uint low )
+{
+    return sub_group_scan_exclusive_add( low );
+}
+
+inline float right_to_left_area16( struct AABB3f* low )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle( low, ID );
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse );
+    const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID );
+    return low_area;
+}
+
+inline uint right_to_left_counts16( uint low )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = intel_sub_group_shuffle( low, ID );
+    const uint low_prefix = sub_group_scan_inclusive_add( low_reverse );
+    return intel_sub_group_shuffle( low_prefix, ID );
+}
+
+inline float2 left_to_right_area32( struct AABB3f* low, struct AABB3f* high )
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low );
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce( low );
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max( high );
+    AABB3f_extend( &high_prefix, &low_reduce );
+    const float low_area = halfArea_AABB3f( &low_prefix );
+    const float high_area = halfArea_AABB3f( &high_prefix );
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 left_to_right_counts32( uint low, uint high )
+{
+    const uint low_prefix = sub_group_scan_exclusive_add( low );
+    const uint low_reduce = sub_group_reduce_add( low );
+    const uint high_prefix = sub_group_scan_exclusive_add( high );
+    return (uint2)(low_prefix, low_reduce + high_prefix);
+}
+
+inline float2 right_to_left_area32( struct AABB3f* low, struct AABB3f* high )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle( high, ID );
+    struct AABB3f high_reverse = AABB3f_sub_group_shuffle( low, ID );
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse );
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce( &low_reverse );
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max( &high_reverse );
+    AABB3f_extend( &high_prefix, &low_reduce );
+    const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &high_prefix ), ID );
+    const float high_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID );
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 right_to_left_counts32( uint low, uint high )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = intel_sub_group_shuffle( high, ID );
+    const uint high_reverse = intel_sub_group_shuffle( low, ID );
+    const uint low_prefix = sub_group_scan_inclusive_add( low_reverse );
+    const uint low_reduce = sub_group_reduce_add( low_reverse );
+    const uint high_prefix = sub_group_scan_inclusive_add( high_reverse ) + low_reduce;
+    return (uint2)(intel_sub_group_shuffle( high_prefix, ID ), intel_sub_group_shuffle( low_prefix, ID ));
+}
+
+inline uint fastDivideBy6_uint( uint v )
+{
+#if 1
+    const ulong u = (ulong)v >> 1;
+    return (uint)((u * 0x55555556ul) >> 32);
+#else
+    return v / 6;
+#endif
+}
+
+inline uint3 fastDivideBy6_uint3( uint3 v )
+{
+    return (uint3)(fastDivideBy6_uint( v.x ), fastDivideBy6_uint( v.y ), fastDivideBy6_uint( v.z ));
+}
+
+#define SAH_LOG_BLOCK_SHIFT 2
+
+inline struct BFS_Split BinInfo_reduce( struct BFS_BinInfo* binInfo, const float4 scale )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    struct AABB3f boundsX = BinInfo_get_AABB( binInfo, subgroupLocalID, 0 );
+
+    const float lr_areaX = left_to_right_area16( &boundsX );
+    const float rl_areaX = right_to_left_area16( &boundsX );
+
+    struct AABB3f boundsY = BinInfo_get_AABB( binInfo, subgroupLocalID, 1 );
+
+    const float lr_areaY = left_to_right_area16( &boundsY );
+    const float rl_areaY = right_to_left_area16( &boundsY );
+
+    struct AABB3f boundsZ = BinInfo_get_AABB( binInfo, subgroupLocalID, 2 );
+
+    const float lr_areaZ = left_to_right_area16( &boundsZ );
+    const float rl_areaZ = right_to_left_area16( &boundsZ );
+
+    const uint3 counts = BinInfo_get_counts( binInfo, subgroupLocalID );
+
+    const uint lr_countsX = left_to_right_counts16( counts.x );
+    const uint rl_countsX = right_to_left_counts16( counts.x );
+    const uint lr_countsY = left_to_right_counts16( counts.y );
+    const uint rl_countsY = right_to_left_counts16( counts.y );
+    const uint lr_countsZ = left_to_right_counts16( counts.z );
+    const uint rl_countsZ = right_to_left_counts16( counts.z );
+
+    const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
+    const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
+
+    const uint3 lr_count = fastDivideBy6_uint3( (uint3)(lr_countsX, lr_countsY, lr_countsZ) + 6 - 1 );
+    const uint3 rl_count = fastDivideBy6_uint3( (uint3)(rl_countsX, rl_countsY, rl_countsZ) + 6 - 1 );
+    float3 sah = fma( lr_area, convert_float3( lr_count ), rl_area * convert_float3( rl_count ) );
+
+    /* first bin is invalid */
+    sah.x = select( (float)(INFINITY), sah.x, subgroupLocalID != 0 );
+    sah.y = select( (float)(INFINITY), sah.y, subgroupLocalID != 0 );
+    sah.z = select( (float)(INFINITY), sah.z, subgroupLocalID != 0 );
+
+    const ulong defaultSplit = (((ulong)as_uint( (float)(INFINITY) )) << 32);
+
+    const ulong bestSplit = getBestSplit( sah, subgroupLocalID, scale, defaultSplit );
+
+    struct BFS_Split split;
+    split.sah = as_float( (uint)(bestSplit >> 32) );
+    split.dim = (uint)bestSplit & 3;
+    split.pos = (uint)bestSplit >> 2;
+
+    return split;
+}
+
+
+struct BFS_BinInfoReduce3_SLM
+{
+    uint sah[3*BFS_NUM_BINS];
+};
+
+
+
+inline struct BFS_Split BinInfo_reduce3( local struct BFS_BinInfoReduce3_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale )
+{
+    // process each bin/axis combination across sub-groups
+    for (uint i = get_sub_group_id(); i < 3 * BFS_NUM_BINS; i += get_num_sub_groups())
+    {
+        uint my_bin  = i % BFS_NUM_BINS;
+        uint my_axis = i / BFS_NUM_BINS;
+
+        float3 left_lower  = (float3)(INFINITY,INFINITY,INFINITY);
+        float3 left_upper  = -left_lower;
+        float3 right_lower = (float3)(INFINITY,INFINITY,INFINITY);
+        float3 right_upper = -right_lower;
+
+        // load the other bins and assign them to the left or to the right
+        //  of this subgroup's bin
+        uint lane = get_sub_group_local_id();
+        struct AABB3f sg_bins = BinInfo_get_AABB(binInfo,lane,my_axis);
+
+        bool is_left = (lane < my_bin);
+        float3 lower = AABB3f_load_lower(&sg_bins);
+        float3 upper = AABB3f_load_upper(&sg_bins);
+
+        float3 lower_l = select_min( lower, is_left  );
+        float3 upper_l = select_max( upper, is_left  );
+        float3 lower_r = select_min( lower, !is_left );
+        float3 upper_r = select_max( upper, !is_left );
+
+        lower_l = sub_group_reduce_min_float3( lower_l );
+        lower_r = sub_group_reduce_min_float3( lower_r );
+        upper_l = sub_group_reduce_max_float3( upper_l );
+        upper_r = sub_group_reduce_max_float3( upper_r );
+        float3 dl = upper_l - lower_l;
+        float3 dr = upper_r - lower_r;
+        float area_l =  dl.x* (dl.y + dl.z) + (dl.y * dl.z);
+        float area_r =  dr.x* (dr.y + dr.z) + (dr.y * dr.z);
+
+        // get the counts
+        uint sg_bin_count = BinInfo_get_count(binInfo, lane, my_axis);
+        uint count_l = (is_left) ?  sg_bin_count : 0;
+        uint count_r = (is_left) ?  0 : sg_bin_count;
+        count_l = sub_group_reduce_add(count_l);
+        count_r = sub_group_reduce_add(count_r);
+
+        // compute sah
+        count_l = fastDivideBy6_uint(count_l + 6 - 1);
+        count_r = fastDivideBy6_uint(count_r + 6 - 1);
+        float lr_partial = area_l * count_l;
+        float rl_partial = area_r * count_r;
+        float sah = lr_partial + rl_partial;
+
+        // first bin is invalid
+        sah = select((float)(INFINITY), sah, my_bin != 0);
+
+        // ignore zero sized dimensions
+        sah = select( sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) );
+        sah = select( sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) );
+        sah = select( sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) );
+
+        // tuck the axis into the bottom bits of sah cost.
+        //  The result is an integer between 0 and +inf (7F800000)
+        //  If we have 3 axes with infinite sah cost, we will select axis 0
+        slm->sah[i] = (as_uint(sah)&~0x3) | my_axis;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // reduce split candidates down to one subgroup
+    //  sah is strictly positive, so integer compares can be used
+    //   which results in a faster sub_group_reduce_min()
+    //
+    uint best_sah = 0xffffffff;
+
+    uint lid = get_sub_group_local_id();
+    if (lid < BFS_NUM_BINS)
+    {
+        best_sah = slm->sah[lid];
+        lid += BFS_NUM_BINS;
+        best_sah = min( best_sah, slm->sah[lid] );
+        lid += BFS_NUM_BINS;
+        best_sah = min( best_sah, slm->sah[lid] );
+    }
+
+    uint reduced_bestsah = sub_group_reduce_min( best_sah );
+    uint best_bin = ctz(intel_sub_group_ballot(best_sah == reduced_bestsah));
+    uint best_axis = as_uint(reduced_bestsah) & 0x3;
+
+    struct BFS_Split ret;
+    ret.sah = as_float(reduced_bestsah);
+    ret.dim = best_axis;
+    ret.pos = best_bin;
+    return ret;
+}
+
+
+struct BFS_BinInfoReduce_SLM
+{
+    struct
+    {
+        float sah;
+        uint bin;
+    } axisInfo[3];
+};
+
+
+
+inline struct BFS_Split BinInfo_reduce2( local struct BFS_BinInfoReduce_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale, uint num_primrefs)
+{
+    ushort my_axis = get_sub_group_id();
+    ushort my_bin  = get_sub_group_local_id();
+
+    if (my_axis < 3)
+    {
+        struct AABB3f aabb = BinInfo_get_AABB(binInfo, my_bin, my_axis);
+        uint count         = BinInfo_get_count(binInfo, my_bin, my_axis);
+
+        float lr_area = left_to_right_area16(&aabb);
+        float rl_area = right_to_left_area16(&aabb);
+
+        uint lr_count = sub_group_scan_exclusive_add(count);
+        uint rl_count = num_primrefs - lr_count;
+
+        lr_count = fastDivideBy6_uint(lr_count + 6 - 1);
+        rl_count = fastDivideBy6_uint(rl_count + 6 - 1);
+        float lr_partial = lr_area * lr_count;
+        float rl_partial = rl_area * rl_count;
+        float sah = lr_partial + rl_partial;
+
+        // first bin is invalid
+        sah = select((float)(INFINITY), sah, my_bin != 0);
+
+        float best_sah = sub_group_reduce_min( sah );
+        uint best_bin = ctz(intel_sub_group_ballot(sah == best_sah));
+
+        // ignore zero sized dimensions
+        best_sah = select( best_sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) );
+        best_sah = select( best_sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) );
+        best_sah = select( best_sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) );
+
+        if (get_sub_group_local_id() == 0)
+        {
+            slm->axisInfo[my_axis].sah = best_sah;
+            slm->axisInfo[my_axis].bin = best_bin;
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    float sah = (float)(INFINITY);
+    if( get_sub_group_local_id() < 3 )
+        sah = slm->axisInfo[get_sub_group_local_id()].sah;
+
+    float bestsah = min(sub_group_broadcast(sah, 0), min(sub_group_broadcast(sah, 1), sub_group_broadcast(sah, 2)));
+    uint bestAxis = ctz( intel_sub_group_ballot(bestsah == sah) );
+
+    struct BFS_Split split;
+    split.sah = bestsah;
+    split.dim = bestAxis;
+    split.pos = slm->axisInfo[bestAxis].bin;
+    return split;
+}
+
+
+inline bool is_left( struct BinMapping* binMapping, struct BFS_Split* split, struct AABB* primref )
+{
+    const uint dim = split->dim;
+    const float lower = primref->lower[dim];
+    const float upper = primref->upper[dim];
+    const float c = lower + upper;
+    const uint pos = convert_uint_rtz( (c - binMapping->ofs[dim]) * binMapping->scale[dim] );
+    return pos < split->pos;
+}
+
+struct BFS_Pass1_SLM
+{
+    struct BFS_BinInfo bin_info;
+//    struct BFS_BinInfoReduce3_SLM reduce3;
+};
+
+
+void DO_BFS_pass1( local struct BFS_Pass1_SLM*  slm,
+                   uint thread_primref_id,
+                   bool thread_primref_valid,
+                   struct BFSDispatchArgs args
+                  )
+{
+    local struct BFS_BinInfo* local_bin_info = &slm->bin_info;
+    global struct VContext* context  = args.context;
+    struct AABB3f centroid_bounds    = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // root AABB is initialized to centroid bounds
+
+    struct BinMapping bin_mapping;
+    BinMapping_init( &bin_mapping, &centroid_bounds, BFS_NUM_BINS );
+
+    // fetch this thread's primref
+    PrimRef ref;
+    if ( thread_primref_valid )
+        ref = args.primref_buffer[thread_primref_id];
+
+    // init bin info
+    BinInfo_init( local_bin_info );
+
+    // fence on local bin-info init
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // merge this thread's primref into local bin info
+    BinInfo_add_primref( &bin_mapping, local_bin_info, &ref, thread_primref_valid );
+
+    // fence on local bin-info update
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    BinInfo_merge(&context->global_bin_info, local_bin_info);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(BFS_WG_SIZE,1,1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_indexed(
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if ( thread_primref_valid )
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )];
+
+    DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals );
+
+    uint thread_primref_id    = args.wg_primref_begin + get_local_id( 0 );
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_indexed_batchable(
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if (thread_primref_valid)
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)];
+
+    DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_initial_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer );
+
+    uint thread_primref_id = args.wg_primref_begin + get_local_id(0);
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///        BVH2 construction -- BFS Phase Pass2
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct BFS_Pass2_SLM
+{
+    struct BFS_BinInfoReduce3_SLM reduce3;
+    //struct AABB3f left_centroid_bounds;
+    //struct AABB3f right_centroid_bounds;
+    //struct AABB3f left_geom_bounds;
+    //struct AABB3f right_geom_bounds;
+    LRBounds lr_bounds;
+    uint left_count;
+    uint right_count;
+    uint lr_mask;
+    uint left_primref_base;
+    uint right_primref_base;
+//    uint num_wgs;
+
+//    uint output_indices[BFS_WG_SIZE];
+};
+
+
+
+
+
+
+
+void DO_BFS_pass2(
+    local struct BFS_Pass2_SLM* slm,
+    uint thread_primref_id,
+    bool thread_primref_valid,
+    struct BFSDispatchArgs args
+)
+{
+    global struct VContext* context = args.context;
+
+    struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root );
+
+    // load the thread's primref
+    PrimRef ref;
+    if ( thread_primref_valid )
+        ref = args.primref_buffer[thread_primref_id];
+
+    struct BinMapping bin_mapping;
+    BinMapping_init( &bin_mapping, &centroid_bounds, BFS_NUM_BINS );
+
+    // initialize working SLM space
+    LRBounds_init(&slm->lr_bounds);
+    if(get_local_id(0) == 0)
+    {
+        slm->left_count  = 0;
+        slm->right_count = 0;
+
+        if( args.do_mask_processing )
+            slm->lr_mask = 0;
+    }
+
+    // compute split - every workgroup does the same computation
+    // local barrier inside BinInfo_reduce3
+    struct BFS_Split split = BinInfo_reduce3( &slm->reduce3, &context->global_bin_info,bin_mapping.scale );
+
+    uint wg_prim_count = args.wg_primref_end - args.wg_primref_begin;
+
+    // partition primrefs into L/R subsets...
+    bool go_left = false;
+    if (split.sah == (float)(INFINITY))      // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes
+        go_left = get_local_id(0) < (wg_prim_count / 2);
+    else
+        go_left = is_left( &bin_mapping, &split, &ref );
+
+    // assign this primref a position in the output array, and expand corresponding centroid-bounds
+    uint local_index;
+    {
+        float3 centroid = ref.lower.xyz + ref.upper.xyz;
+
+        uint l_ballot = intel_sub_group_ballot(  go_left && thread_primref_valid );
+        uint r_ballot = intel_sub_group_ballot( !go_left && thread_primref_valid );
+        if (l_ballot)
+        {
+            bool active_lane = l_ballot & (1 << get_sub_group_local_id());
+            float3 Cmin, Cmax, Gmin, Gmax;
+            Cmin = select_min( centroid.xyz, active_lane );
+            Cmax = select_max( centroid.xyz, active_lane );
+            Gmin = select_min( ref.lower.xyz, active_lane );
+            Gmax = select_max( ref.upper.xyz, active_lane );
+
+            Cmin = sub_group_reduce_min_float3( Cmin );
+            Cmax = sub_group_reduce_max_float3( Cmax );
+            Gmin = sub_group_reduce_min_float3( Gmin );
+            Gmax = sub_group_reduce_max_float3( Gmax );
+
+            LRBounds_merge_left( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax );
+        }
+
+        if (r_ballot)
+        {
+            bool active_lane = r_ballot & (1 << get_sub_group_local_id());
+            float3 Cmin, Cmax, Gmin, Gmax;
+            Cmin = select_min(centroid.xyz, active_lane);
+            Cmax = select_max(centroid.xyz, active_lane);
+            Gmin = select_min(ref.lower.xyz, active_lane);
+            Gmax = select_max(ref.upper.xyz, active_lane);
+
+            Cmin = sub_group_reduce_min_float3(Cmin);
+            Cmax = sub_group_reduce_max_float3(Cmax);
+            Gmin = sub_group_reduce_min_float3(Gmin);
+            Gmax = sub_group_reduce_max_float3(Gmax);
+
+            LRBounds_merge_right( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax );
+        }
+
+        if( args.do_mask_processing )
+        {
+            uint mask =0;
+            if (thread_primref_valid)
+            {
+                mask = PRIMREF_instanceMask(&ref) ;
+                mask = go_left  ? mask : mask<<8;
+            }
+
+            // TODO OPT:  there is no 'sub_group_reduce_or'  and IGC does not do the reduction trick
+            //   for atomics on sub-group uniform addresses
+            for( uint i= get_sub_group_size()/2; i>0; i/= 2)
+                mask = mask | intel_sub_group_shuffle_down(mask,mask,i);
+            if( get_sub_group_local_id() == 0 )
+                atomic_or_local( &slm->lr_mask, mask );
+        }
+
+        uint l_base = 0;
+        uint r_base = 0;
+        if( get_sub_group_local_id() == 0 && l_ballot )
+            l_base = atomic_add_local( &slm->left_count, popcount(l_ballot) );
+        if( get_sub_group_local_id() == 0 && r_ballot )
+            r_base = atomic_add_local( &slm->right_count, popcount(r_ballot) );
+
+        sub_group_barrier( CLK_LOCAL_MEM_FENCE );
+        l_base = sub_group_broadcast(l_base,0);
+        r_base = sub_group_broadcast(r_base,0);
+
+        l_base = l_base + subgroup_bit_prefix_exclusive( l_ballot );
+        r_base = r_base + subgroup_bit_prefix_exclusive( r_ballot );
+
+        local_index = (go_left) ? l_base : r_base;
+    }
+
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // merge local into global
+    // TODO_OPT:  Look at spreading some of this across subgroups
+    if ( get_sub_group_id() == 0 )
+    {
+        // allocate primref space for this wg and merge local/global centroid bounds
+        uint num_left  = slm->left_count;
+        {
+            if (num_left && get_sub_group_local_id() == 0)
+            {
+                num_left = atomic_add_global( &context->num_left, num_left );
+                slm->left_primref_base = args.dispatch_primref_begin + num_left;
+            }
+        }
+        uint num_right = slm->right_count;
+        {
+            if (num_right && get_sub_group_local_id() == 0)
+            {
+                num_right = atomic_add_global( &context->num_right, num_right );
+                slm->right_primref_base = (args.dispatch_primref_end - 1) - num_right;
+            }
+        }
+
+        if( args.do_mask_processing && get_sub_group_local_id() == 0 )
+            atomic_or_global( &context->lr_mask, slm->lr_mask );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    LRBounds_merge( &context->lr_bounds, &slm->lr_bounds );
+
+    // move thread's primref ID into correct position in output index buffer
+    if (thread_primref_valid)
+    {
+        uint pos = go_left ? slm->left_primref_base + local_index
+            : slm->right_primref_base - local_index;
+
+        args.primref_index_out[pos] = thread_primref_id;
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+BFS_pass2_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if ( thread_primref_valid )
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )];
+
+    DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+BFS_pass2_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals );
+
+    uint thread_primref_id    = args.wg_primref_begin + get_local_id( 0 );
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass2_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if (thread_primref_valid)
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)];
+
+    DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args);
+
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass2_initial_batchable(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer)
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer );
+
+    uint thread_primref_id = args.wg_primref_begin + get_local_id(0);
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///        BVH2 construction -- DFS Phase
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct DFSArgs
+{
+    uint primref_base;
+    uint global_bvh2_base;
+    bool do_mask_processing;
+    ushort num_primrefs;
+    global uint* primref_indices_in;
+    global uint* primref_indices_out;
+    global PrimRef* primref_buffer;
+    global struct BVH2* global_bvh2;
+};
+
+
+struct DFSPrimRefAABB
+{
+    half lower[3];
+    half upper[3];
+};
+
+void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb )
+{
+    bb->lower[0] = 1;
+    bb->lower[1] = 1;
+    bb->lower[2] = 1;
+    bb->upper[0] = 0;
+    bb->upper[1] = 0;
+    bb->upper[2] = 0;
+}
+
+void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v )
+{
+    aabb->lower[0] = min( aabb->lower[0], v->lower[0] );
+    aabb->lower[1] = min( aabb->lower[1], v->lower[1] );
+    aabb->lower[2] = min( aabb->lower[2], v->lower[2] );
+    aabb->upper[0] = max( aabb->upper[0], v->upper[0] );
+    aabb->upper[1] = max( aabb->upper[1], v->upper[1] );
+    aabb->upper[2] = max( aabb->upper[2], v->upper[2] );
+}
+
+half DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb )
+{
+    const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]);
+    return fma( d.x, (d.y + d.z), d.y * d.z );
+}
+
+struct DFSPrimRef
+{
+    struct DFSPrimRefAABB aabb;
+    ushort2 meta;
+};
+
+void DFSPrimRef_SetBVH2Root( struct DFSPrimRef* ref, ushort root )
+{
+    ref->meta.y = root;
+}
+
+uint DFSPrimRef_GetInputIndex( struct DFSPrimRef* ref )
+{
+    return ref->meta.x;
+}
+
+uint DFSPrimRef_GetBVH2Parent( struct DFSPrimRef* ref )
+{
+    return ref->meta.y;
+}
+
+
+struct PrimRefSet
+{
+    struct DFSPrimRefAABB AABB[DFS_WG_SIZE];
+    ushort2 meta[DFS_WG_SIZE];
+    uint input_indices[DFS_WG_SIZE];
+};
+
+
+
+
+local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id )
+{
+    return &refs->AABB[id];
+}
+struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id )
+{
+    struct DFSPrimRef r;
+    r.aabb = refs->AABB[id];
+    r.meta = refs->meta[id];
+    return r;
+}
+void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id )
+{
+    refs->AABB[id] = ref.aabb;
+    refs->meta[id] = ref.meta;
+}
+
+void PrimRefSet_SetPrimRef_FullPrecision( struct AABB3f* root_aabb, local struct PrimRefSet* refs, PrimRef ref, ushort id )
+{
+    float3 root_l = AABB3f_load_lower( root_aabb );
+    float3 root_u = AABB3f_load_upper( root_aabb );
+    float3 d = root_u - root_l;
+    float scale = 1.0f / max( d.x, max( d.y, d.z ) );
+
+    float3 l = ref.lower.xyz;
+    float3 u = ref.upper.xyz;
+    half3 lh = convert_half3_rtz( (l - root_l) * scale );
+    half3 uh = convert_half3_rtp( (u - root_l) * scale );
+
+    refs->AABB[id].lower[0] = lh.x;
+    refs->AABB[id].lower[1] = lh.y;
+    refs->AABB[id].lower[2] = lh.z;
+    refs->AABB[id].upper[0] = uh.x;
+    refs->AABB[id].upper[1] = uh.y;
+    refs->AABB[id].upper[2] = uh.z;
+    refs->meta[id].x = id;
+    refs->meta[id].y = 0;
+}
+
+
+
+void DFS_CreatePrimRefSet( struct DFSArgs args,
+                           local struct PrimRefSet* prim_refs )
+{
+    ushort id = get_local_id( 0 );
+    ushort num_primrefs = args.num_primrefs;
+
+    struct AABB3f box = BVH2_GetNodeBox( args.global_bvh2, args.global_bvh2_base );
+    if ( id < num_primrefs )
+    {
+        PrimRef ref = args.primref_buffer[args.primref_indices_in[id]];
+        prim_refs->input_indices[id] = args.primref_indices_in[id];
+        PrimRefSet_SetPrimRef_FullPrecision( &box, prim_refs, ref, id );
+    }
+}
+
+struct ThreadRangeInfo
+{
+    uchar start;
+    uchar local_num_prims;
+    uchar bvh2_root;
+    bool  active;
+};
+
+struct BVHBuildLocals // size:  ~3.8K
+{
+    uchar2                 axis_and_left_count[ DFS_WG_SIZE ];
+    struct ThreadRangeInfo range[ DFS_WG_SIZE ];
+    uint                   sah[ DFS_WG_SIZE ];
+};
+
+#define LOCAL_BVH2_NODE_COUNT (2*(DFS_WG_SIZE) -1)
+
+struct LocalBVH2
+{
+    uint nodes[LOCAL_BVH2_NODE_COUNT];
+    uint num_nodes;
+
+    // bit layout is for a node is
+    //  uchar child_ptr;    // this is right_child_index >> 1.   right child's msb is always 0
+    //  uchar primref_base; // index of the node's first primref.  will be 0 at the root
+    //  uchar parent_dist;  // distance in nodes from this node to its parent
+    //  uchar prim_counter; // number of prims in this subtree.  For a complete tree (256 prims), the root may be off by 1
+
+    // for a WG size of 256, 8b is enough for parent distance, because the tree is built in level order
+    //    the maximum distance between parent and child occurs for a complete tree.
+    //    in this scenario the left-most leaf has index 255, its parent has index 127, the deltas to the children are 128 and 129
+};
+
+
+void LocalBVH2_Initialize( struct LocalBVH2* bvh2, ushort num_prims )
+{
+    bvh2->num_nodes = 1;
+    bvh2->nodes[0] = min(num_prims,(ushort)255);
+}
+
+
+
+void LocalBVH2_Initialize_Presplit(struct LocalBVH2* bvh2, ushort num_prims, ushort left_count, ushort right_count )
+{
+    bvh2->num_nodes = 3;
+    bvh2->nodes[0] = min(num_prims, (ushort)255);
+
+    ushort bvh2_root = 0;
+    ushort child_place = 1;
+
+    uint child_ptr = (child_place + 1) >> 1;
+    bvh2->nodes[bvh2_root] |= (child_ptr) << 24;
+
+    uint parent_dist = child_place - bvh2_root;
+
+    // initialize child nodes
+    ushort primref_base_left = 0;
+    ushort primref_base_right = left_count;
+    uint left = (primref_base_left << 16) + ((parent_dist << 8)) + left_count;
+    uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8) + right_count;
+    bvh2->nodes[child_place] = left;
+    bvh2->nodes[child_place + 1] = right;
+}
+
+
+void LocalBVH2_CreateInnerNode( local struct LocalBVH2* bvh2, ushort bvh2_root, uint primref_base_left, uint primref_base_right )
+{
+    ushort child_place = atomic_add_local( &(bvh2-> num_nodes), 2 );
+
+    uint child_ptr   = (child_place + 1) >> 1;
+    bvh2->nodes[bvh2_root] |= (child_ptr) << 24;
+
+    uint parent_dist = child_place - bvh2_root;
+
+    // initialize child nodes
+    uint left  = (primref_base_left << 16)  + ((parent_dist << 8));
+    uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8);
+    bvh2->nodes[child_place]     = left;
+    bvh2->nodes[child_place + 1] = right;
+}
+
+ushort2 LocalBVH2_GetChildIndices( struct LocalBVH2* bvh2, ushort bvh2_root )
+{
+    ushort right_idx = (bvh2->nodes[bvh2_root] & 0xff000000) >> 23;
+    return (ushort2)(right_idx - 1, right_idx);
+}
+
+
+ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* bvh2, ushort bvh2_root )
+{
+    // increment only the lower 8 bits.  Algorithm will not overflow by design
+    return atomic_inc_local( &bvh2->nodes[bvh2_root] ) & 0xff;
+}
+
+ushort LocalBVH2_SetLeafPrimCount(local struct LocalBVH2* bvh2, ushort bvh2_root, ushort count)
+{
+    return bvh2->nodes[bvh2_root] |= (count& 0xff);
+}
+
+bool LocalBVH2_IsRoot( struct LocalBVH2* bvh2, ushort node_id )
+{
+    return node_id == 0;
+}
+
+ushort LocalBVH2_GetLeafPrimrefStart( struct LocalBVH2* bvh2, ushort bvh2_node_id )
+{
+    return (bvh2->nodes[bvh2_node_id] >> 16) & 255;
+}
+
+bool LocalBVH2_IsLeftChild( struct LocalBVH2* bvh2, ushort parent_node, ushort current_node )
+{
+    return (current_node & 1); // nodes are allocated in pairs.  first node is root, left child is an odd index
+}
+
+ushort LocalBVH2_GetParent( struct LocalBVH2* bvh2, ushort node )
+{
+    return node - ((bvh2->nodes[node] >> 8) & 255);
+}
+
+uint LocalBVH2_GetNodeCount( struct LocalBVH2* bvh2 )
+{
+    return bvh2->num_nodes;
+}
+
+bool LocalBVH2_IsLeaf( struct LocalBVH2* bvh2, ushort node_index )
+{
+    return (bvh2->nodes[node_index] & 255) <= TREE_ARITY;
+}
+
+ushort LocalBVH2_GetLeafPrimCount( struct LocalBVH2* bvh2, ushort node_index )
+{
+    return (bvh2->nodes[node_index] & 255);
+}
+
+void DFS_ConstructBVH2( local struct LocalBVH2* bvh2,
+                        local struct PrimRefSet* prim_refs,
+                        ushort bvh2_root,
+                        ushort prim_range_start,
+                        ushort local_num_prims,
+                        ushort global_num_prims,
+                        local struct BVHBuildLocals* locals,
+                        local uint* num_active_threads )
+{
+    ushort tid = get_local_id( 0 );
+    ushort primref_position = tid;
+
+    bool active_thread = tid < global_num_prims;
+
+    // Handle cases where initial binner creates leaves
+    if ( active_thread && local_num_prims <= TREE_ARITY )
+    {
+        struct DFSPrimRef ref = PrimRefSet_GetPrimRef(prim_refs, primref_position);
+        DFSPrimRef_SetBVH2Root(&ref, bvh2_root);
+        PrimRefSet_SetPrimRef(prim_refs, ref, primref_position);
+        active_thread = false;
+        if (primref_position == prim_range_start)
+            atomic_sub_local(num_active_threads, local_num_prims);
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    locals->range[ tid ].start           = prim_range_start;
+    locals->range[ tid ].local_num_prims = local_num_prims;
+    locals->range[ tid ].bvh2_root       = bvh2_root;
+    locals->range[ tid ].active          = active_thread;
+
+    do
+    {
+        if(active_thread && prim_range_start == primref_position)
+            locals->sah[primref_position] = UINT_MAX;
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if ( active_thread )
+        {
+            local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+
+            // each thread evaluates a possible split candidate.  Scan primrefs and compute sah cost
+            //  do this axis-by-axis to keep register pressure low
+            float best_sah = INFINITY;
+            ushort best_axis = 3;
+            ushort best_count = 0;
+
+            struct DFSPrimRefAABB box_left[3];
+            struct DFSPrimRefAABB box_right[3];
+            float CSplit[3];
+            ushort count_left[3];
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                DFSPrimRefAABB_init( &box_left[axis] );
+                DFSPrimRefAABB_init( &box_right[axis] );
+
+                CSplit[axis] = my_box->lower[axis] + my_box->upper[axis];
+                count_left[axis] = 0;
+            }
+
+            // scan primrefs in our subtree and partition using this thread's prim as a split plane
+            {
+                struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start );
+
+                for ( ushort p = 1; p < local_num_prims; p++ )
+                {
+                        struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration
+
+                        for( ushort axis = 0; axis < 3; axis++ )
+                        {
+                            float c = box.lower[axis] + box.upper[axis];
+
+                            if ( c < CSplit[axis] )
+                            {
+                                // this primitive is to our left.
+                                DFSPrimRefAABB_extend( &box_left[axis], &box );
+                                count_left[axis]++;
+                            }
+                            else
+                            {
+                                // this primitive is to our right
+                                DFSPrimRefAABB_extend( &box_right[axis], &box );
+                            }
+                        }
+
+                        box = next_box;
+                }
+
+                // last iteration without preloading box
+                for( ushort axis = 0; axis < 3; axis++ )
+                {
+                    float c = box.lower[axis] + box.upper[axis];
+
+                    if ( c < CSplit[axis] )
+                    {
+                        // this primitive is to our left.
+                        DFSPrimRefAABB_extend( &box_left[axis], &box );
+                        count_left[axis]++;
+                    }
+                    else
+                    {
+                        // this primitive is to our right
+                        DFSPrimRefAABB_extend( &box_right[axis], &box );
+                    }
+                }
+
+            }
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                float Al = DFSPrimRefAABB_halfArea( &box_left[axis] );
+                float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] );
+
+                // Avoid NANs in SAH calculation in the corner case where all prims go right
+                //  In this case we set Al=Ar, because such a split will only be selected if all primrefs
+                //    are co-incident..  In that case, we will fall back to split-in-the-middle and both subtrees
+                //    should store the same quantized area value
+                if ( count_left[axis] == 0 )
+                    Al = Ar;
+
+                // compute sah cost
+                ushort count_right = local_num_prims - count_left[axis];
+                float sah = Ar * count_right + Al * count_left[axis];
+
+                // keep this split if it is better than the previous one, or if the previous one was a corner-case
+                if ( sah < best_sah || best_count == 0 )
+                {
+                    // yes, keep it
+                    best_axis = axis;
+                    best_sah = sah;
+                    best_count = count_left[axis];
+                }
+            }
+
+            // write split information to SLM
+            locals->axis_and_left_count[primref_position].x = best_axis;
+            locals->axis_and_left_count[primref_position].y = best_count;
+            uint sah = as_uint(best_sah);
+            // break ties by axis to ensure deterministic split selection
+            //  otherwise builder can produce non-deterministic tree structure run to run
+            //  based on the ordering of primitives (which can vary due to non-determinism in atomic counters)
+            // Embed split axis and index into sah value; compute min over sah and max over axis
+            sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | tid );
+
+            // reduce on split candidates in our local subtree and decide the best one
+            atomic_min_local( &locals->sah[ prim_range_start ], sah);
+        }
+
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        ushort split_index = locals->sah[ prim_range_start ] & 255;
+        ushort split_axis = locals->axis_and_left_count[split_index].x;
+        ushort split_left_count = locals->axis_and_left_count[split_index].y;
+
+        if ( (primref_position == split_index) && active_thread )
+        {
+            // first thread in a given subtree creates the inner node
+            ushort start_left  = prim_range_start;
+            ushort start_right = prim_range_start + split_left_count;
+            if ( split_left_count == 0 )
+                start_right = start_left + (local_num_prims / 2); // handle split-in-the-middle case
+
+            LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right );
+        }
+
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        struct DFSPrimRef ref;
+        ushort new_primref_position;
+
+        if ( active_thread )
+        {
+            ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+            bool go_left;
+
+            if ( split_left_count == 0 )
+            {
+                // We chose a split with no left-side prims
+                //  This will only happen if all primrefs are located in the exact same position
+                //   In that case, fall back to split-in-the-middle
+                split_left_count = (local_num_prims / 2);
+                go_left = (primref_position - prim_range_start < split_left_count);
+            }
+            else
+            {
+                // determine what side of the split this thread's primref belongs on
+                local struct DFSPrimRefAABB* my_box    = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+                local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index );
+                float c = my_box->lower[split_axis] + my_box->upper[split_axis];
+                float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis];
+                go_left = c < Csplit;
+            }
+
+            // adjust state variables for next loop iteration
+            bvh2_root = (go_left) ? kids.x : kids.y;
+            local_num_prims = (go_left) ? split_left_count : (local_num_prims - split_left_count);
+            prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count;
+
+            // determine the new primref position by incrementing a counter in the destination subtree
+            new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root );
+
+            // load our primref from its previous position
+            ref = PrimRefSet_GetPrimRef( prim_refs, primref_position );
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if ( active_thread )
+        {
+            // write our primref into its sorted position and note which node it went in
+            DFSPrimRef_SetBVH2Root( &ref, bvh2_root );
+            PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position );
+            primref_position = new_primref_position;
+
+
+            // deactivate all threads whose subtrees are small enough to form a leaf
+            if ( local_num_prims <= TREE_ARITY )
+            {
+                active_thread = false;
+                if( primref_position == prim_range_start )
+                    atomic_sub_local( num_active_threads, local_num_prims );
+            }
+
+            locals->range[ primref_position ].start           = prim_range_start;
+            locals->range[ primref_position ].local_num_prims = local_num_prims;
+            locals->range[ primref_position ].bvh2_root       = bvh2_root;
+            locals->range[ primref_position ].active          = active_thread;
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        // if we'll have next iteration then load from SLM
+        if(*num_active_threads)
+        {
+            prim_range_start = locals->range[ tid ].start;
+            local_num_prims  = locals->range[ tid ].local_num_prims;
+            bvh2_root        = locals->range[ tid ].bvh2_root;
+            active_thread    = locals->range[ tid ].active;
+            primref_position = tid;
+        }
+        else
+        {
+            break;
+        }
+
+    } while ( true );
+
+}
+
+
+#define REFIT_BIT_DWORDS (LOCAL_BVH2_NODE_COUNT - DFS_WG_SIZE)/32
+
+struct RefitBits
+{
+    uint bits[REFIT_BIT_DWORDS];
+};
+
+struct DFS_SLM
+{
+    union
+    {
+        struct LocalBVH2 bvh2;
+        struct {
+            struct AABB3f centroid_bounds;
+            uint left_count;
+            uint right_count;
+            struct BFS_BinInfo bins;
+            struct BFS_BinInfoReduce3_SLM reduce3;
+        } binning;
+
+    } u1;
+
+    union
+    {
+        struct {
+            struct PrimRefSet prim_refs;
+            struct BVHBuildLocals locals;
+        } pass0;
+
+        struct AABB3f node_boxes[LOCAL_BVH2_NODE_COUNT];
+
+    } u2;
+
+    union
+    {
+        uchar bytes[DFS_WG_SIZE];
+        uint dwords[DFS_WG_SIZE/4];
+    } mask_info;
+
+    struct RefitBits refit_bits;
+
+};
+
+
+void DFS_InitialBinningPass(
+    local struct BFS_BinInfo* bins,
+    local struct BFS_BinInfoReduce3_SLM* reduce3,
+    uniform local struct AABB3f* centroid_bounds,
+    local struct PrimRefSet* refs,
+    local uint* left_counter,
+    local uint* right_counter,
+    ushort num_refs )
+{
+    uint tid = get_local_id(0);
+
+    // initialize SLM structures
+    if (tid == 0)
+    {
+        AABB3f_init(centroid_bounds);
+        *left_counter = 0;
+        *right_counter = 0;
+    }
+
+    BinInfo_init(bins);
+
+    PrimRef ref;
+    struct DFSPrimRef dfs_ref;
+
+    if (tid < num_refs)
+    {
+        dfs_ref = PrimRefSet_GetPrimRef(refs, tid);
+        struct DFSPrimRefAABB box = dfs_ref.aabb;
+        ref.lower.xyz = (float3)(box.lower[0], box.lower[1], box.lower[2]);
+        ref.upper.xyz = (float3)(box.upper[0], box.upper[1], box.upper[2]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // compute centroid bounds so that we can bin
+    if (tid < num_refs)
+    {
+        float3 centroid = ref.lower.xyz + ref.upper.xyz;
+        Uniform_AABB3f_atomic_merge_local_sub_group_lu(centroid_bounds, centroid, centroid);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // add primrefs to bins
+    struct BinMapping mapping;
+    BinMapping_init(&mapping, centroid_bounds, BFS_NUM_BINS);
+
+    BinInfo_add_primref( &mapping, bins, &ref, tid<num_refs );
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // compute split - every sub_group computes different bin
+    struct BFS_Split split = BinInfo_reduce3(reduce3, bins, mapping.scale);
+
+
+    bool go_left = false;
+    uint local_pos = 0;
+    if (tid < num_refs)
+    {
+        // partition primrefs into L/R subsets...
+        if (split.sah == (float)(INFINITY))      // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes
+            go_left = tid < (num_refs / 2);
+        else
+            go_left = is_left(&mapping, &split, &ref);
+
+        if (go_left)
+            local_pos = atomic_inc_local(left_counter);
+        else
+            local_pos = num_refs - (1+ atomic_inc_local(right_counter));
+
+        PrimRefSet_SetPrimRef(refs, dfs_ref, local_pos);
+    }
+
+}
+
+
+void Do_DFS( struct DFSArgs args, local struct DFS_SLM* slm, local uint* num_active_threads )
+{
+    local struct LocalBVH2* bvh2 = &slm->u1.bvh2;
+
+    global struct BVH2* global_bvh2 = args.global_bvh2;
+
+    PrimRef ref;
+    uint parent_node;
+
+    {
+        local struct BVHBuildLocals* locals = &slm->u2.pass0.locals;
+        local struct PrimRefSet* prim_refs = &slm->u2.pass0.prim_refs;
+
+        DFS_CreatePrimRefSet(args, prim_refs);
+
+        uint local_id = get_local_id(0);
+
+        ushort bvh2_root = 0;
+        ushort prim_range_start = 0;
+        ushort local_num_prims = args.num_primrefs;
+
+        if(local_id == 0)
+            *num_active_threads = local_num_prims;
+
+        // barrier for DFS_CreatePrimRefSet and num_active_threads
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // initial binning pass if number of primrefs is large
+        if( args.num_primrefs > 32 )
+        {
+            DFS_InitialBinningPass(&slm->u1.binning.bins, &slm->u1.binning.reduce3, &slm->u1.binning.centroid_bounds, prim_refs,
+                &slm->u1.binning.left_count, &slm->u1.binning.right_count, args.num_primrefs);
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            ushort left_count = slm->u1.binning.left_count;
+            ushort right_count = args.num_primrefs - left_count;
+            if (get_local_id(0) == 0)
+                LocalBVH2_Initialize_Presplit(bvh2, args.num_primrefs, left_count, right_count);
+
+            bvh2_root        = (local_id < left_count) ? 1 : 2;
+            local_num_prims = (local_id < left_count) ? left_count : right_count;
+            prim_range_start = (local_id < left_count) ? 0 : left_count;
+        }
+        else
+        {
+            if (get_local_id(0) == 0)
+                LocalBVH2_Initialize(bvh2, args.num_primrefs);
+        }
+
+        DFS_ConstructBVH2( bvh2, prim_refs, bvh2_root, prim_range_start, local_num_prims, args.num_primrefs, locals, num_active_threads);
+
+        // move the prim refs into their sorted position
+        //  keep this thread's primref around for later use
+        if ( local_id < args.num_primrefs )
+        {
+            struct DFSPrimRef dfs_ref = PrimRefSet_GetPrimRef( prim_refs, local_id );
+
+            uint input_id = DFSPrimRef_GetInputIndex( &dfs_ref );
+
+            parent_node = DFSPrimRef_GetBVH2Parent( &dfs_ref );
+
+            uint primref_index = prim_refs->input_indices[input_id];
+            ref = args.primref_buffer[primref_index];
+            args.primref_indices_out[local_id] = primref_index;
+            args.primref_indices_in[local_id] = primref_index;
+            // these buffers are not read again until the end of kernel
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+    }
+
+
+    // initialize flags for determining when subtrees are done refit
+    if ( get_local_id( 0 ) < REFIT_BIT_DWORDS )
+        slm->refit_bits.bits[get_local_id( 0 )] = 0;
+
+
+    // stash full-precision primref AABBs in slm storage
+    local struct AABB3f* slm_boxes = &slm->u2.node_boxes[0];
+    bool active_thread = get_local_id( 0 ) < args.num_primrefs;
+    if( active_thread )
+    {
+        AABB3f_set( &slm_boxes[get_local_id( 0 )], ref.lower.xyz, ref.upper.xyz );
+
+        // stash instance masks in SLM storage
+        if( args.do_mask_processing )
+            slm->mask_info.bytes[get_local_id(0)] = PRIMREF_instanceMask( &ref );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // Refit leaf nodes
+    uint box_index;
+    if ( active_thread )
+    {
+        // the thread for the first primref in every leaf is the one that will ascend
+        // remaining threads merge their AABB/mask into the first one and terminate
+        uint first_ref = LocalBVH2_GetLeafPrimrefStart( bvh2, parent_node );
+        if ( first_ref != get_local_id( 0 ) )
+        {
+            AABB3f_atomic_merge_local_lu( &slm_boxes[first_ref], ref.lower.xyz, ref.upper.xyz );
+
+            if( args.do_mask_processing )
+            {
+                uint dword_index = first_ref/4;
+                uint shift       = (first_ref%4)*8;
+                uint mask = PRIMREF_instanceMask(&ref) << shift;
+                atomic_or_local( &slm->mask_info.dwords[dword_index], mask );
+            }
+            active_thread = false; // switch off all primref threads except the first one
+        }
+
+        box_index = first_ref;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( active_thread )
+    {
+        uint current_node = parent_node;
+        parent_node = LocalBVH2_GetParent( bvh2, current_node );
+
+        // write out the leaf node's AABB
+        uint num_prims = LocalBVH2_GetLeafPrimCount( bvh2, current_node );
+        uint prim_offs = args.primref_base + LocalBVH2_GetLeafPrimrefStart( bvh2, current_node );
+
+        uint mask = 0xff;
+        if( args.do_mask_processing )
+            mask = slm->mask_info.bytes[box_index];
+
+        BVH2_WriteLeafNode( global_bvh2, args.global_bvh2_base + current_node, &slm_boxes[box_index], prim_offs, num_prims, mask );
+
+        // we no longer need the BVH2 bits for this node, so re-purpose the memory to store the AABB index
+        bvh2->nodes[current_node] = box_index;
+
+        // toggle flag bit in parent node.  The second thread to flip the bit is the one that gets to proceed
+        uint thread_mask = (1 << (parent_node % 32));
+        if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], thread_mask ) & thread_mask) == 0 )
+            active_thread = false;
+    }
+
+    // count how many active threads in sub_group we have and increment wg's number of active threads
+    uint sg_active = sub_group_reduce_add(active_thread ? 1 : 0);
+    if(get_sub_group_local_id() == 0)
+    {
+        atomic_add_local(num_active_threads, sg_active);
+    }
+
+    // refit internal nodes:
+    // walk up the tree and refit AABBs
+
+    do
+    {
+        barrier( CLK_LOCAL_MEM_FENCE ); // we need this barrier because we need to make sure all threads read num_active_threads before modifying it
+        if ( active_thread )
+        {
+            uint current_node = parent_node;
+            parent_node = LocalBVH2_GetParent( bvh2, current_node );
+
+            // pull left/right box indices from current node
+            ushort2 kids = LocalBVH2_GetChildIndices( bvh2, current_node );
+
+            uint left_box = bvh2->nodes[kids.x];
+            uint right_box = bvh2->nodes[kids.y];
+
+            struct AABB3f left = slm_boxes[left_box];
+            struct AABB3f right = slm_boxes[right_box];
+            AABB3f_extend( &left, &right );
+
+            uint2 child_offsets = (uint2)(
+                args.global_bvh2_base + kids.x,
+                args.global_bvh2_base + kids.y);
+
+            uint mask = 0xff;
+            if( args.do_mask_processing )
+            {
+                mask = slm->mask_info.bytes[left_box]
+                     | slm->mask_info.bytes[right_box];
+                slm->mask_info.bytes[left_box] = mask;
+            }
+
+            BVH2_WriteInnerNode( args.global_bvh2, args.global_bvh2_base+current_node, &left, child_offsets, mask );
+
+            slm_boxes[left_box] = left;
+            bvh2->nodes[current_node] = left_box;
+
+            // stop at the root
+            if ( LocalBVH2_IsRoot( bvh2, current_node ) )
+            {
+                active_thread = false;
+                atomic_dec_local(num_active_threads);
+            }
+            else
+            {
+                // toggle flag bit in parent node.  The second thread to flip the bit is the one that gets to proceed
+                uint mask = (1 << (parent_node % 32));
+                if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], mask ) & mask) == 0 )
+                {
+                    active_thread = false;
+                    atomic_dec_local(num_active_threads);
+                }
+            }
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+    } while ( *num_active_threads > 0 );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(DFS_WG_SIZE,1,1) ))
+__attribute__( (intel_reqd_sub_group_size(16)) )
+kernel void
+DFS( global struct VContextScheduler* scheduler,
+     global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct DFS_SLM slm;
+    local struct DFSDispatchRecord record;
+    local uint num_active_threads;
+
+    if ( get_local_id( 0 ) == 0  )
+    {
+        // pop an entry off the DFS dispatch queue
+        //uint wg_index = atomic_dec_global( &scheduler->num_dfs_wgs ) - 1;
+        //record = scheduler->dfs_queue.records[wg_index];
+
+        // TODO:  The version above races, but is considerably faster... investigate
+        uint wg_index = get_group_id(0);
+        record = scheduler->dfs_queue.records[wg_index];
+        write_mem_fence( CLK_LOCAL_MEM_FENCE );
+        atomic_dec_global( &scheduler->num_dfs_wgs );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+
+    bool odd_pass = record.tree_depth & 1;
+
+    global struct SAHBuildGlobals* sah_globals = globals_buffer + record.batch_index;
+
+    struct DFSArgs args;
+    args.num_primrefs = record.num_primrefs;
+    args.primref_indices_in   = SAHBuildGlobals_GetPrimrefIndices_In( sah_globals, odd_pass );
+    args.primref_indices_out  = SAHBuildGlobals_GetPrimrefIndices_Out( sah_globals, odd_pass );
+    args.primref_buffer       = SAHBuildGlobals_GetPrimrefs( sah_globals );
+    args.global_bvh2          = SAHBuildGlobals_GetBVH2( sah_globals );
+    args.primref_indices_in  += record.primref_base;
+    args.primref_indices_out += record.primref_base;
+    args.primref_base         = record.primref_base;
+    args.global_bvh2_base     = record.bvh2_base;
+    args.do_mask_processing   = SAHBuildGlobals_NeedMasks( sah_globals );
+
+    Do_DFS( args, &slm, &num_active_threads );
+
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///        BVH2 to BVH6
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+struct BuildFlatTreeArgs
+{
+    ushort leaf_size_in_bytes;
+    ushort leaf_type;
+    ushort inner_node_type;
+    bool do_mask_processing;
+
+    global uint* primref_indices;
+    global PrimRef* primref_buffer;
+    global struct Globals* globals;
+    global struct BVHBase* bvh_base;
+    global struct BVH2* bvh2;
+};
+
+
+// lane i in the return value is the index of the ith largest primref in the input
+// the return value can be used with shuffle() to move data into its sorted position
+//  the elements of 'key' must be unique.. only the first 6 elements are sorted
+varying ushort SUBGROUP_get_sort_indices_N6( varying uint key )
+{
+    // each lane computes the number of items larger than it
+    // this is its position in the descending order
+    //   TODO_OPT:  Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it
+    //     if compiler is not generating optimal code, consider moving to Cm
+
+    varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0;
+    varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0;
+    varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0;
+    varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0;
+    varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0;
+    varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0;
+    varying ushort a = cmp0 + cmp2 + cmp4;
+    varying ushort b = cmp1 + cmp3 + cmp5;
+    varying ushort num_larger = a + b;
+
+    // each lane determines which of the input elements it should pull
+    varying ushort lane = get_sub_group_local_id();
+    a  = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0;
+    b  = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0;
+    a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0;
+    b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0;
+    a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0;
+    b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0;
+    return a + b;
+}
+
+uint SUBGROUP_area_to_sort_key( varying float area, uniform ushort num_children )
+{
+    varying ushort lane = get_sub_group_local_id();
+    area = (lane < num_children) ? area : 0;        // put inactive nodes last
+
+    // drop LSBs and break ties by lane number to ensure unique keys
+    // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal.
+    //     If we do not do this it can lead to non-deterministic tree structure
+    return (as_uint(area) & 0xffffff80) + (lane^(get_sub_group_size()-1));
+}
+
+// lane i in the return value is the index of the ith largest primref in the input
+// the return value can be used with shuffle() to move data into its sorted position
+//  the elements of 'key' must be unique.. only the first 6 elements are sorted
+varying ushort SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16( varying uint key )
+{
+    // each lane computes the number of items larger than it
+    // this is its position in the descending order
+    //   TODO_OPT:  Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it
+    //     if compiler is not generating optimal code, consider moving to Cm
+
+    varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0;
+    varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0;
+    varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0;
+    varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0;
+    varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0;
+    varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0;
+    varying ushort a = cmp0 + cmp2 + cmp4;
+    varying ushort b = cmp1 + cmp3 + cmp5;
+    varying ushort num_larger = a + b;
+
+    varying ushort cmp0_1 = (sub_group_broadcast(key, 8) > key) ? 1 : 0;
+    varying ushort cmp1_1 = (sub_group_broadcast(key, 9) > key) ? 1 : 0;
+    varying ushort cmp2_1 = (sub_group_broadcast(key, 10) > key) ? 1 : 0;
+    varying ushort cmp3_1 = (sub_group_broadcast(key, 11) > key) ? 1 : 0;
+    varying ushort cmp4_1 = (sub_group_broadcast(key, 12) > key) ? 1 : 0;
+    varying ushort cmp5_1 = (sub_group_broadcast(key, 13) > key) ? 1 : 0;
+    varying ushort a_1 = cmp0_1 + cmp2_1 + cmp4_1;
+    varying ushort b_1 = cmp1_1 + cmp3_1 + cmp5_1;
+    varying ushort num_larger_1 = a_1 + b_1;
+
+    // each lane determines which of the input elements it should pull
+    varying ushort lane = get_sub_group_local_id();
+    if(lane < 8)
+    {
+        a  = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0;
+        b  = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0;
+        a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0;
+        b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0;
+        a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0;
+        b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0;
+    }
+    else
+    {
+        a  = (sub_group_broadcast(num_larger_1, 8)  == lane-8) ? 8 : 8;
+        b  = (sub_group_broadcast(num_larger_1, 9)  == lane-8) ? 1 : 0;
+        a += (sub_group_broadcast(num_larger_1, 10) == lane-8) ? 2 : 0;
+        b += (sub_group_broadcast(num_larger_1, 11) == lane-8) ? 3 : 0;
+        a += (sub_group_broadcast(num_larger_1, 12) == lane-8) ? 4 : 0;
+        b += (sub_group_broadcast(num_larger_1, 13) == lane-8) ? 5 : 0;
+    }
+
+    return a + b;
+}
+
+uint SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16( varying float area, uniform ushort num_children )
+{
+    varying ushort lane = get_sub_group_local_id() % 8;
+    area = (lane < num_children) ? area : 0;        // put inactive nodes last
+
+    // drop LSBs and break ties by lane number to ensure unique keys
+    // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal.
+    //     If we do not do this it can lead to non-deterministic tree structure
+    return (as_uint(area) & 0xffffff80) + (lane^7);
+}
+
+ushort SUBGROUP_BuildFlatTreeNode(
+    uniform struct BuildFlatTreeArgs args,
+    uniform uint bvh2_root,
+    uniform struct InternalNode* qnode,
+    uniform uint qnode_index,
+    varying uint3* sg_children_out // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z)
+                                   //  if a leaf is created, receives number of primrefs (z)
+) // return value is the number of child nodes or 0 for a leaf
+{
+    global struct BVH2* bvh2 = args.bvh2;
+    varying ushort lane = get_sub_group_local_id();
+
+    global struct BVHBase* base = args.bvh_base;
+
+
+    if ( !BVH2_IsInnerNode( bvh2, bvh2_root ) )
+    {
+        uniform ushort num_prims   = BVH2_GetLeafPrimCount( bvh2, bvh2_root );
+        uniform uint primref_start = BVH2_GetLeafPrimStart( bvh2, bvh2_root );
+        varying uint primref_index = primref_start + ((lane < num_prims) ? lane : 0);
+
+        varying uint ref_id = args.primref_indices[primref_index];
+        varying PrimRef ref = args.primref_buffer[ref_id];
+        uniform char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+        uniform char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes;
+
+        uniform int offset = (int)(leaf_mem - (char*)qnode);
+        offset = offset >> 6;
+
+        varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&ref), num_prims );
+        varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key);
+        ref = PrimRef_sub_group_shuffle(&ref, sort_index);
+        ref_id = intel_sub_group_shuffle(ref_id, sort_index);
+
+        if (lane < num_prims)
+            args.primref_indices[primref_index] = ref_id;
+
+        uint global_num_prims = args.globals->numPrimitives;
+        char* bvh_mem = (char*) args.bvh_base;
+
+        if(lane < num_prims)
+            args.primref_indices[primref_index + global_num_prims] = qnode - (struct InternalNode*)bvh_mem;
+
+        if (args.leaf_type == NODE_TYPE_INSTANCE)
+            subgroup_setInstanceQBVHNodeN( offset, &ref, num_prims, (struct QBVHNodeN*)qnode, lane < num_prims ? PRIMREF_instanceMask(&ref) : 0 );
+        else
+            subgroup_setQBVHNodeN( offset, args.leaf_type, &ref, num_prims, (struct QBVHNodeN*)qnode, BVH_NODE_DEFAULT_MASK );
+
+        sg_children_out->z = num_prims;
+        return 0;
+    }
+    else
+    {
+        // collapse BVH2 into BVH6.
+        // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+        uniform ushort num_children = 2;
+
+        uniform uint2 kids = BVH2_GetChildIndices( bvh2, bvh2_root );
+        varying uint sg_bvh2_node = kids.x;
+        if ( lane == 1 )
+            sg_bvh2_node = kids.y;
+
+        do
+        {
+            // choose the inner node with maximum area to replace.
+            // Its left child goes in its old location.  Its right child goes in a new lane
+
+            // TODO_OPT:  We re-read the AABBs again and again to compute area
+            //   ... store per-lane boxes instead and pre-compute areas
+
+            varying float sg_area = BVH2_GetNodeArea( bvh2, sg_bvh2_node );
+            varying bool sg_is_inner = BVH2_IsInnerNode( bvh2, sg_bvh2_node );
+            sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+
+            uniform float max_area = sub_group_reduce_max_N6( sg_area );
+            varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner;
+            uniform uint mask = intel_sub_group_ballot( sg_reducable );
+
+            // TODO_OPT:  Some of these ops seem redundant.. look at trimming further
+
+            if ( mask == 0 )
+                break;
+
+            // choose the inner node with maximum area to replace
+            uniform ushort victim_child = ctz( mask );
+            uniform uint victim_node = sub_group_broadcast( sg_bvh2_node, victim_child );
+            kids = BVH2_GetChildIndices( bvh2, victim_node );
+
+            if ( lane == victim_child )
+                sg_bvh2_node = kids.x;
+            else if ( lane == num_children )
+                sg_bvh2_node = kids.y;
+
+            num_children++;
+
+        } while ( num_children < TREE_ARITY );
+
+        // allocate inner node space
+        uniform uint kids_offset;
+        if (get_sub_group_local_id() == 0)
+            kids_offset = allocate_inner_nodes( args.bvh_base, num_children );
+        kids_offset = sub_group_broadcast(kids_offset, 0);
+
+        uniform struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset);
+        uniform int offset = (int)((char*)kid - (char*)qnode) >> 6;
+
+#if 0
+        uniform uint kids_offset;
+        if ( get_sub_group_local_id() == 0 )
+            kids_offset = alloc_node_mem( args.globals, sizeof( struct QBVHNodeN ) * num_children );
+        kids_offset = sub_group_broadcast( kids_offset, 0 );
+
+
+        // create inner node
+        uniform struct QBVHNodeN* kid = (struct QBVHNodeN*) ((char*)(args.bvh_base) + kids_offset);
+        uniform int offset = (int)((char*)kid - (char*)qnode) >> 6;
+#endif
+        uniform uint child_type = args.inner_node_type;
+
+        // sort child nodes in descending order by AABB area
+        varying struct AABB box   = BVH2_GetAABB( bvh2, sg_bvh2_node );
+        varying uint key          = SUBGROUP_area_to_sort_key(AABB_halfArea(&box), num_children );
+        varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key);
+        box          = AABB_sub_group_shuffle(&box, sort_index);
+        sg_bvh2_node = intel_sub_group_shuffle(sg_bvh2_node, sort_index);
+
+        uniform uint node_mask = (args.do_mask_processing) ? BVH2_GetMask( bvh2, bvh2_root ) : 0xff;
+
+        subgroup_setQBVHNodeN( offset, child_type, &box, num_children, (struct QBVHNodeN*)qnode, node_mask );
+
+        // return child information
+        *sg_children_out = (uint3)(sg_bvh2_node, qnode_index + offset + get_sub_group_local_id(), num_children );
+        return num_children;
+    }
+}
+
+ushort SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(
+    uniform struct BuildFlatTreeArgs args,
+    varying uint bvh2_root,
+    varying struct InternalNode* qnode_base,
+    varying uint qnode_index,
+    varying uint3* sg_children_out, // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z)
+                                   //  if a leaf is created, receives number of primrefs (z)
+    bool active_lane
+) // return value is the number of child nodes or 0 for a leaf
+{
+    global struct BVH2* bvh2 = args.bvh2;
+    varying ushort SIMD16_lane = get_sub_group_local_id();
+    varying ushort SIMD8_lane = get_sub_group_local_id() % 8;
+    varying ushort SIMD8_id = get_sub_group_local_id() / 8;
+    varying ushort lane = get_sub_group_local_id();
+    global struct BVHBase* base = args.bvh_base;
+
+    struct BVH2NodeMetaData nodeMetaData = BVH2_GetNodeMetaData( bvh2, bvh2_root );
+
+    bool is_leaf = active_lane && !BVH2NodeMetaData_IsInnerNode( &nodeMetaData );
+    bool is_inner = active_lane && BVH2NodeMetaData_IsInnerNode( &nodeMetaData );
+
+    uchar mask = BVH_NODE_DEFAULT_MASK;
+    if(is_inner)
+        mask = (args.do_mask_processing) ? BVH2NodeMetaData_GetMask( &nodeMetaData ) : 0xff;
+
+    int offset;
+
+    varying struct InternalNode* qnode = qnode_base + qnode_index;
+    // TOOD: we don't need unions, I left them only for readability
+    union {
+        uint num_prims;
+        uint num_children;
+    } lane_num_data;
+
+    union {
+        PrimRef ref; // this is in fact AABB
+        struct AABB box;
+    } lane_box_data;
+
+    union {
+        uint ref_id;
+        uint sg_bvh2_node;
+    } lane_id_data;
+
+    // for leafs
+    varying uint primref_index;
+
+    if(is_leaf)
+    {
+        lane_num_data.num_prims   = BVH2NodeMetaData_GetLeafPrimCount( &nodeMetaData );
+        uint primref_start = BVH2NodeMetaData_GetLeafPrimStart( &nodeMetaData );
+        primref_index = primref_start + ((SIMD8_lane < lane_num_data.num_prims) ? SIMD8_lane : 0);
+
+        lane_id_data.ref_id = args.primref_indices[primref_index];
+        lane_box_data.ref = args.primref_buffer[lane_id_data.ref_id];
+        char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+        char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes;
+
+        offset = (int)(leaf_mem - (char*)qnode);
+        offset = offset >> 6;
+    }
+
+
+    if(intel_sub_group_ballot(is_inner))
+    {
+        // collapse BVH2 into BVH6.
+        // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+
+        uint2 kids;
+        if(is_inner)
+        {
+            lane_num_data.num_children = 2;
+            kids = BVH2_GetChildIndices( bvh2, bvh2_root );
+
+            lane_id_data.sg_bvh2_node = kids.x;
+            if ( SIMD8_lane == 1 )
+                lane_id_data.sg_bvh2_node = kids.y;
+        }
+
+        bool active = is_inner;
+        do
+        {
+            // choose the inner node with maximum area to replace.
+            // Its left child goes in its old location.  Its right child goes in a new lane
+
+            // TODO_OPT:  We re-read the AABBs again and again to compute area
+            //   ... store per-lane boxes instead and pre-compute areas
+
+            varying float sg_area = 0;
+            varying bool sg_is_inner = false;
+            if(active)
+            {
+                sg_area = BVH2_GetNodeArea( bvh2, lane_id_data.sg_bvh2_node );
+                sg_is_inner = BVH2_IsInnerNode( bvh2, lane_id_data.sg_bvh2_node );
+                sg_area = (sg_is_inner && SIMD8_lane < lane_num_data.num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+            }
+
+            float max_area = sub_group_reduce_max_N6_2xSIMD8_in_SIMD16( sg_area );
+            varying bool sg_reducable = max_area == sg_area && sg_is_inner && (SIMD8_lane < lane_num_data.num_children);
+            uint mask = intel_sub_group_ballot( sg_reducable ) & (0xFF << SIMD8_id * 8); // we'll end up with two different masks for two SIMD8 in SIMD16 due to bits masking
+
+            // TODO_OPT:  Some of these ops seem redundant.. look at trimming further
+
+            if ( mask == 0 )
+                active = false;
+
+            // choose the inner node with maximum area to replace
+            ushort victim_child = ctz( mask );
+            uint victim_node = intel_sub_group_shuffle( lane_id_data.sg_bvh2_node, victim_child );
+            if(active)
+            {
+                kids = BVH2_GetChildIndices( bvh2, victim_node );
+
+                if ( SIMD16_lane == victim_child ) // we use SIMD16_lane, cause victim_child was calculated based on SIMD16 i.e. second node will have victim from 8..13
+                    lane_id_data.sg_bvh2_node = kids.x;
+                else if ( SIMD8_lane == lane_num_data.num_children )
+                    lane_id_data.sg_bvh2_node = kids.y;
+
+                lane_num_data.num_children++;
+
+                if(lane_num_data.num_children >= TREE_ARITY)
+                    active = false;
+            }
+
+        } while ( intel_sub_group_ballot(active) ); // if any active, then continue
+
+        // sum children from both halfs of SIMD16 to allocate nodes only once per sub_group
+        uniform ushort num_children = is_inner ? lane_num_data.num_children : 0;
+        uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+        uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+
+        num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+        uint kids_offset;
+
+        // allocate inner node space
+        if(num_children && SIMD16_lane == 0)
+            kids_offset = allocate_inner_nodes( args.bvh_base, num_children );
+        kids_offset = sub_group_broadcast(kids_offset, 0);
+        if((is_inner))
+        {
+            kids_offset += SIMD8_id * first_SIMD8_num_children;
+
+            struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset);
+
+            offset = (int)((char*)kid - (char*)qnode) >> 6;
+            lane_box_data.box = BVH2_GetAABB( bvh2, lane_id_data.sg_bvh2_node );
+        }
+    }
+
+    // sort child nodes in descending order by AABB area
+    varying uint key          = SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16(AABB_halfArea(&lane_box_data.box), lane_num_data.num_children );
+    varying ushort sort_index = SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16(key);
+    lane_box_data.box         = PrimRef_sub_group_shuffle(&lane_box_data.box, sort_index);
+    lane_id_data.sg_bvh2_node = intel_sub_group_shuffle(lane_id_data.sg_bvh2_node, sort_index);
+
+    char* bvh_mem = (char*) args.bvh_base;
+    if (is_leaf && SIMD8_lane < lane_num_data.num_prims)
+    {
+        args.primref_indices[primref_index] = lane_id_data.ref_id;
+        args.primref_indices[primref_index + args.globals->numPrimitives] = qnode - (struct InternalNode*)bvh_mem;
+    }
+
+    bool degenerated = false;
+    uint node_type = is_leaf ? args.leaf_type : args.inner_node_type;
+
+    if(args.leaf_type == NODE_TYPE_INSTANCE)
+        degenerated = subgroup_setInstanceBox_2xSIMD8_in_SIMD16(&lane_box_data.box, lane_num_data.num_children, &mask, SIMD8_lane < lane_num_data.num_prims ? PRIMREF_instanceMask(&lane_box_data.ref) : 0, is_leaf);
+
+    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, node_type, &lane_box_data.box, lane_num_data.num_children, mask, (struct QBVHNodeN*)(qnode), degenerated, active_lane);
+
+    // return child information
+    if(is_inner)
+    {
+        sg_children_out->x = lane_id_data.sg_bvh2_node;
+        sg_children_out->y = qnode_index + offset + SIMD8_lane;
+    }
+
+    sg_children_out->z = lane_num_data.num_children;
+
+    return is_inner ? lane_num_data.num_children : 0;
+}
+
+void check_primref_integrity( global struct SAHBuildGlobals* globals )
+{
+    global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+    global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, 0 );
+    dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+    if ( get_local_id( 0 ) == 0 )
+    {
+        for ( uint i = 0; i < num_primrefs; i++ )
+        {
+            primref_out[i] = 0;
+        }
+
+        for ( uint i = 0; i < num_primrefs; i++ )
+            primref_out[primref_in[i]]++;
+
+        for ( uint i = 0; i < num_primrefs; i++ )
+            if ( primref_out[i] != 1 )
+                printf( "Foo: %u   %u\n", i, primref_out[i] );
+    }
+}
+
+
+
+
+void check_bvh2(global struct SAHBuildGlobals* globals )
+{
+    global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(globals);
+    global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+    global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out(globals, 0);
+    dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs(globals);
+
+    if (get_local_id(0) == 0)
+    {
+        for (uint i = 0; i < num_primrefs; i++)
+            primref_out[i] = 0;
+
+        uint stack[256];
+        uint sp=0;
+        uint r = BVH2_GetRoot(bvh2);
+        stack[sp++] = r;
+        while (sp)
+        {
+            r = stack[--sp];
+            if (BVH2_IsInnerNode(bvh2,r))
+            {
+                uint2 kids = BVH2_GetChildIndices( bvh2, r);
+                if (kids.x >= bvh2->num_nodes || kids.y >= bvh2->num_nodes)
+                {
+                    printf("BVH2!! Bad node index found!\n");
+                    return;
+                }
+
+                stack[sp++] = kids.x;
+                stack[sp++] = kids.y;
+            }
+            else
+            {
+                uint ref = BVH2_GetLeafPrimStart(bvh2,r);
+                uint count = BVH2_GetLeafPrimCount(bvh2,r);
+                if( count == 0 )
+                {
+                    printf("BVH2!! Empty leaf found!\n");
+                    return;
+                }
+                for (uint i = 0; i < count; i++)
+                {
+                    if (ref + i > num_primrefs)
+                    {
+                        printf("BVH2!! Bad leaf range!\n");
+                        return;
+                    }
+                    uint c = primref_out[ref+i];
+                    if (c != 0)
+                    {
+                        printf("BVH2!! overlapped prim ranges\n");
+                        return;
+                    }
+                    primref_out[ref+i] = 1;
+                    if (primref_in[ref + i] >= num_primrefs)
+                    {
+                        printf("BAD PRIMREF ID FOUND!\n");
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    printf("bvh2 is ok!\n");
+}
+
+
+#if 0
+// TODO_OPT:  Enable larger WGs.  WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(256,1,1)) )
+__attribute__( (intel_reqd_sub_group_size(8) ) )
+kernel void
+build_qnodes( global struct SAHBuildGlobals* globals, global struct VContextScheduler* scheduler )
+{
+    globals = globals + (scheduler->num_trivial_builds + scheduler->num_single_builds);
+    globals = globals + get_group_id(0);
+
+
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals );
+    args.leaf_type          = SAHBuildGlobals_GetLeafType( globals );
+    args.inner_node_type    = SAHBuildGlobals_GetInternalNodeType( globals );
+    args.primref_indices    = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+    args.primref_buffer     = SAHBuildGlobals_GetPrimrefs( globals );
+    args.bvh_base           = SAHBuildGlobals_GetBVHBase( globals );
+    args.bvh2               = SAHBuildGlobals_GetBVH2( globals );
+    args.globals            = (global struct Globals*) globals->p_globals;
+    args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals );
+
+    dword alloc_backpointers = SAHBuildGlobals_NeedBackPointers( globals );
+    global uint2* root_buffer = (global uint2*) globals->p_qnode_root_buffer;
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base );
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+
+    local uint nodes_produced;
+    if ( get_sub_group_id() == 0 )
+    {
+        // allocate first node
+        if (get_sub_group_local_id() == 0)
+            allocate_inner_nodes( args.bvh_base, 1 );
+
+        // first subgroup does first node
+        varying uint3 children_info;
+        uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, BVH2_GetRoot(args.bvh2), qnodes, 0, &children_info );
+
+        if ( get_sub_group_local_id() < num_children )
+            root_buffer[get_sub_group_local_id()] = children_info.xy;
+
+        if ( alloc_backpointers )
+        {
+            // set root's backpointer
+            if( get_sub_group_local_id() == 0 )
+                back_pointers[0] = (0xffffffc0) | (children_info.z << 3);
+
+            // point child backpointers at the parent
+            if( get_sub_group_local_id() < num_children )
+                back_pointers[children_info.y] = 0;
+        }
+
+        if ( get_sub_group_local_id() == 0 )
+            nodes_produced = num_children;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );
+
+
+    uniform uint buffer_index = get_sub_group_id();
+    uniform bool sg_active    = buffer_index < nodes_produced;
+
+    while ( work_group_any( sg_active ) )
+    {
+        if( sg_active )
+        {
+            uniform uint bvh2_node    = root_buffer[buffer_index].x;
+            uniform uint qnode_index  = root_buffer[buffer_index].y;
+
+            // build a node
+            varying uint3 children_info;
+            uniform ushort num_children = SUBGROUP_BuildFlatTreeNode( args, bvh2_node, qnodes + qnode_index, qnode_index, &children_info );
+
+            // handle backpointers
+            if ( alloc_backpointers )
+            {
+                // update this node's backpointer with child count
+                if ( get_sub_group_local_id() == 0 )
+                    back_pointers[qnode_index] |= (children_info.z << 3);
+
+                // point child backpointers at parent
+                if ( get_sub_group_local_id() < num_children )
+                    back_pointers[children_info.y] = (qnode_index << 6);
+            }
+
+            if ( num_children )
+            {
+                // allocate space in the child buffer
+                uint root_buffer_position = 0;
+                if ( get_sub_group_local_id() == 0 )
+                    root_buffer_position = atomic_add_local( &nodes_produced, num_children );
+                root_buffer_position = sub_group_broadcast( root_buffer_position, 0 );
+
+                // store child indices in root buffer
+                if ( get_sub_group_local_id() < num_children )
+                    root_buffer[root_buffer_position + get_sub_group_local_id()] = children_info.xy;
+            }
+        }
+
+        // sync everyone
+        work_group_barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE,
+                            memory_scope_work_group );
+
+
+        if( sg_active )
+            buffer_index += get_num_sub_groups();
+
+        sg_active = (buffer_index < nodes_produced);
+    }
+}
+#endif
+
+
+
+
+
+
+
+inline bool buffer_may_overflow( uint capacity, uint current_size, uint elements_processed_per_sub_group )
+{
+    uint num_consumed = min( get_num_sub_groups() * elements_processed_per_sub_group, current_size );
+    uint space_available = (capacity - current_size) + num_consumed;
+    uint space_needed = TREE_ARITY * num_consumed;
+    return space_available < space_needed;
+}
+
+inline uint build_qnodes_pc(
+    global struct SAHBuildGlobals* globals,
+    bool alloc_backpointers,
+    bool process_masks,
+    uint first_qnode,
+    uint first_bvh2_node,
+
+    local uint2* SLM_local_root_buffer,
+    local uint* SLM_ring_tail,
+    const uint  RING_SIZE
+)
+
+{
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals );
+    args.leaf_type = SAHBuildGlobals_GetLeafType( globals );
+    args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals );
+    args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+    args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals );
+    args.bvh_base = SAHBuildGlobals_GetBVHBase( globals );
+    args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+    args.globals = (global struct Globals*) globals->p_globals;
+    args.do_mask_processing = process_masks;
+
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base );
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+
+    // first subgroup adds first node
+    if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0)
+    {
+        SLM_local_root_buffer[0].x = first_bvh2_node;
+        SLM_local_root_buffer[0].y = first_qnode;
+        *SLM_ring_tail = 1;
+
+    }
+
+    uint ring_head = 0;
+    uint ring_tail = 1;
+    uint ring_size = 1;
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    const uniform uint elements_processed_in_sg = 2;
+
+    while ( ring_size > 0 && !buffer_may_overflow( RING_SIZE, ring_size, elements_processed_in_sg ) )
+    {
+        ushort SIMD16_lane = get_sub_group_local_id();
+
+        // SIMD16 as 2xSIMD8
+        ushort SIMD8_lane = get_sub_group_local_id() % 8;
+        ushort SIMD8_id = get_sub_group_local_id() / 8;
+        bool active_lane;
+
+        uniform uint nodes_consumed = min( get_num_sub_groups() * elements_processed_in_sg, ring_size ); // times two because we process two nodes in subgroup
+        uniform bool sg_active = get_sub_group_id() * elements_processed_in_sg < nodes_consumed;
+        ushort num_children = 0;
+        varying uint3 children_info = 0;
+
+        uint bvh2_node = 0;
+        uint qnode_index = 0;
+
+        if (sg_active)
+        {
+            ushort consumed_pos = get_sub_group_id() * elements_processed_in_sg + SIMD8_id;
+            active_lane = consumed_pos < nodes_consumed ? true : false;
+            consumed_pos = consumed_pos < nodes_consumed ? consumed_pos : consumed_pos-1;
+
+            uint buffer_index = (ring_head + consumed_pos) % RING_SIZE;
+
+            bvh2_node = SLM_local_root_buffer[buffer_index].x;
+            qnode_index = SLM_local_root_buffer[buffer_index].y;
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if (sg_active)
+        {
+            // build a node
+            num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, bvh2_node, qnodes, qnode_index, &children_info, active_lane);
+
+            // handle backpointers
+            // TODO_OPT:  This should be separate shaders not a runtime branch
+            //     doing it this way for now because GRLTLK does not make dynamic shader selection on host very easy.
+            //     this needs to change... GRLTLK should
+
+            if (alloc_backpointers && active_lane)
+            {
+                // update this node's backpointer with child count
+                if (SIMD8_lane == 0)
+                    back_pointers[qnode_index] |= (children_info.z << 3);
+
+                // point child backpointers at parent
+                if (SIMD8_lane < num_children)
+                    back_pointers[children_info.y] = (qnode_index << 6);
+            }
+
+            // save data
+
+            uniform ushort first_SIMD8_num_children  = sub_group_broadcast(num_children, 0);
+            uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+            uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+
+            uint root_buffer_position = 0;
+
+            // allocate space in the child buffer
+            if (SIMD16_lane == 0 && SIMD16_num_children)
+                root_buffer_position = atomic_add_local(SLM_ring_tail, SIMD16_num_children);
+
+            root_buffer_position = sub_group_broadcast( root_buffer_position, 0 );
+            root_buffer_position += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16
+
+            // store child indices in root buffer
+            if (SIMD8_lane < num_children)
+            {
+                uint store_pos = (root_buffer_position + SIMD8_lane) % RING_SIZE;
+                SLM_local_root_buffer[store_pos] = children_info.xy;
+            }
+        }
+
+        // sync everyone
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        ring_head += nodes_consumed;
+        ring_tail = *SLM_ring_tail;
+        ring_size = ring_tail - ring_head;
+    }
+
+    return ring_head;
+}
+
+
+
+
+inline void amplify_and_spill(
+    global struct SAHBuildGlobals* globals,
+    dword alloc_backpointers,
+    uint first_qnode,
+    uint first_bvh2_node,
+    global uint2* global_root_buffer,
+    local uint* root_buffer_counter,
+    const uint  RING_SIZE
+)
+
+{
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals);
+    args.leaf_type = SAHBuildGlobals_GetLeafType(globals);
+    args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals);
+    args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+    args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals);
+    args.bvh_base = SAHBuildGlobals_GetBVHBase(globals);
+    args.bvh2 = SAHBuildGlobals_GetBVH2(globals);
+    args.globals = (global struct Globals*) globals->p_globals;
+
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base);
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base);
+
+
+    varying uint3 children_info;
+    uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, first_bvh2_node, qnodes + first_qnode, first_qnode, &children_info);
+
+    if (alloc_backpointers)
+    {
+        // set first node's backpointer
+        if (get_sub_group_local_id() == 0)
+        {
+            // if first node is root, use root sentinel in backpointer
+            //   otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread)
+            uint bp = 0xffffffc0;
+            if (first_qnode != 0)
+                bp = back_pointers[first_qnode];
+            bp |= (children_info.z << 3);
+
+            back_pointers[first_qnode] = bp;
+        }
+
+        // point child backpointers at the parent
+        if (get_sub_group_local_id() < num_children)
+            back_pointers[children_info.y] = (first_qnode << 6);
+    }
+
+    if (num_children)
+    {
+        uint spill_pos = 0;
+        if (get_sub_group_local_id() == 0)
+            spill_pos = atomic_add_local(root_buffer_counter,num_children);
+
+        spill_pos = sub_group_broadcast(spill_pos, 0);
+
+        if (get_sub_group_local_id() < num_children)
+            global_root_buffer[spill_pos+get_sub_group_local_id()] = children_info.xy;
+    }
+
+}
+
+
+
+
+inline void build_qnodes_pc_kickoff_func(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    bool alloc_backpointers,
+    bool process_masks,
+
+    local uint2* SLM_local_root_buffer,
+    local uint* SLM_spill_pos,
+    local uint* SLM_ring_tail,
+    int RING_SIZE
+)
+{
+    // allocate first node
+    if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0 )
+        allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(globals), 1 );
+
+    *SLM_spill_pos=0;
+
+    uint ring_head = build_qnodes_pc( globals, alloc_backpointers, process_masks,
+                     0, BVH2_GetRoot(SAHBuildGlobals_GetBVH2(globals)), SLM_local_root_buffer, SLM_ring_tail, RING_SIZE );
+
+
+    uint n = *SLM_ring_tail - ring_head;
+    if (n > 0)
+    {
+#if 0
+        // do an additional round of amplification so we can get more nodes into the root buffer and go wider in the next phase
+        /// JDB TODO: this is causing hangs on DG2 for metro, so disabling for now...
+        for (uint i = get_sub_group_id(); i < n; i+= get_num_sub_groups() )
+        {
+            uint consume_pos = (ring_head + i) % RING_SIZE;
+            uniform uint bvh2_root = SLM_local_root_buffer[consume_pos].x;
+            uniform uint qnode_root = SLM_local_root_buffer[consume_pos].y;
+
+            amplify_and_spill( globals, alloc_backpointers, qnode_root, bvh2_root, root_buffer, SLM_spill_pos, RING_SIZE );
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+#else
+        for (uint i = get_local_id(0); i < n; i += get_local_size(0))
+            root_buffer[i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+#endif
+
+        if (get_local_id(0) == 0)
+        {
+            globals->root_buffer_num_produced = n;
+            globals->root_buffer_num_produced_hi = 0;
+            globals->root_buffer_num_consumed = 0;
+            globals->root_buffer_num_consumed_hi = 0;
+        }
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 256, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_pc_kickoff(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    dword sah_flags
+)
+{
+    bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+    bool process_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+
+    const int RING_SIZE = 64;
+
+    local uint2 SLM_local_root_buffer[RING_SIZE];
+    local uint SLM_spill_pos;
+    local uint SLM_ring_tail;
+
+    build_qnodes_pc_kickoff_func(globals,
+                                 root_buffer,
+                                 alloc_backpointers,
+                                 process_masks,
+                                 SLM_local_root_buffer,
+                                 &SLM_spill_pos,
+                                 &SLM_ring_tail,
+                                 RING_SIZE
+                                 );
+}
+
+
+
+
+inline void build_qnodes_pc_amplify_func(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    bool alloc_backpointers,
+    bool process_masks,
+
+    local uint2* SLM_local_root_buffer,
+    local uint*  SLM_broadcast,
+    local uint*  SLM_ring_tail,
+    int RING_SIZE
+    )
+{
+    // TODO_OPT:  Probably don't need this atomic.. could clear 'num_consumed' every time
+    //     and just use get_group_id()
+    //
+
+    if (get_local_id(0) == 0)
+        *SLM_broadcast = atomic_inc_global(&globals->root_buffer_num_consumed);
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    uniform uint consume_pos = *SLM_broadcast;
+    uniform uint bvh2_root = root_buffer[consume_pos].x;
+    uniform uint qnode_root = root_buffer[consume_pos].y;
+
+    uint ring_head = build_qnodes_pc(globals, alloc_backpointers,process_masks,
+        qnode_root, bvh2_root, SLM_local_root_buffer, SLM_ring_tail, RING_SIZE);
+
+    // TODO_OPT:  Instead of spilling the nodes, do one more round of amplification and write
+    //   generated children directly into the root buffer.  This should allow faster amplification
+
+    // spill root buffer contents
+    uint n = *SLM_ring_tail - ring_head;
+    if (n > 0)
+    {
+
+        if (get_local_id(0) == 0)
+            *SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n);
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+        uint produce_pos = *SLM_broadcast;
+
+        for (uint i = get_local_id(0); i < n; i += get_local_size(0))
+            root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+    }
+}
+
+
+
+
+
+// Process two nodes per wg during amplification phase.
+// DOing it this way ensures maximum parallelism
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+build_qnodes_pc_amplify(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    dword sah_flags )
+{
+    bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals);
+    args.leaf_type = SAHBuildGlobals_GetLeafType(globals);
+    args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals);
+    args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+    args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals);
+    args.bvh_base = SAHBuildGlobals_GetBVHBase(globals);
+    args.bvh2 = SAHBuildGlobals_GetBVH2(globals);
+    args.globals = (global struct Globals*) globals->p_globals;
+    args.do_mask_processing = sah_flags & SAH_FLAG_NEED_MASKS;
+
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base);
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base);
+
+    ushort SIMD16_lane = get_sub_group_local_id();
+
+    // SIMD16 as 2xSIMD8
+    ushort SIMD8_lane = get_sub_group_local_id() % 8;
+    ushort SIMD8_id = get_sub_group_local_id() / 8;
+    bool active_lane = false;
+
+    uint consume_pos;
+    consume_pos = globals->root_buffer_num_consumed + get_group_id(0) * 2; // times 2 because we process two nodes in workgroup
+    consume_pos += SIMD8_id;
+
+    active_lane = consume_pos < globals->root_buffer_num_to_consume ? true : false;
+    consume_pos = consume_pos < globals->root_buffer_num_to_consume ? consume_pos : consume_pos-1;
+
+    uint first_bvh2_node = root_buffer[consume_pos].x;
+    uint first_qnode = root_buffer[consume_pos].y;
+
+    varying uint3 children_info;
+    ushort num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, first_bvh2_node, qnodes, first_qnode, &children_info, active_lane);
+
+    if (alloc_backpointers && active_lane)
+    {
+        // set first node's backpointer
+        if (SIMD8_lane == 0)
+        {
+            // if first node is root, use root sentinel in backpointer
+            //   otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread)
+            uint bp = 0xffffffc0;
+            if (first_qnode != 0)
+                bp = back_pointers[first_qnode];
+            bp |= (children_info.z << 3);
+
+            back_pointers[first_qnode] = bp;
+        }
+
+        // point child backpointers at the parent
+        if (SIMD8_lane < num_children)
+            back_pointers[children_info.y] = (first_qnode << 6);
+    }
+
+    // save data
+    {
+        // sum children from both halfs of SIMD16 to do only one atomic per sub_group
+        uint produce_pos;
+        uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+        uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+        uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+
+        if (SIMD16_lane == 0 && SIMD16_num_children)
+            produce_pos = atomic_add_global(&globals->root_buffer_num_produced, SIMD16_num_children);
+
+        produce_pos = sub_group_broadcast(produce_pos, 0);
+        produce_pos += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16
+
+        if (SIMD8_lane < num_children)
+        {
+            root_buffer[produce_pos + SIMD8_lane] = children_info.xy;
+        }
+    }
+}
+
+
+//////////
+//
+// Batched version of qnode creation
+//
+//////////
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+kernel void
+build_qnodes_init_scheduler_batched(global struct QnodeScheduler* scheduler, dword num_builds, dword num_max_qnode_global_root_buffer_entries)
+{
+
+    scheduler->batched_build_offset = scheduler->num_trivial_builds + scheduler->num_single_builds;
+    scheduler->batched_build_count = num_builds - scheduler->batched_build_offset;
+    scheduler->num_max_qnode_global_root_buffer_entries = num_max_qnode_global_root_buffer_entries;
+
+    const uint num_builds_to_process = scheduler->batched_build_count;
+    const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+    scheduler->batched_builds_to_process = num_builds_to_process;
+    scheduler->num_qnode_grb_curr_entries = (num_builds_to_process + 15) / 16; // here we store number of workgroups for "build_qnodes_begin_batchable" kernel
+    scheduler->num_qnode_grb_new_entries = num_builds_to_process;
+    scheduler->qnode_global_root_buffer.curr_entries_offset = max_qnode_grb_entries;
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_begin_batchable(global struct QnodeScheduler* scheduler,
+                             global struct SAHBuildGlobals* builds_globals)
+{
+    const uint tid = get_group_id(0) * get_local_size(0) + get_local_id(0);
+
+    const uint num_builds_to_process = scheduler->batched_builds_to_process;
+
+    if(tid < num_builds_to_process)
+    {
+        const uint build_idx = scheduler->batched_build_offset + tid;
+
+        uint bvh2_node = BVH2_GetRoot(SAHBuildGlobals_GetBVH2(&builds_globals[build_idx]));
+        uint qnode = 0;
+        struct QNodeGlobalRootBufferEntry entry = { bvh2_node, qnode, build_idx, 1};
+        scheduler->qnode_global_root_buffer.entries[tid] = entry;
+
+        builds_globals[build_idx].root_buffer_num_produced = 0;
+        builds_globals[build_idx].root_buffer_num_produced_hi = 0;
+        builds_globals[build_idx].root_buffer_num_consumed = 0;
+        builds_globals[build_idx].root_buffer_num_consumed_hi = 0;
+
+        // allocate first node for this build
+        //allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx]), 1 );
+        SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx])->nodeDataCur++;
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 1, 1, 1 )) )
+kernel void
+build_qnodes_scheduler(global struct QnodeScheduler* scheduler)
+{
+    const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+    uint new_entries = min(scheduler->num_qnode_grb_new_entries, max_qnode_grb_entries);
+
+    scheduler->num_qnode_grb_curr_entries = new_entries;
+    scheduler->num_qnode_grb_new_entries = 0;
+    scheduler->qnode_global_root_buffer.curr_entries_offset = scheduler->qnode_global_root_buffer.curr_entries_offset ? 0 : max_qnode_grb_entries;
+}
+
+
+
+
+// TODO_OPT:  Enable larger WGs.  WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 32, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_pc_amplify_batched(
+    global struct SAHBuildGlobals* builds_globals,
+    global struct QnodeScheduler* scheduler
+    )
+{
+    const uint group_id = get_group_id(0);
+
+    global struct QNodeGlobalRootBuffer* global_root_buffer = &scheduler->qnode_global_root_buffer;
+    const uint curr_entries_offset = global_root_buffer->curr_entries_offset;
+    struct QNodeGlobalRootBufferEntry entry = global_root_buffer->entries[curr_entries_offset + group_id];
+
+    const uint build_id = entry.build_idx;
+
+    global struct SAHBuildGlobals* globals = &builds_globals[build_id];
+    global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer;
+    bool alloc_backpointers = SAHBuildGlobals_NeedBackPointers(globals);
+    bool process_masks = SAHBuildGlobals_NeedMasks(globals);
+
+    const int RING_SIZE = 32; // for 2 SGs, 16 should result in 2 rounds:  one SG produces 6, then 2 SGs consume 2 and produce 12
+                              // for 4 SGs, 32 results in 2 rounds:  one SG produces 6, 4 SGs consume 4 and produce 24, resulting in 26
+
+    local uint2 SLM_local_root_buffer[RING_SIZE];
+    local uint  SLM_broadcast;
+    local uint  SLM_ring_tail;
+    local uint  SLM_grb_broadcast;
+
+
+    //// This below can be moved to separate function if needed for TLAS ////
+
+    uniform uint bvh2_root = entry.bvh2_node;
+    uniform uint qnode_root = entry.qnode;
+
+    uint ring_head = build_qnodes_pc(globals, alloc_backpointers, process_masks,
+        qnode_root, bvh2_root, SLM_local_root_buffer, &SLM_ring_tail, RING_SIZE);
+
+    // spill root buffer contents
+    uint n = SLM_ring_tail - ring_head;
+    if (n > 0)
+    {
+        const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+        if (get_local_id(0) == 0)
+        {
+            SLM_grb_broadcast = atomic_add_global(&scheduler->num_qnode_grb_new_entries, n);
+
+            if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, then make space in build's root_buffer
+                SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n);
+            else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then make space in build's root_buffer
+                SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n - (max_qnode_grb_entries - SLM_grb_broadcast));
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        uint produce_pos = SLM_broadcast;
+
+        uint grb_produce_num = n; // grb stands for global_root_buffer
+        uint lrb_produce_num = 0; // lrb stands for local root buffer, meaning this build's root_buffer
+
+        if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, don't write to it
+        {
+            grb_produce_num = 0;
+            lrb_produce_num = n;
+        }
+        else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then decrease amount of entries and store rest in build's root buffer
+        {
+            grb_produce_num = max_qnode_grb_entries - SLM_grb_broadcast;
+            lrb_produce_num = n - grb_produce_num;
+        }
+
+        // save data to global_root_buffer
+        for(uint i = get_local_id(0); i < grb_produce_num; i += get_local_size(0))
+        {
+            const uint2 slm_record = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+
+            struct QNodeGlobalRootBufferEntry new_entry;
+            new_entry.bvh2_node = slm_record.x;
+            new_entry.qnode = slm_record.y;
+            new_entry.build_idx = entry.build_idx;
+
+            const uint new_entries_offset = curr_entries_offset ? 0 : max_qnode_grb_entries;
+            global_root_buffer->entries[new_entries_offset + SLM_grb_broadcast + i] = new_entry;
+        }
+
+        // if anything left, write to build's root buffer
+        for (uint i = get_local_id(0); i < lrb_produce_num; i += get_local_size(0))
+            root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i + grb_produce_num) % RING_SIZE];
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_try_to_fill_grb_batched(
+    global struct SAHBuildGlobals* builds_globals,
+    global struct QnodeScheduler* scheduler
+    )
+{
+    const uint build_id = scheduler->batched_build_offset + get_group_id(0);
+    global struct SAHBuildGlobals* globals = &builds_globals[build_id];
+    global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer;
+
+    global struct QNodeGlobalRootBuffer* qnode_root_buffer = (global struct QNodeGlobalRootBuffer*)&scheduler->qnode_global_root_buffer;
+
+    const uint num_produced = globals->root_buffer_num_produced;
+    const uint num_consumed = globals->root_buffer_num_consumed;
+    const uint entries =  num_produced - num_consumed; // entries to build's root buffer
+
+    if(!entries)
+        return;
+
+    uint global_root_buffer_offset;
+    if(get_local_id(0) == 0)
+        global_root_buffer_offset = atomic_add_global(&scheduler->num_qnode_grb_new_entries, entries);
+
+    global_root_buffer_offset = sub_group_broadcast(global_root_buffer_offset, 0);
+
+    const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+    if(global_root_buffer_offset >= max_qnode_grb_entries) // if global_root_buffer is full, then return
+        return;
+
+    uint global_root_buffer_produce_num = entries;
+    if(global_root_buffer_offset + entries >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then reduce number of entries to push
+        global_root_buffer_produce_num = max_qnode_grb_entries - global_root_buffer_offset;
+
+    for(uint i = get_local_id(0); i < global_root_buffer_produce_num; i += get_local_size(0))
+    {
+        const uint2 entry = root_buffer[num_consumed + i];
+
+        struct QNodeGlobalRootBufferEntry new_entry;
+        new_entry.bvh2_node = entry.x;
+        new_entry.qnode = entry.y;
+        new_entry.build_idx = build_id;
+
+        const uint new_entries_offset = qnode_root_buffer->curr_entries_offset ? 0 : max_qnode_grb_entries;
+        qnode_root_buffer->entries[new_entries_offset + global_root_buffer_offset + i] = new_entry;
+    }
+
+    if(get_local_id(0) == 0)
+        globals->root_buffer_num_consumed += global_root_buffer_produce_num;
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
new file mode 100644
index 00000000000..1f64ef3fbe2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
@@ -0,0 +1,2025 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "intrinsics.h"
+#include "AABB3f.h"
+#include "AABB.h"
+#include "GRLGen12.h"
+#include "quad.h"
+#include "common.h"
+#include "instance.h"
+
+#include "api_interface.h"
+
+#include "binned_sah_shared.h"
+
+
+#if 0
+#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
+
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations) \
+    _loop_trip++;\
+    if ( _loop_trip > max_iterations  )\
+    {\
+        if( get_local_id(0) == 0 )\
+            printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!! group=%u\n", get_group_id(0) );\
+        break;\
+    }
+#else
+
+#define LOOP_TRIPWIRE_INIT 
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations)
+
+#endif
+
+
+// =========================================================
+//             DFS
+// =========================================================
+
+// there are 128 threads x SIMD16 == 2048 lanes in a DSS
+//   There is 128KB of SLM.  Upper limit of 64KB per WG, so target is 2 groups of 1024 lanes @ 64K each
+//     --> Full occupancy requires using less than 64B per lane
+//
+//   Groups of 256 lanes gives us 16KB per group
+//
+
+// We use subgroups very heavily here in order to avoid 
+//    use of per-thread scratch space for intermediate values
+
+#define DFS_WG_SIZE 256
+#define DFS_NUM_SUBGROUPS 16
+#define DFS_BVH2_NODE_COUNT (2*(DFS_WG_SIZE)-1) 
+#define TREE_ARITY 6
+
+// FlatTree node limits:
+// these are the derivations if we always collapse to one primitive and pack nodes as tightly as possible
+//   If BVH2 construction is allowed to terminate early and place multiple prims in a leaf, these numbers will be too low
+#if 0  
+   
+// maximum flattree size is the number of inner nodes in a full M-ary tree with one leaf per primitive
+//  This is given by I = (L-1)/(M-1)
+//  For a 256 thread workgroup, L=256, M=6, this gives: 51
+#define DFS_MAX_FLATTREE_NODES 51
+
+
+// A flattree leaf is a node which contains only primitives.  
+//
+//  The maximum number of leaves is related to the number of nodes as:
+//   L(N) = ((M-1)*N + 1) / M
+//
+#define DFS_MAX_FLATTREE_LEAFS 43  // = 43 for 256 thread WG (L=256, M=6)
+
+#else
+
+//  This is the result of estimate_qbvh6_nodes(256)
+
+#define DFS_MAX_FLATTREE_LEAFS 256 
+#define DFS_MAX_FLATTREE_NODES 307 // 256 fat-leaves + 51 inner nodes.  51 = ceil(256/5)
+#define DFS_MAX_FLATTREE_DEPTH 52  // number of inner nodes in the worst-case tree
+
+#endif
+
+#define uniform
+#define varying
+
+
+struct DFSArgs
+{
+    global struct BVHBase* bvh_base;
+    global PrimRef* primref_buffer;
+    ushort leaf_node_type;
+    ushort inner_node_type;
+    ushort leaf_size_in_bytes;
+    bool need_backpointers;
+    bool need_masks;
+    ushort num_primrefs;
+    global uint* primref_index_buffer;
+};
+
+
+struct DFSPrimRefAABB
+{
+    half lower[3];
+    half upper[3];
+};
+
+GRL_INLINE void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb )
+{
+    bb->lower[0] = 1;
+    bb->lower[1] = 1;
+    bb->lower[2] = 1;
+    bb->upper[0] = 0;
+    bb->upper[1] = 0;
+    bb->upper[2] = 0;
+}
+
+GRL_INLINE void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v )
+{
+    aabb->lower[0] = min( aabb->lower[0], v->lower[0] );
+    aabb->lower[1] = min( aabb->lower[1], v->lower[1] );
+    aabb->lower[2] = min( aabb->lower[2], v->lower[2] );
+    aabb->upper[0] = max( aabb->upper[0], v->upper[0] );
+    aabb->upper[1] = max( aabb->upper[1], v->upper[1] );
+    aabb->upper[2] = max( aabb->upper[2], v->upper[2] );
+}
+
+GRL_INLINE float DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb )
+{
+    const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]);
+    return fma( d.x, (d.y + d.z), d.y * d.z );
+}
+
+GRL_INLINE struct DFSPrimRefAABB DFSPrimRefAABB_sub_group_reduce( struct DFSPrimRefAABB* aabb )
+{
+    struct DFSPrimRefAABB bounds;
+    bounds.lower[0] = sub_group_reduce_min( aabb->lower[0] );
+    bounds.lower[1] = sub_group_reduce_min( aabb->lower[1] );
+    bounds.lower[2] = sub_group_reduce_min( aabb->lower[2] );
+    bounds.upper[0] = sub_group_reduce_max( aabb->upper[0] );
+    bounds.upper[1] = sub_group_reduce_max( aabb->upper[1] );
+    bounds.upper[2] = sub_group_reduce_max( aabb->upper[2] );
+    return bounds;
+}
+
+struct DFSPrimRef
+{
+    struct DFSPrimRefAABB aabb;
+    uint2 meta;
+};
+
+struct PrimRefMeta
+{
+    uchar2 meta;
+};
+
+GRL_INLINE uint PrimRefMeta_GetInputIndex( struct PrimRefMeta* it )
+{
+    return it->meta.x;
+}
+GRL_INLINE uint PrimRefMeta_GetInstanceMask( struct PrimRefMeta* it )
+{
+    return it->meta.y;
+}
+
+
+struct PrimRefSet
+{
+    struct AABB3f root_aabb;
+    struct DFSPrimRefAABB AABB[DFS_WG_SIZE];
+    uint2 meta[DFS_WG_SIZE];
+
+};
+
+GRL_INLINE local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id )
+{
+    return &refs->AABB[id];
+}
+
+GRL_INLINE float PrimRefSet_GetMaxAABBArea( local struct PrimRefSet* refs )
+{
+    float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+    float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+    float3 d = root_u - root_l;
+    float scale = 1.0f / max( d.x, max( d.y, d.z ) );
+
+    half3 dh = convert_half3_rtp( d * scale );
+    return fma( dh.x, (dh.y + dh.z), dh.y * dh.z );
+}
+
+GRL_INLINE float3 ulp3( float3 v ) {
+
+    return fabs(v) * FLT_EPSILON;
+}
+
+GRL_INLINE struct AABB PrimRefSet_ConvertAABB( local struct PrimRefSet* refs, struct DFSPrimRefAABB* box )
+{
+    float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+    float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+    float3 d = root_u - root_l;
+    float scale = max( d.x, max( d.y, d.z ) );
+
+    float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) );
+    float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) );
+    l =  l * scale + root_l ;
+    u =  u * scale + root_l ;
+
+    // clamping is necessary in case that a vertex lies exactly in the upper AABB plane.  
+    //   If we use unclamped values, roundoff error in the scale factor calculation can cause us
+    //   to snap to a flattened AABB that lies outside of the original one, resulting in missed geometry.
+    u = min( u, root_u );
+    l = min( l, root_u );
+
+    struct AABB r;
+    r.lower.xyz = l.xyz;
+    r.upper.xyz = u.xyz;
+    return r;
+}
+
+GRL_INLINE PrimRef PrimRefSet_GetFullPrecisionAABB( local struct PrimRefSet* refs, ushort id )
+{
+    struct AABB r;
+    r = PrimRefSet_ConvertAABB( refs, &refs->AABB[id] );
+    r.lower.w = 0;
+    r.upper.w = 0;
+    return r;
+}
+
+GRL_INLINE uint PrimRefSet_GetInputIndex( local struct PrimRefSet* refs, ushort id )
+{
+    return refs->meta[id].x;
+}
+
+GRL_INLINE uint PrimRefSet_GetInstanceMask( local struct PrimRefSet* refs, ushort id )
+{
+    return refs->meta[id].y;
+}
+GRL_INLINE struct PrimRefMeta PrimRefSet_GetMeta( local struct PrimRefSet* refs, ushort id )
+{
+    struct PrimRefMeta meta;
+    meta.meta.x = refs->meta[id].x;
+    meta.meta.y = refs->meta[id].y;
+    return meta;
+}
+
+
+GRL_INLINE struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id )
+{
+    struct DFSPrimRef r;
+    r.aabb = refs->AABB[id];
+    r.meta = refs->meta[id];
+    return r;
+}
+
+
+GRL_INLINE void PrimRefSet_SetPrimRef_FullPrecision( local struct PrimRefSet* refs, PrimRef ref, ushort id )
+{
+    
+    float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+    float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+    float3 d      = root_u - root_l;
+    float scale   = 1.0f / max(d.x, max(d.y,d.z));
+    
+    float3 l = ref.lower.xyz;
+    float3 u = ref.upper.xyz;
+    half3 lh = convert_half3_rtz( (l - root_l) * scale );
+    half3 uh = convert_half3_rtp( (u - root_l) * scale );
+
+    refs->AABB[id].lower[0] = lh.x;
+    refs->AABB[id].lower[1] = lh.y;
+    refs->AABB[id].lower[2] = lh.z;
+    refs->AABB[id].upper[0] = uh.x;
+    refs->AABB[id].upper[1] = uh.y;
+    refs->AABB[id].upper[2] = uh.z;
+    refs->meta[id].x = id;
+    refs->meta[id].y = PRIMREF_instanceMask(&ref);
+
+
+}
+
+GRL_INLINE void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id )
+{
+    refs->AABB[id] = ref.aabb;
+    refs->meta[id] = ref.meta;
+}
+
+GRL_INLINE struct AABB3f PrimRefSet_GetRootAABB( local struct PrimRefSet* refs )
+{
+    return refs->root_aabb;
+}
+
+GRL_INLINE void SUBGROUP_PrimRefSet_Initialize( local struct PrimRefSet* refs )
+{
+    if ( get_sub_group_local_id() == 0 )
+        AABB3f_init( &refs->root_aabb ); // TODO_OPT: subgroup-vectorized version of AABB3f_init    
+}
+
+
+GRL_INLINE void PrimRefSet_Printf( local struct PrimRefSet* refs, ushort num_prims )
+{
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "Scene AABB:\n" );
+        struct AABB3f rootBox = PrimRefSet_GetRootAABB( refs );
+        AABB3f_print( &rootBox );
+        
+        float ma = PrimRefSet_GetMaxAABBArea( refs );
+
+        for ( uint i = 0; i < num_prims; i++ )
+        {
+            printf( "Ref: %u\n", i );
+            struct AABB r = PrimRefSet_GetFullPrecisionAABB( refs, i );
+            AABB_print( &r );          
+
+            float a = DFSPrimRefAABB_halfArea( PrimRefSet_GetAABBPointer( refs, i ) );
+            printf( "Scaled Area: %f / %f = %f \n", a, ma, a / ma );
+
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+GRL_INLINE void PrimRefSet_CheckBounds( local struct PrimRefSet* refs, ushort num_prims, PrimRef* primref_buffer )
+{
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    if ( get_local_id( 0 ) == 0 )
+    {
+
+        for ( uint i = 0; i < num_prims; i++ )
+        {
+            PrimRef ref = primref_buffer[i];
+            struct AABB r2 = PrimRefSet_GetFullPrecisionAABB( refs, i );
+
+            struct DFSPrimRefAABB* box = &refs->AABB[i];
+            float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) );
+            float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) );
+
+            printf( " halfs:{%x,%x,%x}{%x,%x,%x}\n", as_uint(l.x), as_uint(l.y), as_uint(l.z), as_uint(u.x), as_uint(u.y), as_uint(u.z) );
+
+            printf( " {%f,%f,%f} {%f,%f,%f}    {%f,%f,%f} {%f,%f,%f} {%u,%u,%u,%u,%u,%u}\n",
+                ref.lower.x, ref.lower.y, ref.lower.z, r2.lower.x, r2.lower.y, r2.lower.z,
+                ref.upper.x, ref.upper.y, ref.upper.z, r2.upper.x, r2.upper.y, r2.upper.z,
+                r2.lower.x <= ref.lower.x,
+                r2.lower.y <= ref.lower.y,
+                r2.lower.z <= ref.lower.z,
+
+                r2.upper.x >= ref.upper.x,
+                r2.upper.y >= ref.upper.y,
+                r2.upper.z >= ref.upper.z );
+
+        }
+
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+struct LocalBVH2
+{
+    uint num_nodes;
+    uint nodes[DFS_BVH2_NODE_COUNT];
+
+    // nodes are a bitfield:
+    //    bits 8:0 (9b)     ==> number of primrefs in this subtree
+    //    
+    //    bits 17:9 (9b)    ==> for an inner node:  contains offset to a pair of children
+    //                      ==> for a leaf node: contains index of the first primref in this leaf
+    //
+    //    bits 30:18 (13b)  ==> quantized AABB area (relative to root box)
+    //    bit 31 (1b)       ==> is_inner flag
+    //
+    // NOTE: The left child offset of any node is always odd.. therefore, it is possible to recover a bit if we need it
+    //        by storing only the 8 MSBs
+};
+
+#define DFS_BVH2_AREA_QUANT 8191.0f
+
+
+
+GRL_INLINE void SUBGROUP_LocalBVH2_Initialize( local struct LocalBVH2* tree, ushort num_prims )
+{
+    tree->num_nodes = 1; // include the root node
+    tree->nodes[0] = num_prims; // initialize root node as a leaf containing the full subtree
+    
+}
+
+GRL_INLINE void LocalBVH2_CreateInnerNode( local struct LocalBVH2* tree, ushort node_index,
+                           ushort start_left, ushort start_right,
+                           ushort quantized_left_area, ushort quantized_right_area )
+{
+    uint child_pos   = atomic_add_local( &tree->num_nodes, 2 );
+  
+    // set the inner node flag and child position in the parent
+    // leave the other bits intact
+    uint parent_node = tree->nodes[node_index];
+    parent_node |= 0x80000000;
+    parent_node = (parent_node & ~(0x1ff<<9)) | (child_pos << 9);
+    tree->nodes[node_index] = parent_node;
+
+    // setup children as leaf nodes with prim-count zero
+    uint left_child  = (convert_uint(start_left) << 9)  | (convert_uint( quantized_left_area )  << 18);
+    uint right_child = (convert_uint(start_right) << 9) | (convert_uint( quantized_right_area ) << 18);
+    tree->nodes[child_pos]      = left_child;
+    tree->nodes[child_pos + 1]  = right_child;
+
+}
+
+GRL_INLINE ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* tree, ushort node_index )
+{
+    // increment only the lower bits.  Given correct tree construction algorithm this will not overflow into MSBs
+    return (atomic_inc_local( &tree->nodes[node_index] )) & 0x1ff; 
+}
+
+GRL_INLINE ushort LocalBVH2_GetNodeArea( local struct LocalBVH2* tree, ushort nodeID )
+{
+    return (tree->nodes[nodeID] >> 18) & 0x1FFF;
+}
+
+GRL_INLINE bool LocalBVH2_IsInnerNode( local struct LocalBVH2* tree, ushort nodeID )
+{
+    return (tree->nodes[nodeID] & 0x80000000) != 0;
+}
+
+
+GRL_INLINE ushort2 LocalBVH2_GetChildIndices( local struct LocalBVH2* tree, ushort nodeID )
+{
+    ushort idx = ((tree->nodes[nodeID] >> 9) & 0x1FF);
+    return (ushort2)(idx, idx + 1);
+}
+
+GRL_INLINE ushort LocalBVH2_GetSubtreePrimCount( local struct LocalBVH2* tree, ushort node )
+{
+    return tree->nodes[node] & 0x1FF;
+}
+
+GRL_INLINE ushort LocalBVH2_GetLeafPrimStart( local struct LocalBVH2* tree, ushort node )
+{
+    return ((tree->nodes[node] >> 9) & 0x1FF);
+}
+
+
+GRL_INLINE void LocalBVH2_Printf( local struct LocalBVH2* tree )
+{
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "Nodes: %u\n", tree->num_nodes );
+
+        for ( uint i = 0; i < tree->num_nodes; i++ )
+        {
+            uint num_prims = LocalBVH2_GetSubtreePrimCount( tree, i );
+            printf( "%3u : 0x%08x  %3u 0x%04x ", i, tree->nodes[i], num_prims, LocalBVH2_GetNodeArea(tree,i) ); 
+            if ( LocalBVH2_IsInnerNode( tree, i ) )
+            {
+                ushort2 kids = LocalBVH2_GetChildIndices( tree, i );
+                printf( " INNER ( %3u %3u )\n", kids.x, kids.y );
+            }
+            else
+            {
+                printf( " LEAF {" );
+                for ( uint j = 0; j < num_prims; j++ )
+                    printf( " %3u ", LocalBVH2_GetLeafPrimStart( tree, i ) + j );
+                printf( "}\n" );
+            }
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+struct FlatTreeInnerNode
+{
+    uint DW0;                // lower 16b are index of corresponding LocalBVH2 node.. Bits 30:16  are an atomic flag used during refit.  Bit 31 is a leaf marker
+    ushort parent_index;
+    ushort first_child;
+    uchar index_in_parent;
+    uchar num_children;
+    
+    //struct DFSPrimRefAABB AABB;
+};
+
+struct FlatTree
+{
+    uint num_nodes;
+    uint qnode_byte_offset; // byte offset from the BVHBase to the flat-tree's first QNode
+    uint qnode_base_index; 
+    
+    struct FlatTreeInnerNode nodes[DFS_MAX_FLATTREE_NODES];   
+    uchar primref_back_pointers[DFS_WG_SIZE];
+};
+
+GRL_INLINE void FlatTree_Printf( local struct FlatTree* flat_tree )
+{
+    barrier( CLK_LOCAL_MEM_FENCE );
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "NumNodes: %u\n", flat_tree->num_nodes );
+        for ( uint i = 0; i < flat_tree->num_nodes; i++ )
+        {
+            ushort bvh2_node = flat_tree->nodes[i].DW0 & 0xffff;
+            printf( "%2u  Parent: %2u  Index_in_parent: %u, NumKids: %u  FirstKid: %3u bvh2: %3u DW0: 0x%x\n",
+                i,
+                flat_tree->nodes[i].parent_index,
+                flat_tree->nodes[i].index_in_parent,
+                flat_tree->nodes[i].num_children,
+                flat_tree->nodes[i].first_child,
+                bvh2_node,
+                flat_tree->nodes[i].DW0 );
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+
+GRL_INLINE ushort FlatTree_GetNodeCount( local struct FlatTree* flat_tree )
+{
+    return flat_tree->num_nodes;
+}
+
+GRL_INLINE uint FlatTree_GetParentIndex( local struct FlatTree* flat_tree, ushort id )
+{
+    return flat_tree->nodes[id].parent_index;
+}
+
+GRL_INLINE ushort FlatTree_GetBVH2Root( local struct FlatTree* flat_tree, ushort node_index )
+{
+    return (flat_tree->nodes[node_index].DW0) & 0xffff;
+}
+
+GRL_INLINE ushort FlatTree_GetNumChildren( local struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].num_children;
+}
+
+GRL_INLINE bool FlatTree_IsLeafNode( local struct FlatTree* flat_tree, ushort node_index )
+{
+    return (flat_tree->nodes[node_index].DW0 & 0x80000000) != 0;
+}
+
+
+GRL_INLINE uint FlatTree_GetQNodeByteOffset( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->qnode_byte_offset + node_index * sizeof(struct QBVHNodeN);
+}
+
+GRL_INLINE uint FlatTree_GetQNodeIndex( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->qnode_base_index + node_index;
+}
+
+GRL_INLINE void FlatTree_AllocateQNodes( struct FlatTree* flat_tree, struct DFSArgs args )
+{
+    uint node_base = 64*allocate_inner_nodes( args.bvh_base, flat_tree->num_nodes );
+    flat_tree->qnode_base_index  = (node_base - BVH_ROOT_NODE_OFFSET) / sizeof( struct QBVHNodeN );
+    flat_tree->qnode_byte_offset = node_base;
+}
+
+GRL_INLINE ushort FlatTree_GetFirstChild( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].first_child;
+}
+
+GRL_INLINE ushort FlatTree_GetPrimRefStart( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].first_child;
+}
+GRL_INLINE ushort FlatTree_GetPrimRefCount( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].num_children;
+}
+
+GRL_INLINE uint FlatTree_BuildBackPointer( local struct FlatTree* flat_tree, ushort node_index )
+{
+    uint parent_index = flat_tree->nodes[node_index].parent_index + flat_tree->qnode_base_index;
+    parent_index = (parent_index << 6) | (FlatTree_GetNumChildren( flat_tree, node_index ) << 3);
+    return parent_index;
+}
+
+
+GRL_INLINE void SUBGROUP_FlatTree_Initialize( uniform local struct FlatTree* flat_tree, struct DFSArgs args )
+{
+    if ( get_sub_group_local_id() == 0 )
+    {
+        flat_tree->num_nodes    = 1;
+        flat_tree->nodes[0].DW0 = 0; // point first node at BVH2 root node, which is assumed to be at index zero
+    }
+    
+}
+/*
+GRL_INLINE void SUBGROUP_FlatTree_ReduceAndSetAABB( uniform local struct FlatTree* flat_tree,
+                                         uniform ushort node_index,
+                                         varying local struct DFSPrimRefAABB* box )
+{
+    // TODO_OPT: Replace this with an optimized reduction which exploits the fact that we only ever have 6 active lanes
+    //       Try using the "negated max" trick here to compute min/max simultaneously, with max in top 6 lanes
+    //          This will replace 6 reductions with 3
+    
+    // TODO_OPT:  This only utilizes up to 6 SIMD lanes.  We can use up to 12 of them by putting
+    //  min into even lanes, and -max into odd lanes, and using a manual min-reduction on pairs of lanes
+
+    struct DFSPrimRefAABB bb = DFSPrimRefAABB_sub_group_reduce( box );
+    if( get_sub_group_local_id() )
+        flat_tree->nodes[node_index].AABB = bb;
+}
+*/
+
+GRL_INLINE void SUBGROUP_FlatTree_CreateInnerNode( uniform local struct FlatTree* flat_tree,
+                                        uniform ushort flat_tree_root,
+                                        varying ushort sg_child_bvh2_root,
+                                        uniform ushort num_children )
+{
+    uniform uint lane = get_sub_group_local_id();
+    
+    // increment counter to allocate new nodes.. set required root node fields
+    uniform uint child_base;
+    if ( lane == 0 )
+    {
+        child_base = atomic_add_local( &flat_tree->num_nodes, num_children );
+        flat_tree->nodes[flat_tree_root].first_child  = (uchar) child_base;
+        flat_tree->nodes[flat_tree_root].num_children = num_children;
+
+        // initialize mask bits for this node's live children
+        uint child_mask = ((1 << num_children) - 1) << 16;
+        flat_tree->nodes[flat_tree_root].DW0 |= child_mask;
+    }
+
+    child_base = sub_group_broadcast( child_base, 0 );
+
+    // initialize child nodes
+    if ( lane < num_children )
+    {
+        varying uint child = child_base + lane;
+        flat_tree->nodes[child].DW0 = sg_child_bvh2_root;
+        flat_tree->nodes[child].index_in_parent = lane;
+        flat_tree->nodes[child].parent_index = flat_tree_root;
+    }
+
+}
+
+
+
+GRL_INLINE void SUBGROUP_FlatTree_CreateLeafNode( uniform local struct FlatTree* flat_tree, 
+                                       uniform ushort flat_tree_root,
+                                       uniform ushort primref_start,
+                                       uniform ushort num_prims )
+{
+    ushort lane = get_sub_group_local_id();
+    if ( lane < num_prims )
+    {
+        flat_tree->primref_back_pointers[primref_start + lane] = (uchar) flat_tree_root;
+        if ( lane == 0 )
+        {
+            flat_tree->nodes[flat_tree_root].first_child  = (uchar) primref_start;
+            flat_tree->nodes[flat_tree_root].num_children = (uchar) num_prims;
+            flat_tree->nodes[flat_tree_root].DW0 |= 0x80000000;
+        }
+    }
+}
+
+
+GRL_INLINE uniform bool SUBGROUP_FlatTree_SignalRefitComplete( uniform local struct FlatTree* flat_tree, uniform ushort* p_node_index )
+{
+    uniform ushort node_index       = *p_node_index;
+    uniform ushort parent           = flat_tree->nodes[node_index].parent_index;
+    uniform ushort index_in_parent  = flat_tree->nodes[node_index].index_in_parent;
+
+    // clear the corresponding mask bit in the parent node
+    uniform uint child_mask         = (0x10000 << index_in_parent);
+    uniform uint old_mask_bits = 0;
+    if( get_sub_group_local_id() == 0 )
+        old_mask_bits = atomic_xor( &flat_tree->nodes[parent].DW0, child_mask );
+
+    old_mask_bits = sub_group_broadcast( old_mask_bits, 0 );
+
+    // if we cleared the last mask bit, this subgroup proceeds up the tree and refits the next node
+    //  otherwise, it looks for something else to do
+    if ( ((old_mask_bits^child_mask) & 0xffff0000) == 0 )
+    {
+        *p_node_index = parent;
+        return true;
+    }
+
+    return false;
+}
+
+/*
+GRL_INLINE local struct DFSPrimRefAABB* FlatTree_GetChildAABB( local struct FlatTree* flat_tree, 
+                                            local struct PrimRefSet* prim_refs, 
+                                            ushort node_index, ushort child_index )
+{
+    ushort child_id = FlatTree_GetFirstChild( flat_tree, node_index ) + child_index;
+
+    if( !FlatTree_IsLeafNode( flat_tree, node_index ) )
+        return &flat_tree->nodes[child_id].AABB;
+    else
+        return PrimRefSet_GetAABBPointer( prim_refs, child_id );
+}
+*/
+GRL_INLINE uint FlatTree_GetPrimRefBackPointer( local struct FlatTree* flat_tree, ushort primref_index )
+{
+    return flat_tree->primref_back_pointers[primref_index] * sizeof(struct QBVHNodeN) + flat_tree->qnode_byte_offset;
+}
+
+
+GRL_INLINE void FlatTree_check_boxes(local struct FlatTree* flat_tree, 
+    global struct AABB* primref_buffer, 
+    local struct AABB3f* boxes,
+    local struct PrimRefMeta* meta )
+
+{
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (get_local_id(0) == 0)
+    {
+        printf("checking flattree bounds...\n");
+
+        for (uint i = 0; i < flat_tree->num_nodes; i++)
+        {            
+            struct AABB rb;
+            rb.lower.xyz = AABB3f_load_lower(&boxes[i]);
+            rb.upper.xyz = AABB3f_load_upper(&boxes[i]);
+
+            uint offs  = FlatTree_GetFirstChild( flat_tree, i );
+            uint count = FlatTree_GetNumChildren( flat_tree, i );
+
+            for (uint c = 0; c < count; c++)
+            {
+                struct AABB lb;
+                if (FlatTree_IsLeafNode( flat_tree, i ))
+                {
+                    lb = primref_buffer[ PrimRefMeta_GetInputIndex( &meta[offs+c] ) ];
+                }
+                else
+                {
+                    lb.lower.xyz = AABB3f_load_lower(&boxes[ offs+c ]);
+                    lb.upper.xyz = AABB3f_load_upper(&boxes[ offs+c ]);
+                }
+
+                if( !AABB_subset( &lb, &rb ) )
+                    printf("Bad bounds!!  child %u of %u   %f : %f  %f : %f %f : %f    %f : %f  %f : %f %f : %f \n",
+                        c, i ,
+                        rb.lower.x, rb.upper.x, rb.lower.y, rb.upper.y, rb.lower.z, rb.upper.z,
+                        lb.lower.x, lb.upper.x, lb.lower.y, lb.upper.y, lb.lower.z, lb.upper.z
+                        );
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+struct FlatTreeScheduler
+{
+    int   num_leafs;
+    uint  writeout_produce_count;
+    uint  writeout_consume_count;
+    uint  active_subgroups;
+    uint  num_built_nodes;
+    uint  num_levels;   // number of depth levels in the tree
+
+    //uchar leaf_indices[DFS_MAX_FLATTREE_LEAFS];     // indices of leaf FlatTree nodes to be refitted
+    //uchar writeout_indices[DFS_MAX_FLATTREE_NODES]; // indices of flattree nodes to be written out or collapsed
+
+    ushort level_ordered_nodes[DFS_MAX_FLATTREE_NODES]; // node indices sorted by depth (pre-order, high depth before low depth)
+    ushort level_start[DFS_MAX_FLATTREE_DEPTH]; // first node at given level in the level-ordered node array
+    uint level_count[DFS_MAX_FLATTREE_DEPTH];  // number of nodes at given level
+};
+
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_Initialize( uniform local struct FlatTreeScheduler* scheduler )
+{
+    scheduler->num_built_nodes = 0;
+    scheduler->num_leafs = 0;
+    scheduler->writeout_produce_count = 0;
+    scheduler->writeout_consume_count = 0;
+    scheduler->active_subgroups = DFS_NUM_SUBGROUPS;
+}
+/*
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueLeafForRefit( uniform local struct FlatTreeScheduler* scheduler,
+                                                   uniform ushort leaf )
+{
+    if ( get_sub_group_local_id() == 0 )
+        scheduler->leaf_indices[atomic_inc( &scheduler->num_leafs )] = leaf;
+}*/
+
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node )
+{
+    if ( get_sub_group_local_id() == 0 )
+        atomic_inc_local( &scheduler->num_built_nodes );
+}
+
+GRL_INLINE uint FlatTreeScheduler_GetNumBuiltNodes( uniform local struct FlatTreeScheduler* scheduler )
+{
+    return scheduler->num_built_nodes;
+}
+
+/*
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node )
+{
+    if ( get_sub_group_local_id() == 0 )
+        scheduler->writeout_indices[atomic_inc( &scheduler->writeout_produce_count )] = node;
+}*/
+
+/*
+GRL_INLINE bool SUBGROUP_FlatTreeScheduler_GetRefitTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort* leaf_idx )
+{
+    // schedule the leaves in reverse order to ensure that later leaves
+    //   complete before earlier ones.. This prevents contention during the WriteOut stage
+    // 
+    // There is a barrier between this function and 'QueueLeafForRefit' so we can safely decrement the same counter 
+    //   that we incremented earlier
+    varying int idx = 0;
+    if( get_sub_group_local_id() == 0 )
+        idx = atomic_dec( &scheduler->num_leafs ); 
+
+    sub_group_barrier( CLK_LOCAL_MEM_FENCE );
+    idx = sub_group_broadcast( idx, 0 );
+    
+    if ( idx <= 0 )
+        return false;
+
+    *leaf_idx = scheduler->leaf_indices[idx-1];
+    return true;
+}*/
+
+/*
+// Signal the scheduler that a subgroup has reached the DONE state.
+//  Return true if this is the last subgroup to be done
+void SUBGROUP_FlatTreeScheduler_SubGroupDone( local struct FlatTreeScheduler* scheduler )
+{
+    if ( get_sub_group_local_id() == 0 )
+        atomic_dec( &scheduler->active_subgroups );
+}
+*/
+
+/*
+
+#define STATE_SCHEDULE_REFIT    0x1234
+#define STATE_SCHEDULE_WRITEOUT 0x5679
+#define STATE_REFIT             0xabcd
+#define STATE_WRITEOUT          0xefef
+#define STATE_DONE              0xaabb
+
+// Get a flattree node to write out.  Returns the new scheduler state
+GRL_INLINE ushort SUBGROUP_FlatTreeScheduler_GetWriteOutTask( uniform local struct FlatTreeScheduler* scheduler,
+                                                   uniform ushort num_nodes,
+                                                   uniform ushort* node_idx )
+{
+    uniform ushort return_state = STATE_WRITEOUT;
+    uniform ushort idx = 0;
+    if ( get_sub_group_local_id() == 0 )
+    {
+        idx = atomic_inc( &scheduler->writeout_consume_count );     
+   
+        if ( idx >= scheduler->writeout_produce_count )
+        {
+            // more consumers than there are produced tasks....
+
+            if ( scheduler->writeout_produce_count == num_nodes )
+            {
+                // if all nodes have been written out, flattening is done
+                return_state = STATE_DONE;
+            }
+            else
+            {
+                // some writeout tasks remain, and have not been produced by refit threads yet
+                //   we need to put this one back
+                atomic_dec( &scheduler->writeout_consume_count );
+                return_state = STATE_SCHEDULE_WRITEOUT;
+            }
+        }
+        else
+        {
+            // scheduled successfully 
+            idx = scheduler->writeout_indices[idx];
+        }
+    }
+
+    *node_idx = sub_group_broadcast( idx, 0 );
+    return sub_group_broadcast( return_state, 0 );
+
+}
+*/
+
+
+/*
+GRL_INLINE void FlatTreeScheduler_Printf( local struct FlatTreeScheduler* scheduler )
+{
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "***SCHEDULER***\n" );
+        printf( "built_nodes=%u  active_sgs=%u  leafs=%u wo_p=%u  wo_c=%u\n", scheduler->num_built_nodes, scheduler->active_subgroups, scheduler->num_leafs,
+            scheduler->writeout_produce_count, scheduler->writeout_consume_count );
+        printf( "leafs for refit: {" );
+
+        int nleaf = max( scheduler->num_leafs, 0 );
+
+        for ( uint i = 0; i < nleaf; i++ )
+            printf( "%u ", scheduler->leaf_indices[i] );
+        printf( "}\n" );
+
+        printf( "writeout queue: %u:%u {", scheduler->writeout_produce_count, scheduler->writeout_consume_count );
+        for ( uint i = 0; i < scheduler->writeout_produce_count; i++ )
+            printf( "%u ", scheduler->writeout_indices[i] );
+        printf( "}\n" );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+}
+*/
+
+
+GRL_INLINE void SUBGROUP_BuildFlatTreeNode( local struct LocalBVH2* bvh2,
+                                 local struct FlatTree* flat_tree,
+                                 local struct FlatTreeScheduler* scheduler,
+                                 uniform ushort flat_tree_root )
+{
+    varying ushort lane = get_sub_group_local_id();
+    varying ushort bvh2_root = FlatTree_GetBVH2Root( flat_tree, flat_tree_root );
+
+    if ( !LocalBVH2_IsInnerNode( bvh2, bvh2_root ) )
+    {
+        uniform ushort num_prims        = LocalBVH2_GetSubtreePrimCount( bvh2, bvh2_root );
+        uniform ushort primref_start    = LocalBVH2_GetLeafPrimStart( bvh2, bvh2_root );
+
+        SUBGROUP_FlatTree_CreateLeafNode( flat_tree, flat_tree_root, primref_start, num_prims );
+    }
+    else
+    {
+        // collapse BVH2 into BVH6.
+        // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+        uniform ushort num_children = 2;
+
+        uniform ushort2 kids =  LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+        varying ushort sg_bvh2_node = kids.x;
+        if ( lane == 1 )
+            sg_bvh2_node = kids.y;
+
+        do
+        {            
+            // choose the inner node with maximum area to replace.  
+            // Its left child goes in its old location.  Its right child goes in a new lane
+
+            varying ushort sg_area   = LocalBVH2_GetNodeArea( bvh2, sg_bvh2_node );
+            varying bool sg_is_inner = LocalBVH2_IsInnerNode( bvh2, sg_bvh2_node );
+            sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+            
+            uniform ushort max_area  = sub_group_reduce_max( sg_area );
+            varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner;
+            uniform uint mask         = intel_sub_group_ballot( sg_reducable );
+
+            // TODO_OPT:  Some of these ops seem redundant.. look at trimming further
+            // TODO_OPT:  sub_group_reduce_max results in too many instructions...... unroll the loop and specialize it..
+            //       or ask IGC to give us a version that declares a static maximum number of subgroups to use
+
+            if ( mask == 0 )
+                break;
+
+            // choose the inner node with maximum area to replace
+            uniform ushort victim_child = ctz( mask );
+            uniform ushort victim_node  = sub_group_broadcast( sg_bvh2_node, victim_child );
+            uniform ushort2 kids        = LocalBVH2_GetChildIndices( bvh2, victim_node );
+
+            if ( lane == victim_child )
+                sg_bvh2_node = kids.x;
+            else if ( lane == num_children )
+                sg_bvh2_node = kids.y;
+            
+            
+            num_children++;
+
+
+        }while ( num_children < TREE_ARITY );
+
+        SUBGROUP_FlatTree_CreateInnerNode( flat_tree, flat_tree_root, sg_bvh2_node, num_children );
+    }
+
+}
+
+
+GRL_INLINE void SUBGROUP_DFS_BuildFlatTree( uniform local struct LocalBVH2* bvh2,
+                                 uniform local struct FlatTree* flat_tree,
+                                 uniform local struct FlatTreeScheduler* scheduler
+                                )
+{
+
+    uniform ushort flat_tree_node_index = get_sub_group_id();
+    uniform ushort num_nodes     = 1;
+    uniform ushort num_built     = 0;
+    
+    uint tid = get_local_id(0);
+    if (tid < DFS_MAX_FLATTREE_DEPTH)
+    {
+        scheduler->level_start[tid] = DFS_MAX_FLATTREE_NODES;
+        scheduler->level_count[tid] = 0;
+        scheduler->num_levels = 0;
+    }
+
+    LOOP_TRIPWIRE_INIT;
+
+    do
+    {
+        // process one flat tree node per sub group, as many as are available
+        //
+        //  The first pass will only run one sub-group, the second up to 6, the third up to 36, and so on
+        //     nodes will be processed in breadth-first order, but they are not guaranteed to be stored in this order
+        //      due to use of atomic counters for node allocation
+        //
+        if ( flat_tree_node_index < num_nodes )
+        {
+            SUBGROUP_BuildFlatTreeNode( bvh2, flat_tree, scheduler, flat_tree_node_index );
+            SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( scheduler, flat_tree_node_index );            
+            flat_tree_node_index += get_num_sub_groups();
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        // bump up the node count if new nodes were created
+        // stop as soon as all flattree nodes have been processed
+        num_nodes = FlatTree_GetNodeCount( flat_tree );
+        num_built = FlatTreeScheduler_GetNumBuiltNodes( scheduler );
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+        
+        LOOP_TRIPWIRE_INCREMENT( 300 );
+
+    } while ( num_built < num_nodes );
+  
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+
+    // determine depth of each node, compute node ranges and counts for each depth level, 
+    //  and prepare a depth-ordered node index array
+    uint depth = 0;
+    uint level_pos = 0;
+    for( uint i=tid; i<num_nodes; i += get_local_size(0) )
+    {
+        // compute depth of this node
+        uint node_index = i;
+        while ( node_index != 0 )
+        {
+            node_index = FlatTree_GetParentIndex( flat_tree, node_index );
+            depth++;
+        }
+
+        // assign this node a position within it's depth level
+        level_pos = atomic_inc_local( &scheduler->level_count[depth] );
+    
+        // compute total number of levels 
+        atomic_max_local( &scheduler->num_levels, depth+1 );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    for( uint i=tid; i<num_nodes; i += get_local_size(0) )
+    {
+        // prefix-sum level start positions.  Re-computed for each thread
+        // TODO:  Hierarchical reduction ??
+        uint level_start=0;
+        for( uint d=0; d<depth; d++ )
+            level_start += scheduler->level_count[d];
+
+        scheduler->level_start[depth] = level_start;
+
+        // scatter node indices into level-ordered node array
+        scheduler->level_ordered_nodes[level_start + level_pos] = tid;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+}
+
+/*
+GRL_INLINE bool SUBGROUP_RefitNode( uniform local struct FlatTree* flat_tree,
+                         uniform local struct PrimRefSet* prim_refs,
+                         uniform ushort* p_node_index )
+{
+
+    // fetch and reduce child AABBs across the subgroup
+    uniform ushort node_index = *p_node_index;
+    uniform ushort num_kids = FlatTree_GetNumChildren( flat_tree, node_index );
+    varying ushort sg_child_index = (get_sub_group_local_id() < num_kids) ? get_sub_group_local_id() : 0;
+
+    varying local struct DFSPrimRefAABB* box = FlatTree_GetChildAABB( flat_tree, prim_refs, node_index, sg_child_index );
+
+    SUBGROUP_FlatTree_ReduceAndSetAABB( flat_tree, node_index, box );
+
+    if ( node_index == 0 )
+        return false; // if we just refitted the root, we can stop now
+
+    // signal the parent node that this node was refitted.  If this was the last child to be refitted
+    //    returns true and sets 'node_index' to the parent node, so that this thread can continue refitting
+    return SUBGROUP_FlatTree_SignalRefitComplete( flat_tree, p_node_index );
+}*/
+
+GRL_INLINE struct QBVHNodeN* qnode_ptr( BVHBase* bvh_mem, uint byte_offset )
+{
+    return (struct QBVHNodeN*)(((char*)bvh_mem) + byte_offset);
+}
+
+GRL_INLINE void SUBGROUP_WriteQBVHNode(        
+        uniform local struct FlatTree* flat_tree,
+        uniform local struct PrimRefMeta* primref_meta,
+        uniform local struct AABB3f* boxes,
+        uniform ushort flat_tree_root,
+        uniform struct DFSArgs args,
+        uniform local uchar* masks
+      )
+{
+
+    
+    uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root );
+    uniform bool is_leaf        = FlatTree_IsLeafNode( flat_tree, flat_tree_root );
+
+    varying ushort lane = get_sub_group_local_id();
+    varying ushort sg_child_index = (lane < num_children) ? lane : 0;
+
+    uniform ushort child_base = FlatTree_GetFirstChild( flat_tree, flat_tree_root );
+
+    varying struct AABB sg_box4;
+    if (FlatTree_IsLeafNode( flat_tree, flat_tree_root ))
+    {
+        // fetch AABBs for primrefs               
+        sg_box4 = args.primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_base + sg_child_index] ) ];                    
+        
+    }
+    else
+    {
+        // fetch AABBs for child nodes
+        sg_box4.lower.xyz = AABB3f_load_lower( &boxes[child_base+sg_child_index] );
+        sg_box4.upper.xyz = AABB3f_load_upper( &boxes[child_base+sg_child_index] );
+    }
+
+
+    struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) );
+
+    uniform int offset;
+    uniform uint child_type;
+    if ( is_leaf )
+    { 
+        char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+
+        leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes;
+
+        offset = (int)(leaf_mem - (char*)qnode);
+        child_type = args.leaf_node_type;
+    }
+    else
+    {
+        struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) );
+        offset = (int) ((char*)kid - (char*)qnode);
+        child_type = args.inner_node_type;
+    }
+    offset = offset >> 6;
+
+    if (child_type == NODE_TYPE_INSTANCE)
+    {
+        uint instanceMask = PrimRefMeta_GetInstanceMask( &primref_meta[child_base + sg_child_index] );
+        subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 );
+    }
+    else
+    {
+        uint mask = BVH_NODE_DEFAULT_MASK;
+        if( args.need_masks )
+            mask = masks[flat_tree_root];
+
+        subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode, mask );
+    }
+
+    if ( args.need_backpointers )
+    {
+        global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+        uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root );
+        uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root );
+        back_pointers[idx] = bp;
+    }
+
+    /*
+    // TODO_OPT:  Eventually this section should also handle leaf splitting due to mixed primref types
+    //    For now this is done by the leaf creation pipeline, but that path should probably be refactored
+    //      such that all inner node creation is done in one place
+
+    uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root );
+    uniform bool is_leaf        = FlatTree_IsLeafNode( flat_tree, flat_tree_root );
+
+    varying ushort lane = get_sub_group_local_id();
+    varying ushort sg_child_index = (lane < num_children) ? lane : 0;
+
+    varying local struct DFSPrimRefAABB* sg_box = FlatTree_GetChildAABB( flat_tree, prim_refs, flat_tree_root, sg_child_index );
+
+    varying struct AABB sg_box4 = PrimRefSet_ConvertAABB( prim_refs, sg_box );
+    
+    struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) );
+
+    uniform int offset;
+    uniform uint child_type;
+    if ( is_leaf )
+    { 
+        char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+
+        leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes;
+
+        offset = (int)(leaf_mem - (char*)qnode);
+        child_type = args.leaf_node_type;
+    }
+    else
+    {
+        struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) );
+        offset = (int) ((char*)kid - (char*)qnode);
+        child_type = args.inner_node_type;
+    }
+    offset = offset >> 6;
+    
+    if (child_type == NODE_TYPE_INSTANCE)
+    {
+        uint instanceMask = PrimRefSet_GetInstanceMask( prim_refs, FlatTree_GetPrimRefStart(flat_tree, flat_tree_root) + lane );
+        subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 );
+    }
+    else
+        subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode );
+
+    if ( args.need_backpointers )
+    {
+        global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+        uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root );
+        uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root );
+        back_pointers[idx] = bp;
+    }
+    */
+}
+
+/*
+GRL_INLINE void SUBGROUP_DFS_RefitAndWriteOutFlatTree(
+    uniform local struct FlatTree* flat_tree,
+    uniform local struct PrimRefSet* prim_refs,
+    uniform local struct FlatTreeScheduler* scheduler,
+    uniform struct DFSArgs args)
+{
+
+    uniform ushort state = STATE_SCHEDULE_REFIT;
+    uniform ushort node_index = 0;
+    uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+    {
+        LOOP_TRIPWIRE_INIT;
+
+        bool active = true;
+        bool continue_refit = false;
+        while (1)
+        {
+            if (active)
+            {
+                if (continue_refit || SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index))
+                {
+                    continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index);
+                }
+                else
+                {
+                    active = false;
+                    if (get_sub_group_local_id() == 0)
+                        atomic_dec(&scheduler->active_subgroups);
+
+                    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE); // finish all atomics
+            if (scheduler->active_subgroups == 0)
+                break;
+            barrier(CLK_LOCAL_MEM_FENCE); // finish all checks.. prevent race between thread which loops around and thread which doesn't
+
+            LOOP_TRIPWIRE_INCREMENT(200);
+        }
+    }
+
+    for (uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups())
+        SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, i, args);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
+    // JDB:  Version below attempts to interleave refit and qnode write-out
+    //  This could theoretically reduce thread idle time, but it is more complex and does more atomics for scheduling
+
+#if 0
+    // after we've constructed the flat tree (phase 1), there are two things that need to happen:
+    //   PHASE 2:  Refit the flat tree, computing all of the node ABBs
+    //   PHASE 3:  Write the nodes out to memory
+    //
+    //  all of this is sub-group centric.  Different subgroups can execute phases 2 and 3 concurrently
+    //    
+
+    // TODO_OPT:  The scheduling algorithm might need to be re-thought.
+    //  Fused EUs are very hard to reason about.   It's possible that by scheduling independent
+    //  SGs in this way we would lose a lot of performance due to fused EU serialization.
+    //     Needs to be tested experimentally if such a thing is possible
+
+    uniform ushort state = STATE_SCHEDULE_REFIT;
+    uniform ushort node_index = 0;
+    uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+    LOOP_TRIPWIRE_INIT;
+
+    do
+    {
+        // barrier necessary to protect access to scheduler->active_subgroups
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (state == STATE_SCHEDULE_REFIT)
+        {
+            if (SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index))
+                state = STATE_REFIT;
+            else
+                state = STATE_SCHEDULE_WRITEOUT; // fallthrough
+        }
+        if (state == STATE_SCHEDULE_WRITEOUT)
+        {
+            state = SUBGROUP_FlatTreeScheduler_GetWriteOutTask(scheduler, num_nodes, &node_index);
+            if (state == STATE_DONE)
+                SUBGROUP_FlatTreeScheduler_SubGroupDone(scheduler);
+        }
+
+
+        // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask'
+        //  Note that in theory we could have the write-out tasks spin until the refit tasks clear, which would make this barrier unnecessary
+        //   However, we cannot do this safely on SKUs which do not support independent subgroup forward progress.
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (state == STATE_REFIT)
+        {
+            uniform ushort prev_node = node_index;
+            uniform bool continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index);
+
+            SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut(scheduler, prev_node);
+
+            if (!continue_refit)
+                state = STATE_SCHEDULE_REFIT;
+        }
+        else if (state == STATE_WRITEOUT)
+        {
+            SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, node_index, args);
+            state = STATE_SCHEDULE_WRITEOUT;
+        }
+        // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask'
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        LOOP_TRIPWIRE_INCREMENT(200);
+
+    } while (scheduler->active_subgroups > 0);
+
+#endif
+}
+*/
+
+GRL_INLINE void DFS_CreatePrimRefSet( struct DFSArgs args,
+                           local struct PrimRefSet* prim_refs )
+{
+    ushort id = get_local_id( 0 );
+    ushort num_primrefs = args.num_primrefs;
+
+
+    PrimRef ref;
+    struct AABB3f local_aabb;
+    if ( id < num_primrefs )
+    {
+        ref = args.primref_buffer[id];
+        AABB3f_set_lower( &local_aabb, ref.lower.xyz );
+        AABB3f_set_upper( &local_aabb, ref.upper.xyz );
+    }
+    else
+    {
+        AABB3f_init( &local_aabb );
+    }
+
+    AABB3f_atomic_merge_localBB_nocheck( &prim_refs->root_aabb, &local_aabb );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( id < num_primrefs )
+        PrimRefSet_SetPrimRef_FullPrecision( prim_refs, ref, id );    
+}
+
+
+
+struct BVHBuildLocals
+{
+    float  Al[DFS_WG_SIZE];
+    float  Ar[DFS_WG_SIZE];
+    uchar2 axis_and_left_count[ DFS_WG_SIZE ];
+    uint   sah[DFS_WG_SIZE];
+    uint   num_active_threads;
+};
+
+
+GRL_INLINE void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, 
+                        local struct PrimRefSet* prim_refs, 
+                        ushort num_prims,
+                        local struct BVHBuildLocals* locals )
+{   
+    ushort tid = get_local_id( 0 );
+
+    ushort bvh2_root         = 0;
+    ushort prim_range_start  = 0;
+    ushort primref_position = tid;
+
+    bool active_thread       = tid < num_prims;
+    float root_area  = PrimRefSet_GetMaxAABBArea( prim_refs );
+    float area_scale = DFS_BVH2_AREA_QUANT / root_area;
+    
+    locals->num_active_threads = num_prims;
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    LOOP_TRIPWIRE_INIT;
+
+    do
+    {
+        if(active_thread && prim_range_start == primref_position)
+            locals->sah[primref_position] = UINT_MAX;
+
+        if ( active_thread )
+        {            
+            local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+
+            // each thread evaluates a possible split candidate.  Scan primrefs and compute sah cost
+            //  do this axis-by-axis to keep register pressure low
+            float best_sah    = INFINITY;
+            ushort best_axis  = 3;
+            ushort best_count = 0;
+            float best_al     = INFINITY;
+            float best_ar     = INFINITY;
+
+            struct DFSPrimRefAABB box_left[3];
+            struct DFSPrimRefAABB box_right[3];
+            float CSplit[3];
+            ushort count_left[3];
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                DFSPrimRefAABB_init( &box_left[axis] );
+                DFSPrimRefAABB_init( &box_right[axis] );
+
+                CSplit[axis] = my_box->lower[axis] + my_box->upper[axis];
+                count_left[axis] = 0;
+            }
+
+            // scan primrefs in our subtree and partition using this thread's prim as a split plane
+            {
+                struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start );
+            
+                for ( ushort p = 1; p < num_prims; p++ )
+                {
+                        struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration
+            
+                        for( ushort axis = 0; axis < 3; axis++ )
+                        {
+                            float c = box.lower[axis] + box.upper[axis];
+            
+                            if ( c < CSplit[axis] ) 
+                            {
+                                // this primitive is to our left. 
+                                DFSPrimRefAABB_extend( &box_left[axis], &box );
+                                count_left[axis]++;
+                            }
+                            else
+                            {
+                                // this primitive is to our right
+                                DFSPrimRefAABB_extend( &box_right[axis], &box );
+                            }
+                        }
+            
+                        box = next_box;
+                }
+
+                // last iteration without preloading box
+                for( ushort axis = 0; axis < 3; axis++ )
+                {
+                    float c = box.lower[axis] + box.upper[axis];
+            
+                    if ( c < CSplit[axis] ) 
+                    {
+                        // this primitive is to our left. 
+                        DFSPrimRefAABB_extend( &box_left[axis], &box );
+                        count_left[axis]++;
+                    }
+                    else
+                    {
+                        // this primitive is to our right
+                        DFSPrimRefAABB_extend( &box_right[axis], &box );
+                    }
+                }
+            }
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                float Al = DFSPrimRefAABB_halfArea( &box_left[axis]  );
+                float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] );
+                
+                // Avoid NANs in SAH calculation in the corner case where all prims go right
+                //  In this case we set Al=Ar, because such a split will only be selected if all primrefs
+                //    are co-incident..  In that case, we will fall back to split-in-the-middle and both subtrees 
+                //    should store the same quantized area value
+                if ( count_left[axis] == 0 )
+                    Al = Ar; 
+
+                // compute sah cost
+                ushort count_right = num_prims - count_left[axis];
+                float sah = Ar * count_right + Al * count_left[axis];
+                
+                // keep this split if it is better than the previous one, or if the previous one was a corner-case
+                if ( sah < best_sah || best_count == 0 )
+                {
+                    // yes, keep it
+                    best_axis   = axis;
+                    best_sah    = sah;
+                    best_count  = count_left[axis];
+                    best_al     = Al;
+                    best_ar     = Ar;
+                }               
+            }
+
+
+            // write split information to SLM
+            locals->Al[primref_position]             = best_al;
+            locals->Ar[primref_position]             = best_ar;
+            locals->axis_and_left_count[primref_position].x = best_axis;
+            locals->axis_and_left_count[primref_position].y = best_count;
+
+            uint sah = as_uint(best_sah);
+            // break ties by axis to ensure deterministic split selection
+            //  otherwise builder can produce non-deterministic tree structure run to run
+            //  based on the ordering of primitives (which can vary due to non-determinism in atomic counters)
+            // Embed split axis and index into sah value; compute min over sah and max over axis
+            sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | primref_position );
+
+            // reduce on split candidates in our local subtree and decide the best one
+            atomic_min_local( &locals->sah[ prim_range_start ], sah);
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        ushort split_index      = locals->sah[ prim_range_start ] & 255;
+        ushort split_axis       = locals->axis_and_left_count[split_index].x;
+        ushort split_left_count = locals->axis_and_left_count[split_index].y;
+        float split_al          = locals->Al[split_index];
+        float split_ar          = locals->Ar[split_index];
+
+        if ( (primref_position == prim_range_start) && active_thread )
+        {
+            // first thread in a given subtree creates the inner node
+            ushort quantized_left_area  = convert_ushort_rtn( split_al * area_scale );
+            ushort quantized_right_area = convert_ushort_rtn( split_ar * area_scale );
+            ushort start_left  = prim_range_start;
+            ushort start_right = prim_range_start + split_left_count;
+            if ( split_left_count == 0 )
+                start_right = start_left + (num_prims / 2); // handle split-in-the-middle case
+
+            LocalBVH2_CreateInnerNode( bvh2, bvh2_root, 
+                                      start_left, start_right,
+                                      quantized_left_area, quantized_right_area );
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        struct DFSPrimRef ref;
+        ushort new_primref_position;
+
+        if ( active_thread )
+        {
+            ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+            bool go_left;
+
+            if ( split_left_count == 0 )
+            {
+                // We chose a split with no left-side prims
+                //  This will only happen if all primrefs are located in the exact same position
+                //   In that case, fall back to split-in-the-middle
+                split_left_count = (num_prims / 2);
+                go_left = (primref_position - prim_range_start < split_left_count);
+            }
+            else
+            {
+                // determine what side of the split this thread's primref belongs on
+                local struct DFSPrimRefAABB* my_box     = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+                local struct DFSPrimRefAABB* split_box  = PrimRefSet_GetAABBPointer( prim_refs, split_index );
+                float c = my_box->lower[split_axis] + my_box->upper[split_axis];
+                float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis];
+                go_left = c < Csplit;                
+            }
+
+            // adjust state variables for next loop iteration
+            bvh2_root                    = (go_left) ? kids.x : kids.y;
+            num_prims                    = (go_left) ? split_left_count : (num_prims - split_left_count);
+            prim_range_start             = (go_left) ? prim_range_start : prim_range_start + split_left_count;
+
+            // determine the new primref position by incrementing a counter in the destination subtree
+            new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root );
+            
+            // load our primref from its previous position
+            ref = PrimRefSet_GetPrimRef( prim_refs, primref_position );
+        }
+        
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if ( active_thread )
+        {
+            // write our primref into its sorted position
+            PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position );
+            primref_position = new_primref_position;
+
+            // deactivate all threads whose subtrees are small enough to form a leaf
+            if ( num_prims <= TREE_ARITY )
+            {
+                active_thread = false;
+                atomic_dec_local( &locals->num_active_threads );
+            }
+        } 
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        LOOP_TRIPWIRE_INCREMENT( 50 );
+
+
+    } while ( locals->num_active_threads > 0 );
+
+
+}
+
+
+
+// fast path for #prims <= TREE_ARITY
+GRL_INLINE void Trivial_DFS( struct DFSArgs args )
+{
+
+    ushort tid = get_local_id( 0 );
+
+    PrimRef myRef;
+    AABB_init( &myRef );
+    if( tid < args.num_primrefs )
+        myRef = args.primref_buffer[tid];
+
+    uint node_offset;
+    if ( tid == 0 )
+        node_offset = 64*allocate_inner_nodes( args.bvh_base, 1 );
+    node_offset = sub_group_broadcast(node_offset,0);
+
+    char* bvh_mem = (char*) args.bvh_base;
+    struct QBVHNodeN* qnode  = (struct QBVHNodeN*) (bvh_mem + node_offset);
+
+    uint child_type = args.leaf_node_type;
+    uint prim_base  = args.bvh_base->quadLeafStart*64 ;
+
+    char* leaf_mem = bvh_mem + prim_base;
+    int offset = (int)( leaf_mem  - (char*)qnode );
+
+    if (child_type == NODE_TYPE_INSTANCE)
+    {
+        subgroup_setInstanceQBVHNodeN( offset >> 6, &myRef, args.num_primrefs, qnode, tid < args.num_primrefs ? PRIMREF_instanceMask(&myRef) : 0  );
+    }
+    else
+        subgroup_setQBVHNodeN( offset >> 6, child_type, &myRef, args.num_primrefs, qnode, BVH_NODE_DEFAULT_MASK );
+
+    if ( tid < args.num_primrefs )
+    {
+        global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs;
+        uint bp = node_offset;
+
+        // TODO_OPT:  Leaf creation pipeline can be made simpler by having a sideband buffer containing
+        //    fatleaf index + position in fatleaf for each primref, instead of forcing leaf creation shader to reconstruct it
+        //   should also probably do the fat-leaf splitting here
+        args.primref_buffer[tid]        = myRef;
+        args.primref_index_buffer[tid]  = tid;
+
+        primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN);
+
+        if ( tid == 0 && args.need_backpointers )
+        {
+            uint bp = ((uint)-1) << 6;
+            bp |= (args.num_primrefs) << 3;
+            *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) = bp;
+        }
+    }
+}
+
+
+
+
+
+void SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( uniform local struct FlatTree* flat_tree,
+                                                uniform local struct FlatTreeScheduler* flat_scheduler,
+                                                uniform local struct AABB3f* boxes,
+                                                uniform local struct PrimRefMeta* primref_meta,
+                                                uniform global struct AABB* primref_buffer,
+                                                uniform local uchar* masks,
+                                                bool need_masks )
+
+{
+    uniform int num_levels = (int) flat_scheduler->num_levels;
+    varying ushort lane = get_sub_group_local_id();
+
+    // iterate over depth levels in the tree... deepest to shallowest
+    for (uniform int level = num_levels - 1; level >= 0; level--)
+    {
+        // loop over a range of flattree nodes at this level, one node per sub-group
+        // TODO_OPT:  Try  and enable this code to process two nodes in a SIMD16 subgroup
+        uniform ushort level_start      = flat_scheduler->level_start[level];
+        uniform ushort level_node_count = flat_scheduler->level_count[level];
+        
+        for (uniform ushort i = get_sub_group_id(); i < level_node_count; i += get_num_sub_groups())
+        {
+            uniform ushort node_index = flat_scheduler->level_ordered_nodes[ level_start + i ];
+
+            varying struct AABB box;
+            AABB_init(&box);
+
+            uniform uint child_base   = FlatTree_GetFirstChild( flat_tree, node_index );
+            uniform uint num_children = FlatTree_GetNumChildren( flat_tree, node_index );
+            varying uint child_index  = child_base + ((lane<num_children)?lane : 0);
+
+            varying uint mask = 0xff;
+            if (FlatTree_IsLeafNode( flat_tree, node_index ))
+            {
+                // fetch AABBs for primrefs               
+                box = primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_index] ) ];      
+                if( need_masks )
+                    mask = PRIMREF_instanceMask(&box);
+            }
+            else
+            {
+                // fetch AABBs for child nodes                
+                box.lower.xyz = AABB3f_load_lower( &boxes[child_index] );
+                box.upper.xyz = AABB3f_load_upper( &boxes[child_index] );
+                if ( need_masks )
+                    mask = masks[child_index];                
+            }
+
+
+            // reduce and write box
+            box = AABB_sub_group_reduce_N6( &box );
+            if( lane == 0 )
+                AABB3f_set( &boxes[node_index], box.lower.xyz, box.upper.xyz );
+
+            if( need_masks )
+            {
+                mask = sub_group_reduce_or_N6(mask);
+                masks[node_index] = mask;
+            }
+
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+    }
+}
+
+
+void SUBGROUP_DFS_WriteNodes( 
+    uniform local struct FlatTree* flat_tree,
+    uniform local struct AABB3f* boxes,
+    uniform local struct PrimRefMeta* primref_meta,
+    uniform struct DFSArgs args, 
+    uniform local uchar* masks
+    )
+
+{
+    uniform uint num_nodes = FlatTree_GetNodeCount(flat_tree);
+    
+    for ( uniform uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups() )
+    {
+        SUBGROUP_WriteQBVHNode( flat_tree, primref_meta, boxes, i, args, masks );       
+    }
+
+}
+
+
+
+
+struct Single_WG_build_SLM
+{
+    struct FlatTree           flat_tree;  
+    struct FlatTreeScheduler  flat_scheduler; 
+    struct PrimRefMeta primitive_meta[DFS_WG_SIZE];
+
+    union
+    {
+        struct{   
+            struct PrimRefSet         prim_refs;           
+            struct LocalBVH2          bvh2;  
+            struct BVHBuildLocals     bvh2_locals;
+        } s1;
+
+        struct {
+            struct AABB3f boxes[DFS_MAX_FLATTREE_NODES];
+            uchar masks[DFS_MAX_FLATTREE_NODES];
+        } s2;
+    } u;
+
+};
+
+
+GRL_INLINE void execute_single_WG_build( 
+        struct DFSArgs args,    
+        local struct Single_WG_build_SLM* slm
+    )
+{
+    
+    ushort tid = get_local_id( 0 );
+    
+    //
+    // Initialize the various SLM structures.  Different sub-groups take different init paths.
+    //    NOTE: even numbered subgroups here to avoid the fused-EU serialization bug
+    //
+    if ( get_sub_group_id() == 0 )
+        SUBGROUP_FlatTree_Initialize( &slm->flat_tree, args );
+    else if ( get_sub_group_id() == 2 )
+        SUBGROUP_LocalBVH2_Initialize( &slm->u.s1.bvh2, args.num_primrefs );
+    else if ( get_sub_group_id() == 4 )
+        SUBGROUP_FlatTreeScheduler_Initialize( &slm->flat_scheduler );
+    else if ( get_sub_group_id() == 6 )
+        SUBGROUP_PrimRefSet_Initialize( &slm->u.s1.prim_refs );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // load the PrimRefs  
+    DFS_CreatePrimRefSet( args, &slm->u.s1.prim_refs );
+   
+    // build the BVH2
+    DFS_ConstructBVH2( &slm->u.s1.bvh2, &slm->u.s1.prim_refs, args.num_primrefs, &slm->u.s1.bvh2_locals );
+   
+    // copy out metadata for primrefs now that they have been sorted
+    if( tid < args.num_primrefs )
+    {
+        slm->primitive_meta[tid] = PrimRefSet_GetMeta( &slm->u.s1.prim_refs, tid );
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // collapse into a FlatTree
+    SUBGROUP_DFS_BuildFlatTree( &slm->u.s1.bvh2, &slm->flat_tree, &slm->flat_scheduler );
+
+    // allocate output QBVH6 nodes
+    if ( get_local_id( 0 ) == 0 )
+        FlatTree_AllocateQNodes( &slm->flat_tree, args );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( &slm->flat_tree, &slm->flat_scheduler, &slm->u.s2.boxes[0], slm->primitive_meta, args.primref_buffer, slm->u.s2.masks, args.need_masks );
+    
+    //FlatTree_Printf( &slm->flat_tree );
+    //FlatTree_check_boxes ( &slm->flat_tree, args.primref_buffer, &slm->u.s2.boxes[0], slm->primitive_meta );
+
+    SUBGROUP_DFS_WriteNodes( &slm->flat_tree, &slm->u.s2.boxes[0], slm->primitive_meta, args, slm->u.s2.masks );
+
+   
+    // generate sorted primref index buffer and backpointers to feed the leaf creation pipeilne
+    if ( tid < args.num_primrefs )
+    {
+        uint input_index = PrimRefMeta_GetInputIndex(&slm->primitive_meta[tid]);
+
+        uint bp = FlatTree_GetPrimRefBackPointer( &slm->flat_tree, tid );
+        global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs;
+
+        args.primref_index_buffer[tid] = input_index;
+
+        primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN);
+
+        if ( tid == 0 && args.need_backpointers  )
+        {
+            *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) |= ((uint)-1) << 6;
+        }
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS( global struct Globals* globals,
+                 global char* bvh_mem,
+                 global PrimRef* primref_buffer,
+                 global uint* primref_index_buffer,
+                 uint alloc_backpointers
+                 )
+{
+    struct DFSArgs args;
+    args.bvh_base             = (global struct BVHBase*) bvh_mem;
+    args.leaf_node_type       = globals->leafPrimType;
+    args.inner_node_type      = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes   = globals->leafSize;
+    args.primref_buffer       = primref_buffer;
+    args.need_backpointers    = alloc_backpointers != 0;  
+    args.num_primrefs         = globals->numPrimitives;
+    args.primref_index_buffer = primref_index_buffer;
+    args.need_masks           = args.leaf_node_type == NODE_TYPE_INSTANCE;
+
+    if ( args.num_primrefs <= TREE_ARITY )
+    {
+        // TODO_OPT: This decision should be made using indirect dispatch
+        if( get_sub_group_id() == 0 )
+            Trivial_DFS( args );
+        return;
+    }
+
+    local struct Single_WG_build_SLM slm;
+   
+    execute_single_WG_build( args, &slm );
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS_single_wg( 
+    global struct Globals* globals,
+    global char* bvh_mem,
+    global PrimRef* primref_buffer,
+    global uint* primref_index_buffer,
+    uint sah_flags
+)
+{
+    struct DFSArgs args;
+    args.bvh_base = (global struct BVHBase*) bvh_mem;
+    args.leaf_node_type = globals->leafPrimType;
+    args.inner_node_type = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes = globals->leafSize;
+    args.primref_buffer = primref_buffer;
+    args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+    args.num_primrefs = globals->numPrimitives;
+    args.primref_index_buffer = primref_index_buffer;
+    args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+    local struct Single_WG_build_SLM slm;
+
+    execute_single_WG_build( args, &slm );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS_trivial(
+    global struct Globals* globals,
+    global char* bvh_mem,
+    global PrimRef* primref_buffer,
+    global uint* primref_index_buffer,
+    uint sah_flags
+)
+{
+    struct DFSArgs args;
+    args.bvh_base = (global struct BVHBase*) bvh_mem;
+    args.leaf_node_type = globals->leafPrimType;
+    args.inner_node_type = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes = globals->leafSize;
+    args.primref_buffer = primref_buffer;
+    args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+    args.num_primrefs = globals->numPrimitives;
+    args.primref_index_buffer = primref_index_buffer;
+    args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+    Trivial_DFS( args );
+}
+
+
+struct DFSArgs dfs_args_from_sah_globals( global struct SAHBuildGlobals* sah_globals )
+{
+    struct DFSArgs args;
+    args.bvh_base               = (global struct BVHBase*) sah_globals->p_bvh_base;
+    args.leaf_node_type         = sah_globals->leaf_type;
+    args.inner_node_type        = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes     = sah_globals->leaf_size;
+    args.primref_buffer         = (global PrimRef*) sah_globals->p_primrefs_buffer;
+    args.need_backpointers      = sah_globals->flags & SAH_FLAG_NEED_BACKPOINTERS;
+    args.num_primrefs           = sah_globals->num_primrefs;
+    args.primref_index_buffer   = (global uint*) sah_globals->p_primref_index_buffers;
+    args.need_masks             = sah_globals->flags & SAH_FLAG_NEED_MASKS;
+
+    return args;
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(DFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void DFS_single_wg_batchable(
+    global struct SAHBuildGlobals* globals_buffer,
+    global struct VContextScheduler* scheduler
+)
+{
+    global struct SAHBuildGlobals* sah_globals = globals_buffer + scheduler->num_trivial_builds + get_group_id(0);
+
+    struct DFSArgs args = dfs_args_from_sah_globals( sah_globals );
+    
+    local struct Single_WG_build_SLM slm;
+
+    execute_single_WG_build(args, &slm);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void DFS_trivial_batchable(
+    global struct SAHBuildGlobals* globals_buffer
+)
+{
+    global struct SAHBuildGlobals* sah_globals = globals_buffer + get_group_id(0);
+
+    struct DFSArgs args = dfs_args_from_sah_globals(sah_globals);
+
+    Trivial_DFS(args);
+}
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
new file mode 100644
index 00000000000..bb220b30612
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
@@ -0,0 +1,357 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+primref_to_quads(global struct Globals *globals,
+                 global struct AABB *primref,
+                 global char *primref_index,
+                 global char *bvh_mem,
+                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                 const uint stride,
+                 const uint offset,
+                 const uint allow_update)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
+    uint quadIndicesStart = bvh->quadIndicesDataStart;
+
+    const uint numPrimitives = globals->numPrimitives;
+    uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0);
+    if (i < numPrimitives)
+    {
+        global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+
+        const uint primrefID = *(uint *)(primref_index + i * stride + offset);
+
+        const uint geomID    = PRIMREF_geomID(&primref[primrefID]);
+        const uint primID0   = PRIMREF_primID0(&primref[primrefID]);
+        const uint primID1   = PRIMREF_primID1(&primref[primrefID]);
+        const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
+
+        const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
+        const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
+
+        const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
+
+        uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride;
+
+        const uint4 indices = q.a;
+
+        const uint mask = 0xff; // FIXME: hardcoded mask
+        float3 vtx0, vtx1, vtx2, vtx3;
+        GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+        uint j0 = q.lb.x;
+        uint j1 = q.lb.y;
+        uint j2 = q.lb.z;
+        uint shaderIndex = (mask << 24) | geomID;
+        uint geomIndex = geomID | (geomFlags << 30);
+        uint primIndex0 = primID0;
+        const uint delta = primID1 - primID0;
+        const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
+        uint primIndex1Delta = delta | (j << 16) | (1 << 22);
+
+        uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta);
+        float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x);
+        float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y);
+        float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z);
+        
+        global uint4* dst = (global uint4*)&quads[i];
+        store_uint4_L1WB_L3WB(dst, 0, pack0);
+        store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1));
+        store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2));
+        store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3));
+
+        if(allow_update)
+        {
+            global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i));
+        
+            uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w );
+        
+            store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 );
+            store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride);
+        }
+
+        if (i == 0)
+            bvh->quadLeafCur += numPrimitives ;
+    }
+
+
+
+#if 0
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
+
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = get_group_id( 0 ) * get_local_size( 0 );
+    const uint endID   = min((uint)(startID + get_local_size( 0 )), numPrimitives);
+
+    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+    {
+        const uint primrefID = *(uint *)(primref_index + i * stride + offset);
+
+        const uint geomID    = PRIMREF_geomID(&primref[primrefID]);
+        const uint primID0   = PRIMREF_primID0(&primref[primrefID]);
+        const uint primID1   = PRIMREF_primID1(&primref[primrefID]);
+        const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
+
+        const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
+        const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
+
+        const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
+
+        const uint4 indices = q.a;
+        const uint mask = 0xff; // FIXME: hardcoded mask
+        float3 vtx0, vtx1, vtx2, vtx3;
+        GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+        setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags );
+    }
+
+    if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
+        bvh->quadLeafCur += numPrimitives ;
+#endif
+}
+
+GRL_INLINE void create_procedural_leaf(global struct Globals *globals,
+                            global struct AABB *primref,
+                            local uint *primrefids,
+                            uint numProcedurals,
+                            struct QBVHNodeN *qnode,
+                            global char *bvh_mem,
+                            global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    if (get_local_id(0) >= 8)
+        return;
+
+    global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem;
+
+    /* first read geomID of all primitives */
+    uint primrefID = -1;
+    uint geomID = -1;
+    uint geomFlags = 0;
+    if (get_local_id(0) < numProcedurals)
+    {
+        primrefID = primrefids[get_local_id(0)];
+        geomID = PRIMREF_geomID(&primref[primrefID]);
+        geomFlags = PRIMREF_geomFlags( &primref[primrefID] );
+    }
+
+    // cannot sort by geomID as bounds in parent node are then wrong
+    //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID);
+    //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID);
+    //geomID = geomID_primrefID >> 32;
+    //primrefID = geomID_primrefID;
+
+    /* We have to split at geomID boundaries into multiple leaves. This
+   * block calculates the lane where a leaf starts and ends. */
+    const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u);
+    const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u);
+    const uint leaf_start = geomIDprev != geomID;
+    const uint leaf_end = geomIDnext != geomID;
+    const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u);
+
+    /* This computes which leaf a lane processes. E.g. form geomID =
+   * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */
+    //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive?
+
+    /* This computes the n'th primitive a lane processes inside its
+    * leaf. For the example above we compute leaf_prim =
+    * [0,1,0,1,2,0]. */
+    const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0);
+
+    /* from here on we allocate data and write to memory, thus only
+   * lanes that process a primitive should continue. */
+    if (get_local_id(0) >= numProcedurals)
+        return;
+
+    /* Here we allocate a single memory block for each required
+     * ProceduralLeaf node. We do this from a single lane to ensure
+     * the allocation is contiguous. */
+    uint leaf_base_offset = 0;
+    uint n_leafs = sub_group_reduce_add(leaf_start);
+    if (get_local_id(0) == 0)
+       leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs );
+    leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0);
+
+    /* Compute the leaf offset for each lane. */
+    uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1;
+
+    struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset;
+
+    /* write the procedural leaf headers */
+    if (leaf_end)
+    {
+        pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function.   Future extensions may have shaderIndex != geomID
+        pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME:  Use setter function
+        pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!!
+    }
+    /* write the procedural leaf primIDs */
+    pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]);
+
+    /* update leaf node offset inside parent node */
+    if (get_local_id(0) == 0)
+    {
+        QBVH6Node_set_offset(qnode, pleaf);
+        QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL);
+    }
+
+    /* Let parent node children point to proper procedural leaf block
+   * and primitive. */
+    qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+GRL_ANNOTATE_BIG_REG_REQ
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+primref_to_procedurals(global struct Globals *globals,
+                                 global struct AABB *primref,
+                                 global char *primref_index,
+                                 global char *bvh_mem,
+                                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                                 const uint stride,
+                                 const uint offset)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+
+    const uint numPrimitives = globals->numPrimitives;
+    uint startID = get_group_id( 0 ) * get_local_size( 0 );
+    uint endID   = min((uint)(startID + get_local_size( 0 )), numPrimitives);
+
+    uint offset1 = stride * globals->numPrimitives;
+    if (stride == 8)
+        offset1 = 4;
+
+    uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1);
+    /* start at leaf start */
+    while (startID < numPrimitives)
+    {
+        const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1);
+        if (back_pointer != prev_start_back_pointer)
+            break;
+        startID++;
+    }
+
+    uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1);
+    /* end at next leaf start */
+    while (endID < numPrimitives)
+    {
+        const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1);
+        if (back_pointer != prev_end_back_pointer)
+            break;
+        endID++;
+    }
+
+    local uint procedurals[16];
+
+    for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);)
+    {
+        /* load leaf start points and back_pointer */
+        const uint primrefID = *(uint *)(primref_index + lid * stride + offset);
+        uint back_pointer = *(uint *)(primref_index + lid * stride + offset1);
+        uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1);
+
+        const uint leaf_start = back_pointer != prev_back_pointer;
+        uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0);
+
+        /* compute number of primitives inside the leaf starting at lid */
+        const uint leaf_id = sub_group_scan_inclusive_add(leaf_start);
+        uint numPrimitives = 0;
+        if (back_pointer == leaf_start_back_pointer && lid < endID)
+            numPrimitives = sub_group_reduce_add(1);
+        numPrimitives = sub_group_broadcast(numPrimitives, 0);
+
+        procedurals[get_local_id(0)] = primrefID;
+
+        struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer;
+
+        create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc);
+
+        lid += numPrimitives;
+    }
+}
+
+GRL_INLINE void create_HW_instance_leaf(
+    global struct BVHBase* bvh,
+    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
+    uint dstLeafId,
+    uint instanceIndex,
+    uint rootNodeByteOffset,
+    uint instanceMask)
+{
+    /* convert DXR instance to instance leaf node */
+    global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh);
+    HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask);
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel create_HW_instance_nodes(
+    global const struct Globals *globals,
+    global char *primref_index,
+    global struct AABB *primref,
+    global struct BVHBase *bvh,
+    global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances,
+    uint32_t stride,
+    uint32_t offset)
+{
+    uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
+    uint num_prims = globals->numPrimitives;
+    if (dstLeafId >= num_prims)
+        return;
+    if( dstLeafId == 0 )
+        bvh->instanceLeafEnd += 2*num_prims;
+
+    /* get instance ID */
+    const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
+    const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
+    const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
+    const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
+    create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel create_HW_instance_nodes_pointers(
+    global const struct Globals *globals,
+    global char *primref_index,
+    global struct AABB *primref,
+    global struct BVHBase *bvh,
+    global void *instances_in,
+    uint32_t stride,
+    uint32_t offset)
+{
+    uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
+    uint num_prims = globals->numPrimitives;
+    if (dstLeafId >= num_prims)
+        return;
+    if (dstLeafId == 0)
+        bvh->instanceLeafEnd += 2 * num_prims;
+
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    /* get instance ID */
+    const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
+    const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
+    const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
+    const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
+    create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
new file mode 100644
index 00000000000..bc9cf590f51
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
@@ -0,0 +1,556 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+
+#define GRID_SIZE 1024
+
+/*
+  This presplit item contains for each primitive a number of splits to
+  perform (priority) and the primref index.
+ */
+
+struct PresplitItem
+{
+    unsigned int index;
+    float priority;
+};
+
+/*
+
+  This function splits a line v0->v1 at position pos in dimension dim
+  and merges the bounds for the left and right line segments into
+  lbounds and rbounds.
+
+ */
+
+GRL_INLINE void splitLine(const uint dim,
+                      const float pos,
+                      const float4 v0,
+                      const float4 v1,
+                      struct AABB *lbounds,
+                      struct AABB *rbounds)
+{
+    const float v0d = v0[dim];
+    const float v1d = v1[dim];
+
+    /* this point is on left side */
+    if (v0d <= pos)
+        AABB_extend_point(lbounds, v0);
+
+    /* this point is on right side */
+    if (v0d >= pos)
+        AABB_extend_point(rbounds, v0);
+
+    /* the edge crosses the splitting location */
+    if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d))
+    {
+        const float f = (pos - v0d) / (v1d - v0d);
+        const float4 c = f * (v1 - v0) + v0;
+        AABB_extend_point(lbounds, c);
+        AABB_extend_point(rbounds, c);
+    }
+}
+
+/*
+
+  This function splits a clipped triangle v0,v1,v2 with bounds prim at
+  position pos in dimension dim and merges the bounds for the left and
+  right clipped triangle fragments into lbounds and rbounds.
+
+ */
+
+GRL_INLINE void splitTriangle(struct AABB *prim,
+                          const uint dim,
+                          const float pos,
+                          const float4 v0,
+                          const float4 v1,
+                          const float4 v2,
+                          struct AABB *lbounds,
+                          struct AABB *rbounds)
+{
+    /* clip each triangle edge */
+    splitLine(dim, pos, v0, v1, lbounds, rbounds);
+    splitLine(dim, pos, v1, v2, lbounds, rbounds);
+    splitLine(dim, pos, v2, v0, lbounds, rbounds);
+
+    /* the triangle itself was clipped already, thus clip against triangle bounds */
+    AABB_intersect(lbounds, prim);
+    AABB_intersect(rbounds, prim);
+}
+
+float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom)
+{
+    /* calculate projected area of first triangles */
+    const uint primID0 = PRIMREF_primID0(prim);
+    const uint3 tri0 = GRL_load_triangle(geom, primID0);
+    const float4 av0 = GRL_load_vertex(geom, tri0.x);
+    const float4 av1 = GRL_load_vertex(geom, tri0.y);
+    const float4 av2 = GRL_load_vertex(geom, tri0.z);
+    const float area_tri0 = areaProjectedTriangle(av0, av1, av2);
+
+    /* calculate projected area of second triangle */
+    const uint primID1 = PRIMREF_primID1(prim);
+    const uint3 tri1 = GRL_load_triangle(geom, primID1);
+    const float4 bv0 = GRL_load_vertex(geom, tri1.x);
+    const float4 bv1 = GRL_load_vertex(geom, tri1.y);
+    const float4 bv2 = GRL_load_vertex(geom, tri1.z);
+    const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2);
+
+    /* as priority we use the AABB area */
+    const float area_aabb = AABB_halfArea(prim);
+    float priority = area_aabb;
+
+    /* prefer triangles with a large potential SAH gain. */
+    const float area_tris = area_tri0 + area_tri1;
+    const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris));
+    priority *= area_ratio;
+
+    /* ignore too small primitives */
+    //const float4 size = AABB_size(prim);
+    //const float max_size = max(size.x,max(size.y,size.z));
+    //if (max_size < 0.5f*max_scene_size/GRID_SIZE)
+    //  priority = 0.0f;
+
+    return priority;
+}
+
+/*
+
+  This kernel calculates for each primitive an estimated splitting priority.
+
+ */
+
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals,
+                                                                                                 global struct BVHBase* bvh_base,
+                                                                                                 global struct AABB *primref,
+                                                                                                 global struct PresplitItem *presplit,
+                                                                                                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    //assert(sizeof(PresplitItem) == sizeof_PresplitItem);
+
+    /* calculate the range of primitives each work group should process */
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0);
+
+    /* get scene bounding box size */
+    const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds);
+    const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z));
+
+    /* each work group iterates over its range of primitives */
+    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+    {
+        const uint geomID = PRIMREF_geomID(&primref[i]);
+
+        /* splitting heuristic for triangles */
+        if (GRL_is_triangle(&geomDesc[geomID]))
+        {
+            presplit[i].index = i;
+            presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]);
+        }
+
+        /* splitting of procedurals is not supported */
+        else if (GRL_is_procedural(&geomDesc[geomID]))
+        {
+            presplit[i].index = i;
+            presplit[i].priority = 0.0f;
+        }
+
+        else
+        {
+            //assert(false);
+        }
+    }
+
+    if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
+        globals->numOriginalPrimitives = globals->numPrimitives;
+}
+
+/*
+
+  This kernel computes the sum of all priorities.
+
+ */
+
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+priority_sum(global struct Globals *globals,
+             global struct PresplitItem *presplit,
+             uint numPrimitivesToSplit)
+{
+    const uint N = globals->numPrimitives;
+    const uint j = get_local_id(0);
+    const uint J = get_local_size(0);
+    const uint BLOCKSIZE = (N + J - 1) / J;
+    const uint start = min((j + 0) * BLOCKSIZE, N);
+    const uint end = min((j + 1) * BLOCKSIZE, N);
+
+    float prioritySum = 0;
+    for (uint i = start; i < end; i++)
+        prioritySum += presplit[i].priority;
+
+    prioritySum = work_group_reduce_add(prioritySum);
+    globals->presplitPrioritySum = prioritySum;
+
+#if 0
+  work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+  float scale = 1.0f;
+  for (uint i = 0; i < 10; i++)
+  {
+    //if (j == 0)
+    //printf("prioritySum = %f\n",scale*prioritySum);
+
+    uint numSplits = 0;
+    for (uint i = start; i < end; i++)
+      numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit;
+
+    numSplits = work_group_reduce_add(numSplits);
+
+    if (numSplits > numPrimitivesToSplit)
+      break;
+
+    //if (j == 0)
+    //  printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit);
+
+    globals->presplitPrioritySum = scale * prioritySum;
+    scale -= 0.05f;
+  }
+#endif
+}
+
+GRL_INLINE void heapify_down(struct AABB *array, uint size)
+{
+    /* we start at the root */
+    uint cur_node_id = 0;
+    struct AABB *cur_node = array;
+
+    while (true)
+    {
+        int larger_node_id = cur_node_id;
+        struct AABB *larger_node = cur_node;
+
+        /* check if left child is largest */
+        const int left_node_id = 2 * cur_node_id + 1;
+        struct AABB *left_node = &array[left_node_id];
+        if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node))
+        {
+            larger_node_id = left_node_id;
+            larger_node = left_node;
+        }
+
+        /* check if right child is largest */
+        const int right_node_id = 2 * cur_node_id + 2;
+        struct AABB *right_node = &array[right_node_id];
+        if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node))
+        {
+            larger_node_id = right_node_id;
+            larger_node = right_node;
+        }
+
+        /* if current node is largest heap property is fulfilled and we are done */
+        if (larger_node_id == cur_node_id)
+            break;
+
+        /* otherwise we swap cur and largest */
+        struct AABB tmp = *cur_node;
+        *cur_node = *larger_node;
+        *larger_node = tmp;
+
+        /* we continue downwards with the largest node */
+        cur_node_id = larger_node_id;
+        cur_node = larger_node;
+    }
+}
+
+GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id)
+{
+    /* stop if we start at the root */
+    if (cur_node_id == 0)
+        return;
+
+    struct AABB *cur_node = &array[cur_node_id];
+
+    /* we loop until we reach the root node */
+    while (cur_node_id)
+    {
+        /* get parent node */
+        uint parent_node_id = (cur_node_id - 1) / 2;
+        struct AABB *parent_node = &array[parent_node_id];
+
+        /* if parent is larger then current we fulfill the heap property and can terminate */
+        if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node))
+            break;
+
+        /* otherwise we swap cur and parent */
+        struct AABB tmp = *cur_node;
+        *cur_node = *parent_node;
+        *parent_node = tmp;
+
+        /* and continue upwards */
+        cur_node_id = parent_node_id;
+        cur_node = parent_node;
+    }
+}
+
+/* splits a quad primref */
+GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom,
+                      struct AABB *cur, uint dim, float fsplit,
+                      struct AABB *left, struct AABB *right)
+{
+    /* left and right bounds to compute */
+    AABB_init(left);
+    AABB_init(right);
+
+    /* load first triangle and split it */
+    const uint primID0 = PRIMREF_primID0(cur);
+    const uint3 tri0 = GRL_load_triangle(geom, primID0);
+    const float4 av0 = GRL_load_vertex(geom, tri0.x);
+    const float4 av1 = GRL_load_vertex(geom, tri0.y);
+    const float4 av2 = GRL_load_vertex(geom, tri0.z);
+    splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right);
+
+    /* load second triangle and split it */
+    const uint primID1 = PRIMREF_primID1(cur);
+    const uint3 tri1 = GRL_load_triangle(geom, primID1);
+    const float4 bv0 = GRL_load_vertex(geom, tri1.x);
+    const float4 bv1 = GRL_load_vertex(geom, tri1.y);
+    const float4 bv2 = GRL_load_vertex(geom, tri1.z);
+    splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right);
+
+    /* copy the PrimRef payload into left and right */
+    left->lower.w = cur->lower.w;
+    left->upper.w = cur->upper.w;
+    right->lower.w = cur->lower.w;
+    right->upper.w = cur->upper.w;
+}
+
+/*
+
+  This kernel performs the actual pre-splitting. It selects split
+  locations based on an implicit octree over the scene.
+
+ */
+
+#define USE_HEAP 0
+#define HEAP_SIZE 32u
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+//__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+perform_presplits(global struct Globals *globals,
+                  global struct BVHBase* bvh_base,
+                  global struct AABB *primref,
+                  global struct PresplitItem *presplit,
+                  global char *bvh_mem,
+                  global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                  uint numPrimitivesToSplit)
+{
+    /* calculate the range of primitives each work group should process */
+    const uint numPrimitives = globals->numPrimitives;
+    int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit;
+    pstart = max(0, pstart);
+    const uint numPrimitivesToProcess = globals->numPrimitives - pstart;
+    const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0);
+
+    /* calculates the 3D grid */
+    float4 grid_base;
+    grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds );
+    grid_base.w = 0;
+
+    float4 grid_extend;
+    grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds);
+    grid_extend.w=0;
+
+    grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z));
+    const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f);
+    const float inv_grid_size = 1.0f / GRID_SIZE;
+
+    /* we have to update centroid bounds */
+    struct AABB centroidBounds;
+    AABB_init(&centroidBounds);
+
+    /* initialize heap */
+    struct AABB heap[HEAP_SIZE];
+    uint heap_size = 0;
+
+    /* each work group iterates over its range of primitives */
+    for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0))
+    {
+        /* array is in ascending order */
+        //const uint ID = numPrimitives-1-j;
+        const uint ID = pstart + j;
+        const float prob = presplit[ID].priority;
+        const uint i = presplit[ID].index;
+        const uint geomID = PRIMREF_geomID(&primref[i]);
+
+        /* do not split primitives with low splitting priority */
+        if (prob <= 0.0f)
+            continue;
+
+        /* we support splitting only for triangles */
+        if (!GRL_is_triangle(&geomDesc[geomID]))
+            continue;
+
+        /* compute number of split primitives to produce */
+        uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit;
+        numSplitPrims = min(HEAP_SIZE, numSplitPrims);
+
+        /* stop if not splits have to get performed */
+        if (numSplitPrims <= 1)
+            continue;
+
+        /* add primref to heap */
+        heap[0] = primref[i];
+        heap_size = 1;
+        uint heap_pos = 0;
+
+        /* iterate until all splits are done */
+        uint prims = 1;
+        uint last_heap_size = heap_size;
+        while (prims < numSplitPrims)
+        {
+            /* map the primitive bounds to the grid */
+            const float4 lower = heap[heap_pos].lower;
+            const float4 upper = heap[heap_pos].upper;
+            const float4 glower = (lower - grid_base) * grid_scale + 0.2f;
+            const float4 gupper = (upper - grid_base) * grid_scale - 0.2f;
+            uint4 ilower = convert_uint4_rtz(glower);
+            uint4 iupper = convert_uint4_rtz(gupper);
+
+            /* this ignores dimensions that are empty */
+            if (glower.x >= gupper.x)
+                iupper.x = ilower.x;
+            if (glower.y >= gupper.y)
+                iupper.y = ilower.y;
+            if (glower.z >= gupper.z)
+                iupper.z = ilower.z;
+
+            /* Now we compute a morton code for the lower and upper grid
+       * coordinates. */
+            const uint lower_code = bitInterleave3D(ilower);
+            const uint upper_code = bitInterleave3D(iupper);
+
+            /* if all bits are equal then we cannot split */
+            if (lower_code == upper_code)
+            {
+#if !USE_HEAP
+                prims++; // !!!!!!!
+
+                heap_pos++;
+                if (heap_pos == last_heap_size)
+                {
+                    heap_pos = 0;
+                    last_heap_size = heap_size;
+                }
+                continue;
+#else
+                if (heap_size == 1)
+                    break;
+
+                const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
+                primref[offset] = heap[heap_pos];
+
+                presplit[offset].index = offset;
+                presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]);
+
+                heap[0] = heap[--heap_size];
+                heapify_down(heap, heap_size);
+                continue;
+#endif
+            }
+
+            /* We find the bit position of the first differing bit from the
+       * top down. This bit indicates a split position inside an
+       * implicit octree. */
+            const uint diff = 31 - clz(lower_code ^ upper_code);
+
+            /* compute octree level and dimension to perform the split in */
+            const uint level = diff / 3;
+            const uint dim = diff % 3;
+
+            /* now we compute the grid position of the split */
+            const uint isplit = iupper[dim] & ~((1 << level) - 1);
+
+            /* compute world space position of split */
+            const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim];
+
+            /* split primref into left and right part */
+            struct AABB left, right;
+            splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right);
+            prims++;
+
+            /* update centroid bounds */
+            AABB_extend_point(&centroidBounds, AABB_centroid2(&left));
+            AABB_extend_point(&centroidBounds, AABB_centroid2(&right));
+
+#if !USE_HEAP
+
+            heap[heap_pos] = left;
+            heap[heap_size] = right;
+            heap_size++;
+
+            heap_pos++;
+            if (heap_pos == last_heap_size)
+            {
+                heap_pos = 0;
+                last_heap_size = heap_size;
+            }
+#else
+
+            /* insert left element into heap */
+            heap[0] = left;
+            heapify_down(heap, heap_size);
+
+            /* insert right element into heap */
+            heap[heap_size] = right;
+            heapify_up(heap, heap_size);
+
+            heap_size++;
+#endif
+        }
+
+        /* copy primities to primref array */
+        primref[i] = heap[0];
+
+        presplit[ID].index = i;
+        presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]);
+
+        for (uint k = 1; k < heap_size; k++)
+        {
+            const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
+            primref[offset] = heap[k];
+
+            presplit[offset].index = offset;
+            presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]);
+        }
+    }
+
+    /* merge centroid bounds into global bounds */
+    centroidBounds = AABB_sub_group_reduce(&centroidBounds);
+    if (get_sub_group_local_id() == 0)
+        AABB_global_atomic_merge(&globals->centroidBounds, &centroidBounds);
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+    /* update number of primitives on finish */
+    if (Globals_OnFinish(globals))
+    {
+        globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives;
+        globals->numSplittedPrimitives = 0;
+
+        /* update first build record */ // FIXME: should be done in builder itself
+        global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64);
+        record->end = globals->numPrimitives;
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
new file mode 100644
index 00000000000..1dd9a3cdd92
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
@@ -0,0 +1,674 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+
+#include "bvh_build_primref.h"
+
+//#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
+//int sub_group_non_uniform_any(int predicate);
+
+#define WINDOW_SIZE 16
+
+/* Representation of two merged triangles. */
+struct QuadIndices
+{
+    uint primID0, primID1;
+    uint v0, v1, v2, v3;
+};
+
+/*
+
+  This function calculates a PrimRef from a merged quad and writes
+  this PrimRef to memory.
+
+ */
+GRL_INLINE void create_prim_ref(const uint geomID,
+                            const struct QuadIndices quad,
+                            global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                            struct AABB *geometryBounds,
+                            struct AABB *centroidBounds,
+                            global uint *numPrimitives,
+                            global struct AABB *primref)
+{
+
+    /* load quad vertices */
+    const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged
+    const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1);
+    const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2);
+    const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3);
+
+    /* calculate bounds for quad */
+    float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3));
+    float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3));
+
+    /* extend geometry and centroid bounds */
+    const float4 centroid2 = lower + upper;
+    AABB_extendlu(geometryBounds, lower, upper);
+    AABB_extendlu(centroidBounds, centroid2, centroid2);
+
+    PrimRef ref;
+    PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
+    PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) );
+
+    /* store primref to memory */
+    const uint offset = atomic_add_global(numPrimitives, 1);
+    primref[offset] = ref;
+}
+
+/*
+
+  This function calculates a PrimRef from a procedural and writes
+  this PrimRef to memory.
+
+ */
+GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                                       const uint geomID,
+                                       const uint primID,
+                                       struct AABB *geometryBounds,
+                                       struct AABB *centroidBounds,
+                                       global uint *numPrimitives,
+                                       global struct AABB *primref)
+{
+    /* load aabb from memory */
+    struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+
+    /* extend geometry and centroid bounds */
+    float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f);
+    float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f);
+    const float4 centroid2 = lower + upper;
+    AABB_extendlu(geometryBounds, lower, upper);
+    AABB_extendlu(centroidBounds, centroid2, centroid2);
+
+    /* encode geomID, primID */
+    uint geomFlags = GRL_get_Flags(&geomDesc[geomID]);
+
+    PrimRef ref;
+    PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
+    PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags );
+
+    /* store primref to memory */
+    const uint offset = atomic_add_global(numPrimitives, 1);
+    primref[offset] = ref;
+}
+
+/*
+
+   This function performs a binary search to calculate the geomID and
+   primID of the i'th primitive of the scene. For the search a
+   prefix_sum array is used that stores for each location j the sum of
+   the number of primitives of all meshes k with k<j.
+
+*/
+
+struct GeomPrimID
+{
+    uint geomID, primID;
+};
+
+struct GeomPrimID binary_search_geomID_primID(global uint *prefix_sum, const uint prefix_sum_size, const uint i)
+{
+    uint l = 0;
+    uint r = prefix_sum_size;
+    uint k = 0;
+
+    while (r - l > 1)
+    {
+        const uint m = (l + r) / 2;
+        k = prefix_sum[m];
+        if (k <= i)
+        {
+            l = m;
+        }
+        else if (i < k)
+        {
+            r = m;
+        }
+    }
+
+    struct GeomPrimID id;
+    id.geomID = l;
+    id.primID = i - prefix_sum[l];
+    return id;
+}
+
+/*
+
+  Checks if a vertex contains only finite floating point numbers.
+
+ */
+
+GRL_INLINE bool isfinite_vertex(float4 vtx)
+{
+    return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z);
+}
+
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+primrefs_from_DXR_instances(global struct Globals *globals,
+                            global struct BVHBase* bvh,
+                            global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+                            uint numInstances,
+                            global struct AABB *primrefs,
+                            uint allowUpdate)
+{
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+primrefs_from_DXR_instances_indirect(global struct Globals *globals,
+                            global struct BVHBase* bvh,
+                            global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+                            global struct IndirectBuildRangeInfo* indirect_data,
+                            global struct AABB *primrefs,
+                            uint allowUpdate)
+{
+    // TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed
+    // directly to the kernel. THe rest of the kernel args are pulled using
+    // loads from memory. It may be more efficient to put 'numInstances' and
+    // 'allowUpdate' into 'globals'
+
+    const uint instanceIndex =  get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+            (((global char*)instances) + indirect_data->primitiveOffset);
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+primrefs_from_DXR_instances_pointers(global struct Globals *globals,
+                                     global struct BVHBase* bvh,
+                                     global void *instances_in,
+                                     uint numInstances,
+                                     global struct AABB *primrefs,
+                                     uint allowUpdate)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals,
+                                              global struct BVHBase* bvh,
+                                              global void *instances_in,
+                                              global struct AABB *primrefs,
+                                              global struct IndirectBuildRangeInfo* indirect_data,
+                                              uint allowUpdate)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)
+            (((global char*)instances) + indirect_data->primitiveOffset);
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+
+bool can_pair( uint3 a, uint3 b )
+{
+    bool match0 = any( a.xxx == b.xyz ) ? 1 : 0;
+    bool match1 = any( a.yyy == b.xyz ) ? 1 : 0;
+    bool match2 = any( a.zzz == b.xyz ) ? 1 : 0;
+    return (match0 + match1 + match2) >= 2;
+}
+
+void reduce_bounds(
+    float3 lower,
+    float3 upper,
+    global struct Globals* globals,
+    global struct BVHBase* bvh )
+{
+
+    // reduce centroid bounds... make sure to exclude lanes with invalid AABBs
+    float3 cent = lower + upper;
+    float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
+    float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
+
+    // reduce geo bounds
+    AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper );
+    AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper );
+}
+
+
+struct TriState
+{
+    bool valid;
+    uint prim_index;
+    uint pairing;
+    uint3 indices;
+    float3 lower;
+    float3 upper;
+};
+
+#define NOT_PAIRED 0xffffffff
+
+void load_triangle_data(uniform global char* index_buffer,
+                        uniform const uint index_format,
+                        uniform global char* vertex_buffer,
+                        uniform const uint vertex_format,
+                        uniform const uint vertex_stride,
+                        uniform global float* transform_buffer,
+                        uniform uint total_vert_count,
+                        struct TriState* state,
+                        float4* v)
+{
+        state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index );
+
+        const uint last_vertex = total_vert_count - 1;
+        const uint x = min(state->indices.x, last_vertex);
+        const uint y = min(state->indices.y, last_vertex);
+        const uint z = min(state->indices.z, last_vertex);
+
+        GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v);
+}
+
+struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+                        uniform uint base,
+                        uniform uint num_prims,
+                        uniform uint total_vert_count )
+{
+
+    struct TriState state;
+    state.pairing        = NOT_PAIRED;
+    state.valid          = false;
+    state.prim_index     = base + get_sub_group_local_id();
+    state.lower = (float3)(INFINITY, INFINITY, INFINITY);
+    state.upper = -(float3)(INFINITY, INFINITY, INFINITY);
+
+    if (state.prim_index < num_prims)
+    {
+        state.valid = true;
+        float4 v[3];
+        load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer,
+                        geomDesc->Desc.Triangles.IndexFormat,
+                        (global char*)geomDesc->Desc.Triangles.pVertexBuffer,
+                        geomDesc->Desc.Triangles.VertexFormat,
+                        geomDesc->Desc.Triangles.VertexBufferByteStride,
+                        (global float*)geomDesc->Desc.Triangles.pTransformBuffer,
+                        total_vert_count,
+                        &state,
+                        v);
+
+        if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count ||
+            !isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) ||
+            state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z)
+        {
+            state.valid = false;
+        }
+        else
+        {
+            state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz));
+            state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz));
+        }
+    }
+    return state;
+}
+
+void broadcast_triangles_local( struct TriState* state  )
+{
+    varying uint my_prim    = state->prim_index;
+    varying uint my_pairing = state->pairing;
+    varying float3 my_lower = state->lower;
+    varying float3 my_upper = state->upper;
+    varying bool valid      = state->valid;
+    varying uint3 indices   = state->indices;
+
+    for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
+    {
+        // don't broadcast invalid prims
+        if ( !sub_group_broadcast( valid, broadcast_lane ) )
+            continue;
+
+        uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane);
+        uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane);
+
+        if (broadcast_pairing == NOT_PAIRED)
+        {
+            // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
+            bool pairable = false;
+            uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane );
+            if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid )
+            {
+                pairable = can_pair( indices, other_indices );
+            }
+
+
+            uint pairable_lane = ctz(intel_sub_group_ballot(pairable));
+            if (valid && pairable_lane < get_sub_group_size())
+            {
+                // pair the broadcast primitive with the first lane that can accept it
+                float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
+                float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
+                if (get_sub_group_local_id() == pairable_lane)
+                {
+                    my_pairing = broadcast_prim;
+                    my_lower.xyz = min(my_lower.xyz, broadcast_lower);
+                    my_upper.xyz = max(my_upper.xyz, broadcast_upper);
+                }
+
+                // pair the broadcast primitive with the same that was paired to it
+                uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane);
+                if (get_sub_group_local_id() == broadcast_lane)
+                {
+                    my_pairing = pairable_prim;
+                }
+            }
+        }
+        else
+        {
+            //
+            // if this lane was already paired with the broadcasting tri
+            //   in an earlier loop iteration, then record the pairing in this lane's registers
+            float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
+            float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
+            if (broadcast_pairing == my_prim)
+            {
+                my_pairing = broadcast_prim;
+                my_lower.xyz = min(my_lower.xyz, broadcast_lower);
+                my_upper.xyz = max(my_upper.xyz, broadcast_upper);
+            }
+        }
+    }
+
+    state->pairing = my_pairing;
+    state->lower = my_lower;
+    state->upper = my_upper;
+}
+
+
+void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other )
+{
+    varying uint my_prim = state->prim_index;
+    varying uint my_pairing = state->pairing;
+    varying float3 my_lower = state->lower;
+    varying float3 my_upper = state->upper;
+    varying bool valid = state->valid;
+    varying uint3 indices = state->indices;
+
+    for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
+    {
+        // don't broadcast invalid prims
+        if (!sub_group_broadcast(other->valid, broadcast_lane))
+            continue;
+
+        uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane);
+        uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane);
+
+        if (broadcast_pairing == NOT_PAIRED)
+        {
+            // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
+            bool pairable = false;
+            if ( my_pairing == NOT_PAIRED && valid )
+            {
+                uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane);
+                pairable = can_pair(indices, other_indices);
+            }
+
+            // pair the broadcast primitive with the first lane that can accept it
+            uint pairable_mask = intel_sub_group_ballot(pairable);
+            if (valid && (ctz(pairable_mask) == get_sub_group_local_id()))
+            {
+                my_pairing = broadcast_prim;
+                my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane));
+                my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane));
+            }
+        }
+
+    }
+
+    state->pairing = my_pairing;
+    state->lower = my_lower;
+    state->upper = my_upper;
+}
+
+GRL_INLINE void do_triangles_to_primrefs(
+    global struct Globals*               globals,
+    global struct BVHBase*               bvh,
+    global struct AABB*                  primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint                                 geomID_and_flags,
+    const uint                           num_prims)
+{
+    uint geomID             = geomID_and_flags & 0x00ffffff;
+    uint geom_flags         = geomID_and_flags >> 24;
+    uint prim_base          = get_group_id(0) * get_local_size(0);
+    uint total_vert_count   = GRL_get_triangles_VertexCount(geomDesc);
+
+    struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count );
+    broadcast_triangles_local( &tri );
+
+
+    // we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED)
+    // or for the lane corresponding to the larger of two triangles
+    bool will_write = (tri.pairing > tri.prim_index) && tri.valid;
+    uint write_mask = intel_sub_group_ballot(will_write);
+    uint write_offs = subgroup_bit_prefix_exclusive( write_mask );
+    uint write_count = popcount(write_mask);
+
+    // allocate space in primref buffer
+    uint write_base;
+    if( get_sub_group_local_id() == 0 )
+        write_base = atomic_add_global( &globals->numPrimitives, write_count );
+    write_offs += sub_group_broadcast( write_base, 0 );
+
+    uint primID0 = tri.prim_index;
+    uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index;
+
+    if (will_write)
+    {
+        PrimRef ref;
+        PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz);
+        PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags);
+        uint8 val = (uint8)(
+            as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w),
+            as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w));
+        store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val);
+    }
+
+    reduce_bounds( tri.lower, tri.upper, globals, bvh );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+triangles_to_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint geomID_and_flags,
+    uint num_prims
+    )
+{
+    do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+triangles_to_primrefs_indirect(
+    global struct Globals*                globals,
+    global struct BVHBase*                bvh,
+    global struct AABB*                   primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC*  geomDesc,
+    global struct IndirectBuildRangeInfo* indirect_data,
+    uint                                  geomID_and_flags)
+{
+    const uint num_prims = indirect_data->primitiveCount;
+    do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_INLINE void do_procedurals_to_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint geomID_and_flags,
+    const uint num_prims)
+{
+    uint geomID    = geomID_and_flags & 0x00ffffff;
+    uint geomFlags = geomID_and_flags >> 24;
+
+    uint primID   = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
+
+    bool create_primref = false;
+    float3 lower =  (float3)(INFINITY, INFINITY, INFINITY);
+    float3 upper = -(float3)(INFINITY, INFINITY, INFINITY);
+    if (primID < num_prims)
+    {
+        /* check if procedural is valid */
+        struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID);
+        const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ);
+        const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ);
+        if (valid_min & valid_max)
+        {
+            /* load aabb from memory */
+            float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ);
+            float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ);
+
+            // convert degenerate boxes to points at the box centroid
+            lower = min( l, u );
+            upper = max( l, u );
+
+            create_primref = true;
+        }
+    }
+
+    uint write_mask = intel_sub_group_ballot(create_primref);
+    uint write_offs = subgroup_bit_prefix_exclusive(write_mask);
+    uint write_count = popcount(write_mask);
+
+    // allocate space in primref buffer
+    uint write_base;
+    if (get_sub_group_local_id() == 0)
+        write_base = atomic_add_global(&globals->numPrimitives, write_count);
+    write_offs += sub_group_broadcast(write_base, 0);
+
+    // write the primref
+    if (create_primref)
+    {
+        PrimRef ref;
+        PRIMREF_setAABB(&ref, lower.xyz, upper.xyz);
+        PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags);
+        primref[write_offs] = ref;
+    }
+
+    reduce_bounds(lower, upper, globals, bvh);
+
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+procedurals_to_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint geomID_and_flags,
+    uint num_prims
+    )
+{
+    do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+procedurals_to_primrefs_indirect(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global const struct IndirectBuildRangeInfo* indirect_data,
+    uint geomID_and_flags
+    )
+{
+    const uint num_prims = indirect_data->primitiveCount;
+    do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.h b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
new file mode 100644
index 00000000000..25e2d3df194
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
@@ -0,0 +1,246 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#if 0
+/*
+
+Create primrefs from array of instance descriptors.
+
+*/
+
+void store_instance_primref(
+    global struct BVHBase* top_bvh,
+    global struct Globals* globals,
+    global PrimRef* primrefs,
+    bool alloc_primref,
+    PrimRef new_primref )
+{
+    uint allocatePrimref = alloc_primref ? 1 : 0;
+    uint index = 0;
+    uint numAllocations = sub_group_reduce_add(allocatePrimref);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        index = atomic_add_global(&globals->numPrimitives, numAllocations);
+    }
+
+    index = sub_group_broadcast(index, 0);
+    index = index + sub_group_scan_exclusive_add(allocatePrimref);
+
+    if (allocatePrimref)
+    {
+        primrefs[index] = new_primref;
+    }
+
+    struct AABB centroidBounds;
+    centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
+    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+    struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
+        AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
+    }
+}
+
+
+
+// Compute transformed blas AABB.  Returns false if instance is degenerate
+bool create_instance_primref(
+    PrimRef* ref_out,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    global struct BVHBase* bvh,
+    uint instanceMask,
+    uint instanceIndex
+    )
+{
+    struct AABB3f bbox;
+    bool alloc_primref = false;
+    uint rootNodeOffset = NO_NODE_OFFSET;
+    if (bvh != 0)
+    {
+        alloc_primref = true;
+        AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
+
+        const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
+        const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
+
+        if (!valid_min || !valid_max || instanceMask == 0)
+        {
+            // degenerated instance case
+
+            // TODO this should be under  if ( allocate backpointers )
+            {
+                // we have to allocate the primref because this instance can be updated to non-degenerated
+                // take the origin of the instance as a bounding box.
+
+                bbox.lower[0] = instance->Transform[3];
+                bbox.lower[1] = instance->Transform[7];
+                bbox.lower[2] = instance->Transform[11];
+                bbox.upper[0] = instance->Transform[3];
+                bbox.upper[1] = instance->Transform[7];
+                bbox.upper[2] = instance->Transform[11];
+                instanceMask = 0;
+            }
+        }
+        else
+        {
+            rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+            float transformOverhead = 0.0f;
+            bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
+        }
+    }
+
+    *ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0);
+    return alloc_primref;
+}
+
+GRL_INLINE void primrefs_from_instances(
+    global struct Globals* globals,
+    global struct BVHBase* top_bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    uint instanceIndex,
+    global struct AABB* primrefs)
+{
+    bool alloc_primref = false;
+    PrimRef new_primref;
+    AABB_init(&new_primref);
+
+    if (instance)
+    {
+        uint mask = GRL_get_InstanceMask(instance);
+        global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure;
+        alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex);
+    }
+
+    store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref);
+}
+#endif
+
+#if 1
+GRL_INLINE void primrefs_from_instances(
+    global struct Globals* globals,
+    global struct BVHBase* top_bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    uint instanceIndex,
+    global struct AABB* primrefs,
+    global GRL_RAYTRACING_AABB* procedural_aabb,
+    uint allowUpdate
+    )
+{
+    struct AABB3f bbox;
+    uint allocatePrimref = 0;
+
+    uint rootNodeOffset = NO_NODE_OFFSET;
+    uint instanceMask = 0;
+
+    bool is_procedural = (procedural_aabb != 0);
+
+    if( instance )
+    {
+        instanceMask = GRL_get_InstanceMask(instance) ;
+        if ( is_procedural )
+        {
+            // procedural instance primref
+            allocatePrimref = 1;
+
+            float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ);
+            float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ);
+
+            if (instanceMask == 0 || any(lower > upper))
+            {
+                bbox.lower[0] = instance->Transform[3];
+                bbox.lower[1] = instance->Transform[7];
+                bbox.lower[2] = instance->Transform[11];
+                bbox.upper[0] = instance->Transform[3];
+                bbox.upper[1] = instance->Transform[7];
+                bbox.upper[2] = instance->Transform[11];
+                instanceMask = 0;
+            }
+            else
+            {
+                bbox = transform_aabb(lower, upper, instance->Transform);
+            }
+        }
+        else
+        {
+            // HW-instance primref
+
+            global struct BVHBase* bvh = instance ?
+                (global struct BVHBase*)instance->AccelerationStructure :
+                0;
+
+            if (bvh != 0)
+            {
+                AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
+
+                const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
+                const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
+
+
+                if (valid_min && valid_max && instanceMask != 0)
+                {
+                    allocatePrimref = 1;
+                    rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+                    float transformOverhead = 0.0f;
+                    bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
+                }
+                else if (allowUpdate)
+                {
+                    // degenerated instance case
+                    // we have to allocate the primref because this instance can be updated to non-degenerated
+                    // take the origin of the instance as a bounding box.
+                    allocatePrimref = 1;
+                    bbox.lower[0] = instance->Transform[3];
+                    bbox.lower[1] = instance->Transform[7];
+                    bbox.lower[2] = instance->Transform[11];
+                    bbox.upper[0] = instance->Transform[3];
+                    bbox.upper[1] = instance->Transform[7];
+                    bbox.upper[2] = instance->Transform[11];
+                    instanceMask = 0;
+                }
+            }
+        }
+    }
+
+    uint index = 0;
+    uint numAllocations = sub_group_reduce_add(allocatePrimref);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        index = atomic_add_global(&globals->numPrimitives, numAllocations);
+    }
+
+    index = sub_group_broadcast(index, 0);
+    index = index + sub_group_scan_exclusive_add(allocatePrimref);
+
+    struct AABB new_primref;
+    struct AABB centroidBounds;
+    if (allocatePrimref)
+    {
+        new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural);
+        primrefs[index] = new_primref;
+        centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
+    }
+    else
+    {
+        AABB_init(&new_primref);
+        AABB_init(&centroidBounds);
+    }
+
+
+    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+    struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
+        AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
+    }
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
new file mode 100644
index 00000000000..bcda2fa54ec
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
@@ -0,0 +1,491 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "bvh_build_refit.h"
+#include "api_interface.h"
+#include "common.h"
+
+
+
+
+
+#if 0 
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+void kernel
+update_instance_leaves( global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch
+)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh );
+    uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 );
+    if ( id >= num_leaves )
+        return;
+
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
+    global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
+
+    global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+
+    /* iterate over all children of the instance node and get their bounds */
+
+    uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] );
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
+    if ( dxrInstancesArray != NULL )
+        instance = &instancesArray[instanceIdx];
+    else
+        instance = instancesPtrArray[instanceIdx];
+
+    struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform );
+    global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
+    struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds;
+    struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO:  Use faster abs-matrix method
+
+    const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] );
+    const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] );
+
+    uint mask = GRL_get_InstanceMask(instance);
+
+    uint offset = instanceBvh->rootNodeOffset;
+    if ( !valid_min || !valid_max )
+    {
+        bbox.lower[0] = xfm.p.x;
+        bbox.lower[1] = xfm.p.y;
+        bbox.lower[2] = xfm.p.z;
+        bbox.upper[0] = xfm.p.x;
+        bbox.upper[1] = xfm.p.y;
+        bbox.upper[2] = xfm.p.z;
+        offset = NO_NODE_OFFSET;
+        mask = 0;
+    }
+
+    instance_aabb_scratch[id] = bbox;
+    
+    HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH   
+}
+#endif
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+update_instance_leaves(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch
+)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+    if (id >= num_leaves)
+        return;
+
+    DO_update_instance_leaves(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtr,
+        instance_aabb_scratch,
+        id,
+        0 );
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+update_instance_leaves_indirect(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch,
+    global struct IndirectBuildRangeInfo* indirect_data)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+    if (id >= num_leaves)
+        return;
+
+    DO_update_instance_leaves(
+        bvh,
+        dxrInstancesArray + indirect_data->primitiveOffset,
+        dxrInstancesPtr,
+        instance_aabb_scratch,
+        id,
+        0 );
+}
+
+#if 0
+/*
+
+  This kernel refit a BVH. The algorithm iterates over all BVH nodes
+  to find all leaf nodes, which is where refitting starts. For these
+  leaf nodes bounds get recalculated and then propagates up the tree.
+
+  One kernel instance considers a range of inner nodes as startpoints.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit(
+    global struct BVHBase *bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs )
+{
+    /* here we temporarily store the bounds for the children of a node */
+    struct AABB childrenAABB[BVH_NODE_N6];
+
+    /* get pointer to inner nodes and back pointers */
+    global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* construct range of nodes that each work group will process */
+    const uint numInnerNodes = BVHBase_numNodes(bvh);
+    const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0);
+
+    /* each workgroup iterates over its range of nodes */
+    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+    {
+        global struct QBVHNodeN* curNode = &inner_nodes[i];
+        uint numChildren = refit_bottom(bvh, geosArray,
+                                 instance_leaf_aabbs,
+                                 curNode,
+                                 childrenAABB,
+                                 *InnerNode_GetBackPointer(backPointers, i));
+        if (numChildren != 0)
+        {
+            /* update bounds of node */
+            QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
+
+            /* refit upper parts of the BVH */
+            // TODO: this will not gonna work for mixed nodes
+            refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
+        }
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1)))
+void kernel Find_refit_treelets(
+    global struct BVHBase* bvh,
+    global TreeletNodeData* treelets,
+    global uint* scratchStartpoints,
+    global uint* startpointAlloc)
+{
+    find_refit_treelets(bvh,
+                        treelets,
+                        scratchStartpoints,
+                        startpointAlloc);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1))) 
+void kernel Assign_refit_startpoints_to_treelets(
+    global struct BVHBase* bvh,
+    global TreeletNodeData* treelets,
+    global uint* scratchStartpoints)
+{
+    assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(128, 1, 1))) 
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Finalize_treelets_in_groups(
+    global struct BVHBase* bvh,
+    global uint* scratchStartpoints )
+{
+    local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE];
+
+    finalize_treelets_in_groups(bvh, scratchStartpoints, depths);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs)
+{
+    uint group_id = get_group_id(0);
+    SquashedInput sqinput = psqinputs[group_id];
+    global struct BVHBase* bvh = sqinput.pBvh;
+    uint numLeaves = BVHBase_GetNumQuads(bvh);
+    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+
+    global void* input = sqinput.pInput;
+    global struct AABB* bbox_scratch = sqinput.bbox_scratch;
+
+    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+    uint id = get_local_id(0);
+
+    for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0))
+    {
+        struct AABB theAABB;
+        refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFFu);
+        bbox_scratch[leafsIndexOffset + leaf_id] = theAABB;
+    }
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Refit_quads(
+    global struct BVHBase* bvh,
+    global void* input,
+    global struct AABB* bbox_scratch,
+    uint numGroupsExecuted,
+    global SquashedInputGroupDesc* sqinput)
+{
+    uint numLeafs = BVHBase_GetNumQuads(bvh);
+    if (numLeafs == 0) return;
+    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+    
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+
+    uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted;
+
+    uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0);
+    uint id_end = min(id_start + numLeafsPerGr, numLeafs);
+    for (uint id = id_start; id < id_end; id+= get_local_size(0))
+    {
+        struct AABB theAABB;
+        refit_bottom_child_quad(leafs + id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFFu);
+        bbox_scratch[leafsIndexOffset + id] = theAABB;
+    }
+
+    if (get_group_id(0) == 0 && get_local_id(0) < 16)
+    {
+        
+        uint groupnr;
+        uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+        if (get_sub_group_local_id() == 0) {
+            groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt);
+        }
+        groupnr = sub_group_broadcast(groupnr, 0);
+        for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size())
+        {
+            uint gr = groupnr + subtree;
+            //printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n",  bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints);
+            sqinput[gr].bvh = (qword)bvh;
+            sqinput[gr].scratch = (qword)bbox_scratch;
+            sqinput[gr].groupInTree = subtree;
+        }
+        //if (get_local_id(0)==0 && treeletCnt > 1)
+        //{
+        //    printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth);
+        //}
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+Refit_tree_per_group_quad(
+    global SquashedInput* psqinputs)
+{
+    uint group_id = get_group_id(0);
+    SquashedInput sqinput = psqinputs[group_id];
+    global struct BVHBase* bvh = sqinput.pBvh;
+    global struct AABB* bbox_scratch = sqinput.bbox_scratch;
+    global void* pInput = sqinput.pInput;
+    local Treelet_by_single_group_locals loc;
+
+    if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0)
+        return;
+
+#if REFIT_DEBUG_CHECKS
+    uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+    if (bottoms_cnt != 1) {
+        if (get_local_id(0) == 0)
+        {
+            printf("Error: this tree has more than 1 treelets!\n");
+        }
+        return;
+    }
+#endif
+
+    /* get pointer to inner nodes and back pointers */
+    uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+
+    // uniform per group
+    uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);    
+
+    uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart;
+    
+    if (numLeafs == 0) { return; }
+
+    uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0);
+
+    update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread);
+
+    mem_fence_workgroup_default(); work_group_barrier(0);
+    
+    RefitTreelet trltDsc = *pTrltDsc;
+
+    refit_treelet_by_single_group(
+        bbox_scratch,
+        &loc,
+        bvh,
+        trltDsc,
+        false,
+        true);
+    
+    if (trltDsc.maxDepth > 0)
+    {
+        mem_fence_workgroup_default(); work_group_barrier(0);
+        post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh);
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+Refit_treelet_per_group(
+    global SquashedInputGroupDesc* sqinput)
+{
+    uint group_id = get_group_id(0);
+    global struct AABB*    bbox_scratch = (global struct AABB* )sqinput[group_id].scratch;
+    global struct BVHBase* bvh          = (global struct BVHBase* )sqinput[group_id].bvh;
+    group_id                            = sqinput[group_id].groupInTree;
+
+    /* get pointer to inner nodes and back pointers */
+    uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+
+    uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+    // uniform per group
+    uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
+
+    bool should_we_process_treetip = true;
+    local Treelet_by_single_group_locals loc;
+    local bool* l_should_we_process_treetip = (local bool*)&loc;
+#if REFIT_VERBOSE_LOG
+    if (group_id != 0) return;
+#endif
+
+    if (bottoms_cnt > 1)
+    {
+#if REFIT_VERBOSE_LOG
+        for (; group_id < bottoms_cnt; group_id++)
+        {
+            if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); }
+            work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+#endif
+            bool rootProcThread = refit_treelet_by_single_group(
+                bbox_scratch,
+                &loc,
+                bvh,
+                pTrltDsc[group_id],
+                true,
+                false);
+
+            // we have to make last group that finishes go up and process the treetip
+            if (rootProcThread)
+            {
+
+                mem_fence_gpu_invalidate();
+                uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2);
+                should_we_process_treetip = finished_cnt + 1 == bottoms_cnt;
+
+                * l_should_we_process_treetip = should_we_process_treetip;
+
+                if (should_we_process_treetip) mem_fence_gpu_invalidate();
+            }
+#if REFIT_VERBOSE_LOG
+        }
+#endif
+        work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+
+        should_we_process_treetip = *l_should_we_process_treetip;
+    }
+    
+    if (should_we_process_treetip)
+    {
+        //this group will process treetip
+        if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; }    
+        if (bottoms_cnt == 1) { bottoms_cnt = 0; }
+        refit_treelet_by_single_group(
+            bbox_scratch,
+            &loc,
+            bvh,
+            pTrltDsc[bottoms_cnt],
+            true,
+            true);
+    }
+}
+
+/*
+  This kernel refit a BVH. The algorithm iterates over all BVH nodes
+  to find all leaf nodes, which is where refitting starts. For these
+  leaf nodes bounds get recalculated and then propagates up the tree.
+
+  One kernel instance considers exactly one inner_node startpoint. 
+  not range of inner nodes.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1))) void kernel 
+Refit_per_one_startpoint(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs )
+{
+    /* here we temporarily store the bounds for the children of a node */
+    struct AABB childrenAABB[BVH_NODE_N6];
+
+    /* get pointer to inner nodes and back pointers */
+    global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+    
+    /* get the inner node that we will consider as a bottom startpoint */
+    const uint numInnerNodes = BVHBase_numNodes(bvh);
+    const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0);
+
+    if (innerNodeIdx >= numInnerNodes) return;
+
+    global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
+    uint numChildren = refit_bottom(
+        bvh,
+        geosArray,
+        instance_leaf_aabbs,
+        curNode,
+        childrenAABB,
+        *InnerNode_GetBackPointer(backPointers, innerNodeIdx));
+        
+    if (numChildren != 0)
+    {
+        /* update bounds of node */
+        QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
+
+        /* refit upper parts of the BVH */
+        /* TODO: this will not gonna work for mixed nodes */
+        refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
+    }
+}
+
+#endif
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
+Refit_indirect_sg(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs)
+{    
+    DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0);
+
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
new file mode 100644
index 00000000000..522a44b23a7
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
@@ -0,0 +1,546 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "api_interface.h"
+#include "instance.h"
+#include "GRLGen12.h"
+#include "libs/lsc_intrinsics.h"
+
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+DO_update_instance_leaves(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch,
+    uint id ,
+    global struct GRL_RAYTRACING_AABB* procedural_box
+)
+{
+
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
+    global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
+
+    global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
+    
+
+    /* iterate over all children of the instance node and get their bounds */
+
+    uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
+    if (dxrInstancesArray != NULL)
+        instance = &instancesArray[instanceIdx];
+    else
+        instance = instancesPtrArray[instanceIdx];
+
+    uint mask = GRL_get_InstanceMask(instance);
+    uint offset = NO_NODE_OFFSET;
+
+    struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
+    struct AABB3f bbox;
+
+    if (procedural_box != 0)
+    {
+        bbox.lower[0] = procedural_box->MinX;
+        bbox.lower[1] = procedural_box->MinY;
+        bbox.lower[2] = procedural_box->MinZ;
+        bbox.upper[0] = procedural_box->MaxX;
+        bbox.upper[1] = procedural_box->MaxY;
+        bbox.upper[2] = procedural_box->MaxZ;
+    }
+    else
+    {
+        global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
+        bbox = instanceBvh->Meta.bounds;
+        offset = BVH_ROOT_NODE_OFFSET;
+    }
+
+
+    const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
+    const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
+
+    if (!valid_min || !valid_max )
+    {
+        bbox.lower[0] = xfm.p.x;
+        bbox.lower[1] = xfm.p.y;
+        bbox.lower[2] = xfm.p.z;
+        bbox.upper[0] = xfm.p.x;
+        bbox.upper[1] = xfm.p.y;
+        bbox.upper[2] = xfm.p.z;
+        offset = NO_NODE_OFFSET;
+        mask = 0;
+    }
+    else
+    {
+        bbox = AABB3f_transform(xfm, bbox); // JDB TODO:  Use faster abs-matrix method
+    }
+
+    instance_aabb_scratch[id] = bbox;
+
+    HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH   
+}
+
+/*
+   This function starts at some BVH node and refits all nodes upwards
+   to the root. At some node the algorithm only proceeds upwards if
+   all children of the current node have already been processed. This
+   is checked as each time a node is reached an atomic counter is
+   incremented, which will reach the number of children of the node at
+   some time.
+ */
+
+GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
+                            global struct BVHBase *bvh,           // pointer to BVH
+                            struct AABB *childrenAABB,            // temporary data to use
+                            uint numChildrenTotal)
+{
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* compute the index of the start node */
+    uint curNodeIndex = qnode_start - nodeData;
+
+    /* the start node got already processed, thus go to its parent node */
+    curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
+
+    /* end at root node */
+    while (curNodeIndex != 0x03FFFFFF)
+    {
+        /* increment refit counter that counts refitted children of current node */
+        const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
+
+        /* if all children got refitted, then continue */
+        const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+        numChildrenTotal = (parentPointer >> 3) & 0x7;
+        if (numChildrenRefitted != numChildrenTotal)
+            return;
+
+        /* reset refit counter for next refit */
+        *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
+
+        /* get bounds of all children from child nodes directly */
+        global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
+        global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
+        for (uint k = 0; k < numChildrenTotal; k++)
+            childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
+
+        /* update node bounds of all children */
+        QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
+
+        write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+
+        /* make parent node the current node */
+        curNodeIndex = parentPointer >> 6;
+    }
+
+    /* update QBVH6 bounds */
+    struct AABB bounds;
+    AABB_init(&bounds);
+
+    for (uint i = 0; i < numChildrenTotal; i++)
+        AABB_extend(&bounds, &childrenAABB[i]);
+
+    setBVHBaseBounds(bvh, &bounds);
+}
+
+
+GRL_INLINE void SUBGROUP_refit_bottom_up( 
+    uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
+    uniform global struct BVHBase* bvh,           // pointer to BVH
+    varying struct AABB reduce_bounds,            
+    uniform uint numChildrenTotal,
+    varying ushort lane,
+    varying ushort head_lane)
+{
+    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+    /* compute the index of the start node */
+    uniform uint curNodeIndex = qnode_start - nodeData;
+
+    /* the start node got already processed, thus go to its parent node */
+    uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
+
+    varying struct AABB childrenAABB;
+
+    /* end at root node */
+    while ( curNodeIndex != 0x03FFFFFF )
+    {
+        mem_fence_gpu_invalidate();
+
+        /* increment refit counter that counts refitted children of current node */
+        uniform uint parentPointer = 1;
+        if (lane == 0)
+        {
+            // acquire fence ensures that all previous writes complete before the atomic starts
+            parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
+        }
+
+        parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
+
+        /* if all children got refitted, then continue */
+        uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+        numChildrenTotal = (parentPointer >> 3) & 0x7;
+        if ( numChildrenRefitted != numChildrenTotal )
+            return;
+
+        /* reset refit counter for next refit */
+        if (lane == 0)
+        {
+            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
+        }
+
+        /* get bounds of all children from child nodes directly */
+        global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
+        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+
+        varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
+        childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+        /* update node bounds of all children */
+        reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
+        reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
+
+        subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
+
+        /* update node mask */
+        uchar childrenMask = qnode_child[child_idx].instMask;
+
+        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
+
+        /* make parent node the current node */
+        curNodeIndex = parentPointer >> 6;
+    }
+
+    /* update QBVH6 bounds */
+    
+    if( lane == 0 )
+        setBVHBaseBounds( bvh, &reduce_bounds );
+}
+
+
+GRL_INLINE void quadCopyVertices(
+    const struct QuadLeaf* pQuad,
+    struct QuadLeaf* newQuad)
+{
+    const uint4* s = (const uint4*) & (pQuad->v[0][0]);
+    uint4* d = (uint4*) & (newQuad->v[0][0]);
+    const uint8* s2 = (const uint8*)(s+1);
+    uint8* d2 = (uint8*)(d+1);
+    *d = *s;
+    *d2 = *s2;
+}
+
+
+GRL_INLINE void get_updated_quad(
+    global const struct QuadLeaf* pQuad,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
+    struct QuadLeaf* newQuad)
+{
+    struct QuadLeaf tempQuad;
+
+    // fetch non vtx data;
+    {
+        uint4* tempQuad4U = (uint4*)&tempQuad;
+        global const uint4* pQuad4U = (global const uint4*)pQuad;
+        *tempQuad4U = *pQuad4U;
+    }   
+
+    /* get the geomID and primID0/1 for both quad triangles */
+    const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
+    const uint primID0 = tempQuad.primIndex0;
+    const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
+    ushort fourth_vert = 0;
+
+    if (primID1 != primID0)
+    {
+        ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
+        fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
+        fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
+    }
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
+
+    uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
+
+    // read the indices of the 4 verts we want
+    float3 vtx0, vtx1, vtx2, vtx3;
+    GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+    QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
+
+    *newQuad = tempQuad;
+}
+
+// This calculates children BBs for innerNode having *all* children leafs.
+// mixed nodes will be updated by passing through bottom-up thread.
+GRL_INLINE uint refit_bottom( global struct BVHBase* bvh, 
+                          global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,                       
+                          global struct AABB3f* instance_leaf_aabbs,
+                          global struct QBVHNodeN* curNode,
+                          struct AABB *childrenAABB,
+                          uint backPointer)
+{
+    uint numChildren = 0;
+
+    /* we start refit at leaf nodes, this case is for quad nodes */
+    if (curNode->type == BVH_QUAD_NODE)
+    {
+        global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+        /* iterate over all quads of the quad node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        for (uint k = 0; k < numChildren; k++)
+        {
+            struct QuadLeaf Q;
+            get_updated_quad(&quads[k], geomDesc, &Q);
+            quadCopyVertices(&Q, &quads[k]);
+            childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
+        }
+    }
+
+    /* we start refit at leaf nodes, this case is for procedural nodes */
+    else if (curNode->type == BVH_PROCEDURAL_NODE)
+    {
+        global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+        /* iterate over all children of the procedural node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        for (uint k = 0; k < numChildren; k++)
+        {
+            /* extract geomID and primID from leaf */
+            const uint startPrim = QBVHNodeN_startPrim(curNode, k);
+            const uint geomID = ProceduralLeaf_geomIndex(leaf);
+            const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+            /* read bounds from geometry descriptor */
+            struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+            childrenAABB[k].lower.x = aabb.MinX;
+            childrenAABB[k].lower.y = aabb.MinY;
+            childrenAABB[k].lower.z = aabb.MinZ;
+            childrenAABB[k].upper.x = aabb.MaxX;
+            childrenAABB[k].upper.y = aabb.MaxY;
+            childrenAABB[k].upper.z = aabb.MaxZ;
+
+            /* advance leaf pointer to next child */
+            leaf += QBVHNodeN_blockIncr(curNode, k);
+        }
+    }
+
+    /* we start refit at leaf nodes, this case is for instance nodes */
+    else if (curNode->type == BVH_INSTANCE_NODE)
+    {
+        global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
+        global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+
+        /* iterate over all children of the instance node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        for (uint k = 0; k < numChildren; k++)
+        {
+            uint leafindex = (instancesLeaves + k) - leafBase;
+            childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
+            childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
+        }
+    }
+
+    return numChildren;
+}
+
+
+
+
+
+// This calculates children BBs for innerNode having *all* children leafs.
+// mixed nodes will be updated by passing through bottom-up thread.
+GRL_INLINE uint SUBGROUP_refit_bottom(
+    uniform global struct BVHBase* bvh,
+    uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uniform global struct AABB3f* instance_leaf_aabbs,
+    uniform global struct QBVHNodeN* curNode,
+    uniform uint backPointer,
+    varying struct AABB* childrenAABB,
+    varying uchar* childrenMask,
+    varying ushort lane,
+    global uchar* is_procedural_instance
+    )
+{
+    uniform uint numChildren = 0;
+    bool enable_procedural_instance = (is_procedural_instance != 0);
+
+    /* we start refit at leaf nodes, this case is for quad nodes */
+    if (curNode->type == BVH_QUAD_NODE)
+    {
+        /* iterate over all quads of the quad node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+
+        uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+        struct QuadLeaf Q;
+        if (lane < numChildren)
+        {
+            get_updated_quad(&quads[lane], geomDesc, &Q);
+
+            *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
+
+            quadCopyVertices(&Q, &quads[lane]);
+            *childrenMask = 0xff;
+        }
+        // FIXME: support leaves with more than one quad
+    }
+
+    /* we start refit at leaf nodes, this case is for procedural nodes */
+    else if (curNode->type == BVH_PROCEDURAL_NODE)
+    {
+        uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+
+        
+        /* iterate over all children of the procedural node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        
+        varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
+        incr = sub_group_scan_exclusive_add(incr);
+
+        if( lane < numChildren )
+        {
+            /* extract geomID and primID from leaf */
+            varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
+            varying global struct ProceduralLeaf* my_leaf = leaf + incr;
+            const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
+            const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim); 
+
+            /* read bounds from geometry descriptor */
+            struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+            childrenAABB->lower.x = aabb.MinX;
+            childrenAABB->lower.y = aabb.MinY;
+            childrenAABB->lower.z = aabb.MinZ;
+            childrenAABB->upper.x = aabb.MaxX;
+            childrenAABB->upper.y = aabb.MaxY;
+            childrenAABB->upper.z = aabb.MaxZ;
+            *childrenMask = 0xff;
+        }
+    }
+
+    /* we start refit at leaf nodes, this case is for instance nodes */
+    else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
+    {
+        uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
+        uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
+
+        /* iterate over all children of the instance node and get their bounds and masks */
+        numChildren = (backPointer >> 3) & 0x7;
+        if( lane < numChildren )
+        {
+            uint leafindex = (instancesLeaves + lane) - leafBase;
+            childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
+            childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
+            *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
+        }
+    }
+    else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
+    {
+        // Handle procedural-instance leaves
+        //   TODO:  Generalize this!   Should re-write the kernel to work with arbitrary mixed-mode leaves
+        
+        numChildren = (backPointer >> 3) & 0x7;
+        uint childType = BVH_INTERNAL_NODE;
+        if ( lane < numChildren )
+        {
+            childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
+            if (childType != BVH_INTERNAL_NODE)
+            {
+                uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
+                uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+                uint leafindex = (instancesLeaves + lane) - leafBase;
+                childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
+                childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
+                *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
+
+                // see if the child has flipped from procedural to non-procedural and update the child type field as needed
+                uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
+                uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
+                if (newChildType != childType)
+                {
+                    InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
+                }
+            }            
+        }
+
+
+        // don't ascend the tree for a true internal node
+        if (sub_group_all(childType == BVH_INTERNAL_NODE))
+            numChildren = 0;
+    }
+    
+    return numChildren;
+}
+
+#define SG_REFIT_WG_SIZE 8
+
+void DO_Refit_per_one_startpoint_sg(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs,
+    global uchar* is_procedural_instance )
+{
+    /* get pointer to inner nodes and back pointers */
+    global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* get the inner node that we will consider as a bottom startpoint */
+    const uint numInnerNodes = BVHBase_numNodes(bvh);
+    const uint innerNodeIdx = get_sub_group_global_id();
+
+    varying ushort lane = get_sub_group_local_id();
+
+    if (innerNodeIdx >= numInnerNodes) return;
+
+    varying struct AABB childrenAABB; // one child AABB per lane
+    AABB_init(&childrenAABB);
+
+    varying uchar childrenMask = 0; // one child mask per lane
+
+    global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
+    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+    uint numChildren = SUBGROUP_refit_bottom(
+        bvh,
+        geosArray,
+        instance_leaf_aabbs,
+        curNode,
+        backPointer,
+        &childrenAABB,
+        &childrenMask,
+        lane,
+        is_procedural_instance
+         );
+
+    
+    if (numChildren != 0)
+    {
+        /* update bounds of node */
+        struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
+        reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
+        subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
+        
+        /* update mask of node */
+        uchar mask = sub_group_reduce_or_N6(childrenMask);
+        curNode->instMask = mask;
+
+        /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
+           only by the first thread (similar to morton phase1) the machine hangs. */
+        mem_fence_gpu_invalidate();
+
+        /* refit upper parts of the BVH */
+        /* TODO: this will not gonna work for mixed nodes */
+        SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
+    }
+}
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
new file mode 100644
index 00000000000..0a4bd3466af
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
@@ -0,0 +1,1917 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "common.h"
+#include "instance.h"
+
+#define DBG(x)
+
+#define ENABLE_CHECKS 0
+
+#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1
+
+/* todo:                                                     */
+/* - new cross WG code path for first splits                 */
+/* - optimize find best child loop sequence                  */
+/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */
+
+#define DIVIDE_BY_6 1
+
+inline uint getNumPrims(struct BuildRecord *buildRecord)
+{
+    return buildRecord->end - buildRecord->start;
+}
+
+inline void printBuildRecord(struct BuildRecord *record)
+{
+    printf("centroidBounds\n");
+    AABB_print(&record->centroidBounds);
+    printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record));
+}
+
+inline void printBinInfo2(struct BinInfo2 *record)
+{
+    printf("boundsX[%d]\n", BINS * 2);
+    for (uint b = 0; b < BINS * 2; b++)
+    {
+        AABB3f_print(&record->boundsX[b]);
+        printf("counts.x = %d\n", record->counts[b].x);
+    }
+    printf("boundsY[%d]\n", BINS * 2);
+    for (uint b = 0; b < BINS * 2; b++)
+    {
+        AABB3f_print(&record->boundsY[b]);
+        printf("counts.y = %d\n", record->counts[b].y);
+    }
+    printf("boundsZ[%d]\n", BINS * 2);
+    for (uint b = 0; b < BINS * 2; b++)
+    {
+        AABB3f_print(&record->boundsZ[b]);
+        printf("counts.z = %d\n", record->counts[b].z);
+    }
+}
+
+inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins)
+{
+    const float4 eps = 1E-34f;
+    const float4 diag = max(eps, centBounds->upper - centBounds->lower);
+    const float4 scale = (float4)(0.99f * (float)bins) / diag;
+    binMapping->scale = select((float4)(0.0f), scale, diag > eps);
+    binMapping->ofs = centBounds->lower;
+}
+
+inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref)
+{
+    const float4 centroid2 = primref->lower + primref->upper;
+    AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2);
+}
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+inline void initBinInfo(struct BinInfo *binInfo)
+{
+    for (uint i = 0; i < BINS; i++)
+    {
+        AABB3f_init(&binInfo->boundsX[i]);
+        AABB3f_init(&binInfo->boundsY[i]);
+        AABB3f_init(&binInfo->boundsZ[i]);
+        binInfo->counts[i] = (uint3)(0);
+    }
+}
+
+inline void subgroup_initBinInfo(struct BinInfo *binInfo)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    for (uint i = subgroupLocalID; i < BINS; i += subgroup_size)
+    {
+        AABB3f_init(&binInfo->boundsX[i]);
+        AABB3f_init(&binInfo->boundsY[i]);
+        AABB3f_init(&binInfo->boundsZ[i]);
+        binInfo->counts[i] = (uint3)(0);
+    }
+}
+
+inline void parallel_initBinInfo(struct BinInfo *binInfo)
+{
+    const uint localID = get_local_id(0);
+    if (localID < BINS)
+    {
+        AABB3f_init(&binInfo->boundsX[localID]);
+        AABB3f_init(&binInfo->boundsY[localID]);
+        AABB3f_init(&binInfo->boundsZ[localID]);
+        binInfo->counts[localID] = (uint3)(0);
+    }
+}
+
+inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
+    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper);
+    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper);
+    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper);
+    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz);
+    AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz);
+    AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz);
+    binInfo->counts[i.x].x++;
+    binInfo->counts[i.y].y++;
+    binInfo->counts[i.z].z++;
+}
+
+// =====================================================================================================================
+// =====================================================================================================================
+// =====================================================================================================================
+
+inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins)
+{
+    const uint localID = get_local_id(0);
+    if (localID < bins)
+    {
+        AABB3f_init(&binInfo->boundsX[localID]);
+        AABB3f_init(&binInfo->boundsY[localID]);
+        AABB3f_init(&binInfo->boundsZ[localID]);
+        binInfo->counts[localID] = (uint3)(0);
+    }
+}
+
+inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
+    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins)
+{
+    const uint localID = get_local_id(0);
+    if (localID < bins)
+    {
+        AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]);
+        AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]);
+        AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]);
+        atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x);
+        atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y);
+        atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z);
+    }
+}
+
+inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+#if 0
+  /*! find best child to split */
+  const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY;
+  const float maxArea = sub_group_reduce_max(area);
+  const uint mask = intel_sub_group_ballot(area == maxArea);
+  const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1;
+#else
+    float bestArea = -(float)INFINITY;
+    int bestChild = -1;
+    for (int i = 0; i < numChildren; i++)
+    {
+        /* ignore leaves as they cannot get split */
+        if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize)
+            continue;
+
+        /* find child with largest surface area */
+        if (childrenAABB[i].lower.w > bestArea)
+        {
+            bestChild = i;
+            bestArea = childrenAABB[i].lower.w;
+        }
+    }
+#endif
+    return bestChild;
+}
+
+inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref)
+{
+    const float4 centroid2 = primref->lower + primref->upper;
+
+    if (centroid2.x < buildRecord->centroidBounds.lower.x)
+        return false;
+    if (centroid2.y < buildRecord->centroidBounds.lower.y)
+        return false;
+    if (centroid2.z < buildRecord->centroidBounds.lower.z)
+        return false;
+
+    if (centroid2.x > buildRecord->centroidBounds.upper.x)
+        return false;
+    if (centroid2.y > buildRecord->centroidBounds.upper.y)
+        return false;
+    if (centroid2.z > buildRecord->centroidBounds.upper.z)
+        return false;
+
+    if (primref->lower.x < geometryBounds->lower.x)
+        return false;
+    if (primref->lower.y < geometryBounds->lower.y)
+        return false;
+    if (primref->lower.z < geometryBounds->lower.z)
+        return false;
+
+    if (primref->upper.x > geometryBounds->upper.x)
+        return false;
+    if (primref->upper.y > geometryBounds->upper.y)
+        return false;
+    if (primref->upper.z > geometryBounds->upper.z)
+        return false;
+
+    return true;
+}
+
+/* initialize primref index array */
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+create_primref_index(global struct Globals *globals,
+                     global struct AABB *primref,
+                     global unsigned int *primref_index)
+{
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+
+    const uint startID = (taskID + 0) * globals->numPrimitives / numTasks;
+    const uint endID = (taskID + 1) * globals->numPrimitives / numTasks;
+    for (uint primID = startID + localID; primID < endID; primID += local_size)
+        primref_index[primID] = primID;
+}
+
+// ==========================================================================================================
+// ==========================================================================================================
+// ==========================================================================================================
+
+inline float left_to_right_area16(struct AABB3f *low)
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
+    return halfArea_AABB3f(&low_prefix);
+}
+
+inline uint left_to_right_counts16(uint low)
+{
+    return sub_group_scan_exclusive_add(low);
+}
+
+inline float right_to_left_area16(struct AABB3f *low)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID);
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
+    const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
+    return low_area;
+}
+
+inline uint right_to_left_counts16(uint low)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = sub_group_broadcast(low, ID);
+    const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
+    return sub_group_broadcast(low_prefix, ID);
+}
+
+inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high)
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce(low);
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high);
+    AABB3f_extend(&high_prefix, &low_reduce);
+    const float low_area = halfArea_AABB3f(&low_prefix);
+    const float high_area = halfArea_AABB3f(&high_prefix);
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 left_to_right_counts32(uint low, uint high)
+{
+    const uint low_prefix = sub_group_scan_exclusive_add(low);
+    const uint low_reduce = sub_group_reduce_add(low);
+    const uint high_prefix = sub_group_scan_exclusive_add(high);
+    return (uint2)(low_prefix, low_reduce + high_prefix);
+}
+
+inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID);
+    struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID);
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse);
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse);
+    AABB3f_extend(&high_prefix, &low_reduce);
+    const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID);
+    const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 right_to_left_counts32(uint low, uint high)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = sub_group_broadcast(high, ID);
+    const uint high_reverse = sub_group_broadcast(low, ID);
+    const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
+    const uint low_reduce = sub_group_reduce_add(low_reverse);
+    const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce;
+    return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID));
+}
+
+inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit)
+{
+    ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0;
+    ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1;
+    ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2;
+    /* ignore zero sized dimensions */
+    splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0));
+    splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0));
+    splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0));
+    ulong bestSplit = min(min(splitX, splitY), splitZ);
+    bestSplit = sub_group_reduce_min(bestSplit);
+    return bestSplit;
+}
+
+inline uint fastDivideBy6_uint(uint v)
+{
+#if 1
+    const ulong u = (ulong)v >> 1;
+    return (uint)((u * 0x55555556ul) >> 32);
+#else
+    return v / 6;
+#endif
+}
+
+inline uint3 fastDivideBy6_uint3(uint3 v)
+{
+    return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z));
+}
+
+inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID];
+
+    const float lr_areaX = left_to_right_area16(&boundsX);
+    const float rl_areaX = right_to_left_area16(&boundsX);
+
+    struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID];
+
+    const float lr_areaY = left_to_right_area16(&boundsY);
+    const float rl_areaY = right_to_left_area16(&boundsY);
+
+    struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID];
+
+    const float lr_areaZ = left_to_right_area16(&boundsZ);
+    const float rl_areaZ = right_to_left_area16(&boundsZ);
+
+    const uint3 counts = binInfo->counts[subgroupLocalID];
+
+    const uint lr_countsX = left_to_right_counts16(counts.x);
+    const uint rl_countsX = right_to_left_counts16(counts.x);
+    const uint lr_countsY = left_to_right_counts16(counts.y);
+    const uint rl_countsY = right_to_left_counts16(counts.y);
+    const uint lr_countsZ = left_to_right_counts16(counts.z);
+    const uint rl_countsZ = right_to_left_counts16(counts.z);
+
+    const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
+    const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
+
+#if DIVIDE_BY_6 == 0
+    const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
+    uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
+    const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift;
+    const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift;
+#else
+    const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1);
+    const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1);
+#endif
+    float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count));
+
+    /* first bin is invalid */
+
+    sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0);
+    sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0);
+    sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0);
+
+    const uint mid = (startID + endID) / 2;
+    const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
+
+    const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit);
+
+    struct Split split;
+    split.sah = as_float((uint)(bestSplit >> 32));
+    split.dim = (uint)bestSplit & 3;
+    split.pos = (uint)bestSplit >> 2;
+
+    return split;
+}
+
+inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID];
+    struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size];
+
+    const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high);
+    const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high);
+
+    struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID];
+    struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size];
+
+    const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high);
+    const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high);
+
+    struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID];
+    struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size];
+
+    const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high);
+    const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high);
+
+    const uint3 counts_low = binInfo->counts[subgroupLocalID];
+    const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size];
+
+    const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x);
+    const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x);
+    const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y);
+    const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y);
+    const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z);
+    const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z);
+
+    const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
+    uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
+
+    /* low part: bins 0..15 */
+    const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x);
+    const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x);
+
+#if DIVIDE_BY_6 == 0
+    const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift;
+    const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift;
+
+#else
+    //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
+    //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
+
+    /* skip blocks for breadth-first phase */
+    const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x));
+    const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x));
+
+#endif
+
+    float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low));
+
+    /* first bin is invalid */
+    // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x;
+    // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y;
+    // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z;
+
+    sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0);
+    sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0);
+    sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0);
+
+    /* high part: bins 16..31 */
+
+    const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y);
+    const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y);
+#if DIVIDE_BY_6 == 0
+    const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift;
+    const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift;
+#else
+    //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
+    //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
+
+    /* skip blocks for breadth-first phase */
+    const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y));
+    const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y));
+
+#endif
+    const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high));
+
+    const uint mid = (startID + endID) / 2;
+    const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
+
+    const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit);
+    const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit);
+    const ulong bestSplit = min(bestSplit_low, bestSplit_high);
+
+    struct Split split;
+    split.sah = as_float((uint)(bestSplit >> 32));
+    split.dim = (uint)bestSplit & 3;
+    split.pos = (uint)bestSplit >> 2;
+
+    return split;
+}
+
+// =====================================================================
+
+inline float leafSAH(float geometryArea, uint prims, uint block_shift)
+{
+    return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift);
+}
+
+inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref)
+{
+    const uint dim = split->dim;
+    const float lower = primref->lower[dim];
+    const float upper = primref->upper[dim];
+    const float c = lower + upper;
+    const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]);
+    return pos < split->pos;
+}
+
+inline void serial_find_split(global struct AABB *primref,
+                              struct BinMapping *binMapping,
+                              struct BuildRecord *buildRecord,
+                              local struct Split *split,
+                              local struct BinInfo *binInfo,
+                              global uint *primref_index0,
+                              global uint *primref_index1)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = buildRecord->start;
+    const uint endID = buildRecord->end;
+
+    subgroup_initBinInfo(binInfo);
+
+    for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size)
+    {
+        const uint index = primref_index0[t];
+        primref_index1[t] = index;
+        atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]);
+    }
+}
+
+inline void serial_partition_index(global struct AABB *primref,
+                                   struct BinMapping *binMapping,
+                                   struct BuildRecord *buildRecord,
+                                   struct Split *inSplit,
+                                   struct BuildRecord *outLeft,
+                                   struct BuildRecord *outRight,
+                                   struct AABB *outGeometryBoundsLeft,
+                                   struct AABB *outGeometryBoundsRight,
+                                   global uint *primref_index0,
+                                   global uint *primref_index1)
+{
+    const uint localID = get_local_id(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroupID = get_sub_group_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint begin = buildRecord->start;
+    const uint end = buildRecord->end;
+    struct Split split = *inSplit;
+
+    struct BuildRecord left;
+    struct BuildRecord right;
+    initBuildRecord(&left, begin, end);
+    initBuildRecord(&right, begin, end);
+
+    struct AABB leftAABB;
+    struct AABB rightAABB;
+    AABB_init(&leftAABB);
+    AABB_init(&rightAABB);
+
+    global uint *l = primref_index0 + begin;
+    global uint *r = primref_index0 + end;
+
+    /* no valid split, just split in the middle */
+    if (split.sah == (float)(INFINITY))
+    {
+        for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint count = sub_group_reduce_add(1);
+            extendBuildRecord(&left, &primref[index]);
+            AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+            l[subgroupLocalID] = index;
+            l += count;
+        }
+
+        for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint count = sub_group_reduce_add(1);
+            extendBuildRecord(&right, &primref[index]);
+            AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+            r -= count;
+            r[subgroupLocalID] = index;
+        }
+    }
+    else
+    {
+        for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+            const uint isRight = 1 - isLeft;
+            const uint countLeft = sub_group_reduce_add(isLeft);
+            const uint countRight = sub_group_reduce_add(isRight);
+            const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+            const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+            r -= countRight;
+
+            if (isLeft)
+            {
+                extendBuildRecord(&left, &primref[index]);
+                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+                l[prefixLeft] = index;
+            }
+            else
+            {
+                extendBuildRecord(&right, &primref[index]);
+                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+                r[prefixRight] = index;
+            }
+            l += countLeft;
+        }
+    }
+
+    left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+    right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+    leftAABB = AABB_sub_group_reduce(&leftAABB);
+    rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+    if (subgroupLocalID == 0)
+    {
+        uint pos = l - primref_index0; // single first thread needs to compute "pos"
+        left.end = pos;
+        right.start = pos;
+
+        leftAABB.lower.w = AABB_halfArea(&leftAABB);
+        rightAABB.lower.w = AABB_halfArea(&rightAABB);
+
+        leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left));
+        rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right));
+
+        *outLeft = left;
+        *outRight = right;
+        *outGeometryBoundsLeft = leftAABB;
+        *outGeometryBoundsRight = rightAABB;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if ENABLE_CHECKS == 1
+    if (subgroupLocalID == 0)
+    {
+        if (AABB_verify(outLeft))
+        {
+            printf("outLeft:\n");
+            printBuildRecord(outLeft);
+        }
+        if (AABB_verify(outRight))
+        {
+            printf("outRight:\n");
+            printBuildRecord(outRight);
+        }
+        if (AABB_verify(outGeometryBoundsLeft))
+        {
+            printf("outGeometryBoundsLeft:\n");
+            AABB_print(outGeometryBoundsLeft);
+        }
+        if (AABB_verify(outGeometryBoundsRight))
+        {
+            printf("outGeometryBoundsRight:\n");
+            AABB_print(outGeometryBoundsRight);
+        }
+
+        for (uint i = outLeft->start; i < outLeft->end; i++)
+        {
+            const uint index = primref_index0[i];
+            if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index]))
+                printf("check left %d \n", i);
+            if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
+                printf("check prim ref bounds left %d \n", i);
+        }
+        for (uint i = outRight->start; i < outRight->end; i++)
+        {
+            const uint index = primref_index0[i];
+            if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index]))
+                printf("check right %d \n", i);
+            if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
+                printf("check prim ref bounds right %d \n", i);
+        }
+    }
+#endif
+}
+
+inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator,
+                                      const uint start,
+                                      const uint end,
+                                      global struct AABB *primref,
+                                      uint primID,
+                                      global char *bvh_mem,
+                                      unsigned leafSize)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint items = end - start;
+
+#if ENABLE_CHECKS == 1
+    if (items > BVH_LEAF_N_MAX)
+        printf("error items %d \n", items);
+#endif
+
+    // JDB TODO:  Why was this code commented out??
+    //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0;
+    //offset = sub_group_broadcast(offset,0);
+
+    //uint offset = globals->leaf_mem_allocator_start + start * leafSize;
+    uint offset = allocator->start + start * leafSize;
+    return offset;
+}
+
+inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode)
+{
+    size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN);
+    uint offset_u = (uint)offset;
+#if ENABLE_CHECKS
+    if ((size_t)((offset_u << 6) >> 6) != offset)
+    {
+        printf("get_qnode_index_for_backptr - index out of reach");
+    }
+#endif
+    return offset_u;
+}
+
+struct SerialBuildRecurseTemplateConst
+{
+    unsigned leafSize;
+    unsigned leafType;
+    bool allocateBackpointers;
+};
+
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+
+inline void parallel_find_split(global struct AABB *primref,
+                                local struct BuildRecord *buildRecord,
+                                local struct Split *bestSplit,
+                                local struct BinInfo *binInfo,
+                                global uint *primref_index0,
+                                global uint *primref_index1)
+{
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+
+    const uint startID = buildRecord->start;
+    const uint endID = buildRecord->end;
+
+    struct BinMapping binMapping;
+    initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS);
+
+    /* init bininfo */
+    parallel_initBinInfo(binInfo);
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint t = startID + localID; t < endID; t += local_size)
+    {
+        const uint index = primref_index0[t];
+        primref_index1[t] = index;
+        atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* find best dimension */
+
+    if (subgroupID == 0)
+    {
+        *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void parallel_find_split32(local uint *local_sync,
+                                  global struct AABB *primref,
+                                  local struct BuildRecord *buildRecord,
+                                  local struct Split *bestSplit,
+                                  local struct BinInfo2 *binInfo2,
+                                  global uint *primref_index0,
+                                  global uint *primref_index1)
+{
+
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+    const uint numSubGroups = get_num_sub_groups();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    const uint startID = buildRecord->start;
+    const uint endID = buildRecord->end;
+
+    struct BinMapping binMapping;
+    initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS);
+
+    /* init bininfo */
+    parallel_initBinInfo2(binInfo2, 2 * BINS);
+
+    if (localID == 0)
+        *local_sync = 0;
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint t = startID + localID; t < endID; t += local_size)
+    {
+        const uint index = primref_index0[t];
+        primref_index1[t] = index;
+        atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]);
+    }
+
+    /* find best split position using the last subgroup */
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0;
+    syncID = sub_group_broadcast(syncID, 0);
+
+    if (syncID + 1 == numSubGroups)
+    {
+        *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID);
+        DBG(if (localID == 0) printSplit(bestSplit));
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void parallel_partition_index(local uint *local_sync,
+                                     global struct AABB *primref,
+                                     struct BinMapping *binMapping,
+                                     const uint begin,
+                                     const uint end,
+                                     struct Split *inSplit,
+                                     local struct BuildRecord *outLeft,
+                                     local struct BuildRecord *outRight,
+                                     local struct AABB *outGeometryBoundsLeft,
+                                     local struct AABB *outGeometryBoundsRight,
+                                     global uint *primref_index0,
+                                     global uint *primref_index1,
+                                     uint *atomicCountLeft,
+                                     uint *atomicCountRight)
+{
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+    const uint numSubGroups = get_num_sub_groups();
+    const uint subgroup_size = get_sub_group_size();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    const uint size = end - begin;
+    struct Split split = *inSplit;
+
+    /* init bin bounds */
+    if (localID == 0)
+    {
+        initBuildRecord(outLeft, begin, end);
+        initBuildRecord(outRight, begin, end);
+        AABB_init(outGeometryBoundsLeft);
+        AABB_init(outGeometryBoundsRight);
+        *atomicCountLeft = 0;
+        *atomicCountRight = 0;
+        *local_sync = 0;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ?
+
+    struct BuildRecord left;
+    struct BuildRecord right;
+    initBuildRecord(&left, begin, end);
+    initBuildRecord(&right, begin, end);
+
+    struct AABB leftAABB;
+    struct AABB rightAABB;
+    AABB_init(&leftAABB);
+    AABB_init(&rightAABB);
+
+    if (split.sah == (float)(INFINITY))
+    {
+        if (subgroupID == 0)
+        {
+            for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
+            {
+                const uint index = primref_index1[i];
+                extendBuildRecord(&left, &primref[index]);
+                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+                primref_index0[i] = index;
+            }
+
+            for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
+            {
+                const uint index = primref_index1[i];
+                extendBuildRecord(&right, &primref[index]);
+                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+                primref_index0[i] = index;
+            }
+
+            left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+            right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+            leftAABB = AABB_sub_group_reduce(&leftAABB);
+            rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+            if (localID == 0)
+            {
+                outLeft->centroidBounds = left.centroidBounds;
+                outRight->centroidBounds = right.centroidBounds;
+
+                *outGeometryBoundsLeft = leftAABB;
+                *outGeometryBoundsRight = rightAABB;
+
+                outLeft->end = split.pos;
+                outRight->start = split.pos;
+
+                outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
+                outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
+                outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
+                outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
+            }
+        }
+    }
+    else
+    {
+
+        const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
+        const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
+
+        for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+            const uint isRight = 1 - isLeft;
+            const uint countLeft = sub_group_reduce_add(isLeft);
+            const uint countRight = sub_group_reduce_add(isRight);
+            const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+            const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+            uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
+            offsetLeft = sub_group_broadcast(offsetLeft, 0);
+            uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
+            offsetRight = sub_group_broadcast(offsetRight, 0);
+
+            if (isLeft)
+            {
+                extendBuildRecord(&left, &primref[index]);
+                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+                primref_index0[begin + offsetLeft + prefixLeft] = index;
+            }
+            else
+            {
+                extendBuildRecord(&right, &primref[index]);
+                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+                primref_index0[end - (offsetRight + countRight) + prefixRight] = index;
+            }
+        }
+        left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+        right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+        leftAABB = AABB_sub_group_reduce(&leftAABB);
+        rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+        AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper);
+        AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper);
+
+        AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
+        AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (subgroupLocalID == 0)
+        {
+            const uint sync = atomic_add(local_sync, 1);
+            if (sync + 1 == numSubGroups)
+            {
+                uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos"
+                outLeft->end = pos;
+                outRight->start = pos;
+
+                outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
+                outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
+                outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
+                outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
+            }
+        }
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if ENABLE_CHECKS == 1
+    if (localID == 0)
+    {
+        if (outLeft->end <= begin)
+            printf("pos begin error\n");
+        if (outLeft->end > end)
+            printf("pos end error\n");
+
+        for (uint i = outLeft->start; i < outLeft->end; i++)
+        {
+            const uint index = primref_index0[i];
+            //printf("left %d -> %d \n",i,index);
+            if (!is_left(binMapping, inSplit, &primref[index]))
+                printf("check left %d \n", i);
+            if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
+                printf("check prim ref bounds left %d \n", i);
+        }
+        for (uint i = outRight->start; i < outRight->end; i++)
+        {
+            const uint index = primref_index0[i];
+            //printf("right %d -> %d \n",i,index);
+            if (is_left(binMapping, inSplit, &primref[index]))
+                printf("check right %d \n", i);
+            if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
+                printf("check prim ref bounds right %d \n", i);
+        }
+    }
+#endif
+}
+
+
+#define ENABLE_LOOP_BREADTH_FIRST 0
+#if ENABLE_LOOP_BREADTH_FIRST
+// TBD It might be that layout of this impact perf.
+struct BreadthFirstLoopLocals
+{
+    struct BuildRecord local_current;
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+    struct BinInfo binInfo;
+#else
+    struct BinInfo2 binInfo;
+#endif
+    struct Split split;
+    struct BuildRecord children[BVH_NODE_N + 1];
+    struct AABB childrenAABB[BVH_NODE_N + 1];
+    uint atomicCountLeft;
+    uint atomicCountRight;
+    uint local_sync;
+    uint recordID;
+    uint buildRecordIDs[BUILDRECORD_STACK_SIZE];
+    uint numBuildRecordIDs;
+    bool exit;
+};
+
+
+inline void parallel_build_breadth_first_loopT(global struct Globals *globals,
+                                               global struct AABB *primref,
+                                               global uint *primref_index,
+                                               global char *bvh_mem,
+                                               uint subtreeThreshold,
+                                               local struct BreadthFirstLoopLocals *L,
+                                               struct BreadthFirstTemplateConst T)
+{
+    const uint global_size = get_global_size(0);
+    const uint local_size = get_local_size(0);
+    const uint localID = get_local_id(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint subgroupID = get_sub_group_id();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    /* double buffered primref index array */
+    global uint *primref_index0 = primref_index;
+    global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+    const uint bins = BINS;
+#else
+    const uint bins = 2 * BINS;
+#endif
+
+    if (localID == 0)
+    {
+        L->numBuildRecordIDs = 0;
+        L->exit = false;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    while (1)
+    {
+        if (localID == 0)
+        {
+            if (L->numBuildRecordIDs == 0)
+            {
+                L->recordID = generic_atomic_add(&globals->counter, 1);
+                if (L->recordID >= globals->numBuildRecords)
+                    L->exit = true;
+            }
+            else
+            {
+                L->numBuildRecordIDs--;
+                L->recordID = L->buildRecordIDs[L->numBuildRecordIDs];
+            }
+            L->local_current = records[L->recordID];
+        }
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+        /* no more buildrecords available ? */
+
+        if (L->exit)
+            break;
+
+        local struct BuildRecord *current = &L->local_current;
+        const uint items = getNumPrims(current);
+        const uint depth = getBuildRecursionDepth(current);
+
+        global unsigned int *num_records_output = &globals->numBuildRecords_extended;
+
+        struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current;
+
+        /* ignore small buildrecords */
+        if (items < max(subtreeThreshold, cfg_minLeafSize))
+        {
+            // do nothing
+        }
+        else
+        {
+            /*! find best split */
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+            parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
+#else
+            parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
+#endif
+            uint numChildren = 2;
+
+            /*! find best split */
+            struct BinMapping binMapping;
+            initBinMapping(&binMapping, &current->centroidBounds, bins);
+
+            parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
+
+            while (numChildren < BVH_NODE_N6)
+            {
+                /*! find best child to split */
+                const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren);
+                if (bestChild == -1)
+                    break;
+
+                /* perform best found split */
+                local struct BuildRecord *brecord = &L->children[bestChild];
+                local struct BuildRecord *lrecord = &L->children[numChildren + 0];
+                local struct BuildRecord *rrecord = &L->children[numChildren + 1];
+
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+                parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
+#else
+                parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
+#endif
+
+                initBinMapping(&binMapping, &brecord->centroidBounds, bins);
+
+                parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
+
+                *brecord = *rrecord;
+                L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1];
+
+                work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+                numChildren++;
+            }
+
+            //sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (localID <= 16 && subgroupID == 0)
+            {
+                global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem;
+                global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base);
+                global uint *back_pointers = BVHBase_backPointers(bvh_base);
+                uint qnode_index = 0;
+                if (T.allocateBackpointers)
+                {
+                    /* index of internal node, the domain of backpointers map*/
+                    qnode_index = get_qnode_index_for_backptr(nodes_start, qnode);
+                    // the backpointer is already set, but we need to add/encode the num of children
+                    // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic...
+                    back_pointers[qnode_index] += (numChildren << 3);
+                }
+
+                /* sort children based on rnage size */
+                const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren);
+                //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1);
+                const uint IDs = numPrimsIDs & 7;
+                const uint pushIDs = convertToPushIndices8(IDs);
+
+                /* alloc #numChildren nodes at once */
+                const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
+
+                /* update single relative node pointer and type */
+                const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6;
+                const uint type = BVH_INTERNAL_NODE;
+
+                /* set parent pointer in child build records */
+                if (subgroupLocalID < numChildren)
+                {
+                    setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1);
+                    global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN);
+                    L->children[subgroupLocalID].current = child_data_ptr;
+                    if (T.allocateBackpointers)
+                    {
+                        uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr);
+                        back_pointers[child_index] = qnode_index << 6;
+                    }
+                }
+
+                /* write out qbvh node */
+                subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode);
+
+                /* write out child buildrecords to memory */
+
+                uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0;
+                global_records_offset = sub_group_broadcast(global_records_offset, 0);
+
+                if (localID == 0)
+                {
+                    records[L->recordID] = L->children[0];
+                    L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID;
+                    for (uint i = 1; i < numChildren; i++)
+                    {
+                        const uint ID = globals->numBuildRecords + global_records_offset + i - 1;
+                        records[ID] = L->children[i];
+                        L->buildRecordIDs[L->numBuildRecordIDs++] = ID;
+                    }
+                }
+            }
+        }
+        work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    }
+
+    /* last active HW thread ? */
+    if (localID == 0)
+    {
+        const uint sync = atomic_add(&globals->sync, 1);
+        if (sync + 1 == numTasks)
+        {
+            globals->sync = 0;
+            /* set final number of buildrecords */
+            globals->numBuildRecords += globals->numBuildRecords_extended;
+            globals->numBuildRecords_extended = 0;
+            globals->counter = 0;
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_breadth_first_loop(global struct Globals *globals,
+                                  global struct AABB *primref,
+                                  global uint *primref_index,
+                                  global char *bvh_mem,
+                                  uint subtreeThreshold)
+{
+    local struct BreadthFirstLoopLocals L;
+    static const struct BreadthFirstTemplateConst T = {
+        false // bool allocateBackpointers;
+    };
+
+    parallel_build_breadth_first_loopT(globals,
+                                       primref,
+                                       primref_index,
+                                       bvh_mem,
+                                       subtreeThreshold,
+                                       &L,
+                                       T);
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_breadth_first_loop_backpointers(global struct Globals *globals,
+                                               global struct AABB *primref,
+                                               global uint *primref_index,
+                                               global char *bvh_mem,
+                                               uint subtreeThreshold)
+{
+    local struct BreadthFirstLoopLocals L;
+    static const struct BreadthFirstTemplateConst T = {
+        true // bool allocateBackpointers;
+    };
+
+    parallel_build_breadth_first_loopT(globals,
+                                       primref,
+                                       primref_index,
+                                       bvh_mem,
+                                       subtreeThreshold,
+                                       &L,
+                                       T);
+}
+// ===================================================
+// =============== experimental code =================
+// ===================================================
+#endif
+
+#define ENABLE_GLOBAL_SPLIT 0
+#if ENABLE_GLOBAL_SPLIT
+inline void parallel_partition_segment_index(local uint *local_sync,
+                                             global struct AABB *primref,
+                                             struct BinMapping *binMapping,
+                                             const uint begin,
+                                             const uint end,
+                                             const uint global_begin,
+                                             const uint global_end,
+                                             struct Split *inSplit,
+                                             local struct AABB *outLeft,
+                                             local struct AABB *outRight,
+                                             local struct AABB *outGeometryBoundsLeft,
+                                             local struct AABB *outGeometryBoundsRight,
+                                             global uint *primref_index0,
+                                             global uint *primref_index1,
+                                             uint *atomicCountLeft,
+                                             uint *atomicCountRight)
+{
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+    const uint numSubGroups = get_num_sub_groups();
+    const uint subgroup_size = get_sub_group_size();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    const uint size = end - begin;
+    struct Split split = *inSplit;
+
+    /* init bin bounds */
+    if (localID == 0)
+    {
+        AABB_init(outLeft);
+        AABB_init(outRight);
+        AABB_init(outGeometryBoundsLeft);
+        AABB_init(outGeometryBoundsRight);
+        *local_sync = 0;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    struct AABB left;
+    struct AABB right;
+    AABB_init(&left);
+    AABB_init(&right);
+
+    struct AABB leftAABB;
+    struct AABB rightAABB;
+    AABB_init(&leftAABB);
+    AABB_init(&rightAABB);
+
+    const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
+    const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
+
+    for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
+    {
+        const uint index = primref_index1[i];
+        const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+        const uint isRight = 1 - isLeft;
+        const uint countLeft = sub_group_reduce_add(isLeft);
+        const uint countRight = sub_group_reduce_add(isRight);
+        const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+        const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+        uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
+        offsetLeft = sub_group_broadcast(offsetLeft, 0);
+        uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
+        offsetRight = sub_group_broadcast(offsetRight, 0);
+
+        if (isLeft)
+        {
+            AABB_extend_point(&left, AABB_centroid2(&primref[index]));
+            AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+            primref_index0[global_begin + offsetLeft + prefixLeft] = index;
+        }
+        else
+        {
+            AABB_extend_point(&right, AABB_centroid2(&primref[index]));
+            AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+            primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index;
+        }
+    }
+    left = AABB_sub_group_reduce(&left);
+    right = AABB_sub_group_reduce(&right);
+    leftAABB = AABB_sub_group_reduce(&leftAABB);
+    rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+    AABB_local_atomic_merge(outLeft, left.lower, left.upper);
+    AABB_local_atomic_merge(outRight, right.lower, right.upper);
+
+    AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
+    AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(BINS * 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_init_split_iteration(global struct Globals *globals,
+                            global struct GlobalBuildRecord *global_record,
+                            global char *bvh_mem,
+                            const uint subTreeThreshold)
+{
+    const uint localID = get_local_id(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+    /* for each build record with size > subTreeThreshold initialize a global build record */
+
+    const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks;
+    const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks;
+
+    for (uint i = startID; i < endID; i++)
+    {
+        global struct BuildRecord *buildRecord = &records[i];
+        DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start));
+
+        if ((buildRecord->end - buildRecord->start) > subTreeThreshold)
+        {
+            uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0;
+
+            ID = work_group_broadcast(ID, 0);
+            global struct BinInfo2 *binInfo = &global_record[ID].binInfo;
+            global struct BinMapping *binMapping = &global_record[ID].binMapping;
+            initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS);
+            parallel_initBinInfo2(binInfo, 2 * BINS);
+            if (localID == 0)
+            {
+                global_record[ID].range.start = buildRecord->start;
+                global_record[ID].range.end = buildRecord->end;
+                global_record[ID].atomicCountLeft = 0;
+                global_record[ID].atomicCountRight = 0;
+                global_record[ID].buildRecordID = i;
+                AABB_init(&global_record[ID].leftCentroid);
+                AABB_init(&global_record[ID].rightCentroid);
+                AABB_init(&global_record[ID].leftGeometry);
+                AABB_init(&global_record[ID].rightGeometry);
+            }
+        }
+    }
+    DBG(
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+        if (localID == 0)
+            printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords););
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_bin_iteration(global struct Globals *globals,
+                     global struct AABB *primref,
+                     global uint *primref_index,
+                     global char *bvh_mem,
+                     global struct GlobalBuildRecord *global_record)
+{
+    const uint localID = get_local_id(0);
+    const uint blockSize = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+    /* early out */
+    if (numGlobalBuildRecords == 0)
+        return;
+
+    /* double buffered primref index array */
+    global uint *primref_index0 = primref_index;
+    global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+    uint numBlocks = 0;
+
+    /* get total number of blocks, size of block == WG size */
+    for (uint i = 0; i < numGlobalBuildRecords; i++)
+        numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
+
+    const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
+    const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
+    uint numBlockIDs = endBlockID - startBlockID;
+
+    uint splitRecordID = 0;
+    uint offset_start = 0;
+    uint offset_end = 0;
+    uint cur_blocks = 0;
+
+    for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
+    {
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
+        {
+            const uint preBlocks = startBlockID - blockCounter;
+            cur_blocks = min(numBlockIDs, blocks - preBlocks);
+            offset_start = preBlocks * blockSize;
+            offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
+            break;
+        }
+        blockCounter += blocks;
+    }
+
+    if (localID == 0)
+        DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+    local struct BinInfo2 local_binInfo;
+    parallel_initBinInfo2(&local_binInfo, 2 * BINS);
+    struct BinMapping binMapping = global_record[splitRecordID].binMapping;
+
+    while (1)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        const uint startID = global_record[splitRecordID].range.start + offset_start;
+        const uint endID = global_record[splitRecordID].range.start + offset_end;
+
+        if (localID == 0)
+            DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID));
+
+        for (uint i = startID + localID; i < endID; i += blockSize)
+        {
+            const uint index = primref_index0[i];
+            primref_index1[i] = index;
+            atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]);
+        }
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync
+        atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS);
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        numBlockIDs -= cur_blocks;
+        if (numBlockIDs == 0)
+            break;
+
+        splitRecordID++;
+        parallel_initBinInfo2(&local_binInfo, 2 * BINS);
+        binMapping = global_record[splitRecordID].binMapping;
+
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        cur_blocks = min(numBlockIDs, blocks);
+        offset_start = 0;
+        offset_end = min(cur_blocks * blockSize, sizeRecord);
+
+        if (localID == 0)
+            DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+    }
+}
+
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+global_compute_best_split_iteration(global struct Globals *globals,
+                                    global char *bvh_mem,
+                                    global struct GlobalBuildRecord *global_record)
+{
+    const uint localID = get_local_id(0);
+    const uint blockSize = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+    /* early out */
+    if (numGlobalBuildRecords == 0)
+        return;
+
+    const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks;
+    const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks;
+    for (uint i = startRecordID; i < endRecordID; i++)
+    {
+        struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo,
+                                                             global_record[i].binMapping.scale,
+                                                             global_record[i].range.start,
+                                                             global_record[i].range.end);
+        if (localID == 0)
+        {
+            global_record[i].split = split;
+            global_record[i].atomicCountLeft = 0;
+            global_record[i].atomicCountRight = 0;
+            DBG(printSplit(&global_record[i].split));
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+global_partition_iteration(global struct Globals *globals,
+                           global struct AABB *primref,
+                           global uint *primref_index,
+                           global char *bvh_mem,
+                           global struct GlobalBuildRecord *global_record)
+{
+
+    const uint localID = get_local_id(0);
+    const uint blockSize = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+    /* early out */
+    if (numGlobalBuildRecords == 0)
+        return;
+
+    /* double buffered primref index array */
+    global uint *primref_index0 = primref_index;
+    global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+    uint numBlocks = 0;
+
+    /* get total number of blocks, size of block == WG size */
+    for (uint i = 0; i < numGlobalBuildRecords; i++)
+        numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
+
+    const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
+    const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
+    uint numBlockIDs = endBlockID - startBlockID;
+
+    uint splitRecordID = 0;
+    uint offset_start = 0;
+    uint offset_end = 0;
+    uint cur_blocks = 0;
+
+    for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
+    {
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
+        {
+            const uint preBlocks = startBlockID - blockCounter;
+            cur_blocks = min(numBlockIDs, blocks - preBlocks);
+            offset_start = preBlocks * blockSize;
+            offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
+            break;
+        }
+        blockCounter += blocks;
+    }
+
+    if (localID == 0)
+        DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+    local struct AABB centroidAABB[2];
+    local struct AABB geometryAABB[2];
+    local uint local_sync;
+
+    while (1)
+    {
+
+        const uint startID = global_record[splitRecordID].range.start + offset_start;
+        const uint endID = global_record[splitRecordID].range.start + offset_end;
+
+        struct BinMapping binMapping = global_record[splitRecordID].binMapping;
+        struct Split split = global_record[splitRecordID].split;
+
+        const uint global_start = global_record[splitRecordID].range.start;
+        const uint global_end = global_record[splitRecordID].range.end;
+
+        if (localID == 0)
+            DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+        parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, &centroidAABB[0], &centroidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight);
+
+        /* update global structures */
+        if (localID == 0)
+        {
+            AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, &centroidAABB[0]);
+            AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, &centroidAABB[1]);
+            AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]);
+            AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]);
+        }
+
+        numBlockIDs -= cur_blocks;
+        if (numBlockIDs == 0)
+            break;
+
+        splitRecordID++;
+
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        cur_blocks = min(numBlockIDs, blocks);
+        offset_start = 0;
+        offset_end = min(cur_blocks * blockSize, sizeRecord);
+    }
+}
+
+inline void printBinaryNode(struct AABB *aabb)
+{
+    printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w));
+}
+
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_finalize_iteration(global struct Globals *globals,
+                          global struct GlobalBuildRecord *global_record,
+                          global char *bvh_mem,
+                          global struct AABB *binary_nodes)
+{
+    const uint localID = get_local_id(0);
+    const uint localSize = get_local_size(0);
+    const uint groupID = get_group_id(0);
+    const uint numGroups = get_num_groups(0);
+
+    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+    for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize)
+    {
+        const uint buildRecordID = global_record[i].buildRecordID;
+        const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w);
+        /* left child buildrecord */
+        const uint leftID = buildRecordID;
+        records[leftID].start = global_record[i].range.start;
+        records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft;
+        records[leftID].centroidBounds = global_record[i].leftCentroid;
+        /* right child buildrecord */
+        const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1);
+        records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft;
+        records[rightID].end = global_record[i].range.end;
+        records[rightID].centroidBounds = global_record[i].rightCentroid;
+        /* two binary nodes */
+        const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2);
+        binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0);
+        binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1);
+        binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry;
+        binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry;
+        binary_nodes[binaryChildID + 0].lower.w = as_float(leftID);
+        binary_nodes[binaryChildID + 0].upper.w = as_float(-1);
+        binary_nodes[binaryChildID + 1].lower.w = as_float(rightID);
+        binary_nodes[binaryChildID + 1].upper.w = as_float(-1);
+        records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0);
+        records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1);
+    }
+
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (localID == 0)
+    {
+        const uint sync = atomic_add(&globals->sync, 1);
+        if (sync + 1 == numGroups)
+        {
+            globals->sync = 0;
+            DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords));
+            DBG(
+                for (uint i = 0; i < globals->numBuildRecords; i++) {
+                    printf("i %d \n", i);
+                    printBuildRecord(&records[i]);
+                } printf("Binary Tree \n");
+                for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) {
+                    printf("i %d \n", i);
+                    printBinaryNode(&binary_nodes[i]);
+                }
+
+            );
+            globals->numGlobalBuildRecords = 0;
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals,
+                                                                                  global struct GlobalBuildRecord *global_record,
+                                                                                  global char *bvh_mem,
+                                                                                  global struct AABB *binary_nodes)
+{
+#define MAX_TOP_LEVEL_STACK_DEPTH 32
+    struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH];
+    global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH];
+    struct AABB childrenAABB[BVH_NODE_N6];
+    float childrenHalfArea[BVH_NODE_N6];
+
+    /* build records */
+    global struct BuildRecord *record = getBuildRecords(bvh_mem, globals);
+
+    struct BVHBase *base = (struct BVHBase *)bvh_mem;
+    struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
+
+    uint stack_index = 1;
+    stack[0] = binary_nodes[0];
+    stackParentPtrs[0] = (global uchar *)qnode_root;
+
+    while (stack_index != 0)
+    {
+        stack_index--;
+
+        childrenAABB[0] = stack[stack_index];
+        struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index];
+        childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
+
+        /* buildrecord leaf => set parent pointer and continue*/
+        DBG(
+            printf("stack_index %d \n", stack_index);
+            printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w)););
+
+        if (as_uint(childrenAABB[0].upper.w) == -1)
+        {
+            const uint buildRecordID = as_uint(childrenAABB[0].lower.w);
+            DBG(
+                printf("leaf buildRecordID %d \n", buildRecordID);
+                printBuildRecord(&record[buildRecordID]);)
+
+            record[buildRecordID].current = (global uchar *)qnode;
+            continue;
+        }
+
+        childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
+
+        uint numChildren = 1;
+        while (numChildren < BVH_NODE_N6)
+        {
+            // FIXME
+
+            /*! find best child to split */
+            float bestArea = -(float)INFINITY;
+            int bestChild = -1;
+            for (int i = 0; i < numChildren; i++)
+            {
+                /* ignore leaves as they cannot get split */
+                if (as_uint(childrenAABB[i].upper.w) == -1)
+                    continue;
+
+                /* find child with largest surface area */
+                if (childrenHalfArea[i] > bestArea)
+                {
+                    bestChild = i;
+                    bestArea = childrenAABB[i].lower.w;
+                }
+            }
+            if (bestChild == -1)
+                break;
+            const uint leftID = as_uint(childrenAABB[bestChild].lower.w);
+            const uint rightID = as_uint(childrenAABB[bestChild].upper.w);
+            childrenAABB[bestChild] = binary_nodes[leftID];
+            childrenAABB[numChildren] = binary_nodes[rightID];
+            childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]);
+            childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]);
+            numChildren++;
+        }
+
+        const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
+
+        /* update single relative node pointer */
+        const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6;
+        const uint type = BVH_INTERNAL_NODE;
+
+        setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode);
+
+        DBG(
+            printQBVHNodeN(qnode);
+            printf("numChildren %d \n", numChildren);
+            for (uint i = 0; i < numChildren; i++)
+                AABB_print(&childrenAABB[i]););
+
+        /* update parent pointer of build records of all children */
+        for (uint ID = 0; ID < numChildren; ID++)
+        {
+            stack[stack_index] = childrenAABB[ID];
+            stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN);
+            stack_index++;
+        }
+    }
+}
+
+#endif
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
new file mode 100644
index 00000000000..b8cf7288f6a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
@@ -0,0 +1,1507 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "bvh_build_refit.h"
+#include "libs/lsc_intrinsics.h"
+
+
+#define REFIT_DEBUG_CHECKS 0
+#define REFIT_VERBOSE_LOG 0
+
+#define NUM_STARTPOINTS_IN_SLM (1024)
+
+GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr)
+{
+    uint8 val = (uint8)(
+        as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
+        as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
+
+    store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val);
+}
+
+GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr)
+{
+    uint8 val = (uint8)(
+        as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
+        as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
+
+    store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val);
+}
+
+typedef struct Treelet_by_single_group_locals
+{
+    uint   startpoints[NUM_STARTPOINTS_IN_SLM];
+} Treelet_by_single_group_locals;
+
+typedef struct SquashedInputGroupDesc {
+    qword bvh;
+    qword scratch;
+    uint  groupInTree;
+    uint  totalNumGroups; //valid only for 0th element in array, otherwise its trash padding
+} SquashedInputGroupDesc;
+
+//
+//
+// update primitives
+//
+//
+
+typedef struct SquashedInput {
+    global struct BVHBase* pBvh;
+    global void* pInput;
+    global struct AABB* bbox_scratch;
+} SquashedInput;
+
+
+
+// updates one quad leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_quad(
+    global struct QuadLeaf* quad,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    struct AABB* childAABB)
+{
+    struct QuadLeaf Q;
+    get_updated_quad(quad, geomDesc, &Q);
+    quadCopyVertices(&Q, quad);
+    *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
+}
+
+// procedurals will have to go old path at first
+#if 0
+// updates one procedural leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_procedural(
+    global struct ProceduralLeaf** pleaf,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    struct AABB* childAABB)
+{
+    global struct ProceduralLeaf* leaf = *pleaf;
+    /* extract geomID and primID from leaf */
+    const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx);
+    const uint geomID = ProceduralLeaf_geomIndex(leaf);
+    const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+    /* read bounds from geometry descriptor */
+    struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+    childAABB->lower.x = aabb.MinX;
+    childAABB->lower.y = aabb.MinY;
+    childAABB->lower.z = aabb.MinZ;
+    childAABB->upper.x = aabb.MaxX;
+    childAABB->upper.y = aabb.MaxY;
+    childAABB->upper.z = aabb.MaxZ;
+
+    /* advance leaf pointer to next child */
+    *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx);
+}
+
+
+GRL_INLINE void update_procedural_leafs(
+    global struct BVHBase* bvh,
+    global void* input,
+    global struct AABB* bbox_scratch,
+    uint id,
+    uint num_done_by_one_thread)
+{
+    uint numLeaves = BVHBase_GetNumQuads(bvh);
+    uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64;
+    global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh);
+    uint start_leaf = id * num_done_by_one_thread;
+    uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+
+    for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
+    {
+        struct AABB theAABB;
+        refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFF);
+        theAABB.upper.w = 0x00;
+        storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]);
+    }
+}
+#endif
+
+GRL_INLINE void update_quads(
+    global struct BVHBase* bvh,
+    global void* input,
+    global struct AABB* bbox_scratch,
+    uint id,
+    uint num_done_by_one_thread)
+{
+    uint numLeaves = BVHBase_GetNumQuads(bvh);
+    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+    uint start_leaf = id * num_done_by_one_thread;
+    uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+
+    for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
+    {
+        struct AABB theAABB;
+        refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFF);
+        theAABB.upper.w = 0x00;
+        storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// core bottom-up update functions
+//
+//
+
+GRL_INLINE void quantise_bounds(
+    struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp,
+    uchar3* lower_uchar,
+    uchar3* upper_uchar)
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ???
+
+    float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+    lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+    float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+    upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+    *lower_uchar = convert_uchar3_rtn(lower);
+    *upper_uchar = convert_uchar3_rtp(upper);
+}
+
+typedef struct Qbounds_as_DW {
+    uint32_t xLL; uint32_t xLU; uint32_t xUU;
+    uint32_t yLL; uint32_t yLU; uint32_t yUU;
+    uint32_t zLL; uint32_t zLU; uint32_t zUU;
+} Qbounds_as_DW;
+
+GRL_INLINE void encodeQuantisedDataAsDW(
+    uchar3 lower_uchar,
+    uchar3 upper_uchar,
+    uint idx,
+    Qbounds_as_DW* qbounds)
+{
+    uint shift_init = idx * 8;
+    if (idx >= 4) {
+        uint shift = (shift_init - 32);
+        qbounds->xLU |= ((uint)lower_uchar.x) << shift;
+        qbounds->yLU |= ((uint)lower_uchar.y) << shift;
+        qbounds->zLU |= ((uint)lower_uchar.z) << shift;
+    }
+    else {
+        qbounds->xLL |= ((uint)lower_uchar.x) << shift_init;
+        qbounds->yLL |= ((uint)lower_uchar.y) << shift_init;
+        qbounds->zLL |= ((uint)lower_uchar.z) << shift_init;
+    }
+
+    if (idx < 2) {
+        uint shift = (shift_init + 16);
+        qbounds->xLU |= ((uint)upper_uchar.x) << shift;
+        qbounds->yLU |= ((uint)upper_uchar.y) << shift;
+        qbounds->zLU |= ((uint)upper_uchar.z) << shift;
+    }
+    else {
+        uint shift = (shift_init - 16);
+        
+        qbounds->xUU |= ((uint)upper_uchar.x) << shift;
+        qbounds->yUU |= ((uint)upper_uchar.y) << shift;
+        qbounds->zUU |= ((uint)upper_uchar.z) << shift;
+    }
+}
+
+GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode)
+{
+    qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x;
+    qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y;
+    qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z;
+}
+    
+
+GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced)
+{
+    
+    int3 exp;
+    const float up = 1.0f + ulp;
+    struct AABB conservative_aabb = conservativeAABB(aabb_reduced);
+    const float3 len = AABB_size(&conservative_aabb).xyz * up;
+    const float3 mant = frexp_vec3(len, &exp);
+    const float3 org = conservative_aabb.lower.xyz;
+
+    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z;
+
+    qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y;  qbvh_node->exp_z = exp.z;
+    
+    Qbounds_as_DW qbounds = { 0x0 };
+
+
+    {
+        uchar3 lower_uchar, upper_uchar;
+        quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar);
+
+        //encode invalid children. its enough to set 0x80 as lower_x bytes
+        uint shift = numChildren * 8;
+        uint shift2 = min(shift, 31u);
+        qbounds.xLL = (0x80808080u << shift2);
+        uint shift3 = max(shift, 32u) - 32;
+        qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3);
+
+        encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds);
+        //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node);
+    }
+
+    uint ch = prevChildIdx == 0;
+    while (ch < numChildren) {
+        uchar3 lower_uchar, upper_uchar;
+        quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar);
+        encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds);
+        //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node);
+        ch += 1 + (prevChildIdx == (ch + 1));
+    }
+    Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]);
+    *qbounds_dst = qbounds;
+    return;
+}
+
+GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B)
+{
+    AABB_extend(&A, &B);
+    // to make it work for TLAS node masks change to this:
+    // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w));
+    A.lower.w = as_float(0xABBADE00u);
+    return A;
+}
+
+GRL_INLINE void refitReduceNodePrev(
+    uint prevIdx,
+    uint leadChildIdx,
+    uint numChildren,
+    struct AABB* globalBox,
+    struct AABB* reduceBox,
+    uint depth,
+    uint NodeIndex)
+{
+    uint8_t childIgnored = (prevIdx - leadChildIdx);
+
+#   if REFIT_DEBUG_CHECKS
+    bool err = false;
+    if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)
+    {
+        printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n",
+            get_local_id(0),
+            childIgnored,
+            NodeIndex,
+            prevIdx,
+            depth);
+        err = true;
+    }
+
+    if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u)
+    {
+        printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n",
+            get_local_id(0),
+            NodeIndex,
+            depth);
+    }
+
+    bool fail = false;
+    for (uint k = 0; (k < numChildren) && !err; ++k) {
+        if (k != childIgnored) {
+            if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+                printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
+                    get_local_id(0),
+                    k,
+                    prevIdx - leadChildIdx,
+                    NodeIndex,
+                    leadChildIdx + k,
+                    depth);
+                fail = true;   
+            }
+        }
+    }
+    err |= fail;
+#   endif
+
+    // for each child 3 bits contains load index
+    const uint32_t indicesEncoded =
+        (1 << 0) +
+        (2 << 3) +
+        (3 << 6) +
+        (4 << 9) +
+        (5 << 12) +
+        (0 << 15) +
+        (1 << 18) +
+        (2 << 21) +
+        (3 << 24) +
+        (4 << 27);
+    // 1,2,3,4,5
+
+
+    uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3);
+
+    struct AABB* childAABB = globalBox + leadChildIdx;
+    struct AABB  temp = childAABB[indicesEncodedShifted & 7];
+    indicesEncodedShifted >>= 3;
+    struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7);
+    struct AABB  backlog = temp;
+
+    for (uint child = 2; child < numChildren; child++)
+    {
+        temp = *nextChild;
+        *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
+        indicesEncodedShifted >>= 3;
+        nextChild = childAABB + (indicesEncodedShifted & 7);
+        backlog = temp;
+    }
+
+    *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
+
+#if REFIT_DEBUG_CHECKS
+    for (uint k = 0; (k < numChildren) && !err; ++k) {
+        if (k != childIgnored) {
+            if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) {
+                printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
+                    get_local_id(0),
+                    k, numChildren,
+                    prevIdx - leadChildIdx,
+                    NodeIndex,
+                    leadChildIdx + k,
+                    depth);
+
+                err = true;
+            }
+        }
+    }
+    if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) {
+        printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n",
+            NodeIndex,
+            depth);
+    }
+#endif
+}
+
+
+GRL_INLINE uint hash_local_id()
+{
+    return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id();
+}
+
+//===============================================================
+//
+//  Core update function
+//
+//===============================================================
+GRL_INLINE bool refit_treelet_by_single_group(
+    global  struct AABB* bbox,
+    local Treelet_by_single_group_locals* loc,
+    uniform global BVHBase* pBvh,
+    uniform RefitTreelet   trltDsc,
+    bool encodeQnodes,
+    bool isTipTreelet)
+{
+    BackPointers* backpointers = BVHBase_GetBackPointers(pBvh);    
+    InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh);
+    uint local_id = get_local_id(0);   
+    StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset;
+    
+    // special case for single path treelets, TODO rewrite it as subgroups based
+    if (trltDsc.numStartpoints == 1) {
+        if (local_id == 0) {
+            RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc);
+            uint innerNodeIdx   = desc.theOnlyNodeIndex;
+            uint numChildren    = desc.numChildrenOfTheNode;
+            uint childIndex     = desc.childrenOffsetOfTheNode;
+            uint maxDepth       = desc.maxDepth;
+
+            uint prevIdx = childIndex;
+            struct AABB myBox = bbox[childIndex];
+            struct AABB prevAABB;
+            uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0;
+            InternalNode* curNode = internalNodes + innerNodeIdx;
+            uint currDepth = 0;
+            
+            while (1)
+            {
+                prevAABB = myBox;
+                if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
+                
+                if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); }
+                
+                if (++currDepth > maxDepth) { break; }
+
+                if (encodeQnodes) {
+                    InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); 
+                }
+#if !REFIT_DEBUG_CHECKS
+                else
+#endif
+                { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
+
+                prevIdx = innerNodeIdx;
+                innerNodeIdx = BackPointer_GetParentIndex(backpointer);
+                backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+                numChildren = BackPointer_GetNumChildren(backpointer);
+                curNode = internalNodes + innerNodeIdx;
+                childIndex = innerNodeIdx + curNode->childOffset;
+            }
+
+            if (isTipTreelet) {
+                AABB3f reduced3f = AABB3fFromAABB(myBox);
+                pBvh->Meta.bounds = reduced3f;
+            }
+            else {
+                storeAABBToL3(myBox, &bbox[innerNodeIdx]);
+            }
+
+            if (encodeQnodes || isTipTreelet) {
+                InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            }
+            
+#if REFIT_VERBOSE_LOG
+            printf("single node treelet: storing node idx %d \n", innerNodeIdx);
+#endif
+        }
+        
+        return local_id == 0;
+    }
+
+    local uint* loc_startpoints = loc->startpoints;
+    
+
+#if REFIT_DEBUG_CHECKS
+    if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) {
+        if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n");
+        return local_id == 0;
+    }
+#endif
+
+    uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints;
+
+    /*=====================================================================
+    first phase where we update startpoints nodes only
+    ----------------------------------------------------------------------*/
+    for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) {
+        uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i));
+        uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+        uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+        if (startpoint_i >= SLMedStartpointsOffset) {
+            uint idx = startpoint_i - SLMedStartpointsOffset;
+            loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
+        }
+        
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+        InternalNode* curNode = internalNodes + innerNodeIdx;
+        uint childIndex = innerNodeIdx + curNode->childOffset;
+        
+        uint prevIdx = childIndex;
+        struct AABB myBox = bbox[childIndex];
+        struct AABB prevAABB = myBox;
+
+#   if REFIT_DEBUG_CHECKS
+        if (numChildren == 0) {
+            printf("this node has no chidren!\n", 0);
+            AABB_init(&myBox);
+        }
+#   endif
+        
+        if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
+        myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+        
+#if REFIT_VERBOSE_LOG
+        printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx);
+#endif
+        storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+
+        if (encodeQnodes) {
+            InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+        }
+    }
+
+    uniform uint CurrPeeledDepth = 1;
+    uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints;
+    uint nextFloorStartpoint = hash_local_id();
+
+    uint depthOnionEnd = trltDsc.depthLess64;
+    if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; }
+    if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; }
+
+    /*=====================================================================
+    second phase, we update horizontally untill 
+    we reach number of active path below grou size
+    ----------------------------------------------------------------------*/
+    while (CurrPeeledDepth < depthOnionEnd) {
+        mem_fence_workgroup_default();
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+        uint start = nextFloorStartpoint;
+        nextFloorStartpoint = numStartpoints;
+
+        for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) {
+            uint startpoint   = loc_startpoints[startpoint_i];
+            uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+            uint backpointer  = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+
+            if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) {
+                StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
+                loc_startpoints[startpoint_i] = newSP;
+                nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i);
+            }
+
+            InternalNode* curNode = internalNodes + innerNodeIdx;
+            uint childIndex = innerNodeIdx + curNode->childOffset;
+            uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+            uint prevIdx = childIndex;
+            struct AABB myBox = bbox[childIndex];
+            struct AABB prevAABB = myBox;
+            refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+
+            myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+
+#if REFIT_VERBOSE_LOG
+            printf("onion: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
+#endif
+            storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+            if (encodeQnodes) {
+                InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            }
+        }
+        CurrPeeledDepth++;
+    }
+
+    uint startpoint_idx = nextFloorStartpoint;
+    bool active = startpoint_idx < numStartpoints;
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+    StartPoint startpoint = loc_startpoints[startpoint_idx];
+
+    struct AABB myBox;
+    uint prevIdx = 0;
+    uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+
+    /*=====================================================================
+    last phase, each thread just continues path to its end
+    
+    only thread that computes the longest path leaves prematurely 
+    (thats why while condition isn't <=) the code for finalizing root of treelet
+    is special and hendled afterwards
+
+    TODO: with proper assigning of paths to lanes we should reach only three
+    active lanes per physical thread quite soon for this subgroups could be used 
+    ----------------------------------------------------------------------*/
+    bool prevActive = active;
+    while (CurrPeeledDepth < trltDsc.maxDepth) {
+        uint backpointer;
+        uint childIndex;
+        InternalNode* curNode = internalNodes + innerNodeIdx;
+        if (active) {
+            childIndex = innerNodeIdx + curNode->childOffset;
+            backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+        } else if(prevActive){
+            mem_fence_workgroup_default();
+        }
+
+        prevActive = active;
+
+        work_group_barrier(0, memory_scope_work_group);
+        //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx);
+        if (active) {
+
+#if REFIT_DEBUG_CHECKS
+            if (CurrPeeledDepth > StartPoint_GetDepth(startpoint))
+            {
+                printf("uppath: startpoint %d <n=%d , d=%d> at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth);
+            }
+#endif
+            if (prevIdx == 0) {
+                myBox = bbox[childIndex];
+                prevIdx = childIndex;
+            }
+            uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+            struct AABB prevAABB = myBox;
+            refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+            myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+#if REFIT_VERBOSE_LOG
+            printf("uppath: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
+#endif
+            active = CurrPeeledDepth < StartPoint_GetDepth(startpoint);
+
+            if (encodeQnodes) {
+#if !REFIT_DEBUG_CHECKS
+                if (!active)
+#endif
+                { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
+                InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            } else {
+                storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+            }
+
+            prevIdx = innerNodeIdx;
+            innerNodeIdx = BackPointer_GetParentIndex(backpointer);
+        }
+
+        CurrPeeledDepth++;
+    }
+
+    {
+        uint backpointer;
+        uint childIndex;
+        InternalNode* curNode = internalNodes + innerNodeIdx;
+        if (active) {
+            childIndex = innerNodeIdx + curNode->childOffset;
+            backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+        } else if(prevActive) {
+            mem_fence_workgroup_default();
+        }
+
+        work_group_barrier(0, memory_scope_work_group);
+
+        /*=====================================================================
+        final step, is special processing of root,
+        its different, since its box is transfered cross group (written to L3)
+        or is root of whole tree and hence fill global box in bvh MD
+        TODO: this should be done in SG as only one thread is active
+        ----------------------------------------------------------------------*/
+        if (active) {
+            if (prevIdx == 0) {
+                myBox = bbox[childIndex];
+                prevIdx = childIndex;
+            }
+            uint numChildren = BackPointer_GetNumChildren(backpointer);
+            struct AABB prevAABB = myBox;
+            refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+            myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+            
+#if REFIT_VERBOSE_LOG
+            printf("root: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/);
+#endif
+            if (isTipTreelet) {
+                AABB3f reduced3f = AABB3fFromAABB(myBox);
+                pBvh->Meta.bounds = reduced3f;
+                InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            } else {
+                storeAABBToL3(myBox, &bbox[innerNodeIdx]);
+                if (encodeQnodes) {
+                    InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+                }
+            }
+        }
+    }
+
+    return active;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// Internal nodes enocding as a separate dispatch
+//
+//
+
+// encode qnodes as a separate pass
+GRL_INLINE void post_refit_encode_qnode_tree_per_group(
+    global struct AABB* bbox_scratch,
+    global struct BVHBase* bvh)
+{
+    uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh);
+    InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh);
+
+    for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0))
+    {
+        struct AABB reduced = bbox_scratch[nodeIdx];
+#   if REFIT_DEBUG_CHECKS
+        if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+            printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx);
+            return;
+        }
+        for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) {
+            uint childIdx = (as_uint(reduced.upper.w) >> 4) + k;
+            if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+                printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx);
+                return;
+            }
+        }
+#   endif
+        struct InternalNode* qbvh_node = internalNodes + nodeIdx;
+        uint childIndex = as_uint(reduced.upper.w) >> 4;
+        uint numChildren = as_uint(reduced.upper.w) & 7;
+        struct AABB* children = bbox_scratch + childIndex;
+        //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced);
+        InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// Construction of treelets and paths
+//
+//
+
+// this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node
+// when thread closed treelets it the data is starts to be treelet ID
+typedef uint TreeletNodeData;
+
+typedef struct TreeletsOpenNodeInfo {
+    // bool isTreeletRoot; // : 1 
+    short   maxDepth;      // : 14
+    uint    numStartpoints;// : 16
+} TreeletsOpenNodeInfo;
+
+typedef struct TreeletsClosedNodeInfo {
+    // bool isTreeletRoot; // : 1 
+    uint    treeletId;     // : 31 (when treelet is closed)
+} TreeletsClosedNodeInfo;
+
+GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D)
+{
+    return D & ((1u << 31u) - 1u);
+}
+
+GRL_INLINE uint isTreeletRoot(TreeletNodeData E)
+{
+    return E >> 31;
+}
+
+GRL_INLINE uint getNumStartpoints(TreeletNodeData E)
+{
+    return E & ((1 << 16) - 1);
+}
+
+GRL_INLINE uint getMaxDepth(TreeletNodeData E)
+{
+    return (E >> 16) & ((1 << 14) - 1);
+}
+
+// single startpoint treelet
+GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E)
+{
+    return (E >> 31) && (getMaxDepth(E) == 0);
+}
+
+GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D)
+{
+    return ClearTreeletRoot(D) | (1 << 30);
+}
+
+GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D)
+{
+    return D | (1 << 31);
+}
+
+GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E)
+{
+    TreeletsOpenNodeInfo I;
+    I.maxDepth = getMaxDepth(E);
+    I.numStartpoints = getNumStartpoints(E);
+    return I;
+}
+
+GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot)
+{
+    TreeletNodeData D = isRoot ? (1 << 31) : 0;
+    D |= (I.maxDepth & ((1 << 14) - 1)) << 16;
+    D |= I.numStartpoints & ((1 << 16) - 1);
+    return D;
+}
+
+GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E)
+{
+    TreeletsClosedNodeInfo I;
+    I.treeletId = E & ((1u << 31u) - 1u);
+    return I;
+}
+
+GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I)
+{
+    TreeletNodeData D = (1u << 31u); // closed is always a root!
+    D |= I.treeletId & ((1u << 31u) - 1u);
+    return D;
+}
+
+GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId)
+{
+    TreeletNodeData D = (1 << 31); // closed is always a root!
+    D |= treeletId & ((1u << 31u) - 1u);
+    return D;
+}
+
+GRL_INLINE void chk_close_Treelet(
+    RefitTreelet* TreeletDescsArr,
+    TreeletNodeData* nodeTreeletDataArr,
+    uint* StartPointBuffer,
+    uint* currStartpoint,
+    TreeletNodeData nodeData,
+    TreeletsOpenNodeInfo* nodeOpenInfo,
+    uint nodeIdx,
+    uint* treeletDescIdx)
+{
+    if (isTreeletRoot(nodeData))
+    {
+        TreeletNodeData encoded = 0;
+        if (nodeOpenInfo->numStartpoints == 1)
+        {
+            encoded = ClearTreeletRoot(SetTipStartpoint(nodeData));
+        }
+        else
+        {
+            RefitTreelet RTdesc;
+            RTdesc.startpoint_offset = *currStartpoint;
+            *currStartpoint += nodeOpenInfo->numStartpoints;
+            RTdesc.numStartpoints = nodeOpenInfo->numStartpoints;
+            RTdesc.maxDepth = nodeOpenInfo->maxDepth;
+            TreeletDescsArr[*treeletDescIdx] = RTdesc;
+            encoded = EncodeClosedInfo(*treeletDescIdx);
+            *treeletDescIdx = *treeletDescIdx + 1;
+            TreeletsOpenNodeInfo infoDefault = { 0, 0 };
+            *nodeOpenInfo = infoDefault;
+        }
+
+        nodeTreeletDataArr[nodeIdx] = encoded;
+    }
+    // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset);
+}
+
+
+// TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root
+GRL_INLINE void treelet_bottom_up_mark_treelets(
+    global struct BVHBase* bvh,
+    global InternalNode* internalNodes,
+    global StartPoint* scratch_startpoints,
+    uint curNodeIndex,
+    BackPointers* backPointers,
+    global TreeletNodeData* treelets,
+    uint refitTreeletsDataStart,
+    uint* startpointAlloc)
+{
+    TreeletsOpenNodeInfo currInfo;
+    currInfo.maxDepth = 0;
+    currInfo.numStartpoints = 1;
+
+    global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64));
+
+    treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true);
+
+    /* the start node got already processed, thus go to its parent node */
+    uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+    curNodeIndex = parentPointer >> 6;
+
+    bool isInTip = false;
+    while (curNodeIndex != 0x03FFFFFF)
+    {
+        uint numChildrenTotal = 0;
+        // numChildrenTotal and parentPointer gets updated...
+        // atomic trickery, on backpointers, only the last one thread enters up
+        {
+            /* increment refit counter that counts refitted children of current node */
+            global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex);
+            mem_fence_gpu_invalidate();
+            parentPointer = 1 + atomic_inc_global(pCurrentBackpointer);
+
+            /* if all children got refitted, then continue */
+            const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+            numChildrenTotal = (parentPointer >> 3) & 0x7;
+
+            if (numChildrenRefitted != numChildrenTotal)
+                return;
+
+            /* reset refit counter for next refit */
+            *pCurrentBackpointer = (parentPointer & 0xfffffff8);
+        }
+
+        /* get children treelets */
+        global struct InternalNode* node = internalNodes + curNodeIndex;
+        uint childrenIndices = curNodeIndex + node->childOffset;
+        global TreeletNodeData* childrenTreelets = treelets + childrenIndices;
+
+        // yeah, it is possible we are pulling trash here, but we wont use it.
+        // this is for the sake of one non control flow spoiled data pull
+        TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1];
+        TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3];
+        TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5];
+
+        // zero out the potential trash
+        if (numChildrenTotal < 3) dataCh2 = 0;
+        if (numChildrenTotal < 4) dataCh3 = 0;
+        if (numChildrenTotal < 5) dataCh4 = 0;
+        if (numChildrenTotal < 6) dataCh5 = 0;
+
+        TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0);
+        TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1);
+        TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2);
+        TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3);
+        TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4);
+        TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5);
+
+        uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5);
+        // see if we should merge the trees, if not then we should move to tip.
+        currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints;
+
+        bool isTipStartpoint = false;
+        if (!isInTip)
+        {
+            // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes
+            bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS));
+            bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots;
+            if (mergeTreelets && allChildrenRootsCurrently)
+            {
+                childrenTreelets[0] = ClearTreeletRoot(dataCh0);
+                childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root.
+                if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2);
+                if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3);
+                if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4);
+                if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5);
+            }
+            else
+            {
+                isInTip = true;
+                isTipStartpoint = allChildrenRootsCurrently;
+            }
+        }
+
+        // close any roots underneath
+        if (isInTip && numChildrenBeingRoots)
+        {
+            uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) +
+                                isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5);
+
+            uint treeletId = 0;
+            uint bottomStartpointSpace = 0;
+
+            uint startpointsFromTiptree = trivialRoots;
+
+            if (trivialRoots) isTipStartpoint = false;
+
+            if (numChildrenBeingRoots > trivialRoots)
+            {
+                startpointsFromTiptree += // startpoint ONLY from tiptree
+                    (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints +
+                    (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints +
+                    (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints +
+                    (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints +
+                    (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints +
+                    (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints;
+                    
+                treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots);
+                bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree);
+            }
+
+            currInfo.numStartpoints = startpointsFromTiptree;
+
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId);
+        }
+
+        if (isTipStartpoint)
+        {
+            currInfo.maxDepth = 0;
+            currInfo.numStartpoints = 1;
+        }
+        else
+        {
+            // reduce max depth and number of startpoint underneath
+            currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth),
+                max(infoCh2.maxDepth, infoCh3.maxDepth)),
+                max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1;
+        }
+
+        treelets[curNodeIndex] = EncodeOpenInfo(
+            currInfo,
+            !isInTip /*mark marged treelet as an new root iff we are in bottom we */);
+
+        /* make parent node the current node */
+        curNodeIndex = parentPointer >> 6;
+    }
+
+    uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+    uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints);
+
+    treelets[0] = EncodeClosedInfo(treeletId);
+    RefitTreelet tipTreeletDesc;
+    tipTreeletDesc.startpoint_offset = bottomStartpointSpace;
+    tipTreeletDesc.numStartpoints = currInfo.numStartpoints;
+    tipTreeletDesc.maxDepth = currInfo.maxDepth;
+    
+    treeletDescs[treeletId] = tipTreeletDesc;
+
+    uint realNumberOfTreelets = treeletId + 1;
+    // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets 
+    // so substract 1. Except single treelet tree which is should stay 1.
+    uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId;
+
+    *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets;
+
+    uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6;
+    uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6;
+    bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B;
+    bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B;
+    *startpointAlloc = 0;
+}
+
+
+GRL_INLINE void find_refit_treelets(
+    global struct BVHBase* bvh,
+    global TreeletNodeData* treelets,
+    global uint* scratchStartpoints,
+    global uint* startpointAlloc)
+{
+    /* get pointer to inner nodes and back pointers */
+    uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
+
+    /* construct range of nodes that each work group will process */
+    uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
+
+    varying ushort lane = get_sub_group_local_id();
+    varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    uint numBackpointers = BVHBase_GetNumInternalNodes(bvh);
+
+    // align to 64B and divide
+    uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6;
+
+    uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B;
+    if (global_id == 0)
+    {
+        bvh->refitTreeletsDataStart = refitTreeletsDataStart;
+    }
+
+    global struct InternalNode* curNode = &inner_nodes[global_id];
+
+    varying ushort has_startpoint = 0;
+    if (global_id < numInnerNodes) {
+        if ((curNode->nodeType != BVH_INTERNAL_NODE))
+        {
+            has_startpoint = 1;
+        }
+    }
+
+    if (has_startpoint == 0)
+        return;
+
+    treelet_bottom_up_mark_treelets(
+        bvh,
+        inner_nodes,
+        scratchStartpoints,
+        global_id,
+        BVHBase_GetBackPointers(bvh),
+        treelets,
+        refitTreeletsDataStart,
+        startpointAlloc);
+}
+
+GRL_INLINE void assign_refit_startpoints_to_treelets(
+    global struct BVHBase*  bvh,
+    global TreeletNodeData* treelets,
+    global uint*            scratchStartpoints)
+{
+    /* get pointer to inner nodes and back pointers */
+    uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh);
+
+    /* construct range of nodes that each work group will process */
+    uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
+
+    varying ushort lane = get_sub_group_local_id();
+    varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    varying uint curNodeIndex = starPointNode;
+    global struct InternalNode* curNode = &inner_nodes[curNodeIndex];
+
+    varying ushort is_startpoint = 0;
+
+    if (curNodeIndex < numInnerNodes)
+    {
+        if ((curNode->nodeType != BVH_INTERNAL_NODE))
+        {
+            is_startpoint = 1;
+        }
+    }
+
+    if (is_startpoint == 0)
+    {
+        return;
+    }
+
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
+    uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh);
+    if (numTreelets > 1) numTreelets++;
+
+    uint myDepthWhenDead = 0;
+    uint startpointsBeforeMe = 0;
+    bool dead = false;
+
+    uint prevNodeIndex = 0x03FFFFFF;
+
+    while (curNodeIndex != 0x03FFFFFF)
+    {
+        TreeletNodeData nodeData = treelets[curNodeIndex];
+
+        uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+        uint numChildren = BackPointer_GetNumChildren(parentPointer);
+
+        // this is counterpart of atomic based entrance decision.
+        // the alive path is the longest, if two are equal take the one that came through child with smaller index.
+        if (prevNodeIndex != 0x03FFFFFF)
+        {
+            uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset;
+            uint childEnd = numChildren + leadChildOfCur;
+
+            uint longestPath = 0;
+            uint longestPathChildIdx = leadChildOfCur;
+
+            for (uint child = leadChildOfCur; child < childEnd; child++)
+            {
+                TreeletNodeData childData = treelets[child];
+                if (!isTreeletRoot(childData))
+                {
+                    TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData);
+                    if (longestPath <= childinfo.maxDepth) {
+                        longestPathChildIdx = child;
+                        longestPath = childinfo.maxDepth + 1;
+                    }
+
+                    if (child < prevNodeIndex)
+                    {
+                        // also count how many startpoints are there before me (used to place startpoint in proper slot)
+                        startpointsBeforeMe += childinfo.numStartpoints;
+                    }
+                }
+            }
+
+            if (!dead && prevNodeIndex != longestPathChildIdx)
+            {
+                dead = true;
+                //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+            }
+
+            if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index
+            {
+                myDepthWhenDead = longestPath;
+                // it is a startpoint
+                //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+            }
+
+            if (starPointNode == (uint)-1) {
+                // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet
+                if (dead)
+                {
+                    //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+                    // and we are dead, so we are not a startpoint of tip, 
+                    // so we must disappear to not be added as a startpoint.
+                    return;
+                }
+                else
+                {
+                    // it is a startpoint
+                    //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex);
+                    starPointNode = curNodeIndex;
+                }
+            }
+        }
+
+        if (isTreeletRoot(nodeData))
+        {
+            TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData);
+            RefitTreelet treeletDesc = treeletDescs[info.treeletId];
+            uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe;
+            scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1));
+
+            //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead);
+
+            if (dead) return;
+            myDepthWhenDead = 0;
+            startpointsBeforeMe = 0;
+            starPointNode = (uint)-1;
+        }
+
+        /* make parent node the current node */
+        prevNodeIndex = curNodeIndex;
+        curNodeIndex = BackPointer_GetParentIndex(parentPointer);
+        //if(!dead)
+        //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex);
+    }
+}
+
+const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32;
+
+GRL_INLINE void finalize_treelets_in_groups(
+    global struct BVHBase* bvh,
+    global uint* scratchStartpoints,
+    local uint* depths)
+{
+    uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+    uint local_id = get_local_id(0);
+
+    uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted;
+
+    RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
+
+    for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted)
+    {
+        if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); }
+
+        RefitTreelet treeletDesc = treeletDescs[treeletId];
+        StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset;
+        if (treeletDesc.numStartpoints <= 1)
+        {
+            // for smaller latency we store 1 element treelets as RefitTreeletTrivial,
+            // this happens most of the time for tip treelet
+            if (local_id == 0)
+            {
+                RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 };
+                if (treeletDesc.numStartpoints == 1)
+                {
+                    StartPoint sp               = srcStartpoints[0];
+                    
+                    tr.theOnlyNodeIndex         = StartPoint_GetNodeIdx(sp);
+                    uint backpointer            = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex);
+                    tr.numChildrenOfTheNode     = BackPointer_GetNumChildren(backpointer);
+                    tr.childrenOffsetOfTheNode  = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex;
+                }
+                RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId);
+                *trivial = tr;
+#if REFIT_VERBOSE_LOG
+                printf("treelet trivial %d {\n  theOnlyNodeIndex = %d;\n  numStartpoints = %d;\n  childrenOffsetOfTheNode = %d;\n  maxDepth =%d;\n  numChildrenOfTheNode = %d;\n}\n",
+                    treeletId,
+                    tr.theOnlyNodeIndex,
+                    tr.numStartpoints,
+                    tr.childrenOffsetOfTheNode,
+                    tr.maxDepth,
+                    tr.numChildrenOfTheNode);
+#endif
+            }
+        }
+        else
+        {
+#define SKIP_PATHS_SORTING 0
+#if SKIP_PATHS_SORTING
+            StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
+            for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
+            {
+                dstStartpoints[startpointID] = srcStartpoints[startpointID];
+            }
+#else
+            //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); }
+
+            if (local_id <= treeletDesc.maxDepth) {
+                depths[local_id] = 0;
+                //    printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id);
+            }
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+            uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size();
+
+            // collect histogram of how many paths of given length we have
+
+            // keep count of depth 0 
+            uint val = 0;
+
+            // optimize: we will load Startpoint only once to 
+            uint S_c[8];
+            // optimize: keep accumulated numbers in registers to limit number of atomic ops
+            uint D_c[8] = { 0 };
+
+            uint cached_threshold = 8 * get_local_size(0);
+            cached_threshold = min(cached_threshold, treeletDesc.numStartpoints);
+
+            uint loop_turn = 0;
+            uint sgid = get_sub_group_local_id();
+
+            for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
+            {
+                uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]);
+                atomic_inc((volatile local uint*) (depths + dstSlot));
+            }
+
+            uint HistogramSG = 0;
+            if (treeletDesc.maxDepth < 8)
+            {
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = srcStartpoints[startpointID];
+                    S_c[loop_turn++] = S;
+                    uint dstSlot = StartPoint_GetDepth(S);
+                    D_c[dstSlot]++;
+                }
+                
+                for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+                {
+                    val = sub_group_reduce_add(D_c[d]);
+                    if (sgid == d)
+                    {
+                        HistogramSG = val;
+                    }
+                }
+                if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
+                {
+                    atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
+                }
+            }
+            else
+            {
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = srcStartpoints[startpointID];
+                    S_c[loop_turn++] = S;
+                    uint dstSlot = StartPoint_GetDepth(S);
+                    atomic_inc((volatile local uint*) (depths + dstSlot));
+                }
+            }
+
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REFIT_VERBOSE_LOG
+            if (local_id == 0)
+            {
+                for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+                {
+                    printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]);
+                }
+            }
+#endif
+
+            if (treeletDesc.maxDepth < get_sub_group_size())
+            {
+                if (get_sub_group_id() == 0)
+                {
+                    
+                    uint cntOfDepth = 0;
+                    if (sgid <= treeletDesc.maxDepth) {
+                        cntOfDepth = depths[sgid];
+                    }
+                    uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth);
+                    depths[sgid] = pref_sum;
+
+                    uint numLeft = treeletDesc.numStartpoints - (pref_sum);
+                    uint depthLess64  = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+                    uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+                    uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+
+                    // filling data for thread 0 who will save this to mem
+                    treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64);
+                    treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128);
+                    treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256);
+                    treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth;
+
+                    if (sgid == 0) {
+                        treeletDescs[treeletId] = treeletDesc;
+#if REFIT_VERBOSE_LOG
+                        printf("treelet %d {\n  startpoint_offset = %d;\n  numStartpoints = %d;\n  numNonTrivialStartpoints = %d;  \n  maxDepth = %d;\n  depthLess64 = %d;\n  depthLess128 = %d;\n  depthLess256 = %d;\n}\n",
+                            treeletId,
+                            treeletDesc.startpoint_offset,
+                            treeletDesc.numStartpoints,
+                            treeletDesc.numNonTrivialStartpoints,
+                            treeletDesc.maxDepth,
+                            treeletDesc.depthLess64,
+                            treeletDesc.depthLess128,
+                            treeletDesc.depthLess256);
+#endif
+                    }
+                }
+            }
+            else if (local_id <= treeletDesc.maxDepth) {
+                uint thisdepthcount = depths[local_id];
+                treeletDesc.depthLess64 = 0;
+                treeletDesc.depthLess128 = 0;
+                treeletDesc.depthLess256 = 0;
+                uint numLeft = treeletDesc.numStartpoints;                
+                uint pref_sum = 0;
+
+                for (uint d = 0; d < local_id; d++)
+                {
+                    uint depthCnt = depths[d];
+                    if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; }
+                    if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; }
+                    if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; }
+                    pref_sum += depthCnt;
+                    numLeft -= depthCnt;
+                    if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; }
+                }
+
+                if (local_id == treeletDesc.maxDepth)
+                {
+                    treeletDescs[treeletId] = treeletDesc;
+#if REFIT_VERBOSE_LOG
+                    printf("treelet %d {\n  startpoint_offset = %d;\n  numStartpoints = %d;\n  numNonTrivialStartpoints = %d;  maxDepth = %d;\n  depthLess64 = %d;  depthLess128 = %d;  depthLess256 = %d;\n}\n",
+                        treeletId,
+                        treeletDesc.startpoint_offset,
+                        treeletDesc.numStartpoints,
+                        treeletDesc.numNonTrivialStartpoints,
+                        treeletDesc.maxDepth,
+                        treeletDesc.depthLess64,
+                        treeletDesc.depthLess128,
+                        treeletDesc.depthLess256);
+#endif
+                }    
+            }
+
+            StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
+
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+            
+            loop_turn = 0;
+            if (treeletDesc.maxDepth < 8)
+            {
+                uint prefixSG = 0;
+
+                // make prefixSG keep interval for paths with sglid depth that is separated out for sg.
+                if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
+                {
+                    prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
+                }
+
+                // from now on all sgs run independently
+
+                // make D_c keep offset interval that is separated out for given lane
+                for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+                {
+                    uint thisDPrefixSg = sub_group_broadcast(prefixSG, d);
+                    uint thisLaneCount = D_c[d];
+                    uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount);
+                    D_c[d] = laneOffset + thisDPrefixSg;
+                }
+
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = S_c[loop_turn++];
+                    uint d = StartPoint_GetDepth(S);
+                    uint dstSlot = D_c[d]++;
+                    dstStartpoints[dstSlot] = S;
+                }
+            }
+            else
+            {
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = S_c[loop_turn++];
+                    uint d = StartPoint_GetDepth(S);
+                    uint dstSlot = atomic_inc((volatile local uint*) (depths + d));
+                    dstStartpoints[dstSlot] = S;
+                }
+            }
+
+            for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0))
+            {
+                StartPoint S = srcStartpoints[srcStartpointID];
+                uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]);
+                uint dstSlot = atomic_inc((volatile local uint*) (depths+ d));
+                dstStartpoints[dstSlot] = S;
+            }
+#endif //skip sorting
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_copy.cl b/src/intel/vulkan/grl/gpu/bvh_copy.cl
new file mode 100644
index 00000000000..6e76f195095
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_copy.cl
@@ -0,0 +1,763 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "d3d12.h"
+#include "common.h"
+#include "mem_utils.h"
+#include "misc_shared.h"
+
+#define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT))
+
+GRL_INLINE
+uint GroupCountForCopySize(uint size)
+{
+    return (size >> 8) + 4;
+}
+
+GRL_INLINE
+uint GroupCountForCopy(BVHBase* base)
+{
+    return GroupCountForCopySize(base->Meta.allocationSize);
+}
+
+GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances)
+{
+    for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0))
+    {
+        for (uint row = 0; row < 3; row++)
+        {
+            for (uint column = 0; column < 4; column++)
+            {
+                D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column));
+            }
+        }
+        D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex]));
+        D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex]));
+        D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex]));
+        D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex]));
+        D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex]));
+    }
+}
+
+GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart)
+{
+    if (get_local_id(0) == 0)
+    {
+        uint64_t previousGeoDataBufferEnd = dataBufferStart;
+        for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1)
+        {
+            D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type));
+            D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags));
+            if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
+            {
+                // Every triangle is stored separately
+                uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount;
+                D3D12_set_triangles_Transform(&descs[geoIndex], 0);
+                D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE);
+                D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT);
+                D3D12_set_triangles_IndexCount(&descs[geoIndex], 0);
+                D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3);
+                D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+                D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+                D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float));
+                previousGeoDataBufferEnd += vertexBufferSize;
+            }
+            else
+            {
+                D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount);
+                D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+                D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB));
+                previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount;
+            }
+        }
+    }
+}
+
+GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad)
+{
+    float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc);
+    uint64_t firstTriangleIndex = quad->primIndex0;
+    uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2;
+
+    vertices[firstTriangleIndex * 9] = quad->v[0][0];
+    vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1];
+    vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2];
+
+    vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0];
+    vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1];
+    vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2];
+
+    vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0];
+    vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1];
+    vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2];
+
+    if (numTriangles == 2)
+    {
+        uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad);
+        uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
+        for( size_t i=0; i<3; i++ )
+        {
+            uint32_t idx = packed_indices & 3 ; packed_indices >>= 2;
+            for( size_t j=0; j<3; j++ )
+                vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j];
+        }
+    }
+}
+
+GRL_INLINE
+void storeProceduralDesc(
+    struct AABB     procAABB,
+    uint32_t        primId,
+    D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc)
+{
+    D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc);
+    D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB);
+}
+
+GRL_INLINE
+void copyDataFromLProcedurals(
+    BVHBase* base,
+    D3D12_RAYTRACING_GEOMETRY_DESC* descs)
+{
+    unsigned numProcedurals = BVHBase_GetNumProcedurals(base);
+    InternalNode* innerNodes = BVHBase_GetInternalNodes(base);
+    unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base);
+
+    if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals
+    {
+
+        // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them
+        for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0))
+        {
+            InternalNode* innerNode = innerNodes + nodeI;
+
+            if (innerNode->nodeType == NODE_TYPE_PROCEDURAL)
+            {
+                float* origin = innerNode->lower;
+
+                global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode);
+
+                for (uint k = 0; k < 6; k++)
+                {
+                    if (InternalNode_IsChildValid(innerNode, k))
+                    {
+                        struct AABB3f qbounds = {
+                            (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]),
+                            (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) };
+
+                        struct AABB dequantizedAABB;
+
+                        dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8);
+                        dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8);
+                        dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8);
+                        dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8);
+                        dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8);
+                        dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8);
+
+                        dequantizedAABB = conservativeAABB(&dequantizedAABB);
+                        /* extract geomID and primID from leaf */
+                        const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k);
+                        const uint geomID = ProceduralLeaf_geomIndex(leaf);
+                        const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+                        storeProceduralDesc(dequantizedAABB, primID, descs + geomID);
+                    }
+                    /* advance leaf pointer to next child */
+                    leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k);
+                }
+
+            }
+            else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); }
+            else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; }
+        }
+    }
+}
+
+GRL_INLINE
+void copyDataFromQuadLeaves(BVHBase* base,
+    D3D12_RAYTRACING_GEOMETRY_DESC* descs)
+{
+    QuadLeaf* quads = BVHBase_GetQuadLeaves(base);
+    uint64_t numQuads = BVHBase_GetNumQuads(base);
+    for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0))
+    {
+        uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc);
+        copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]);
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel clone_indirect(global char* dest,
+    global char* src)
+{
+    BVHBase* base = (BVHBase*)src;
+    uint64_t bvhSize = base->Meta.allocationSize;
+
+    uint numGroups = GroupCountForCopy(base);
+    CopyMemory(dest, src, bvhSize, numGroups);
+}
+
+GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt)
+{
+    global BVHBase* baseSrc = (global BVHBase*)src;
+    global BVHBase* baseDest = (global BVHBase*)dest;
+
+    uint32_t offset = sizeof(BVHBase);
+    uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc);
+    uint32_t nodeSize = numNodes * sizeof(InternalNode);
+    offset += nodeSize;
+
+    int quadChildFix = baseSrc->quadLeafStart;
+    int procChildFix = baseSrc->proceduralDataStart;
+    int instChildFix = baseSrc->instanceLeafStart;
+
+    // serialization already copies part of bvh base so skip this part
+    CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt);
+    baseDest->Meta.allocationSize = compactedSize;
+
+    if (baseSrc->Meta.instanceCount)
+    {
+        const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf);
+        CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt);
+        const uint instanceLeafStart = (uint)(offset / 64);
+        baseDest->instanceLeafStart = instanceLeafStart;
+        instChildFix -= instanceLeafStart;
+        offset += instLeafsSize;
+        baseDest->instanceLeafEnd = (uint)(offset / 64);
+    }
+    if (baseSrc->Meta.geoCount)
+    {
+        const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf);
+        if (quadLeafsSize)
+        {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt);
+            const uint quadLeafStart = (uint)(offset / 64);
+            baseDest->quadLeafStart = quadLeafStart;
+            quadChildFix -= quadLeafStart;
+            offset += quadLeafsSize;
+            baseDest->quadLeafCur = (uint)(offset / 64);
+        }
+
+        const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf);
+        if (procLeafsSize)
+        {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt);
+            const uint proceduralDataStart = (uint)(offset / 64);
+            baseDest->proceduralDataStart = proceduralDataStart;
+            procChildFix -= proceduralDataStart;
+            offset += procLeafsSize;
+            baseDest->proceduralDataCur = (uint)(offset / 64);
+        }
+    }
+    // copy nodes with fixed child offsets
+    global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase));
+    global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc);
+    // used in mixed case
+    char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc);
+    char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc);
+    uint localId = get_sub_group_local_id();
+    for (uint i = get_group_id(0); i < numNodes; i += groupCnt)
+    {
+        uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]);
+        char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0];
+        if (localId * 4 == offsetof(InternalNode, childOffset))
+        {
+            int childOffset = as_int(nodePart);
+            if (nodeType == NODE_TYPE_MIXED)
+            {
+                char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset;
+                if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd)
+                    nodePart = as_int(childOffset - instChildFix);
+            }
+            else if (nodeType == NODE_TYPE_INSTANCE)
+                nodePart = as_int(childOffset - instChildFix);
+            else if (nodeType == NODE_TYPE_QUAD)
+                nodePart = as_int(childOffset - quadChildFix);
+            else if (nodeType == NODE_TYPE_PROCEDURAL)
+                nodePart = as_int(childOffset - procChildFix);
+        }
+        nodeDest[i * 16 + localId] = nodePart;
+    }
+
+    if (baseSrc->Meta.instanceCount)
+    {
+        const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc);
+        CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt);
+        baseDest->Meta.instanceDescsStart = offset;
+        offset += instanceDescSize;
+    }
+    if (baseSrc->Meta.geoCount)
+    {
+        const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData);
+        CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt);
+        baseDest->Meta.geoDescsStart = offset;
+        offset += (geoMetaSize + 63) & ~63; // align to 64
+    }
+
+    uint backPointerDataStart     = offset / 64;
+    uint refitTreeletsDataStart   = backPointerDataStart;
+    uint refitStartPointDataStart = backPointerDataStart;
+    uint dataEnd                  = backPointerDataStart;
+    uint fatLeafTableStart = dataEnd;
+    uint fatLeafCount      = baseSrc->fatLeafCount;
+    uint innerTableStart   = dataEnd;
+    uint innerCount        = baseSrc->innerCount;
+    
+    uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate;
+    uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate;
+    uint quadIndicesDataStart = dataEnd;
+
+    if (BVHBase_HasBackPointers(baseSrc))
+    {
+#if 0 //
+        const uint oldbackpontersDataStart = baseSrc->backPointerDataStart;
+        const uint shift = oldbackpontersDataStart - backPointerDataStart;
+        const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63;
+
+        CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt);
+
+        refitTreeletsDataStart   = baseSrc->refitTreeletsDataStart - shift;
+        refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift;
+        dataEnd                  = baseSrc->BVHDataEnd - shift;
+#else // compacting version
+        const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63;
+        CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt);
+        offset += backpointersSize;
+
+        refitTreeletsDataStart = offset / 64;
+        refitStartPointDataStart = offset / 64;
+
+        // TODO: remove treelets from .... everywhere
+        const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc);
+
+        if (treeletExecutedCnt)
+        {
+            const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1;
+
+            refitTreeletsDataStart = offset / 64;
+            const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63;
+            RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset);
+            RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc);
+
+            uint numThreads = groupCnt * get_local_size(0);
+            uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
+
+            for (uint i = globalID; i < treeletCnt; i += numThreads)
+            {
+                RefitTreelet dsc = srcTreelets[i];
+                RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc;
+                if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) {
+                    trivial_dsc->childrenOffsetOfTheNode -= quadChildFix;
+                }
+                destTreelets[i] = dsc;
+            }
+
+            offset += treeletsSize;
+
+            refitStartPointDataStart = offset / 64;
+            const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63;
+            CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt);
+            offset += startPointsSize;
+            dataEnd = offset / 64;
+        }
+
+        uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63);
+        fatLeafTableStart = offset / 64;
+        if (fatleafEntriesSize) {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt);
+        }
+        offset += fatleafEntriesSize;
+
+        // New atomic update
+        if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart)
+        {
+            uint numQuads = BVHBase_GetNumQuads(baseSrc);
+            uint quadTableMainBufferSize = (numQuads + 255) & ~255;
+            uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255;
+            uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
+            if (quadTableEntriesSize) {
+                CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt);
+            }
+            offset += quadTableEntriesSize;
+                
+            uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63);
+            quadIndicesDataStart = offset / 64;
+            if (quadIndicesDataSize) {
+                CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt);
+            }
+            offset += quadIndicesDataSize;
+        }
+
+        uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63);
+        innerTableStart = offset / 64;
+        if (innerEntriesSize) {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt);
+        }
+        offset += innerEntriesSize;
+
+        dataEnd = offset / 64;
+#endif
+    }
+
+    baseDest->backPointerDataStart = backPointerDataStart;
+    baseDest->refitTreeletsDataStart = refitTreeletsDataStart;
+    baseDest->refitStartPointDataStart = refitStartPointDataStart;
+    baseDest->fatLeafTableStart = fatLeafTableStart ;
+    baseDest->fatLeafCount = fatLeafCount;
+    baseDest->innerTableStart = innerTableStart;
+    baseDest->innerCount = innerCount;
+
+    baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate;
+    baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate;
+    baseDest->quadIndicesDataStart = quadIndicesDataStart;
+    baseDest->BVHDataEnd = dataEnd;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel compact(global char* dest,
+    global char* src,
+    uint groupCnt)
+{
+    uint64_t compactedSize = compute_compacted_size((BVHBase*)src);
+    compactT(dest, src, compactedSize, 0, groupCnt);
+}
+
+// set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data
+GRL_INLINE
+unsigned prepare_header(
+    uint64_t headerSize,
+    uint64_t instancePtrSize,
+    uint64_t numInstances,
+    uint64_t bvhSize,
+    uint8_t* driverID,
+    uint64_t reminder)
+{
+
+    unsigned loc_id = get_sub_group_local_id();
+
+    uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize;
+    uint64_t DeserializedSizeInBytes = bvhSize;
+    uint64_t InstanceHandleCount = numInstances;
+
+    char bvh_magic_str[] = BVH_MAGIC_MACRO;
+    uint* bvh_magic_uint = (uint*)bvh_magic_str;
+
+    unsigned headerTempLanePiece;
+    if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); }
+    else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; }
+    else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; }
+    else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; }
+    else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; }
+    else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; }
+    else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); }
+    else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; }
+    else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); }
+    else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; }
+    else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); }
+    else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; }
+    else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); }
+
+    return headerTempLanePiece;
+}
+
+
+
+
+GRL_INLINE
+void serializeT(
+    global byte_align64B* dest,
+    global byte_align64B* src,
+    global uint8_t* driverID,
+    uint groups_count)
+{
+    SerializationHeader* header = (SerializationHeader*)dest;
+    BVHBase* base = (BVHBase*)src;
+
+    const uint headerSize = sizeof(SerializationHeader);
+    const uint numInstances = base->Meta.instanceCount;
+    const uint instancePtrSize = sizeof(gpuva_t);
+    const uint compactedSize = compute_compacted_size(base);
+    uint local_id = get_sub_group_local_id();
+
+    // this is not 64byte aligned :(
+    const uint offsetToBvh = headerSize + instancePtrSize * numInstances;
+
+    global InstanceDesc* src_instances = 0;
+
+    if (numInstances) {
+        src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart);
+    }
+
+    // effectively this part should end up as one 64B aligned 64B write
+    if (get_group_id(0) == groups_count - 1)
+    {
+        Block64B headerPlus;
+
+        // we patch the missing piece with instance or bhv beginning (TRICK A and B)
+        // we assume header is 56B.
+        global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src;
+
+        unsigned headerTemp;
+
+        headerTemp = prepare_header(
+            headerSize,
+            instancePtrSize,
+            numInstances,
+            compactedSize,
+            driverID,
+            *srcPiece);
+
+        CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp);
+    }
+
+    if (numInstances > 0)
+    {
+        uint instancesOffset = headerSize;
+        uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6;
+        uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3;
+        unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances);
+
+        global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset);
+
+        // we've copied first instance onto a header, (see TRICK A)
+        // now we have only instances start at aligned memory
+        uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt;
+        dst_instances += unaligned_prefixing_instance_cnt;
+        src_instances += unaligned_prefixing_instance_cnt;
+
+        if (numAlignedInstances)
+        {
+            // each 8 instances form a cacheline
+            uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs
+            // qwords besides multiple of 8;
+            uint startReminder = numAlignedInstances & ~((1 << 3) - 1);
+            uint numreminder = numAlignedInstances & ((1 << 3) - 1);
+
+            uint task_id = get_group_id(0);
+
+            while (task_id < numCachelines)
+            {
+                uint src_id = task_id * 8 + (local_id >> 1);
+                uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA;
+                uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected;
+                uint data = *src;
+
+                global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id));
+                CacheLineSubgroupWrite(dst, data);
+                task_id += groups_count;
+            }
+
+            if (task_id == numCachelines && local_id < 8 && numreminder > 0)
+            {
+                // this should write full cacheline
+
+                uint index = startReminder + local_id;
+                // data will be taken from instances for lanes (local_id < numreminder)
+                // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B)
+                global uint64_t* srcData = (local_id < numreminder) ?
+                    &src_instances[index].AccelerationStructureGPUVA :
+                    ((global uint64_t*)src) + (local_id - numreminder);
+                dst_instances[index] = *srcData;
+            }
+        }
+    }
+
+    // the parts above copied unaligned dst beginning of bvh (see TRICK B)
+    uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u);
+
+    compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel serialize_indirect(
+    global char* dest,
+    global char* src,
+    global uint8_t* driverID)
+{
+    BVHBase* base = (BVHBase*)src;
+    uint groups_count = GroupCountForCopy(base);
+    serializeT(dest, src, driverID, groups_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel serialize_for_input_dump_indirect(
+    global struct OutputBatchPtrs* batchPtrs,
+    global dword* dstOffset,
+    global char* src,
+    global uint8_t* driverID)
+{
+    BVHBase* base = (BVHBase*)src;
+    uint groups_count = GroupCountForCopy(base);
+    global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset);
+    dest += (sizeof(OutputData) + 127) & ~127;
+    serializeT(dest, src, driverID, groups_count);
+}
+
+GRL_INLINE
+void deserializeT(
+    global char* dest,
+    global char* src,
+    unsigned groupCnt)
+{
+    SerializationHeader* header = (SerializationHeader*)src;
+
+    const uint64_t headerSize = sizeof(struct SerializationHeader);
+    const uint64_t instancePtrSize = sizeof(gpuva_t);
+    const uint64_t numInstances = header->InstanceHandleCount;
+    const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances;
+    const uint64_t bvhSize = header->DeserializedSizeInBytes;
+
+    if (numInstances)
+    {
+        const bool instances_mixed_with_inner_nodes = false;
+        if (instances_mixed_with_inner_nodes)
+        {
+            // not implemented !
+            // copy each node with 64byte granularity if node is instance, patch it mid-copy
+        }
+        else
+        {
+            BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh);
+
+            // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than
+            // numInstances (count of pointers and descriptors).
+            uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6;
+            uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1;
+
+            //
+            // instances are in separate memory intervals
+            // copy all the other data simple way
+            //
+            uint nodesEnd = srcBvhBase->Meta.instanceDescsStart;
+            // copy before instance leafs
+            CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt);
+
+            uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6;
+            uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart;
+            uint sizePostInstances = instanceDescStart - offsetPostInstances;
+            // copy after instance leafs before instance desc
+            CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt);
+
+            uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc);
+            uint sizePostInstanceDescs = bvhSize - instanceDescEnd;
+            // copy after instance desc
+            CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt);
+
+            global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize);
+            global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart);
+            global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart);
+
+            // copy and patch instance descriptors
+            for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt)
+            {
+                InstanceDesc desc = srcDesc[instanceIndex];
+                uint64_t newInstancePtr = newInstancePtrs[instanceIndex];
+                desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr;
+
+                dstDesc[instanceIndex] = desc;
+            }
+
+            // copy and patch hw instance leafs
+            global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances);
+            global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances);
+
+            for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt)
+            {
+                // pull the instance from srcBVH
+                HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex];
+
+                uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf);
+                uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex];
+                uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf);
+
+                HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr);
+                uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf);
+
+                if (startNode != 0) {
+                    uint64_t rootNodeOffset = startNode - originalBvhPtr;
+                    HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset);
+                }
+
+                dstInstleafs[hwLeafIndex] = tmpInstleaf;
+            }
+        }
+    }
+    else
+    {
+        CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt);
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel deserialize_indirect(
+    global char* dest,
+    global char* src)
+{
+    SerializationHeader* header = (SerializationHeader*)src;
+    const uint64_t bvhSize = header->DeserializedSizeInBytes;
+    unsigned groupCnt = GroupCountForCopySize(bvhSize);
+    deserializeT(dest, src, groupCnt);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest,
+    global char* src)
+{
+
+    DecodeHeader* header = (DecodeHeader*)dest;
+    BVHBase* base = (BVHBase*)src;
+
+    uint32_t numGeos = base->Meta.geoCount;
+    uint32_t numInstances = base->Meta.instanceCount;
+
+    if (numInstances > 0)
+    {
+        header->Type = TOP_LEVEL;
+        header->NumDesc = numInstances;
+
+        D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader));
+        copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart),
+            instanceDesc,
+            numInstances);
+    }
+    else if (numGeos > 0)
+    {
+        header->Type = BOTTOM_LEVEL;
+        header->NumDesc = numGeos;
+
+        D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader));
+        uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos;
+        createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
+            geomDescs,
+            numGeos,
+            data);
+
+        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+
+        copyDataFromQuadLeaves(base,
+            geomDescs);
+
+        copyDataFromLProcedurals(base,
+            geomDescs);
+    }
+    else
+    {
+        header->Type = BOTTOM_LEVEL;
+        header->NumDesc = 0;
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.cl b/src/intel/vulkan/grl/gpu/bvh_debug.cl
new file mode 100644
index 00000000000..bce75fec3ff
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.cl
@@ -0,0 +1,208 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// @file bvh_debug.cl
+//
+// @brief routines to do basic integrity checks
+//
+// Notes:
+//
+
+#include "GRLGen12.h"
+#include "intrinsics.h"
+#include "libs/lsc_intrinsics.h"
+#include "GRLGen12IntegrityChecks.h"
+#include "api_interface.h"
+
+#define ERROR_PRINTF 0
+GRL_INLINE bool commit_err(
+    global uint* some_null,
+    global BVHBase* bvh,
+    global ERROR_INFO* err_info_slot,
+    ERROR_INFO err)
+{
+    if (err.type != error_t_no_error) {
+        uint expected = error_t_no_error;
+        atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type);
+        if (expected == error_t_no_error)
+        {
+            err_info_slot->offset_in_BVH = err.offset_in_BVH;
+            err_info_slot->when = err.when;
+            err_info_slot->reserved = 0xAAACCAAA;
+            mem_fence_evict_to_memory();
+#if ERROR_PRINTF
+            printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH);
+#else 
+			// This is to trigger PF. Note we have to write directly to memory.
+            // If write would stay in L3 it won't give a PF untill this will get evicted to mem.
+            store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type);
+#endif
+            return true;
+        }
+    }
+    return false;
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_tree_topology(
+    global uint* some_null,
+    global BVHBase* bvh,
+    global ERROR_INFO* err,
+    uint phase)
+{
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    if (err->type != error_t_no_error) return;
+
+    uint dummy1, dummy2, dummy3;
+    ERROR_INFO reterr =  check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false);
+    if (reterr.type == error_t_no_error)
+    {
+        reterr = check_backpointers(bvh, globalID);
+    }
+    if (reterr.type == error_t_no_error)
+    {
+        reterr = validate_atomic_update_structs(bvh, globalID);
+    }
+    reterr.when = phase;
+    commit_err(some_null, bvh, err, reterr);
+}
+
+GRL_INLINE bool IsValid48bPtr(qword ptr)
+{
+    qword CANONIZED_BITS = 0xFFFFul << 48ul;
+    qword canonized_part = ptr & CANONIZED_BITS;
+    bool isIt = ptr != 0 && (
+        canonized_part == 0 || canonized_part == CANONIZED_BITS);
+    return isIt;
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_geos_before_quad_update(
+    global BVHBase* bvh, //dest bvh
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global uint* some_null,
+    global ERROR_INFO* err,
+    uint phase,
+    uint numGeos,
+    uint numThreads)
+{
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    if (err->type != error_t_no_error) return;
+
+    // first check sanity of geos
+    ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 };  
+
+    for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size())
+    {
+        bool IsSane = IsValid48bPtr((qword)(qword)geomDesc);
+
+        if (IsSane) {
+            GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID];
+            IsSane = geo.Type < NUM_GEOMETRY_TYPES;
+            if (IsSane) {
+                if (geo.Type == GEOMETRY_TYPE_TRIANGLES) {
+                    if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) {
+                        IsSane = false;
+                    }
+                    else
+                    {
+                        if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2)
+                        {
+                            IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) &&
+                                IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) &&
+                                IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer);
+                        }   
+                        else if (geo.Desc.Triangles.VertexCount > 2)
+                        {
+                            IsSane =
+                                geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&&
+                                IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        geo_insanity_error.offset_in_BVH = ID;
+        geo_insanity_error.when = phase;
+        if (!IsSane) {
+            commit_err(some_null, bvh, err, geo_insanity_error);
+        }
+        return;
+    }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_geos_vs_quads(
+    global BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global uint* some_null,
+    global ERROR_INFO* err,
+    uint phase,
+    uint numGeos,
+    uint numThreads)
+{
+    uint numQuads = BVHBase_GetNumQuads(bvh);
+
+    QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh);
+
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    uint qoffset = bvh->quadLeafStart;
+
+    if (err->type != error_t_no_error) return;
+    
+    ERROR_INFO theErr = { error_t_no_error, 0 };
+    
+    for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size())
+    {
+        ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase };
+        
+        QuadLeaf quad = quads[ID];
+
+        uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc);
+        
+        if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; }
+
+        uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ?
+            geomDesc[geoIdx].Desc.Triangles.IndexCount  / 3 :
+            geomDesc[geoIdx].Desc.Triangles.VertexCount / 3;
+
+        if(quad.primIndex0 >= numPrimsInGeo) { 
+            commit_err(some_null, bvh, err, quadErr);
+            return; 
+        }
+        
+        if(!QuadLeaf_IsSingleTriangle(&quad) && 
+           (quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo))
+        {
+            commit_err(some_null, bvh, err, quadErr);
+            return; 
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_instances_linked_bvhs(
+    global uint* some_null,
+    global BVHBase* bvh,
+    global ERROR_INFO* err,
+    uint phase)
+{
+    if (err->type != error_t_no_error) return;
+
+    uint instanceLeafStart = bvh->instanceLeafStart;
+    uint instanceLeafEnd = bvh->instanceLeafEnd;
+    uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2;
+
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true);
+    reterr.when = phase;
+    commit_err(some_null, bvh, err, reterr);
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.grl b/src/intel/vulkan/grl/gpu/bvh_debug.grl
new file mode 100644
index 00000000000..28008ab09ce
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.grl
@@ -0,0 +1,107 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module bvh_on_gpu_checks;
+
+kernel_module debug_kernels ("bvh_debug.cl") 
+{
+    links lsc_intrinsics;
+    kernel opencl_check_tree_topology                        < kernelFunction="check_tree_topology">;
+    kernel opencl_check_instances_linked_bvhs                < kernelFunction="check_instances_linked_bvhs">;
+    kernel opencl_check_geos_before_quad_update              < kernelFunction="check_geos_before_quad_update">;
+    kernel opencl_check_geos_vs_quads                        < kernelFunction="check_geos_vs_quads">;
+}
+
+
+metakernel debug_checks_prepare_const_regs()
+{
+    define cRoundingSIMD REG4;
+    define cInit0        REG5;
+    define cShiftForSIMD REG3;
+    cRoundingSIMD = (16-1);
+    cShiftForSIMD = 4;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+}
+
+metakernel debug_checks_bvh_topology(
+    qword some_null_ptr,
+    qword bvh,
+    qword bvh_inner_nodes_end,
+    qword error_struct,
+    dword when,
+    dword bvh_inner_nodes_start_value )
+{
+    define cRoundingSIMD REG4;
+    define cShiftForSIMD REG3;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> cShiftForSIMD;
+
+    DISPATCHDIM_X = REG2.lo;
+
+    dispatch_indirect opencl_check_tree_topology args(
+        some_null_ptr,
+        bvh,
+        error_struct,
+        when);
+}
+
+metakernel debug_check_instances_linked_bvhs(
+    qword some_null_ptr,
+    qword bvh,
+    qword error_struct, 
+    dword numHWThreads,
+    dword when)
+{
+    dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args(
+        some_null_ptr,
+        bvh,
+        error_struct,
+        when);
+}
+
+metakernel debug_check_geos_before_quad_update(
+    qword bvh,
+    qword geos,
+    qword some_null_ptr,
+    qword error_struct, 
+    dword when,
+    dword numGeos,
+    dword numHWThreads )
+{
+    dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args(
+        bvh,
+        geos,
+        some_null_ptr,
+        error_struct, 
+        when,
+        numGeos,
+        numHWThreads );
+}
+
+metakernel debug_check_geos_vs_quads(
+    qword bvh,
+    qword geos,
+    qword some_null_ptr,
+    qword error_struct, 
+    dword when,
+    dword numGeos,
+    dword numHWThreads )
+{
+    dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args(
+        bvh,
+        geos,
+        some_null_ptr,
+        error_struct, 
+        when,
+        numGeos,
+        numHWThreads );
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
new file mode 100644
index 00000000000..4fa222b53eb
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
@@ -0,0 +1,97 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "d3d12.h"
+#include "common.h"
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem,
+                                                                          global char *postbuild_info)
+{
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info;
+
+    postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem,
+                                                                        global char *postbuild_info)
+{
+
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info;
+
+    postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem,
+                                                                           global char *postbuild_info)
+{
+
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info;
+
+    uint64_t headerSize = sizeof(SerializationHeader);
+    uint64_t numInstances = base->Meta.instanceCount;
+
+    postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) +
+                                                        numInstances * sizeof(gpuva_t) +
+                                                        compute_compacted_size(base);
+                                                        //base->Meta.allocationSize;
+    postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances;
+}
+
+void countTrianglesAndProcedurals(GeoMetaData *geoMetaData,
+                                  uint64_t numGeos,
+                                  uint64_t *numTriangles,
+                                  uint64_t *numProcedurals)
+{
+    uint64_t numTrianglesLoc = 0;
+    uint64_t numProceduralsLoc = 0;
+
+    for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0))
+    {
+        if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
+        {
+            *numTriangles += geoMetaData[geoIndex].PrimitiveCount;
+        }
+        else
+        {
+            *numProcedurals += geoMetaData[geoIndex].PrimitiveCount;
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem,
+                                                                        global char *postbuild_info)
+{
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info;
+
+    uint64_t numTriangles = 0;
+    uint64_t numProcedurals = 0;
+    countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
+                                 base->Meta.geoCount,
+                                 &numTriangles,
+                                 &numProcedurals);
+    uint64_t numInstances = base->Meta.instanceCount;
+    uint64_t numDescs = base->Meta.geoCount;
+    uint64_t headerSize = sizeof(DecodeHeader);
+    uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) +
+                         numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC);
+
+    // Each triangle is stored separately - 3 vertices (9 floats) per triangle
+    uint64_t triangleDataSize = 9 * sizeof(float);
+    uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB);
+    uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize;
+
+    postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize;
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
new file mode 100644
index 00000000000..ab0f891acee
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
@@ -0,0 +1,1683 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "AABB.h"
+#include "GRLGen12.h"
+#include "api_interface.h"
+#include "common.h"
+#include "qbvh6.h"
+
+#define MAX_SPLITS_PER_INSTANCE 64
+#define NUM_REBRAID_BINS 32
+
+#define NUM_CHILDREN 6
+#define MAX_NODE_OFFSET 65535 // can't open nodes whose offsets exceed this
+
+// OCL/DPC++ *SHOULD* have a uniform keyword... but they dont... so I'm making my own
+#define uniform
+#define varying
+
+#define SGPRINT_UNIFORM(fmt,val)    {sub_group_barrier(CLK_LOCAL_MEM_FENCE); if( get_sub_group_local_id() == 0 ) { printf(fmt,val); }}
+
+#define SGPRINT_6x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
+                                            v0,v1,v2,v3,v4,v5);}}
+
+
+#define SGPRINT_16x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        type v6 = sub_group_broadcast( val, 6 );\
+                                        type v7 = sub_group_broadcast( val, 7 );\
+                                        type v8 = sub_group_broadcast( val, 8 );\
+                                        type v9 = sub_group_broadcast( val, 9 );\
+                                        type v10 = sub_group_broadcast( val, 10 );\
+                                        type v11 = sub_group_broadcast( val, 11 );\
+                                        type v12 = sub_group_broadcast( val, 12 );\
+                                        type v13 = sub_group_broadcast( val, 13 );\
+                                        type v14 = sub_group_broadcast( val, 14 );\
+                                        type v15 = sub_group_broadcast( val, 15 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
+                                                      fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
+                                            v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
+
+#if 1
+#define GRL_ATOMIC_INC(addr) atomic_add(addr, 1);
+#else 
+#define GRL_ATOMIC_INC(addr) atomic_inc(addr);
+#endif
+
+#if 0
+#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
+
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) \
+    _loop_trip++;\
+    if ( _loop_trip > max_iterations )\
+    {\
+        printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!!\n" );\
+        printf( name"\n");\
+        break;\
+    }
+#else
+
+#define LOOP_TRIPWIRE_INIT 
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name)
+
+#endif
+
+
+
+typedef struct SGHeap
+{
+    uint32_t key_value;
+    bool lane_mask;
+} SGHeap;
+
+GRL_INLINE void SGHeap_init(uniform SGHeap *h)
+{
+    h->lane_mask = false;
+    h->key_value = 0xbaadf00d;
+}
+
+GRL_INLINE bool SGHeap_full(uniform SGHeap *h)
+{
+    return sub_group_all(h->lane_mask);
+}
+GRL_INLINE bool SGHeap_empty(uniform SGHeap *h)
+{
+    return sub_group_all(!h->lane_mask);
+}
+
+GRL_INLINE bool SGHeap_get_lane_mask(uniform SGHeap *h)
+{
+    return h->lane_mask;
+}
+GRL_INLINE uint16_t SGHeap_get_lane_values(uniform SGHeap *h)
+{
+    return (h->key_value & 0xffff);
+}
+
+GRL_INLINE ushort isolate_lowest_bit( ushort m )
+{
+    return m & ~(m - 1);
+}
+
+
+// lane i receives the index of the ith set bit in mask.  
+GRL_INLINE ushort subgroup_bit_rank( uniform ushort mask )
+{
+    varying ushort lane = get_sub_group_local_id();
+    ushort idx = 16;
+    for ( uint i = 0; i < NUM_CHILDREN; i++ )
+    {
+        ushort lo = isolate_lowest_bit( mask );
+        mask = mask ^ lo;
+        idx = (lane == i) ? lo : idx;
+    }
+
+    return ctz( idx );
+}
+
+// push a set of elements spread across a subgroup.  Return mask of elements that were not pushed
+GRL_INLINE uint16_t SGHeap_vectorized_push(uniform SGHeap *h, varying uint16_t key, varying uint16_t value, uniform ushort push_mask)
+{    
+
+#if 0 // an attempt to make this algorithm branchless
+    varying uint key_value = (((uint)key) << 16) | ((uint)value);
+    uniform ushort free_mask = intel_sub_group_ballot( !h->lane_mask );
+
+    varying ushort free_slot_idx = subgroup_bit_prefix_exclusive( free_mask ); // for each heap slot, what is its position in a compacted list of free slots (prefix sum)
+    varying ushort push_idx      = subgroup_bit_prefix_exclusive( push_mask );  // for each lane, what is its position in a compacted list of pushing lanes (prefix sum)
+
+    uniform ushort num_pushes = min( popcount( free_mask ), popcount( push_mask ) );
+
+    varying ushort push_index = subgroup_bit_rank( push_mask ); // lane i gets the index of the i'th set bit in push_mask
+    
+    varying uint shuffled = intel_sub_group_shuffle( key_value, intel_sub_group_shuffle( push_index, free_slot_idx ) );
+    varying bool pushed = false;
+    if ( !h->lane_mask && free_slot_idx < num_pushes )
+    {
+        h->lane_mask = true;
+        h->key_value = shuffled;
+        pushed = true;
+    }
+
+    return push_mask & intel_sub_group_ballot( push_idx >= num_pushes );
+#else
+
+    varying uint lane = get_sub_group_local_id();
+
+    varying uint key_value = (((uint)key) << 16) | ((uint)value);
+    uniform ushort free_mask = intel_sub_group_ballot(!h->lane_mask);
+
+    // TODO_OPT:  Look for some clever way to remove this loop
+    while (free_mask && push_mask)
+    {
+        // insert first active child into first available lane
+        uniform uint child_id = ctz(push_mask);
+        uniform uint victim_lane = ctz(free_mask);
+        uniform uint kv = sub_group_broadcast( key_value, child_id );
+        if (victim_lane == lane)
+        {
+            h->lane_mask = true;
+            h->key_value = kv;
+        }
+        push_mask ^= (1 << child_id);
+        free_mask ^= (1 << victim_lane);
+    }
+
+    return push_mask;
+
+#endif
+}
+
+// push an item onto a heap that is full except for one slot
+GRL_INLINE void SGHeap_push_and_fill(uniform SGHeap *h, uniform uint16_t key, uniform uint16_t value)
+{
+    uniform uint32_t key_value = (((uint)key) << 16) | value;
+    if (!h->lane_mask)
+    {
+        h->lane_mask = true;
+        h->key_value = key_value; // only one lane will be active at this point
+    }
+}
+
+// pop the min item from a full heap
+GRL_INLINE void SGHeap_full_pop_min(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
+{
+    varying uint lane = get_sub_group_local_id();
+    uniform uint kv = sub_group_reduce_min(h->key_value);
+    if (h->key_value == kv)
+        h->lane_mask = false;
+
+    *key_out   = (kv >> 16);
+    *value_out = (kv & 0xffff);
+}
+
+// pop the max item from a heap
+GRL_INLINE void SGHeap_pop_max(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
+{
+    uniform uint lane = get_sub_group_local_id();
+    uniform uint kv = sub_group_reduce_max(h->lane_mask ? h->key_value : 0);
+    if (h->key_value == kv)
+        h->lane_mask = false;
+
+    *key_out = (kv >> 16);
+    *value_out = (kv & 0xffff);
+}
+
+GRL_INLINE void SGHeap_printf( SGHeap* heap )
+{
+    uint key = heap->key_value >> 16;
+    uint value = heap->key_value & 0xffff;
+    
+    if ( get_sub_group_local_id() == 0)
+        printf( "HEAP: \n" );
+    SGPRINT_16x( "  mask: ", "%6u ", bool, heap->lane_mask );
+    SGPRINT_16x( "  key : ", "0x%04x ", uint, key );
+    SGPRINT_16x( "  val : ", "0x%04x ", uint, value );
+
+}
+
+GRL_INLINE float transformed_aabb_halfArea(float3 lower, float3 upper, const float *Transform)
+{
+    // Compute transformed extent per 'transform_aabb'.  Various terms cancel
+    float3 Extent = upper - lower;
+    float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
+    float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
+    float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
+
+    return (ex * ey) + (ey * ez) + (ex * ez);
+}
+
+GRL_INLINE uint16_t quantize_area(float relative_area)
+{
+    // clamp relative area at 0.25 (1/4 of root area)
+    //  and apply a non-linear distribution because most things in real scenes are small
+    relative_area = pow(min(1.0f, relative_area * 4.0f), 0.125f);   
+    return convert_ushort_rtn( relative_area * 65535.0f );    
+}
+
+GRL_INLINE varying uint16_t SUBGROUP_get_child_areas(uniform InternalNode *n,
+                                                 uniform const float *Transform,
+                                                 uniform float relative_area_scale)
+{
+    varying uint16_t area;
+    varying uint16_t lane = get_sub_group_local_id();
+    varying int exp_x = n->exp_x;
+    varying int exp_y = n->exp_y;
+    varying int exp_z = n->exp_z;
+
+    {
+        // decode the AABB positions.  Lower in the bottom 6 lanes, upper in the top
+        uniform uint8_t *px = &n->lower_x[0];
+        uniform uint8_t *py = &n->lower_y[0];
+        uniform uint8_t *pz = &n->lower_z[0];
+
+        varying float fx = convert_float(px[lane]);
+        varying float fy = convert_float(py[lane]);
+        varying float fz = convert_float(pz[lane]);
+        fx = n->lower[0] + bitShiftLdexp(fx, exp_x - 8);
+        fy = n->lower[1] + bitShiftLdexp(fy, exp_y - 8);
+        fz = n->lower[2] + bitShiftLdexp(fz, exp_z - 8);
+
+        // transform the AABBs to world space
+        varying float3 lower = (float3)(fx, fy, fz);
+        varying float3 upper = intel_sub_group_shuffle(lower, lane + 6);
+
+        {
+ 
+            // TODO_OPT:  This is only utilizing 6 lanes.
+            //  We might be able to do better by vectorizing the calculation differently
+            float a1 = transformed_aabb_halfArea( lower, upper, Transform );
+            float a2 = a1 * relative_area_scale;
+            area = quantize_area( a2 );
+        }
+    }
+
+    return area;
+}
+
+
+
+GRL_INLINE ushort get_child_area( 
+    InternalNode* n, 
+    ushort child,
+    const float* Transform,
+    float relative_area_scale )
+{
+    uint16_t area;
+    uint16_t lane = get_sub_group_local_id();
+    int exp_x = n->exp_x;
+    int exp_y = n->exp_y;
+    int exp_z = n->exp_z;
+
+    // decode the AABB positions.  Lower in the bottom 6 lanes, upper in the top
+    uint8_t* px = &n->lower_x[0];
+    uint8_t* py = &n->lower_y[0];
+    uint8_t* pz = &n->lower_z[0];
+
+    float3 lower, upper;
+    lower.x = convert_float( n->lower_x[child] );
+    lower.y = convert_float( n->lower_y[child] );
+    lower.z = convert_float( n->lower_z[child] );
+    upper.x = convert_float( n->upper_x[child] );
+    upper.y = convert_float( n->upper_y[child] );
+    upper.z = convert_float( n->upper_z[child] );
+
+    lower.x = bitShiftLdexp( lower.x, exp_x - 8 ); // NOTE:  the node's 'lower' field cancels out, so don't add it
+    lower.y = bitShiftLdexp( lower.y, exp_y - 8 ); //    see transform_aabb_halfArea
+    lower.z = bitShiftLdexp( lower.z, exp_z - 8 );
+    upper.x = bitShiftLdexp( upper.x, exp_x - 8 );
+    upper.y = bitShiftLdexp( upper.y, exp_y - 8 );
+    upper.z = bitShiftLdexp( upper.z, exp_z - 8 );
+
+    float a1 = transformed_aabb_halfArea( lower, upper, Transform );
+    float a2 = a1 * relative_area_scale;
+    area = quantize_area( a2 );
+
+    return area;
+}
+
+
+GRL_INLINE varying int SUBGROUP_get_child_offsets(uniform InternalNode *n)
+{
+    varying uint lane = get_sub_group_local_id();
+    varying uint child = (lane < NUM_CHILDREN) ? lane : 0;
+
+    varying uint block_incr = InternalNode_GetChildBlockIncr( n, child );
+
+    //varying uint prefix = sub_group_scan_exclusive_add( block_incr );
+    varying uint prefix;
+    if ( NUM_CHILDREN == 6 ) 
+    {
+        prefix = block_incr + intel_sub_group_shuffle_up( 0u, block_incr, 1u );
+        prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 2 );
+        prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 4 );        
+        prefix = prefix - block_incr;
+    }
+
+    return n->childOffset + prefix;
+}
+
+
+// compute the maximum number of leaf nodes that will be produced given 'num_splits' node openings
+GRL_INLINE uint get_num_nodes(uint num_splits, uint max_children)
+{
+    // each split consumes one node and replaces it with N nodes
+    //   there is initially one node
+    //  number of nodes is thus:  N*s + 1 - s ==> (N-1)*s + 1
+    return (max_children - 1) * num_splits + 1;
+}
+
+// compute the number of node openings that can be performed given a fixed extra node budget
+GRL_INLINE uint get_num_splits(uint num_nodes, uint max_children)
+{
+    // inverse of get_num_nodes:   x = (n-1)s + 1
+    //                             s = (x-1)/(n-1)
+    if (num_nodes == 0)
+        return 0;
+
+    return (num_nodes - 1) / (max_children - 1);
+}
+
+GRL_INLINE uint get_rebraid_bin_index(uint16_t quantized_area, uint NUM_BINS)
+{
+    // arrange bins in descending order by size
+    float relative_area = quantized_area * (1.0f/65535.0f);
+    relative_area = 1.0f - relative_area; // arrange bins largest to smallest
+    size_t bin = round(relative_area * (NUM_BINS - 1));
+    return bin;
+}
+
+GRL_INLINE global InternalNode *get_node(global BVHBase *base, int incr)
+{
+    global char *ptr = (((global char *)base) + BVH_ROOT_NODE_OFFSET); // NOTE: Assuming this will be hoisted out of inner loops
+
+    return (global InternalNode *)(ptr + incr * 64);
+}
+
+GRL_INLINE bool is_aabb_valid(float3 lower, float3 upper)
+{
+    return all(isfinite(lower)) &&
+           all(isfinite(upper)) &&
+           all(lower <= upper);
+}
+
+GRL_INLINE bool is_node_openable(InternalNode *n)
+{
+    // TODO_OPT: Optimize me by fetching dwords instead of looping over bytes
+    // TODO: OPT:  Pre-compute openability and pack into the pad byte next to the nodeType field??
+    bool openable = n->nodeType == NODE_TYPE_INTERNAL;
+    if ( openable )
+    {
+        for ( uint i = 0; i < NUM_CHILDREN; i++ )
+        {
+            bool valid = InternalNode_IsChildValid( n, i );
+            uint childType = InternalNode_GetChildType( n, i );
+            openable = openable & (!valid || (childType == NODE_TYPE_INTERNAL));
+        }
+    }
+
+    return openable;
+}
+
+
+GRL_INLINE bool SUBGROUP_can_open_root(
+    uniform global BVHBase *bvh_base,
+    uniform const struct GRL_RAYTRACING_INSTANCE_DESC* instance
+    )
+{
+    if (bvh_base == 0 || GRL_get_InstanceMask(instance) == 0)
+        return false;
+
+    // TODO_OPT: SG-vectorize this AABB test
+    uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+    uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+    if (!is_aabb_valid(root_lower, root_upper))
+        return false;
+
+    uniform global InternalNode *node = get_node(bvh_base, 0);
+    if ( node->nodeType != NODE_TYPE_INTERNAL )
+        return false; 
+
+    varying bool openable = true;
+    varying uint lane = get_sub_group_local_id();
+    if (lane < NUM_CHILDREN)
+    {
+        varying uint childType = InternalNode_GetChildType(node, lane);
+        varying bool valid = InternalNode_IsChildValid(node, lane);
+        openable = childType == NODE_TYPE_INTERNAL || !valid;
+    }
+
+    return sub_group_all(openable);
+}
+
+
+
+GRL_INLINE
+varying uint2
+SUBGROUP_count_instance_splits(uniform global struct AABB3f *geometry_bounds,
+                               uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
+{
+    uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+    if (!SUBGROUP_can_open_root(bvh_base, instance))
+        return (uint2)(0, 0);
+
+    uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
+    uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+    uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+
+    uniform uint16_t quantized_area = quantize_area(transformed_aabb_halfArea(root_lower, root_upper, instance->Transform) * relative_area_scale);
+    uniform uint16_t node_offs = 0;
+
+    uniform SGHeap heap;
+    uniform uint num_splits = 0;
+
+    SGHeap_init(&heap);
+    varying uint sg_split_counts_hi = 0; // cross-subgroup bin counters
+    varying uint sg_split_counts_lo = 0;
+
+    uniform global InternalNode* node_array = get_node( bvh_base, 0 );
+
+    LOOP_TRIPWIRE_INIT;
+
+    while (1)
+    {
+        uniform global InternalNode* node = node_array + node_offs;
+
+        // count this split
+        uniform uint bin = get_rebraid_bin_index(quantized_area, NUM_REBRAID_BINS);
+        varying uint lane = get_sub_group_local_id();
+
+        sg_split_counts_hi += ((lane + 16) == bin) ? 1 : 0;
+        sg_split_counts_lo += (lane == bin) ? 1 : 0;
+
+        // open this node and push all of its openable children to heap
+        varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node);
+        varying bool sg_openable = 0;
+        if (lane < NUM_CHILDREN & sg_offs <= MAX_NODE_OFFSET )
+            if (InternalNode_IsChildValid(node, lane))
+                sg_openable = is_node_openable( node_array + sg_offs);
+
+        uniform uint openable_children = intel_sub_group_ballot(sg_openable);
+
+        if ( openable_children )
+        {
+            varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
+
+            if ( !SGHeap_full( &heap ) )
+            {         
+                openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
+            }
+
+            while ( openable_children )
+            {
+                // pop min element
+                uniform uint16_t min_area;
+                uniform uint16_t min_offs;
+                SGHeap_full_pop_min( &heap, &min_area, &min_offs );
+
+                // eliminate all children smaller than heap minimum
+                openable_children &= intel_sub_group_ballot( sg_area > min_area );
+
+                if ( openable_children )
+                {
+                    // if any children survived,
+                    // kick out heap minimum and replace with first child.. otherwise we will re-push the minimum
+                    uniform uint child_id = ctz( openable_children );
+                    openable_children ^= (1 << child_id);
+                    min_area = sub_group_broadcast( sg_area, child_id );
+                    min_offs = sub_group_broadcast( sg_offs, child_id );
+                }
+
+                // re-insert onto heap
+                SGHeap_push_and_fill( &heap, min_area, min_offs );
+
+                // repeat until all children are accounted for.  It is possible
+                //  for multiple children to fit in the heap, because heap minimum is now changed and we need to recompute it
+            }
+        }
+
+        num_splits++;
+        if (num_splits == MAX_SPLITS_PER_INSTANCE)
+            break;
+
+        if (SGHeap_empty(&heap))
+            break;
+
+        // get next node from heap
+        SGHeap_pop_max(&heap, &quantized_area, &node_offs);
+
+        LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_count_splits" );
+
+    }
+
+    return (uint2)(sg_split_counts_lo, sg_split_counts_hi);
+}
+
+typedef struct RebraidBuffers
+{
+    global uint *bin_split_counts;    // [num_bins]
+    global uint *bin_instance_counts; // [num_bins]
+    global uint *instance_bin_counts; // num_intances * num_bins
+} RebraidBuffers;
+
+GRL_INLINE RebraidBuffers cast_rebraid_buffers(global uint *scratch, uint instanceID)
+{
+    RebraidBuffers b;
+    b.bin_split_counts = scratch;
+    b.bin_instance_counts = scratch + NUM_REBRAID_BINS;
+    b.instance_bin_counts = scratch + (2 + instanceID) * NUM_REBRAID_BINS;
+    return b;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                            Compute AABB
+//                  Dispatch one work item per instance
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE void rebraid_compute_AABB(
+                          global struct BVHBase* bvh,
+                          global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
+{
+    // don't open null rtas
+    global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+
+    struct AABB new_primref;
+    if (bvh_base != 0)
+    {
+        float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+        float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+        const float *Transform = instance->Transform;
+
+        if (is_aabb_valid(root_lower, root_upper))
+        {
+            new_primref = AABBfromAABB3f(transform_aabb(root_lower, root_upper, Transform));
+        }
+        else
+        {
+            // degenerate instance which might be updated to be non-degenerate
+            //  use AABB position to guide BVH construction
+            //
+            new_primref.lower.x = Transform[3];
+            new_primref.lower.y = Transform[7];
+            new_primref.lower.z = Transform[11];
+            new_primref.upper = new_primref.lower;
+        }
+    }
+    else
+    {
+        AABB_init(&new_primref);
+    }
+
+    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        AABB3f_atomic_merge_global_lu(&bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz );
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances(
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances)
+{
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    rebraid_compute_AABB(bvh, instances + instanceID);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_indirect(
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+        (((global char*)instances) + indirect_data->primitiveOffset);
+    rebraid_compute_AABB(bvh, instances + instanceID);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_pointers(
+    global struct BVHBase* bvh,
+    global void *instances_in)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    rebraid_compute_AABB(bvh, instances[instanceID]);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_pointers_indirect(
+    global struct BVHBase* bvh,
+    global void *instances_in,
+    global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    rebraid_compute_AABB(bvh, instances[instanceID]);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                            Init scratch:  Dispatch one work group
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel rebraid_init_scratch(global uint *scratch)
+{
+    scratch[get_local_id(0) + get_group_id(0)*get_local_size(0)] = 0;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel rebraid_chase_instance_pointers(global struct GRL_RAYTRACING_INSTANCE_DESC *instances_out,
+                                                                                           global void *instance_buff)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances_in =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instance_buff;
+
+    instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel rebraid_chase_instance_pointers_indirect(
+    global struct GRL_RAYTRACING_INSTANCE_DESC*       instances_out,
+    global void*                                      instance_buff,
+    global struct IndirectBuildRangeInfo const* const indirect_data)
+{
+    instance_buff = ((global char*)instance_buff) + indirect_data->primitiveOffset;
+    global const struct GRL_RAYTRACING_INSTANCE_DESC**
+        instances_in = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instance_buff;
+
+    instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                             Count splits
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE void DEBUG_SUBGROUP_print_split_counts( uniform uint instanceID, varying uint split_counts_lo, varying uint split_counts_hi )
+{
+    uniform uint vals[32] = {
+       sub_group_broadcast( split_counts_lo, 0 ),  sub_group_broadcast( split_counts_lo, 1 ),
+       sub_group_broadcast( split_counts_lo, 2 ),  sub_group_broadcast( split_counts_lo, 3 ),
+       sub_group_broadcast( split_counts_lo, 4 ),  sub_group_broadcast( split_counts_lo, 5 ),
+       sub_group_broadcast( split_counts_lo, 6 ),  sub_group_broadcast( split_counts_lo, 7 ),
+       sub_group_broadcast( split_counts_lo, 8 ),  sub_group_broadcast( split_counts_lo, 9 ),
+       sub_group_broadcast( split_counts_lo, 10 ), sub_group_broadcast( split_counts_lo, 11 ),
+       sub_group_broadcast( split_counts_lo, 12 ), sub_group_broadcast( split_counts_lo, 13 ),
+       sub_group_broadcast( split_counts_lo, 14 ), sub_group_broadcast( split_counts_lo, 15 ),
+
+       sub_group_broadcast( split_counts_hi, 0 ),  sub_group_broadcast( split_counts_hi, 1 ),
+       sub_group_broadcast( split_counts_hi, 2 ),  sub_group_broadcast( split_counts_hi, 3 ),
+       sub_group_broadcast( split_counts_hi, 4 ),  sub_group_broadcast( split_counts_hi, 5 ),
+       sub_group_broadcast( split_counts_hi, 6 ),  sub_group_broadcast( split_counts_hi, 7 ),
+       sub_group_broadcast( split_counts_hi, 8 ),  sub_group_broadcast( split_counts_hi, 9 ),
+       sub_group_broadcast( split_counts_hi, 10 ), sub_group_broadcast( split_counts_hi, 11 ),
+       sub_group_broadcast( split_counts_hi, 12 ), sub_group_broadcast( split_counts_hi, 13 ),
+       sub_group_broadcast( split_counts_hi, 14 ), sub_group_broadcast( split_counts_hi, 15 )
+    };
+
+    if ( get_sub_group_local_id() == 0 )
+    {
+        printf(
+            "Instance: %4u  "
+            "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u "
+            "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u \n"
+            ,
+            instanceID,
+            vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7],
+            vals[8], vals[9], vals[10], vals[11], vals[12], vals[13], vals[14], vals[15],
+            vals[16], vals[17], vals[18], vals[19], vals[20], vals[21], vals[22], vals[23],
+            vals[24], vals[25], vals[26], vals[27], vals[28], vals[29], vals[30], vals[31]
+        );
+    }
+}
+
+GRL_INLINE void do_rebraid_count_splits_SG(
+    uniform global struct BVHBase* bvh,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    uniform global uint *rebraid_scratch)
+{
+    uniform const uint instanceID = get_sub_group_global_id();
+    uniform RebraidBuffers buffers = cast_rebraid_buffers(rebraid_scratch,instanceID);
+
+    varying uint lane = get_sub_group_local_id();
+    varying uint2 splits = SUBGROUP_count_instance_splits(&bvh->Meta.bounds, instances + instanceID);
+    varying uint split_counts_lo = splits.x;
+    varying uint split_counts_hi = splits.y;
+
+    // write this instance's per-bin counts
+    global uint* counts = buffers.instance_bin_counts;
+    intel_sub_group_block_write2( counts, splits );
+
+    // update the per-bin split and instance counters
+    if (split_counts_lo > 0)
+    {
+        atomic_add(&buffers.bin_split_counts[lane], split_counts_lo);
+        GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane]);
+    }
+    if (split_counts_hi > 0)
+    {
+        atomic_add(&buffers.bin_split_counts[lane + 16], split_counts_hi);
+        GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane + 16]);
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_count_splits_SG(
+    uniform global struct BVHBase* bvh,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    uniform global uint *rebraid_scratch)
+{
+    do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_count_splits_SG_indirect(
+    uniform global struct BVHBase* bvh,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    uniform global uint *rebraid_scratch,
+    global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+        (((global char*)instances) + indirect_data->primitiveOffset);
+    do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
+}
+
+
+#define HEAP_SIZE 16 
+#define COUNT_SPLITS_WG_SIZE 16
+
+struct SLMHeapNode
+{
+    short offs;
+    ushort area;
+};
+
+struct SLMHeap
+{
+    struct SLMHeapNode nodes[HEAP_SIZE];
+    ushort size;
+    ushort min_key;
+};
+
+GRL_INLINE bool SLMHeapNode_Greater( struct SLMHeapNode a, struct SLMHeapNode b )
+{
+    return a.area > b.area;
+}
+
+GRL_INLINE ushort SLMHeapNode_UnpackKey( struct SLMHeapNode a )
+{
+    return a.area;
+}
+
+GRL_INLINE void SLMHeapNode_Unpack( struct SLMHeapNode a, ushort* area_out, short* offs_out )
+{
+    *area_out = a.area;
+    *offs_out = a.offs;
+}
+
+GRL_INLINE struct SLMHeapNode SLMHeapNode_Pack( ushort area, short offs )
+{
+    struct SLMHeapNode n;
+    n.offs = offs;
+    n.area = area;
+    return n;
+}
+
+
+GRL_INLINE void SLMHeap_Init( struct SLMHeap* heap )
+{
+    heap->size = 0;
+    heap->min_key = 0xffff;
+}
+
+GRL_INLINE bool SLMHeap_empty( struct SLMHeap* heap )
+{
+    return heap->size == 0;
+}
+
+GRL_INLINE bool SLMHeap_full( struct SLMHeap* heap )
+{
+    return heap->size == HEAP_SIZE;
+}
+
+
+GRL_INLINE void SLMHeap_push( struct SLMHeap* heap, ushort area, short offs )
+{
+    ushort insert_pos;
+    if ( SLMHeap_full( heap ) )
+    {
+        ushort current_min_key = heap->min_key;
+        if ( area <= current_min_key )
+            return; // don't push stuff that's smaller than the current minimum
+
+        // search for the minimum element
+        //  The heap is laid out in level order, so it is sufficient to search only the last half 
+        ushort last_leaf = HEAP_SIZE - 1;
+        ushort first_leaf = (last_leaf / 2) + 1;
+
+        // as we search, keep track of what the new min-key will be so we can cull future pushes
+        ushort new_min_key = area;
+        ushort min_pos = 0;
+
+        do
+        {
+            ushort idx = first_leaf++;
+
+            ushort current_key = SLMHeapNode_UnpackKey( heap->nodes[idx] );
+            bool found_min_pos = (min_pos == 0) && (current_key == current_min_key);
+            
+            if ( found_min_pos )
+                min_pos = idx;
+            else
+                new_min_key = min( current_key, new_min_key );
+
+        } while ( first_leaf != last_leaf );
+
+        heap->min_key = new_min_key;
+        insert_pos = min_pos;
+    }
+    else
+    {
+        insert_pos = heap->size++;
+        heap->min_key = min( area, heap->min_key );
+    }
+
+    heap->nodes[insert_pos] = SLMHeapNode_Pack( area, offs );
+
+    // heap-up
+    while ( insert_pos )
+    {
+        ushort parent = insert_pos / 2;
+
+        struct SLMHeapNode parent_node = heap->nodes[parent];
+        struct SLMHeapNode current_node = heap->nodes[insert_pos];
+        if ( SLMHeapNode_Greater( parent_node, current_node ) )
+            break;
+         
+        heap->nodes[insert_pos]    = parent_node;
+        heap->nodes[parent]        = current_node;
+        insert_pos = parent;
+    }
+
+}
+
+bool SLMHeap_pop_max( struct SLMHeap* heap, ushort* area_out, short* offs_out )
+{
+    if ( SLMHeap_empty( heap ) )
+        return false;
+
+    SLMHeapNode_Unpack( heap->nodes[0], area_out, offs_out );
+
+    // heap down
+    ushort size = heap->size;
+    ushort idx = 0;
+    do
+    {
+        ushort left = 2 * idx + 1;
+        ushort right = 2 * idx + 2;
+        if ( left >= size )
+            break;
+
+        if ( right >= size )
+        {
+            heap->nodes[idx] = heap->nodes[left];
+            break;
+        }
+
+        struct SLMHeapNode left_node = heap->nodes[left];
+        struct SLMHeapNode right_node = heap->nodes[right];
+        bool go_left = SLMHeapNode_Greater( left_node, right_node );
+        heap->nodes[idx] = go_left ? left_node : right_node;
+        idx = go_left ? left : right;
+
+    } while ( 1 );
+
+    heap->size = size - 1;
+    return true;
+}
+
+void SLMHeap_Print( struct SLMHeap* heap )
+{
+    printf( " size=%u min=%u {", heap->size, heap->min_key );
+    for ( uint i = 0; i < heap->size; i++ )
+        printf( "%04x:%04x", heap->nodes[i].area, heap->nodes[i].offs );
+}
+
+
+GRL_INLINE bool can_open_root( 
+    global struct BVHBase* bvh_base, 
+    const struct GRL_RAYTRACING_INSTANCE_DESC* instance 
+    )
+{    
+    float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
+    float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
+    if ( !is_aabb_valid( root_lower, root_upper ) || GRL_get_InstanceMask(instance) == 0 )
+        return false;
+
+    global InternalNode* node = get_node( bvh_base, 0 );
+    if ( node->nodeType != NODE_TYPE_INTERNAL )
+        return false;
+
+    return is_node_openable( node );
+}
+
+
+GRL_INLINE void count_instance_splits(
+    global struct AABB3f* geometry_bounds,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    local ushort* bin_split_counts,
+    local struct SLMHeap* heap
+)
+{
+    global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
+    
+    SLMHeap_Init( heap );
+
+    float relative_area_scale = 1.0f / AABB3f_halfArea( geometry_bounds );
+    float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
+    float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
+
+    ushort quantized_area = quantize_area( transformed_aabb_halfArea( root_lower, root_upper, instance->Transform ) * relative_area_scale );
+    short  node_offs = 0;
+    ushort num_splits = 0;
+    
+    global InternalNode* node_array = get_node( bvh_base, 0 );
+
+    while ( 1 )
+    {
+        global InternalNode* node = node_array + node_offs;
+
+        // count this split
+        uint bin = get_rebraid_bin_index( quantized_area, NUM_REBRAID_BINS );
+        bin_split_counts[bin]++;
+
+        // open this node and push children to heap
+
+        // TODO_OPT:  Restructure this control flow to prevent differnet lanes from skipping different loop iterations and diverging
+        // TODO_OPT:  Precompute openability masks in BLAS nodes at build time... one bit for self and N bits for each child
+        int offs = node->childOffset;
+        for ( ushort i = 0; i < NUM_CHILDREN; i++ )
+        {
+            if ( InternalNode_IsChildValid( node, i ) )
+            {
+                if ( offs >= SHRT_MIN && offs <= SHRT_MAX )
+                {
+                    if ( is_node_openable( node_array + offs ) )
+                    {
+                        ushort area = get_child_area( node, i, instance->Transform, relative_area_scale );
+                        SLMHeap_push( heap, area, (short)offs );
+                    }
+                }
+            }
+            offs += InternalNode_GetChildBlockIncr( node, i );
+        }
+
+        num_splits++;
+        if ( num_splits == MAX_SPLITS_PER_INSTANCE )
+            break;
+
+        if ( !SLMHeap_pop_max( heap, &quantized_area, &node_offs ) )
+            break;
+    }
+
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL      
+__attribute__( (reqd_work_group_size( COUNT_SPLITS_WG_SIZE, 1, 1 )) )
+void kernel
+rebraid_count_splits(
+    global struct BVHBase* bvh_base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+    global uint* rebraid_scratch,
+    uint num_instances
+    )
+{
+    local struct SLMHeap heap[COUNT_SPLITS_WG_SIZE];
+    local ushort split_counts[COUNT_SPLITS_WG_SIZE][NUM_REBRAID_BINS];
+
+    // initialize stuff
+    // TODO_OPT:  transpose this and subgroup-vectorize it so that
+    //     block-writes can be used
+    for ( uint i = 0; i < NUM_REBRAID_BINS; i++ )
+        split_counts[get_local_id( 0 )][i] = 0;
+
+
+    // count splits for this thread's instance
+    uniform uint base_instance = get_group_id( 0 ) * get_local_size( 0 );
+    uint instanceID = base_instance + get_local_id( 0 );
+    
+    if ( instanceID < num_instances )
+    {
+        global BVHBase* bvh_base = (global BVHBase*)instances[instanceID].AccelerationStructure;
+        if ( can_open_root( bvh_base, &instances[instanceID] ) )
+        {
+            count_instance_splits( &bvh_base->Meta.bounds,
+                &instances[instanceID],
+                &split_counts[get_local_id( 0 )][0],
+                &heap[get_local_id(0)] );
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+    RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
+
+
+    // reduce bins
+    for ( uint bin = get_local_id( 0 ); bin < NUM_REBRAID_BINS; bin += get_local_size( 0 ) )
+    {
+        // TODO_OPT:  There's probably a better way to arrange this computation
+        uint bin_split_count = 0;
+        uint bin_instance_count = 0;
+        for ( uint i = 0; i < COUNT_SPLITS_WG_SIZE; i++ )
+        {
+            uint s = split_counts[i][bin];
+            bin_split_count     += s;
+            bin_instance_count  += (s > 0) ? 1 : 0;
+        }
+
+        if ( bin_split_count > 0 )
+        {
+            atomic_add( &buffers.bin_split_counts[bin], bin_split_count );
+            atomic_add( &buffers.bin_instance_counts[bin], bin_instance_count );
+        }
+    }
+
+    // write out bin counts for each instance
+    for ( uniform uint i = get_sub_group_id(); i < COUNT_SPLITS_WG_SIZE; i += get_num_sub_groups() )
+    {
+        uniform uint iid = base_instance + i;
+        if ( iid > num_instances )
+            break;
+
+        global uint* instance_bin_counts = cast_rebraid_buffers( rebraid_scratch, iid ).instance_bin_counts;
+
+        for ( uniform ushort j = 0; j < NUM_REBRAID_BINS; j += get_sub_group_size() )
+        {
+            uint count = split_counts[i][j + get_sub_group_local_id() ];
+            intel_sub_group_block_write( instance_bin_counts + j, count );
+        }
+    }
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                             Build PrimRefs
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE uint get_instance_split_count(RebraidBuffers buffers, uint instanceID, uint available_splits)
+{
+    global uint* instance_desired_split_count = buffers.instance_bin_counts;
+    global uint *bin_split_counts = buffers.bin_split_counts;
+    global uint *bin_instance_counts = buffers.bin_instance_counts;
+
+    uint total_splits = 0;
+    uint remaining_available_splits = available_splits;
+    uint max_bin = 0;
+    uint desired_splits_this_bin = 0;
+    uint instance_splits = 0;
+
+    do
+    {
+        // stop when we reach a level where we can't satisfy the demand
+        desired_splits_this_bin = instance_desired_split_count[max_bin];
+        uint total_bin_splits = bin_split_counts[max_bin];
+
+        if (total_bin_splits > remaining_available_splits)
+            break;
+
+        // we have enough budget to give all instances everything they want at this level, so do it
+        remaining_available_splits -= total_bin_splits;
+        instance_splits += desired_splits_this_bin;
+        desired_splits_this_bin = 0;
+        max_bin++;
+
+    } while (max_bin < NUM_REBRAID_BINS);
+
+    if (max_bin < NUM_REBRAID_BINS)
+    {
+        // we have more split demand than we have splits available.  The current bin is the last one that gets any splits
+        //   distribute the leftovers as evenly as possible to instances that want them
+        if (desired_splits_this_bin > 0)
+        {
+            // this instance wants splits.  how many does it want?
+            uint desired_total = instance_splits + desired_splits_this_bin;
+            
+            // distribute to all instances as many as possible
+            uint count = bin_instance_counts[max_bin];
+            uint whole = remaining_available_splits / count;
+            remaining_available_splits -= whole * count;
+
+            // distribute remainder to lower numbered instances
+            size_t partial = (instanceID < remaining_available_splits) ? 1 : 0;
+
+            // give the instance its share.
+            instance_splits += whole + partial;
+            instance_splits = min(instance_splits, desired_total); // don't give it more than it needs
+        }
+    }
+
+    return instance_splits;
+}
+
+GRL_INLINE void build_unopened_primref(
+    struct AABB3f* centroid_bounds,
+    global __const BVHBase *bvh_base,
+    global volatile uint *primref_counter,
+    global struct AABB *primref_buffer,
+    global __const float *Transform,
+    uint instanceID,
+    float matOverhead,
+    ushort instanceMask)
+{
+    float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+    float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+
+    struct AABB primRef;
+    AABB_init( &primRef );
+    
+    uint bvhoffset = (uint)BVH_ROOT_NODE_OFFSET;
+    if (is_aabb_valid(root_lower, root_upper) && instanceMask != 0)
+    {
+        primRef = AABBfromAABB3f(compute_xfm_bbox(Transform, BVHBase_GetRootNode(bvh_base), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &bvh_base->Meta.bounds, matOverhead));
+    }
+    else
+    {
+        primRef.lower.x = Transform[3];
+        primRef.lower.y = Transform[7];
+        primRef.lower.z = Transform[11];
+        primRef.upper.xyz = primRef.lower.xyz;
+
+        instanceMask = 0;
+        bvhoffset = NO_NODE_OFFSET;
+    }
+
+    primRef.lower.w = as_float(instanceID | (instanceMask << 24));
+    primRef.upper.w = as_float(bvhoffset);
+
+    float3 centroid = primRef.lower.xyz + primRef.upper.xyz;
+    centroid_bounds->lower[0] = centroid.x;
+    centroid_bounds->upper[0] = centroid.x;
+    centroid_bounds->lower[1] = centroid.y;
+    centroid_bounds->upper[1] = centroid.y;
+    centroid_bounds->lower[2] = centroid.z;
+    centroid_bounds->upper[2] = centroid.z;
+
+    uint place = GRL_ATOMIC_INC(primref_counter);
+    primref_buffer[place] = primRef;
+}
+
+GRL_INLINE void build_opened_primrefs(
+    varying bool lane_mask,
+    varying uint offset,
+    varying InternalNode* node,
+    varying struct AABB3f* centroid_bounds,
+    uniform global BVHBase *bvh_base,
+    uniform volatile global uint *primref_counter,
+    uniform global struct AABB *primref_buffer,
+    uniform uint instanceID,
+    uniform const float *Transform,
+    uniform float matOverhead, 
+    varying ushort instanceMask)
+{
+    // TODO_OPT: This function is often called with <= 6 active lanes
+    //  If lanes are sparse, consider jumping to a sub-group vectorized variant...
+
+    if (lane_mask)
+    {
+        varying uint place = GRL_ATOMIC_INC(primref_counter);
+        
+        struct AABB box = AABBfromAABB3f(compute_xfm_bbox(Transform, node, XFM_BOX_NOT_REFINED_CLIPPED, &bvh_base->Meta.bounds, matOverhead));
+
+        box.lower.w = as_float(instanceID | (instanceMask << 24));
+        box.upper.w = as_float(offset * 64 + (uint)BVH_ROOT_NODE_OFFSET);
+        primref_buffer[place] = box;
+
+        AABB3f_extend_point( centroid_bounds, box.lower.xyz + box.upper.xyz );
+    }
+}
+
+
+GRL_INLINE void SUBGROUP_open_nodes(
+    uniform global struct AABB3f *geometry_bounds,
+    uniform uint split_limit,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance,
+    uniform uint instanceID,
+    uniform volatile global uint *primref_counter,
+    uniform global struct AABB *primref_buffer,
+    varying struct AABB3f* centroid_bounds, 
+    float transformOverhead)
+{
+    uniform SGHeap heap;
+    SGHeap_init(&heap);
+
+    uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
+    uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+
+    uniform uint16_t node_offs = 0;
+    varying uint lane = get_sub_group_local_id();
+
+    uniform InternalNode* node_array = get_node( bvh_base, 0 );
+
+    LOOP_TRIPWIRE_INIT;
+
+    while ( 1 )
+    {        
+        uniform InternalNode *node = node_array + node_offs;
+
+        varying uint sg_offs   = node_offs + SUBGROUP_get_child_offsets(node);
+        varying bool sg_valid = false;
+        varying bool sg_openable = false;
+        if (lane < NUM_CHILDREN)
+        {
+            sg_valid = InternalNode_IsChildValid(node, lane);
+            if (sg_valid && (sg_offs <= MAX_NODE_OFFSET))
+            {
+                sg_openable = is_node_openable( node_array + sg_offs);
+            }
+        }
+
+        uniform uint16_t valid_children = intel_sub_group_ballot(sg_valid);
+        uniform uint16_t openable_children = intel_sub_group_ballot(sg_openable);
+        uniform uint16_t unopenable_children = valid_children & (~openable_children);
+
+        if ( openable_children )
+        {
+            varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
+
+            // try to push all openable children to the heap
+            if ( !SGHeap_full( &heap ) )
+            {
+                openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
+            }
+
+            // we have more openable children than will fit in the heap
+            //  process these one by one.
+            //          TODO: Try re-writing with sub_group_any() and see if compiler does a better job
+            while ( openable_children )
+            {
+                // pop min element
+                uniform uint16_t min_area;
+                uniform uint16_t min_offs;
+                SGHeap_full_pop_min( &heap, &min_area, &min_offs );
+
+                // eliminate all children smaller than heap minimum.
+                // mark eliminated children as unopenable
+                varying uint culled_children = openable_children & intel_sub_group_ballot( sg_area <= min_area );
+                unopenable_children ^= culled_children;
+                openable_children &= ~culled_children;
+
+                if ( openable_children )
+                {
+                    // if any children survived the purge
+                    //   find the first such child and swap its offset for the one from the heap
+                    //
+                    uniform uint child_id = ctz( openable_children );
+                    uniform uint16_t old_min_offs = min_offs;
+                    min_area = sub_group_broadcast( sg_area, child_id );
+                    min_offs = sub_group_broadcast( sg_offs, child_id );
+
+                    if ( lane == child_id )
+                        sg_offs = old_min_offs;
+
+                    openable_children ^= (1 << child_id);
+                    unopenable_children ^= (1 << child_id);
+                }
+
+                SGHeap_push_and_fill( &heap, min_area, min_offs );
+
+            }
+        }
+
+        if (unopenable_children)
+        {
+            varying bool sg_create_primref = ((1 << lane) & unopenable_children);
+            build_opened_primrefs(sg_create_primref, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
+        }
+
+        --split_limit;
+        if (split_limit == 0)
+        {
+            // split limit exceeded
+            //  create primrefs for all remaining openable nodes in heap
+            varying bool sg_mask = SGHeap_get_lane_mask(&heap);
+            sg_offs = SGHeap_get_lane_values(&heap);
+            build_opened_primrefs(sg_mask, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
+
+            break;
+        }
+
+       
+        // NOTE: the heap should never be empty.  If it is, the instance was given too many splits.
+
+        // get next node from heap
+        uint16_t quantized_area;
+        SGHeap_pop_max(&heap, &quantized_area, &node_offs);
+
+        LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_build_primrefs" );
+
+    }
+}
+
+
+#define OPEN_QUEUE_SIZE 256
+#define OPEN_QUEUE_NUM_SGS 16
+
+typedef struct OpenQueueEntry
+{
+    uint instanceID;
+    ushort num_splits;
+} OpenQueueEntry;
+
+typedef struct OpenQueue
+{
+    uint num_produced;
+    uint num_consumed;
+    OpenQueueEntry Q[OPEN_QUEUE_SIZE];
+} OpenQueue;
+
+uniform uint SUBGROUP_GetNextQueueEntry( local OpenQueue* queue )
+{
+    uint next = 0;
+    if ( get_sub_group_local_id() == 0 )
+        next = GRL_ATOMIC_INC( &queue->num_consumed );
+    return sub_group_broadcast( next, 0 );
+}
+
+
+GRL_INLINE void do_rebraid_build_primrefs(
+    local struct AABB3f* SLM_CentroidBounds,
+    local OpenQueue* SLM_Q,
+    global struct Globals* globals,
+    global struct BVHBase* base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    uint extra_primref_count,
+    uint num_instances)
+{
+    varying uint instanceID = get_sub_group_size() * get_sub_group_global_id() + get_sub_group_local_id();
+
+    uniform volatile global uint* primref_counter = &globals->numPrimitives;
+    uniform RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
+    uniform uint available_splits = get_num_splits( extra_primref_count, NUM_CHILDREN );
+
+
+
+    varying struct AABB3f centroidBounds;
+    AABB3f_init( &centroidBounds );
+
+    if ( get_local_id( 0 ) == 0 )
+    {
+        SLM_Q->num_produced = 0;
+        SLM_Q->num_consumed = 0;
+        AABB3f_init( SLM_CentroidBounds );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // assign splits to unopened instances.  Build primrefs for unsplit instances in vectorized form
+    varying uint num_splits = 0;
+    if ( instanceID < num_instances )
+    {
+        num_splits = get_instance_split_count( buffers, instanceID, available_splits );
+        if ( num_splits == 0 )
+        {
+            varying global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
+            varying global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
+            if ( bvh_base != 0 )
+            {
+                build_unopened_primref( &centroidBounds, bvh_base, primref_counter, primref_buffer, instance->Transform, instanceID, 0.0f, GRL_get_InstanceMask(instance));
+            }
+        }
+        else
+        {
+            // defer opened instances
+            uint place = GRL_ATOMIC_INC( &SLM_Q->num_produced );
+            SLM_Q->Q[place].instanceID = instanceID;
+            SLM_Q->Q[place].num_splits = (ushort)num_splits;
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // if there were opened instances, process them, one per subgroup
+    uniform uint num_produced = SLM_Q->num_produced;
+    uniform uint next = SUBGROUP_GetNextQueueEntry( SLM_Q );
+
+    while ( next < num_produced )
+    {
+        uniform uint instanceID = SLM_Q->Q[next].instanceID;
+        uniform uint num_splits = SLM_Q->Q[next].num_splits;
+
+        uniform global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
+
+        float transformOverhead =
+#if FINE_TRANSFORM_NODE_BOX
+            transformation_bbox_surf_overhead(instance->Transform);
+#else
+            0.0f;
+#endif
+
+        SUBGROUP_open_nodes(
+            &base->Meta.bounds,
+            num_splits,
+            instance,
+            instanceID,
+            primref_counter,
+            primref_buffer,
+            &centroidBounds,
+            transformOverhead);
+
+        next = SUBGROUP_GetNextQueueEntry( SLM_Q );
+    }
+
+    // reduce the centroid bounds AABB
+    struct AABB3f reduced = AABB3f_sub_group_reduce( &centroidBounds );    
+    if ( get_sub_group_local_id() == 0 )
+        AABB3f_atomic_merge_localBB_nocheck( SLM_CentroidBounds, &reduced );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if( get_local_id(0) == 0 )
+    {
+        atomic_min( (global float*) (&globals->centroidBounds.lower) + 0, SLM_CentroidBounds->lower[0] );
+        atomic_min( (global float*) (&globals->centroidBounds.lower) + 1, SLM_CentroidBounds->lower[1] );
+        atomic_min( (global float*) (&globals->centroidBounds.lower) + 2, SLM_CentroidBounds->lower[2] );
+        atomic_max( (global float*) (&globals->centroidBounds.upper) + 0, SLM_CentroidBounds->upper[0] );
+        atomic_max( (global float*) (&globals->centroidBounds.upper) + 1, SLM_CentroidBounds->upper[1] );
+        atomic_max( (global float*) (&globals->centroidBounds.upper) + 2, SLM_CentroidBounds->upper[2] );
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel rebraid_build_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    uint extra_primref_count,
+    uint num_instances)
+{
+    local struct AABB3f SLM_CentroidBounds;
+    local OpenQueue SLM_Q;
+    do_rebraid_build_primrefs(
+        &SLM_CentroidBounds,
+        &SLM_Q,
+        globals,
+        base,
+        instance_buffer,
+        rebraid_scratch,
+        primref_buffer,
+        extra_primref_count,
+        num_instances);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel rebraid_build_primrefs_indirect(
+    global struct Globals* globals,
+    global struct BVHBase* base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    global struct IndirectBuildRangeInfo const * const indirect_data,
+    uint extra_primref_count )
+{
+    local struct AABB3f SLM_CentroidBounds;
+    local OpenQueue SLM_Q;
+
+    instance_buffer = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+        (((global char*)instance_buffer) + indirect_data->primitiveOffset);
+
+    do_rebraid_build_primrefs(
+        &SLM_CentroidBounds,
+        &SLM_Q,
+        globals,
+        base,
+        instance_buffer,
+        rebraid_scratch,
+        primref_buffer,
+        extra_primref_count,
+        indirect_data->primitiveCount);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                             Misc
+///////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+ISA_TEST(global InternalNode *n, global uint *out, global float *xform, float scale)
+{
+
+    out[get_sub_group_local_id()] = InternalNode_IsChildValid(n, get_sub_group_local_id());
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) void kernel
+DEBUG_PRINT( 
+    global struct Globals* globals,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    dword num_extra, 
+    dword input_instances )
+{
+#if 0
+    // validate primrefs
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+    {
+        uint refs = globals->numPrimitives;
+        for ( uint i = 0; i < refs; i++ )
+        {
+            if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
+                 any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) ||
+                 any( isnan(primref_buffer[i].lower.xyz) ) ||
+                any( isnan(primref_buffer[i].upper.xyz) ) )
+            {
+                struct AABB box = primref_buffer[i];
+                printf( "BAD BOX:      %u  {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
+                    box.lower.x, box.lower.y, box.lower.z,
+                    box.upper.x, box.upper.y, box.upper.z,
+                    as_uint( box.lower.w ) );
+            }
+
+            const uint instIndex = PRIMREF_instanceID(&primref_buffer[i]);    // TODO: Refactor me.  We should not be using struct AABB for primRefs
+            const uint rootByteOffset = as_uint( primref_buffer[i].upper.w ); // It should be struct PrimRef
+            if ( instIndex >= input_instances )
+                printf( "BAD INSTANCE INDEX: %u", i );
+            else
+            {
+                global struct BVHBase* blas = (global struct BVHBase*)instance_buffer[instIndex].AccelerationStructure;
+                if ( blas )
+                {
+                    struct InternalNode* start = BVHBase_GetInternalNodes( blas );
+                    struct InternalNode* end = BVHBase_GetInternalNodesEnd( blas );
+
+                    InternalNode* entryPoint = (struct InternalNode*)((char*)instance_buffer[instIndex].AccelerationStructure + rootByteOffset);
+                    if ( entryPoint < start || entryPoint >= end )
+                        printf( "BAD ENTRYPOINT: %u\n", i );
+                    if ( (rootByteOffset & 63) != 0 )
+                        printf( "MISALIGNED ENTRYPOInt: %u\n", i );
+                    
+                }
+            }
+        }
+    }
+#endif
+#if 0
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+        printf( "REBRAIDED: %u\n", globals->numPrimitives );
+
+    // print instance bin information
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+    {
+        printf( "REBRAIDED: %u\n", globals->numPrimitives );
+        for( uint i=0; i<231; i++  )
+        {
+            RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch,i );
+            printf( " ID:%4u ", i );
+            for ( uint j = 0; j < NUM_REBRAID_BINS; j++ )
+            {
+                global uint* count = buffers.instance_bin_counts;
+                printf( " %2u ", count[j] );
+            }
+            printf( "\n" );
+        }
+    }
+#endif
+#if 0
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+    {
+        printf( "Instances: %u\n", globals->numPrimitives );
+
+        for ( uint i = 0; i < globals->numPrimitives; i++ )
+        {
+            if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
+                 any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) )
+            {
+                struct AABB box = primref_buffer[i];
+                printf( "      %u  {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
+                    box.lower.x, box.lower.y, box.lower.z,
+                    box.upper.x, box.upper.y, box.upper.z,
+                    as_uint( box.lower.w ) );
+            }
+
+        }
+    }
+#endif
+}
+
diff --git a/src/intel/vulkan/grl/gpu/common.h b/src/intel/vulkan/grl/gpu/common.h
new file mode 100644
index 00000000000..5fa0e117ae4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/common.h
@@ -0,0 +1,429 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+#include "qbvh6.h"
+
+/* ====== BVH_BUILDER config ====== */
+
+__constant const float cfg_intCost = 4.0f;
+__constant const float cfg_travCost = 1.0f;
+__constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN;
+__constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX;
+__constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE;
+
+#define ENABLE_CONVERSION_CHECKS 0
+
+#ifdef ENABLE_BIG_REG_ANNOTATION
+#define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4")))
+#else
+#define GRL_ANNOTATE_BIG_REG_REQ
+#endif
+
+#ifdef ENABLE_IGC_DO_NOT_SPILL
+#define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill")))
+#else
+#define GRL_ANNOTATE_IGC_DO_NOT_SPILL
+#endif
+
+#define ERROR()
+
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+
+GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset)
+{
+    return (offset & 0x7) - 3;
+}
+
+GRL_INLINE unsigned int getLeafOffset(unsigned int offset)
+{
+    return offset & (~0x7);
+}
+
+GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2)
+{
+    const float4 a = v1 - v0;
+    const float4 b = v2 - v0;
+    return cross(a, b);
+}
+
+GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2)
+{
+    const float4 normal = triangleNormal(v0, v1, v2);
+    return length((float3)(normal.x, normal.y, normal.z)) * 0.5f;
+}
+
+GRL_INLINE float det2(const float2 a, const float2 b)
+{
+    return a.x * b.y - a.y * b.x;
+}
+
+GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2)
+{
+    const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy));
+    const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz));
+    const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx));
+    return xy + yz + zx;
+}
+
+typedef struct Block64B  {
+    char data[64];
+} Block64B __attribute__((aligned(64)));
+
+typedef char byte_align64B __attribute__((aligned(64)));
+
+/* ====================================================================== */
+/* ============================== GLOBALS =============================== */
+/* ====================================================================== */
+
+GRL_INLINE bool Globals_OnFinish(global struct Globals *globals)
+{
+    /* last active HW thread ? */
+    if (get_local_id(0) == 0)
+    {
+        const uint sync = atomic_add(&globals->sync, 1);
+        if (sync + 1 == get_num_groups(0))
+        {
+            globals->sync = 0;
+            return true;
+        }
+    }
+    return false;
+}
+
+GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p)
+{
+    return p->cur - p->start;
+};
+
+GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size)
+{
+    return atomic_add(&p->cur, size);
+}
+
+GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size)
+{
+    uint offset = 0;
+    if (get_sub_group_local_id() == 0)
+        offset = atomic_add(&p->cur, size);
+    return sub_group_broadcast(offset, 0);
+}
+
+// node allocation returns an offset from beginning of BVH to allocated node
+//  in multiples of 64B
+GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes )
+{
+    return atomic_add_global( &base->nodeDataCur, num_nodes );
+}
+GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes)
+{
+    return atomic_add_global(&base->proceduralDataCur, num_nodes);
+}
+
+GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes)
+{
+    return atomic_add_global(&base->quadLeafCur, num_nodes);
+}
+
+#if 0
+GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size);
+}
+#endif
+
+GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals)
+{
+    return (global struct BuildRecord *)(bvh_mem + globals->build_record_start);
+}
+
+/* ======================================================================= */
+/* ============================== TRIANGLE =============================== */
+/* ======================================================================= */
+
+/*GRL_INLINE void printTriangle(struct Triangle *t)
+{
+  printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID);
+  }*/
+
+/* ==================================================================== */
+/* ============================== SPLIT =============================== */
+/* ==================================================================== */
+
+GRL_INLINE void printSplit(struct Split *split)
+{
+    printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos);
+}
+
+/* ========================================================================== */
+/* ============================== BUILDRECORD =============================== */
+/* ========================================================================== */
+
+GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end)
+{
+    AABB_init(&buildRecord->centroidBounds);
+    buildRecord->start = start;
+    buildRecord->end = end;
+}
+
+GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref)
+{
+    AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref));
+}
+
+GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord)
+{
+    return as_uint(buildRecord->centroidBounds.upper.w);
+}
+
+GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth)
+{
+    buildRecord->centroidBounds.upper.w = as_float(depth);
+}
+
+GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord)
+{
+    return buildRecord->end - buildRecord->start;
+}
+
+/* ========================================================================== */
+/* =================== BinaryMortonCodeHierarchy ============================= */
+/* ========================================================================== */
+
+GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end)
+{
+    record->range.start = start;
+    record->range.end = end;
+    record->leftChild = -1;
+    record->rightChild = -1;
+//    record->flag = 0;
+}
+
+GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
+{
+    /* leaf case */
+    if (nodeID & (uint)(1 << 31))
+        return 1;
+
+    /* inner node case*/
+    else
+        return nodes[nodeID].range.end - nodes[nodeID].range.start + 1;
+}
+
+GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID)
+{
+    struct BinaryMortonCodeHierarchy entry;
+
+    if (nodeID & (uint)(1 << 31)) {
+        /* leaf case */
+        uint rangeStart = nodeID ^ (uint)(1 << 31);
+        BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart);
+    }
+    else {
+        /* inner node case*/
+        entry = nodes[nodeID];
+    }
+
+    return entry;
+}
+
+GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
+{
+    /* leaf case */
+    if (nodeID & (uint)(1 << 31))
+        return nodeID ^ (uint)(1 << 31);
+
+    /* inner node case*/
+    else
+        return nodes[nodeID].range.start;
+}
+
+/* ==================================================================== */
+/* ============================== RANGE =============================== */
+/* ==================================================================== */
+
+GRL_INLINE void printRange(struct Range *range)
+{
+    printf("start %d end %d \n", range->start, range->end);
+}
+
+GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1)
+{
+    if (range0->start == range1->start &&
+        range0->end == range1->end)
+        return true;
+    return false;
+}
+
+GRL_INLINE uint getSizeRange(struct Range *range)
+{
+    return range->end - range->start;
+}
+
+/* ==================================================================== */
+/* ========================= ProceduralLeaf =========================== */
+/* ==================================================================== */
+
+#if 0
+struct ProceduralLeaf
+{
+  uint shaderIndex_geomMask;
+  uint geomIndex_flags;
+  uint N_last;
+  uint primIndex[13];
+};
+#endif
+
+GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This)
+{
+    return This->leafDesc.geomIndex_flags & 0x1FFFFFFF;
+}
+
+GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i)
+{
+    //assert(i < N);
+    return This->_primIndex[i];
+}
+
+/* ==================================================================== */
+/* =========================== TrianglePair =========================== */
+/* ==================================================================== */
+
+struct TrianglePair
+{
+    uint4 a;    // indices of the 4 verts to store in the quad
+    uint3 lb;   //   index of the second triangle's verts in 'a'
+};
+
+GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1)
+{
+    struct TrianglePair q;
+    q.a.x = tri0.x;
+    q.a.y = tri0.y;
+    q.a.z = tri0.z;
+    q.a.w = tri0.z;
+
+    uint3 b;
+    b.x = tri1.x;
+    b.y = tri1.y;
+    b.z = tri1.z;
+
+    q.lb = (uint3)(3);
+
+    q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x;
+    q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y;
+    q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z;
+
+    q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x;
+    q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y;
+    q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z;
+
+    q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x;
+    q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y;
+    q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z;
+
+    q.lb.x = (primID0 != primID1) ? q.lb.x : 0;
+    q.lb.y = (primID0 != primID1) ? q.lb.y : 0;
+    q.lb.z = (primID0 != primID1) ? q.lb.z : 0;
+
+    q.a.w = (q.lb.x == 3) ? b.x : q.a.w;
+    q.a.w = (q.lb.y == 3) ? b.y : q.a.w;
+    q.a.w = (q.lb.z == 3) ? b.z : q.a.w;
+
+    return q;
+}
+
+GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column)
+{
+    return d->Transform[row][column];
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d)
+{
+    return d->InstanceIDAndMask & (0x00FFFFFF);
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d)
+{
+    return d->InstanceIDAndMask >> 24;
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d)
+{
+    return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d)
+{
+    return d->InstanceContributionToHitGroupIndexAndFlags >> 24;
+}
+
+GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d)
+{
+    return d->AccelerationStructureGPUVA;
+}
+
+GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value)
+{
+    d->Transform[row][column] = value;
+}
+
+GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id)
+{
+    d->InstanceIDAndMask &= 255 << 24;
+    d->InstanceIDAndMask |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask)
+{
+    d->InstanceIDAndMask &= ((1 << 24) - 1);
+    d->InstanceIDAndMask |= mask << 24;
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution)
+{
+    d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24;
+    d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags)
+{
+    d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1);
+    d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24;
+}
+
+GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address)
+{
+    d->AccelerationStructureGPUVA = address;
+}
diff --git a/src/intel/vulkan/grl/gpu/copy.grl b/src/intel/vulkan/grl/gpu/copy.grl
new file mode 100644
index 00000000000..1bb500a4ea0
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/copy.grl
@@ -0,0 +1,129 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module copy; // In copy we assume output data structure to be DXR compatible
+
+kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" >
+kernel compact < source="bvh_copy.cl", kernelFunction="compact" >
+kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" >
+kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" >
+kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" >
+kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" >
+
+metakernel clone_indirect(
+    qword dest,
+    qword src,
+    qword srcBVHsizedwordAddr)
+{
+// this has to be compatible with in kernel GroupCountForCopy(...)
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect clone_indirect args(
+        dest,
+        src);
+}
+
+metakernel compact(
+    qword dest,
+    qword src)
+{
+    dispatch compact(32,1,1) args(
+        dest,
+        src,
+        32);
+}
+
+metakernel serialize_indirect(
+    qword dest,
+    qword src,
+    qword driverID,
+    qword srcBVHsizedwordAddr)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect serialize_indirect args(
+        dest,
+        src,
+        driverID);
+}
+
+metakernel serialize_for_input_dump_indirect(
+    qword batchPtrs,
+    qword dstOffset,
+    qword src,
+    qword driverID,
+    qword srcBVHsizedwordAddr)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG2;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG3;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect serialize_for_input_dump_indirect args(
+        batchPtrs,
+        dstOffset,
+        src,
+        driverID);
+}
+
+metakernel deserialize_indirect(
+    qword dest,
+    qword src,
+    qword srcBVHsizedwordAddr)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect deserialize_indirect args(
+        dest,
+        src);
+}
+
+metakernel dxr_decode(
+    qword dest,
+    qword src)
+{
+    dispatch dxr_decode(1,1,1) args(
+        dest,
+        src);
+}
diff --git a/src/intel/vulkan/grl/gpu/d3d12.h b/src/intel/vulkan/grl/gpu/d3d12.h
new file mode 100644
index 00000000000..32a7654eac5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/d3d12.h
@@ -0,0 +1,525 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+#include "GRLStructs.h"
+#include "shared.h"
+
+typedef global void *D3D12_GPU_VIRTUAL_ADDRESS;
+typedef void *ID3D12StateObjectPrototype;
+
+enum DXGI_FORMAT
+{
+    DXGI_FORMAT_UNKNOWN,
+    DXGI_FORMAT_R32G32B32A32_TYPELESS,
+    DXGI_FORMAT_R32G32B32A32_FLOAT,
+    DXGI_FORMAT_R32G32B32A32_UINT,
+    DXGI_FORMAT_R32G32B32A32_SINT,
+    DXGI_FORMAT_R32G32B32_TYPELESS,
+    DXGI_FORMAT_R32G32B32_FLOAT,
+    DXGI_FORMAT_R32G32B32_UINT,
+    DXGI_FORMAT_R32G32B32_SINT,
+    DXGI_FORMAT_R16G16B16A16_TYPELESS,
+    DXGI_FORMAT_R16G16B16A16_FLOAT,
+    DXGI_FORMAT_R16G16B16A16_UNORM,
+    DXGI_FORMAT_R16G16B16A16_UINT,
+    DXGI_FORMAT_R16G16B16A16_SNORM,
+    DXGI_FORMAT_R16G16B16A16_SINT,
+    DXGI_FORMAT_R32G32_TYPELESS,
+    DXGI_FORMAT_R32G32_FLOAT,
+    DXGI_FORMAT_R32G32_UINT,
+    DXGI_FORMAT_R32G32_SINT,
+    DXGI_FORMAT_R32G8X24_TYPELESS,
+    DXGI_FORMAT_D32_FLOAT_S8X24_UINT,
+    DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS,
+    DXGI_FORMAT_X32_TYPELESS_G8X24_UINT,
+    DXGI_FORMAT_R10G10B10A2_TYPELESS,
+    DXGI_FORMAT_R10G10B10A2_UNORM,
+    DXGI_FORMAT_R10G10B10A2_UINT,
+    DXGI_FORMAT_R11G11B10_FLOAT,
+    DXGI_FORMAT_R8G8B8A8_TYPELESS,
+    DXGI_FORMAT_R8G8B8A8_UNORM,
+    DXGI_FORMAT_R8G8B8A8_UNORM_SRGB,
+    DXGI_FORMAT_R8G8B8A8_UINT,
+    DXGI_FORMAT_R8G8B8A8_SNORM,
+    DXGI_FORMAT_R8G8B8A8_SINT,
+    DXGI_FORMAT_R16G16_TYPELESS,
+    DXGI_FORMAT_R16G16_FLOAT,
+    DXGI_FORMAT_R16G16_UNORM,
+    DXGI_FORMAT_R16G16_UINT,
+    DXGI_FORMAT_R16G16_SNORM,
+    DXGI_FORMAT_R16G16_SINT,
+    DXGI_FORMAT_R32_TYPELESS,
+    DXGI_FORMAT_D32_FLOAT,
+    DXGI_FORMAT_R32_FLOAT,
+    DXGI_FORMAT_R32_UINT,
+    DXGI_FORMAT_R32_SINT,
+    DXGI_FORMAT_R24G8_TYPELESS,
+    DXGI_FORMAT_D24_UNORM_S8_UINT,
+    DXGI_FORMAT_R24_UNORM_X8_TYPELESS,
+    DXGI_FORMAT_X24_TYPELESS_G8_UINT,
+    DXGI_FORMAT_R8G8_TYPELESS,
+    DXGI_FORMAT_R8G8_UNORM,
+    DXGI_FORMAT_R8G8_UINT,
+    DXGI_FORMAT_R8G8_SNORM,
+    DXGI_FORMAT_R8G8_SINT,
+    DXGI_FORMAT_R16_TYPELESS,
+    DXGI_FORMAT_R16_FLOAT,
+    DXGI_FORMAT_D16_UNORM,
+    DXGI_FORMAT_R16_UNORM,
+    DXGI_FORMAT_R16_UINT,
+    DXGI_FORMAT_R16_SNORM,
+    DXGI_FORMAT_R16_SINT,
+    DXGI_FORMAT_R8_TYPELESS,
+    DXGI_FORMAT_R8_UNORM,
+    DXGI_FORMAT_R8_UINT,
+    DXGI_FORMAT_R8_SNORM,
+    DXGI_FORMAT_R8_SINT,
+    DXGI_FORMAT_A8_UNORM,
+    DXGI_FORMAT_R1_UNORM,
+    DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
+    DXGI_FORMAT_R8G8_B8G8_UNORM,
+    DXGI_FORMAT_G8R8_G8B8_UNORM,
+    DXGI_FORMAT_BC1_TYPELESS,
+    DXGI_FORMAT_BC1_UNORM,
+    DXGI_FORMAT_BC1_UNORM_SRGB,
+    DXGI_FORMAT_BC2_TYPELESS,
+    DXGI_FORMAT_BC2_UNORM,
+    DXGI_FORMAT_BC2_UNORM_SRGB,
+    DXGI_FORMAT_BC3_TYPELESS,
+    DXGI_FORMAT_BC3_UNORM,
+    DXGI_FORMAT_BC3_UNORM_SRGB,
+    DXGI_FORMAT_BC4_TYPELESS,
+    DXGI_FORMAT_BC4_UNORM,
+    DXGI_FORMAT_BC4_SNORM,
+    DXGI_FORMAT_BC5_TYPELESS,
+    DXGI_FORMAT_BC5_UNORM,
+    DXGI_FORMAT_BC5_SNORM,
+    DXGI_FORMAT_B5G6R5_UNORM,
+    DXGI_FORMAT_B5G5R5A1_UNORM,
+    DXGI_FORMAT_B8G8R8A8_UNORM,
+    DXGI_FORMAT_B8G8R8X8_UNORM,
+    DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM,
+    DXGI_FORMAT_B8G8R8A8_TYPELESS,
+    DXGI_FORMAT_B8G8R8A8_UNORM_SRGB,
+    DXGI_FORMAT_B8G8R8X8_TYPELESS,
+    DXGI_FORMAT_B8G8R8X8_UNORM_SRGB,
+    DXGI_FORMAT_BC6H_TYPELESS,
+    DXGI_FORMAT_BC6H_UF16,
+    DXGI_FORMAT_BC6H_SF16,
+    DXGI_FORMAT_BC7_TYPELESS,
+    DXGI_FORMAT_BC7_UNORM,
+    DXGI_FORMAT_BC7_UNORM_SRGB,
+    DXGI_FORMAT_AYUV,
+    DXGI_FORMAT_Y410,
+    DXGI_FORMAT_Y416,
+    DXGI_FORMAT_NV12,
+    DXGI_FORMAT_P010,
+    DXGI_FORMAT_P016,
+    DXGI_FORMAT_420_OPAQUE,
+    DXGI_FORMAT_YUY2,
+    DXGI_FORMAT_Y210,
+    DXGI_FORMAT_Y216,
+    DXGI_FORMAT_NV11,
+    DXGI_FORMAT_AI44,
+    DXGI_FORMAT_IA44,
+    DXGI_FORMAT_P8,
+    DXGI_FORMAT_A8P8,
+    DXGI_FORMAT_B4G4R4A4_UNORM,
+    DXGI_FORMAT_P208,
+    DXGI_FORMAT_V208,
+    DXGI_FORMAT_V408,
+    DXGI_FORMAT_FORCE_UINT
+};
+
+typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS
+{
+    D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0,
+    D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1,
+    D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2
+} D3D12_RAYTRACING_GEOMETRY_FLAGS;
+
+typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE
+{
+    D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0,
+    D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1)
+} D3D12_RAYTRACING_GEOMETRY_TYPE;
+
+typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS
+{
+    D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0,
+    D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
+    D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
+    D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
+    D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8
+} D3D12_RAYTRACING_INSTANCE_FLAGS;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE
+{
+    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+    unsigned long StrideInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE
+{
+    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+    unsigned long SizeInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSRANGE;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE
+{
+    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+    unsigned long SizeInBytes;
+    unsigned long StrideInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE;
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC
+{
+    D3D12_GPU_VIRTUAL_ADDRESS Transform;
+    enum DXGI_FORMAT IndexFormat;
+    enum DXGI_FORMAT VertexFormat;
+    unsigned int IndexCount;
+    unsigned int VertexCount;
+    D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer;
+    struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer;
+} D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC;
+
+typedef struct D3D12_RAYTRACING_AABB
+{
+    float MinX;
+    float MinY;
+    float MinZ;
+    float MaxX;
+    float MaxY;
+    float MaxZ;
+} D3D12_RAYTRACING_AABB;
+
+GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source)
+{
+    dest->MinX = source->lower.x;
+    dest->MinY = source->lower.y;
+    dest->MinZ = source->lower.z;
+    dest->MaxX = source->upper.x;
+    dest->MaxY = source->upper.y;
+    dest->MaxZ = source->upper.z;
+}
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC
+{
+    unsigned long AABBCount;
+    D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs;
+} D3D12_RAYTRACING_GEOMETRY_AABBS_DESC;
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_DESC
+{
+    D3D12_RAYTRACING_GEOMETRY_TYPE Type;
+    D3D12_RAYTRACING_GEOMETRY_FLAGS Flags;
+    //unsigned int ShaderIndex : 24; // extension
+    //unsigned int Mask : 8; // extension
+    //unsigned int ShaderIndex_Mask; // extension
+    union {
+        D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles;
+        D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs;
+    };
+} D3D12_RAYTRACING_GEOMETRY_DESC;
+
+GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type)
+{
+    geomDesc->Type = type;
+}
+
+GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Type;
+}
+
+GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags)
+{
+    geomDesc->Flags = flags;
+}
+
+GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Flags;
+}
+
+GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform)
+{
+    geomDesc->Triangles.Transform = transform;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.Transform;
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format)
+{
+    switch (format)
+    {
+    case INDEX_FORMAT_NONE:
+        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN;
+        break;
+    case INDEX_FORMAT_R16_UINT:
+        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT;
+        break;
+    case INDEX_FORMAT_R32_UINT:
+        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT;
+        break;
+    }
+}
+
+GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    switch (geomDesc->Triangles.IndexFormat)
+    {
+    case DXGI_FORMAT_R16_UINT:
+        return INDEX_FORMAT_R16_UINT;
+    case DXGI_FORMAT_R32_UINT:
+        return INDEX_FORMAT_R32_UINT;
+    case DXGI_FORMAT_UNKNOWN:
+    default:
+        return INDEX_FORMAT_NONE;
+    }
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format)
+{
+    switch (format)
+    {
+    case VERTEX_FORMAT_R32G32_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT;
+        break;
+    case VERTEX_FORMAT_R32G32B32_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT;
+        break;
+    case VERTEX_FORMAT_R16G16_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT;
+        break;
+    case VERTEX_FORMAT_R16G16B16A16_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT;
+        break;
+    case VERTEX_FORMAT_R16G16_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM;
+        break;
+    case VERTEX_FORMAT_R16G16B16A16_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM;
+        break;
+    case VERTEX_FORMAT_R16G16B16A16_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM;
+        break;
+    case VERTEX_FORMAT_R16G16_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM;
+        break;
+    case VERTEX_FORMAT_R10G10B10A2_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM;
+        break;
+    case VERTEX_FORMAT_R8G8B8A8_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM;
+        break;
+    case VERTEX_FORMAT_R8G8_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM;
+        break;
+    case VERTEX_FORMAT_R8G8B8A8_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM;
+        break;
+    case VERTEX_FORMAT_R8G8_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM;
+        break;
+    }
+}
+
+GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    switch(geomDesc->Triangles.VertexFormat)
+    {
+    case DXGI_FORMAT_R32G32_FLOAT:
+        return VERTEX_FORMAT_R32G32_FLOAT;
+    case DXGI_FORMAT_R32G32B32_FLOAT:
+        return VERTEX_FORMAT_R32G32B32_FLOAT;
+    case DXGI_FORMAT_R16G16_FLOAT:
+        return VERTEX_FORMAT_R16G16_FLOAT;
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+        return VERTEX_FORMAT_R16G16B16A16_FLOAT;
+    case DXGI_FORMAT_R16G16_SNORM:
+        return VERTEX_FORMAT_R16G16_SNORM;
+    case DXGI_FORMAT_R16G16B16A16_SNORM:
+        return VERTEX_FORMAT_R16G16B16A16_SNORM;
+    case DXGI_FORMAT_R16G16B16A16_UNORM:
+        return VERTEX_FORMAT_R16G16B16A16_UNORM;
+    case DXGI_FORMAT_R16G16_UNORM:
+        return VERTEX_FORMAT_R16G16_UNORM;
+    case DXGI_FORMAT_R10G10B10A2_UNORM:
+        return VERTEX_FORMAT_R10G10B10A2_UNORM;
+    case DXGI_FORMAT_R8G8B8A8_UNORM:
+        return VERTEX_FORMAT_R8G8B8A8_UNORM;
+    case DXGI_FORMAT_R8G8_UNORM:
+        return VERTEX_FORMAT_R8G8_UNORM;
+    case DXGI_FORMAT_R8G8B8A8_SNORM:
+        return VERTEX_FORMAT_R8G8B8A8_SNORM;
+    case DXGI_FORMAT_R8G8_SNORM:
+        return VERTEX_FORMAT_R8G8_SNORM;
+    default:
+        return VERTEX_FORMAT_R32G32_FLOAT;
+    }
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
+{
+    geomDesc->Triangles.IndexCount = count;
+}
+
+GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.IndexCount;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
+{
+    geomDesc->Triangles.VertexCount = count;
+}
+
+GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.VertexCount;
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer)
+{
+    geomDesc->Triangles.IndexBuffer = buffer;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.IndexBuffer;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+    geomDesc->Triangles.VertexBuffer.StartAddress = address;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.VertexBuffer.StartAddress;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
+{
+    geomDesc->Triangles.VertexBuffer.StrideInBytes = stride;
+}
+
+GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.VertexBuffer.StrideInBytes;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count)
+{
+    geomDesc->AABBs.AABBCount = count;
+}
+
+GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->AABBs.AABBCount;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+    geomDesc->AABBs.AABBs.StartAddress = address;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->AABBs.AABBs.StartAddress;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
+{
+    geomDesc->AABBs.AABBs.StrideInBytes = stride;
+}
+
+GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->AABBs.AABBs.StrideInBytes;
+}
+
+typedef struct D3D12_RAYTRACING_INSTANCE_DESC
+{
+    float Transform[12];
+    //     unsigned int InstanceID : 24;
+    //     unsigned int InstanceMask : 8;
+    uint32_t DW0;
+    //     unsigned int InstanceContributionToHitGroupIndex : 24;
+    //     unsigned int Flags : 8;
+    uint32_t DW1;
+    global char *AccelerationStructure;
+} D3D12_RAYTRACING_INSTANCE_DESC;
+
+GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column)
+{
+    return d->Transform[row * 4 + column];
+}
+
+GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW0 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW0 >> 24;
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW1 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW1 >> 24;
+}
+
+GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return (gpuva_t)d->AccelerationStructure;
+}
+
+GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value)
+{
+    d->Transform[row * 4 + column] = value;
+}
+
+GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id)
+{
+    d->DW0 &= 255 << 24;
+    d->DW0 |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask)
+{
+    d->DW0 &= ((1 << 24) - 1);
+    d->DW0 |= mask << 24;
+}
+
+GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution)
+{
+    d->DW1 &= 255 << 24;
+    d->DW1 |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags)
+{
+    d->DW1 &= ((1 << 24) - 1);
+    d->DW1 |= flags << 24;
+}
+
+GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address)
+{
+    d->AccelerationStructure = (global char*)address;
+}
diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
new file mode 100644
index 00000000000..d37adbbbb2b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
@@ -0,0 +1,59 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom(
+    global struct Geo *src,
+    global struct Geo *dst,
+    global float4 *vec,
+    global ushort *indices,
+    dword step)
+{
+    src = src + get_group_id(0);
+    dst = dst + get_group_id(0);
+    dst->Flags = src->Flags;
+    dst->Type = src->Type;
+    if (src->Type == GEOMETRY_TYPE_PROCEDURAL)
+    {
+        dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
+        dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount;
+        dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
+    }
+    else
+    {
+        dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer;
+        if (step == 0)
+            return;
+        dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount;
+        if (step == 1)
+            return;
+        dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount;
+        if (step == 2)
+            return;
+        dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat;
+        if (step == 3)
+            return;
+        dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer;
+        if (step == 4)
+            return;
+        dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer;
+        if (step == 5)
+            return;
+        dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride;
+
+        dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat;
+
+        for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++)
+        {
+            uint3 tri = GRL_load_triangle(src, t);
+            vec[t * 3] = GRL_load_vertex(src, tri[0]);
+            vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]);
+            vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]);
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
new file mode 100644
index 00000000000..3779439c54b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
@@ -0,0 +1,27 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module api_interface_verify;
+
+kernel copy_geom                   < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" >
+
+metakernel ifc0_copy( 
+    qword src,
+    qword dst,
+    qword vec,
+    qword srcIndices,
+    dword numGroups,
+    dword step)
+{
+    dispatch copy_geom(numGroups,1,1) args(
+        src,
+        dst,
+        vec,
+        srcIndices,
+        step
+        );
+}
diff --git a/src/intel/vulkan/grl/gpu/input_dump.cl b/src/intel/vulkan/grl/gpu/input_dump.cl
new file mode 100644
index 00000000000..f668f053f1f
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/input_dump.cl
@@ -0,0 +1,723 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "d3d12.h"
+#include "mem_utils.h"
+#include "misc_shared.h"
+
+/// Align value to 128
+///
+/// @param value vale to align
+/// @return aligned value
+GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }
+
+GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
+    return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
+}
+
+/// Finds max used byte in vertex buffer
+///
+/// @param indexBuffPtr pointer to index buffer
+/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
+/// @param IndexCount number of indices in index buffer
+/// @param IndexFormat index format
+/// @param VertexCount number of vertices in vertex buffer
+/// @param VertexBufferByteStride vertex buffer byte stride
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel find_max_used_byte_in_buff(
+    global void* indexBuffPtr,
+    global uint* vertexBufferUsedByteEnd,
+    dword IndexCount,
+    dword IndexFormat,
+    dword VertexCount,
+    qword VertexBufferByteStride)
+{
+    local uint sgMax[16];
+    uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);
+
+    if (IndexFormat != INDEX_FORMAT_NONE)
+    {
+        uint endByte = 0;
+        if (glob_id < IndexCount)
+        {
+            if (IndexFormat == INDEX_FORMAT_R16_UINT)
+            {
+                global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
+                endByte = indexBuffPtrShort[glob_id];
+            }
+            else
+            {
+                global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
+                endByte = indexBuffPtrUint[glob_id];
+            }
+        }
+
+        endByte = sub_group_reduce_max(endByte);
+
+        if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (get_sub_group_id() == 0)
+        {
+            endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
+            if (get_sub_group_local_id() == 0) 
+            {
+                endByte = min(endByte, VertexCount);
+                if (endByte < VertexCount && IndexCount != 0)
+                    ++endByte;
+                endByte *= (dword)VertexBufferByteStride;
+                atomic_max(vertexBufferUsedByteEnd, endByte);
+            }
+        }
+    }
+    else if (glob_id == 0)
+    {
+        uint endByte = VertexCount * VertexBufferByteStride;
+        atomic_max(vertexBufferUsedByteEnd, endByte);
+    }
+}
+
+/// Allocates buffer for vertices
+///
+/// @param batchPtrs batch pointers struct
+/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
+/// @param vertexBufferOffset pointer to offsets to vertex buffers
+/// @param numVertexBuffers number of vertex buffers
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_linear_offsets_for_vertex_buffers(
+    global InputBatchPtrs* batchPtrs,
+    global uint* vertexBufferUsedByteEnd,
+    global uint* vertexBufferOffset,
+    dword numVertexBuffers)
+{
+    uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
+
+    if (glob_id < numVertexBuffers)
+    {
+        uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
+        uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
+        vertexBufferOffset[glob_id] = position;
+    }
+}
+
+/// Sets the dst data space for input dump of this batch
+///
+/// @param inputDumpMainBuffer pointer to main dump buffer
+/// @param batchPtrs batch pointers struct
+/// @param nonVertexSize size of non vertex data
+/// @param batchIdPtr pointer to batch id
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_data_space_for_inputs(
+    global DebugBufferHeader* inputDumpMainBuffer,
+    global InputBatchPtrs* batchPtrs,
+    uint nonVertexSize,
+    global qword* batchIdPtr)
+{
+    if (get_sub_group_local_id() == 0) 
+    {
+        uint vertexBufferSize = batchPtrs->vertexBuffersSize;
+        uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;
+
+        if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) 
+        {
+            inputDumpMainBuffer->overflow = 1;
+            batchPtrs->dumpDst = 0;
+            batchPtrs->globalDumpBuffer = 0;
+            batchPtrs->nonVertexDataStart = 0;
+            batchPtrs->totalSize = 0;
+            return;
+        }
+
+        dword prevHead = inputDumpMainBuffer->gpuHead;
+        dword newHead;
+        bool circled;
+
+        do
+        {
+            circled = false;
+            newHead = prevHead + sizeOfThisBatch;
+            dword bufferBegin = prevHead;
+            if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
+            {
+                circled = true;
+                newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
+                bufferBegin = inputDumpMainBuffer->headStart;
+            }
+            dword bufferEnd = newHead + sizeof(InputBatch);
+
+            uint tail;
+            uint tail2 = 7;
+            bool wait;
+            do
+            {
+                wait = true;
+                tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);
+
+                // dead code, workaround so IGC won't move tail load out of loop
+                if (tail > inputDumpMainBuffer->totalSize) 
+                {
+                   store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
+                   tail2 = tail;
+                }
+
+                if( prevHead >= tail )
+                {
+                    //colision example:
+                    //  ----------T=======H------------
+                    //  -------B=====E-----------------
+                    //
+                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
+                    {
+                        wait = false;
+                    }
+                }
+                else 
+                {
+                    //colision example:
+                    //  ==========H-------T============
+                    //  B==============E---------------
+                    // caution: we will never have H circled completely so that H == T
+                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
+                    {
+                        wait = false;
+                    }
+                }
+            } while (wait);
+        } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));
+
+        if (circled)
+        {
+            global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
+            endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
+            prevHead = inputDumpMainBuffer->headStart;
+        }
+
+        global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
+        batchPtrs->dumpDst = (qword)thisBatchDump;
+        batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
+        batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
+        batchPtrs->totalSize = sizeOfThisBatch;
+
+        global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
+        batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
+        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
+        batchOp->vertexBufferDataSize = vertexBufferSize;
+        batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
+        batchOp->batchId = *batchIdPtr;
+    }
+}
+
+/// Sets the dst data space for output dump of this batch
+///
+/// @param outputDumpMainBuffer pointer to main dump buffer
+/// @param batchPtrs batch pointers struct
+/// @param batchIdPtr pointer to batch id
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_data_space_for_outputs(
+    global DebugBufferHeader* outputDumpMainBuffer,
+    global OutputBatchPtrs* batchPtrs,
+    global qword* batchIdPtr)
+{
+    if (get_sub_group_local_id() == 0) 
+    {
+        uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;
+
+        if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) 
+        {
+            outputDumpMainBuffer->overflow = 1;
+            batchPtrs->dumpDst = 0;
+            batchPtrs->dataStart = 0;
+            batchPtrs->totalSize = 0;
+            return;
+        }
+
+        dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
+        dword newHead;
+        bool circled;
+
+        do
+        {
+            //mem_fence_gpu_invalidate();
+            //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
+            circled = false;
+            newHead = prevHead + sizeOfThisBatch;
+            dword bufferBegin = prevHead;
+            if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
+            {
+                circled = true;
+                newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
+                bufferBegin = outputDumpMainBuffer->headStart;
+            }
+            dword bufferEnd = newHead + sizeof(OutputBatch);
+
+            uint tail;
+            uint tail2 = 7;
+            bool wait;
+            do
+            {
+                wait = true;
+                tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);
+
+                // dead code, workaround so IGC won't move tail load out of loop
+                if (tail > outputDumpMainBuffer->totalSize) 
+                {
+                   store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
+                   tail2 = tail;
+                }
+
+                if( prevHead >= tail )
+                {
+                    //colision example:
+                    //  ----------T=======H------------
+                    //  -------B=====E-----------------
+                    //
+                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
+                    {
+                        wait = false;
+                    }
+                }
+                else 
+                {
+                    //colision example:
+                    //  ==========H-------T============
+                    //  B==============E---------------
+                    // caution: we will never have H circled completely so that H == T
+                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
+                    {
+                        wait = false;
+                    }
+                }
+            } while (wait);
+        } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));
+
+        if (circled)
+        {
+            global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
+            endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
+            prevHead = outputDumpMainBuffer->headStart;
+        }
+
+        global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
+        batchPtrs->dumpDst = (qword)thisBatchDump;
+        batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
+        batchPtrs->totalSize = sizeOfThisBatch;
+
+        global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
+        batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
+        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
+        batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
+        batchOp->batchId = *batchIdPtr;
+    }
+}
+
+/// Calculates sum of output sizes
+///
+/// @param pbi pointer to post build infos
+/// @param destOffset offset in dest buffer
+/// @param numOutputs number of outputs
+/// @param batchPtrs batch pointers struct
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel calc_outputs_data_size(
+    global PostbuildInfoSerializationDesc* pbi,
+    global dword* destOffsets,
+    qword numOutputs,
+    global OutputBatchPtrs* batchPtrs)
+{
+    uint offset = 0;
+    for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
+    {
+        uint size = 0;
+        if (i < numOutputs)
+        {
+            size = AlignTo128(pbi[i].SerializedSizeInBytes);
+            size += AlignTo128(sizeof(OutputData));
+            destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
+        }
+        offset += sub_group_reduce_add(size);
+    }
+    if (get_sub_group_local_id() == 0)
+        batchPtrs->dataSize = offset;
+}
+
+/// Adds output data operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param destOffset offset in dest buffer
+/// @param src pointer to source bvh
+/// @param pbi pointer to post build info
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_output_data_op(
+    global OutputBatchPtrs* batchPtrs,
+    global dword* destOffset,
+    qword src,
+    global PostbuildInfoSerializationDesc* pbi)
+{
+    if (batchPtrs->dataStart == 0)
+        return;
+
+    global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
+    out->header.operationType = OUTPUT_DUMP_OP_DATA;
+    out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
+    out->srcBvhPtr = src;
+}
+
+/// Writes indices and transform or procedurals data
+///
+/// @param batchPtrs batch pointers struct
+/// @param srcDesc description of source geometry
+/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
+/// @param dstDescOffset offset to dest geo desc
+/// @param dstDataOffset offset to dest geo data
+/// @param numThreads number of threads
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
+void kernel write_geo_data(
+    global InputBatchPtrs* batchPtrs,
+    global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
+    global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+    global uint* pVertexBufferSize,
+    qword dstDescOffset,
+    qword dstDataOffset,
+    dword numThreads)
+{
+    if (batchPtrs->dumpDst == 0) return;
+
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+
+    GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;
+
+    global char* dstDataPtr = (global char*)(
+        batchPtrs->nonVertexDataStart + dstDataOffset);
+
+    global char* srcDataPtr;
+    global char* dstTransform;
+    uint bytesToCopy = 0;
+
+    if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
+    {
+        uint sizeOfMatrix = 0;
+
+        if (geoDescToStore.Desc.Triangles.pTransformBuffer)
+        {
+            sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
+            if (glob_id < 12)
+            {
+                global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
+                global float* matrixDst = (global float*)dstDataPtr;
+                matrixDst[glob_id] = matrixSrc[glob_id];
+                if (glob_id == 0) 
+                {
+                    geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
+                }
+            }
+        }
+        
+        dstDataPtr += sizeOfMatrix;
+        srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;
+
+        bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);
+
+        if (bytesToCopy && (glob_id == 0)) 
+        {
+            qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
+            // for this we remember offset relative to global debug buffer
+            geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
+            geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
+            geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
+        }
+        else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
+        {
+            if (geoDescToStore.Desc.Triangles.pVertexBuffer)
+            {
+                qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
+                // for this we remember offset relative to global debug buffer
+                geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
+            }
+        }
+        else if (glob_id == 0)
+        {
+            geoDescToStore.Desc.Triangles.IndexCount = 0;
+            geoDescToStore.Desc.Triangles.VertexCount = 0;
+            geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
+            geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
+        }
+    }
+    else 
+    {
+        srcDataPtr  = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
+        bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
+        if (glob_id == 0) 
+        {
+            geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
+        }
+    }
+
+    if (bytesToCopy) 
+    {
+        CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
+    }
+
+    if (glob_id == 0) 
+    {
+        global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
+            batchPtrs->nonVertexDataStart + dstDescOffset);
+        *dstDescPtr = geoDescToStore;
+    }
+}
+
+/// Adds build operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param buildOpOffset offset in dst buffer
+/// @param srcBvh address of src bvh (in case of update)
+/// @param dstBvhAddr address of dest bvh buffer
+/// @param offsetToEnd offset to end of this operation
+/// @param flags build flags
+/// @param numGeometries number of geometries in build
+/// @param numInstances number of instances in build
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_input_build_op(
+    global InputBatchPtrs* batchPtrs,
+    qword buildOpOffset,
+    qword srcBvh,
+    qword dstBvhAddr,
+    dword offsetToEnd,
+    dword flags,
+    dword numGeometries, 
+    dword numInstances,
+    dword instArrayOfPtrs)
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
+    
+    global InputBuild* buildOp = (global InputBuild*)(
+        batchPtrs->nonVertexDataStart + buildOpOffset);
+    buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
+    buildOp->header.endOfData = offsetToEnd;
+    buildOp->dstBvhPtr = dstBvhAddr;
+    buildOp->srcBvhPtr = srcBvh;
+    buildOp->flags = flags;
+    buildOp->numGeos = numGeometries;
+    buildOp->numInstances = numInstances;
+    buildOp->instArrayOfPtrs = instArrayOfPtrs;
+}
+
+/// Copies instance description
+///
+/// @param batchPtrs batch pointers struct
+/// @param instanceDescArr inst desc source
+/// @param offset ptr to offset in dst buffer
+/// @param numInstances number of instances to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+copy_instance_descriptors_array(
+    global InputBatchPtrs* batchPtrs,
+    global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
+    qword offset,                               
+    dword numInstances) 
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0) return;
+
+    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
+        batchPtrs->nonVertexDataStart + offset);
+
+    if (glob_id < numInstances)
+    {
+        dst[glob_id] = instanceDescArr[glob_id];
+    }
+}
+
+/// Copies instance description, array of pointers version
+///
+/// @param batchPtrs batch pointers struct
+/// @param pInstanceDescPtrsArr inst desc source
+/// @param offset ptr to offset in dst buffer
+/// @param numInstances number of instances to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+copy_instance_descriptors_array_of_ptrs(
+    global InputBatchPtrs* batchPtrs,
+    global qword* pInstanceDescPtrsArr,
+    qword offset,
+    dword numInstances)
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0) return;
+
+    // save gpuva of instance descs for debug
+    global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);
+
+    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
+        batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
+    global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;
+
+    if (glob_id < numInstances)
+    {
+        gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
+        dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
+    }
+}
+
+/// Adds copy operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param offset ptr to offset in dst buffer
+/// @param src copy source pointer
+/// @param dst copy destination pointer
+/// @param copyOpType copy type
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
+void kernel insert_copy_op(
+    global InputBatchPtrs* batchPtrs,
+    qword offset,
+    global void* src,
+    global void* dst,
+    uint copyOpType)
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
+
+    global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);
+
+    copyOp->header.operationType = copyOpType;
+    copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
+    copyOp->srcBvhPtr = (qword)src;
+    copyOp->dstBvhPtr = (qword)dst;
+}
+
+/// Copies vertex buffer
+///
+/// @param batchPtrs batch pointers struct
+/// @param src input buffer
+/// @param offset ptr to offset in dst buffer
+/// @param size ptr to number of bytes to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
+void kernel copy_vertex_data(
+    global InputBatchPtrs* batchPtrs,
+    global const char* src,
+    global const uint* offset,
+    global const uint* size) 
+{
+    if (batchPtrs->dumpDst == 0) return;
+
+    global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
+    uint numGroups = (*size >> 6) + 1;
+    CopyMemory(dst, src, *size, numGroups);
+}
+
+/// Generate unique batch id
+///
+/// @param batchIds array of unique batch ids
+/// @param index index of batch id to generate
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
+    global unsigned int *counterPtrs = (global unsigned int *)batchIds;
+    atomic_add(&counterPtrs[index * 2 + 1], 1);
+    batchIds[index] |= (unsigned long)index;
+}
+
+/// Sets batch as ready to read and moves cpuHead forward, inputs case
+///
+/// @param batchPtrs batch pointers struct
+/// @param dumpMainBuffer pointer to main dump buffer
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel finish_batch_dump_inputs(
+    global InputBatchPtrs* batchPtrs,
+    global DebugBufferHeader* dumpMainBuffer)
+{
+    if (batchPtrs->dumpDst == 0)
+        return;
+
+    global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;
+
+    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
+
+    dword seven = 7;
+    while (true)
+    {
+        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
+        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
+        {
+            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
+            currentHead = seven;
+        }
+
+        if (currentHead == myDstOffset)
+        {
+            mem_fence_evict_to_memory();
+            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
+            break;
+        }
+        else if (myDstOffset == dumpMainBuffer->headStart)
+        {
+            global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
+            if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
+            {
+                mem_fence_evict_to_memory();
+                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
+                break;
+            }
+        }
+    }
+}
+
+/// Sets batch as ready to read and moves cpuHead forward, outputs case
+///
+/// @param batchPtrs batch pointers struct
+/// @param dumpMainBuffer pointer to main dump buffer
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel finish_batch_dump_outputs(
+    global OutputBatchPtrs* batchPtrs,
+    global DebugBufferHeader* dumpMainBuffer)
+{
+    if (batchPtrs->dumpDst == 0)
+        return;
+
+    global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;
+
+    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
+
+    dword seven = 7;
+    while (true)
+    {
+        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
+        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
+        {
+            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
+            currentHead = seven;
+        }
+
+        if (currentHead == myDstOffset)
+        {
+            mem_fence_evict_to_memory();
+            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
+            break;
+        }
+        else if (myDstOffset == dumpMainBuffer->headStart)
+        {
+            global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
+            if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
+            {
+                mem_fence_evict_to_memory();
+                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
+                break;
+            }
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/input_dump.grl b/src/intel/vulkan/grl/gpu/input_dump.grl
new file mode 100644
index 00000000000..7cc6e60a95d
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/input_dump.grl
@@ -0,0 +1,252 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module input_dump;
+
+kernel_module input_dumper("input_dump.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_find_max_used_byte_in_buff                  < kernelFunction="find_max_used_byte_in_buff" >;
+    kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers  < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >;
+    kernel opencl_kernel_allocate_data_space_for_inputs              < kernelFunction="allocate_data_space_for_inputs" >;
+    kernel opencl_kernel_allocate_data_space_for_outputs             < kernelFunction="allocate_data_space_for_outputs" >;
+    kernel opencl_kernel_calc_outputs_data_size                      < kernelFunction="calc_outputs_data_size" >;
+    kernel opencl_kernel_write_output_data_op                        < kernelFunction="write_output_data_op" >;
+    kernel opencl_kernel_write_geo_data                              < kernelFunction="write_geo_data" >;
+    kernel opencl_kernel_write_input_build_op                        < kernelFunction="write_input_build_op" >;
+    kernel opencl_kernel_copy_instance_descriptors_array             < kernelFunction="copy_instance_descriptors_array" >;
+    kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs     < kernelFunction="copy_instance_descriptors_array_of_ptrs" >;
+    kernel opencl_kernel_insert_copy_op                              < kernelFunction="insert_copy_op" >;
+    kernel opencl_kernel_copy_vertex_data                            < kernelFunction="copy_vertex_data" >;
+    kernel opencl_kernel_generate_unique_batch_id                    < kernelFunction="generate_unique_batch_id" >;
+    kernel opencl_kernel_finish_batch_dump_inputs                    < kernelFunction="finish_batch_dump_inputs" >;
+    kernel opencl_kernel_finish_batch_dump_outputs                   < kernelFunction="finish_batch_dump_outputs" >;
+}
+
+
+metakernel find_max_used_byte_in_buff(
+    qword indexBuffPtr,
+    qword vertexBufferUsedByteEnd,
+    dword IndexCount,
+    dword IndexFormat,
+    dword VertexCount,
+    qword VertexBufferByteStride,
+    dword numPhysThreads)
+{ 
+    dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1)   args(
+        indexBuffPtr,
+        vertexBufferUsedByteEnd,
+        IndexCount,
+        IndexFormat,
+        VertexCount,
+        VertexBufferByteStride);
+}
+
+metakernel allocate_linear_offsets_for_vertex_buffers(
+    qword batchPtrs,
+    qword m_VertexBufferUsedByteEnd,
+    qword m_VertexBufferOffset,
+    dword numVertexBuffers,
+    dword numPhysThreads)
+{ 
+    dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args(
+        batchPtrs,
+        m_VertexBufferUsedByteEnd,
+        m_VertexBufferOffset,
+        numVertexBuffers);
+}
+
+metakernel allocate_data_space_for_inputs(
+    qword inputDumpMainBuffer,
+    qword batchPtrs,
+    dword nonVertexSize,
+    qword batchIdPtr)
+{  
+    dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args(
+        inputDumpMainBuffer,
+        batchPtrs,
+        nonVertexSize,
+        batchIdPtr);
+}
+
+metakernel allocate_data_space_for_outputs(
+    qword inputDumpMainBuffer,
+    qword batchPtrs,
+    qword batchIdPtr)
+{  
+    dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args(
+        inputDumpMainBuffer,
+        batchPtrs,
+        batchIdPtr);
+}
+
+metakernel calc_outputs_data_size(
+    qword pbi,
+    qword destOffsets,
+    qword numOutputs,
+    qword batchPtrs)
+{
+    dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args(
+        pbi,
+        destOffsets,
+        numOutputs,
+        batchPtrs);
+}
+
+metakernel write_output_data_op(
+    qword batchPtrs,
+    qword destOffset,
+    qword src,
+    qword pbi)
+{
+    dispatch opencl_kernel_write_output_data_op(1, 1, 1) args(
+        batchPtrs,
+        destOffset,
+        src,
+        pbi);
+}
+
+metakernel write_geo_data(
+    qword batchPtrs,
+    qword srcDesc,
+    qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+    qword pVertexBufferSize,
+    qword dstDescOffset,
+    qword dstDataOffset,
+    dword numThreads)
+{  
+    dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args(
+        batchPtrs,
+        srcDesc,
+        pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+        pVertexBufferSize,
+        dstDescOffset,
+        dstDataOffset,
+        numThreads);
+}
+
+metakernel write_input_build_op(
+    qword batchPtrs,
+    qword buildOpOffset,
+    qword srcBvh,
+    qword dstBvhAddr,
+    dword offsetToEnd,
+    dword flags,
+    dword numGeometries,
+    dword numInstances,
+    dword instArrayOfPtrs)
+
+{  
+    dispatch opencl_kernel_write_input_build_op(1, 1, 1) args(
+        batchPtrs,
+        buildOpOffset,
+        srcBvh,
+        dstBvhAddr,
+        offsetToEnd,
+        flags,
+        numGeometries,
+        numInstances,
+        instArrayOfPtrs);
+}
+
+metakernel copy_instance_descriptors_array(
+    qword batchPtrs,
+    qword instanceDescArr,
+    qword offset,
+    dword numInstances,
+    dword numPhysThreads)
+{  
+    dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args(
+        batchPtrs,
+        instanceDescArr,
+        offset,
+        numInstances);
+}
+
+metakernel copy_instance_descriptors_array_of_ptrs(
+    qword batchPtrs,
+    qword instanceDescArrPtrs,
+    qword offset,
+    dword numInstances,
+    dword numPhysThreads)
+{  
+    dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args(
+        batchPtrs,
+        instanceDescArrPtrs,
+        offset,
+        numInstances);
+}
+
+metakernel insert_copy_op(
+    qword batchPtrs,
+    qword offset,
+    qword src,
+    qword dst,
+    dword type)
+{  
+    dispatch opencl_kernel_insert_copy_op(1, 1, 1) args(
+        batchPtrs,
+        offset,
+        src,
+        dst,
+        type);
+}
+
+metakernel copy_vertex_data(
+    qword desc,
+    qword src,
+    qword offset,
+    qword size)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define shift REG2;
+    define minimum REG3;
+
+    shift = 6;
+    minimum = 1;
+    byteSize = load_dword(size);
+    numGroupsRqd = byteSize >> shift;
+    numGroupsRqd = numGroupsRqd + minimum;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_vertex_data args(
+        desc,
+        src,
+        offset,
+        size);
+}
+
+metakernel generate_unique_batch_id(
+    qword batchIds,
+    dword batchIndex)
+{
+    dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args(
+        batchIds,
+        batchIndex);
+}
+
+metakernel finish_batch_dump_inputs(
+    qword batchPtrs,
+    qword dumpMainBuffer)
+{
+    dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args(
+        batchPtrs,
+        dumpMainBuffer);
+}
+
+metakernel finish_batch_dump_outputs(
+    qword batchPtrs,
+    qword dumpMainBuffer)
+{
+    dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args(
+        batchPtrs,
+        dumpMainBuffer);
+}
diff --git a/src/intel/vulkan/grl/gpu/instance.h b/src/intel/vulkan/grl/gpu/instance.h
new file mode 100644
index 00000000000..e463a01dc90
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/instance.h
@@ -0,0 +1,183 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "affinespace.h"
+#include "api_interface.h"
+#include "qbvh6.h"
+#include "libs/lsc_intrinsics.h"
+
+GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I)
+{
+    return I->part1.instanceIndex;
+}
+
+GRL_INLINE void encodeDW0_HwInstanceLeafPart0(
+    uint32_t shaderIndex,
+    uint32_t geomMask,
+    uint4 *dst)
+{
+    (*dst).x = (shaderIndex & ((1 << 24) - 1)) |
+             (geomMask << 24);
+}
+
+GRL_INLINE void encodeDW1_HwInstanceLeafPart0(
+    uint32_t instanceContributionToHitGroupIndex,
+    uint32_t notProcedural,
+    uint32_t geomFlags,
+    uint4* dst)
+{
+    (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
+        ((notProcedural & 1) << (24 + 5)) |
+        ((geomFlags & 3) << (24 + 5 + 1));
+}
+
+GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0(
+    uint64_t rootNodePtr,
+    uint32_t instFlags,
+    uint4* dst)
+{
+    uint64_t flags = instFlags;
+    uint DW2 = (uint)rootNodePtr;
+    uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff);
+    DW3 |= flags << 16ull;
+    (*dst).z = DW2;
+    (*dst).w = DW3;
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I,
+                                       uint32_t shaderIndex,
+                                       uint32_t geomMask)
+{
+    I->part0.DW0 =
+        (shaderIndex & ((1 << 24) - 1)) |
+        (geomMask << 24);
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I,
+                                       uint32_t instanceContributionToHitGroupIndex,
+                                       uint32_t notProcedural,
+                                       uint32_t geomFlags)
+{
+    I->part0.DW1 =
+        (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
+        ((notProcedural & 1) << (24 + 5)) |
+        ((geomFlags & 3) << (24 + 5 + 1));
+}
+
+GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I,
+                                          global char *pBvhPtr)
+{
+    I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1);
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I,
+                                          uint64_t rootNodePtr,
+                                          uint32_t instFlags)
+{
+    uint64_t flags = instFlags;
+    flags = flags << 48ull;
+    uint64_t ptr = rootNodePtr & 0x0000ffffffffffff;
+    I->part0.DW2_DW3 = ptr + flags;
+}
+
+GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf,
+    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
+    uint instanceIndex,
+    uint rootNodeByteOffset,
+    uint instanceMask)
+{
+    global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf);
+
+    struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform);
+
+    qword accStructPtr = (qword)instDesc->AccelerationStructure;
+    uint4 p1_DW0_3 = (uint4)(
+        (uint)accStructPtr,
+        (uint)(accStructPtr >> (uint64_t)32),
+        GRL_get_instanceID(instDesc),
+        instanceIndex);
+
+    struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world);
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3);
+
+    uint4 p1_DW4_7 = (uint4)(
+        as_uint(obj2world.l.vx.x),
+        as_uint(obj2world.l.vx.y),
+        as_uint(obj2world.l.vx.z),
+        as_uint(obj2world.l.vy.x));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7);
+
+    uint4 p1_DW8_11 = (uint4)(
+        as_uint(obj2world.l.vy.y),
+        as_uint(obj2world.l.vy.z),
+        as_uint(obj2world.l.vz.x),
+        as_uint(obj2world.l.vz.y));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11);
+
+
+    uint4 p1_DW12_15 = (uint4)(
+        as_uint(obj2world.l.vz.z),
+        as_uint(world2obj.p.x),
+        as_uint(world2obj.p.y),
+        as_uint(world2obj.p.z));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15);
+
+    
+    uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc);
+    global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure;
+
+    uint4 p0_DW0_3;
+
+    encodeDW0_HwInstanceLeafPart0(
+        hit_group_index,
+        instanceMask,
+        &p0_DW0_3);
+
+    encodeDW1_HwInstanceLeafPart0(
+        hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index
+        1,  // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing
+        0,
+        &p0_DW0_3);
+
+    encodeDW2DW3_HwInstanceLeafPart0(
+        rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer
+        GRL_get_InstanceFlags(instDesc),
+        &p0_DW0_3);
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3);
+
+    uint4 p0_DW4_7 = (uint4)(
+        as_uint(world2obj.l.vx.x),
+        as_uint(world2obj.l.vx.y),
+        as_uint(world2obj.l.vx.z),
+        as_uint(world2obj.l.vy.x));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7);
+
+    uint4 p0_DW8_11 = (uint4)(
+        as_uint(world2obj.l.vy.y),
+        as_uint(world2obj.l.vy.z),
+        as_uint(world2obj.l.vz.x),
+        as_uint(world2obj.l.vz.y));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11);
+
+    uint4 p0_DW12_15 = (uint4)(
+        as_uint(world2obj.l.vz.z),
+        as_uint(obj2world.p.x),
+        as_uint(obj2world.p.y),
+        as_uint(obj2world.p.z));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15);
+}
diff --git a/src/intel/vulkan/grl/gpu/intrinsics.h b/src/intel/vulkan/grl/gpu/intrinsics.h
new file mode 100644
index 00000000000..0dff3147d8a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/intrinsics.h
@@ -0,0 +1,581 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+// TODO: AABB_work_group_reduce is super slow, remove !!!
+
+#pragma cl_intel_subgroups : enable
+#pragma cl_khr_fp16        : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+
+uint intel_sub_group_ballot(bool valid);
+
+// atom_min
+float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
+// atom_max
+float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
+// atom_cmpxchg
+float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
+
+
+
+inline uint subgroup_single_atomic_add(global uint *p, uint val)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
+    return sub_group_broadcast(v, 0);
+}
+
+inline float halfarea(const float3 d)
+{
+    return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+inline float area(const float3 d)
+{
+    return halfarea(d) * 2.0f;
+}
+
+inline uint maxDim(const float3 a)
+{
+    const float3 b = fabs(a);
+    const bool b_x_y = b.x > b.y;
+    const float cur_max = b_x_y ? b.x : b.y;
+    const uint cur_idx = b_x_y ? 0 : 1;
+    const bool b_x_y_z = b.z > cur_max;
+    return b_x_y_z ? 2 : cur_idx;
+}
+
+inline uint3 sortByMaxDim(const float3 a)
+{
+    const uint kz = maxDim(a);
+    const uint _kx = (kz + 1) % 3;
+    const uint _ky = (_kx + 1) % 3;
+    const bool kz_pos = a[kz] >= 0.0f;
+    const uint kx = kz_pos ? _ky : _kx;
+    const uint ky = kz_pos ? _kx : _ky;
+    return (uint3)(kx, ky, kz);
+}
+
+inline uint4 sort4_ascending(const uint4 dist)
+{
+    const uint a0 = dist.s0;
+    const uint a1 = dist.s1;
+    const uint a2 = dist.s2;
+    const uint a3 = dist.s3;
+    const uint b0 = min(a0, a2);
+    const uint b1 = min(a1, a3);
+    const uint b2 = max(a0, a2);
+    const uint b3 = max(a1, a3);
+    const uint c0 = min(b0, b1);
+    const uint c1 = max(b0, b1);
+    const uint c2 = min(b2, b3);
+    const uint c3 = max(b2, b3);
+    const uint d0 = c0;
+    const uint d1 = min(c1, c2);
+    const uint d2 = max(c1, c2);
+    const uint d3 = c3;
+    return (uint4)(d0, d1, d2, d3);
+}
+
+__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
+__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
+__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
+
+__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
+__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
+__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
+
+__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
+
+inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const uint a_min = min(a0, a1);
+    const uint a_max = max(a0, a1);
+    return select(a_max, a_min, selectMask);
+}
+
+inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const uint a_min = min(a0, a1);
+    const uint a_max = max(a0, a1);
+    return select(a_min, a_max, selectMask);
+}
+
+inline uint sort8_descending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
+    const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
+    const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
+    const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint sort8_ascending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
+    const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
+    const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
+    const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint sort4_descending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
+    return dd;
+}
+
+inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const ulong a_min = min(a0, a1);
+    const ulong a_max = max(a0, a1);
+    return select(a_max, a_min, (ulong)selectMask);
+}
+
+inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const ulong a_min = min(a0, a1);
+    const ulong a_max = max(a0, a1);
+    return select(a_min, a_max, (ulong)selectMask);
+}
+
+inline ulong sort8_ascending_ulong(const ulong aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
+    const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
+    const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
+    const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
+    const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
+    const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint bitInterleave3D(const uint4 in)
+{
+    uint x = in.x, y = in.y, z = in.z;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x << 8)) & 0x0300F00F;
+    x = (x | (x << 4)) & 0x030C30C3;
+    x = (x | (x << 2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y << 8)) & 0x0300F00F;
+    y = (y | (y << 4)) & 0x030C30C3;
+    y = (y | (y << 2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z << 8)) & 0x0300F00F;
+    z = (z | (z << 4)) & 0x030C30C3;
+    z = (z | (z << 2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+}
+
+inline uint bitInterleave4D(const uint4 in)
+{
+    uint x = in.x, y = in.y, z = in.z, w = in.w;
+
+    x = x & 0x000000ff;
+    x = (x ^ (x << 16)) & 0x00c0003f;
+    x = (x ^ (x << 8)) & 0x00c03807;
+    x = (x ^ (x << 4)) & 0x08530853;
+    x = (x ^ (x << 2)) & 0x09090909;
+    x = (x ^ (x << 1)) & 0x11111111;
+
+    y = y & 0x000000ff;
+    y = (y ^ (y << 16)) & 0x00c0003f;
+    y = (y ^ (y << 8)) & 0x00c03807;
+    y = (y ^ (y << 4)) & 0x08530853;
+    y = (y ^ (y << 2)) & 0x09090909;
+    y = (y ^ (y << 1)) & 0x11111111;
+
+    z = z & 0x000000ff;
+    z = (z ^ (z << 16)) & 0x00c0003f;
+    z = (z ^ (z << 8)) & 0x00c03807;
+    z = (z ^ (z << 4)) & 0x08530853;
+    z = (z ^ (z << 2)) & 0x09090909;
+    z = (z ^ (z << 1)) & 0x11111111;
+
+    w = w & 0x000000ff;
+    w = (w ^ (w << 16)) & 0x00c0003f;
+    w = (w ^ (w << 8)) & 0x00c03807;
+    w = (w ^ (w << 4)) & 0x08530853;
+    w = (w ^ (w << 2)) & 0x09090909;
+    w = (w ^ (w << 1)) & 0x11111111;
+
+    return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline ulong ulong_bitInterleave4D(const uint4 in)
+{
+    ulong x = in.x, y = in.y, z = in.z, w = in.w;
+
+    x = x & 0x0000ffff;
+    x = (x ^ (x << 32)) & 0x0000f800000007ff;
+    x = (x ^ (x << 16)) & 0x0000f80007c0003f;
+    x = (x ^ (x << 8)) & 0x00c0380700c03807;
+    x = (x ^ (x << 4)) & 0x0843084308430843;
+    x = (x ^ (x << 2)) & 0x0909090909090909;
+    x = (x ^ (x << 1)) & 0x1111111111111111;
+
+    y = y & 0x0000ffff;
+    y = (y ^ (y << 32)) & 0x0000f800000007ff;
+    y = (y ^ (y << 16)) & 0x0000f80007c0003f;
+    y = (y ^ (y << 8)) & 0x00c0380700c03807;
+    y = (y ^ (y << 4)) & 0x0843084308430843;
+    y = (y ^ (y << 2)) & 0x0909090909090909;
+    y = (y ^ (y << 1)) & 0x1111111111111111;
+
+    z = z & 0x0000ffff;
+    z = (z ^ (z << 32)) & 0x0000f800000007ff;
+    z = (z ^ (z << 16)) & 0x0000f80007c0003f;
+    z = (z ^ (z << 8)) & 0x00c0380700c03807;
+    z = (z ^ (z << 4)) & 0x0843084308430843;
+    z = (z ^ (z << 2)) & 0x0909090909090909;
+    z = (z ^ (z << 1)) & 0x1111111111111111;
+
+    w = w & 0x0000ffff;
+    w = (w ^ (w << 32)) & 0x0000f800000007ff;
+    w = (w ^ (w << 16)) & 0x0000f80007c0003f;
+    w = (w ^ (w << 8)) & 0x00c0380700c03807;
+    w = (w ^ (w << 4)) & 0x0843084308430843;
+    w = (w ^ (w << 2)) & 0x0909090909090909;
+    w = (w ^ (w << 1)) & 0x1111111111111111;
+
+    return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline uint bitCompact(uint x)
+{
+    x &= 0x09249249;
+    x = (x ^ (x >> 2)) & 0x030c30c3;
+    x = (x ^ (x >> 4)) & 0x0300f00f;
+    x = (x ^ (x >> 8)) & 0xff0000ff;
+    x = (x ^ (x >> 16)) & 0x000003ff;
+    return x;
+}
+
+inline uint3 bitCompact3D(const uint in)
+{
+    const uint x = bitCompact(x >> 0);
+    const uint y = bitCompact(y >> 1);
+    const uint z = bitCompact(z >> 2);
+    return (uint3)(x, y, z);
+}
+
+inline uint convertToPushIndices8(uint ID)
+{
+    const unsigned int slotID = get_sub_group_local_id();
+    uint index = 0;
+    for (uint i = 0; i < 8; i++)
+    {
+        const uint mask = intel_sub_group_ballot(ID == i);
+        const uint new_index = ctz(mask);
+        index = i == slotID ? new_index : index;
+    }
+    return index;
+}
+
+inline uint convertToPushIndices16(uint ID)
+{
+    const unsigned int slotID = get_sub_group_local_id();
+    uint index = 0;
+    for (uint i = 0; i < 16; i++)
+    {
+        const uint mask = intel_sub_group_ballot(ID == i);
+        const uint new_index = ctz(mask);
+        index = i == slotID ? new_index : index;
+    }
+    return index;
+}
+
+#define FLOAT_EXPONENT_MASK     (0x7F800000)  // used to be EXPONENT_MASK
+#define FLOAT_MANTISSA_MASK     (0x007FFFFF)  // used to be MANTISSA_MASK
+#define FLOAT_NEG_ONE_EXP_MASK  (0x3F000000)
+#define FLOAT_BIAS              (127)
+#define FLOAT_MANTISSA_BITS     (23)
+
+inline float3 frexp_vec3(float3 len, int3* exp)
+{
+    float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
+    mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
+    mant = copysign(mant, len);
+    *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
+    return mant;
+}
+
+
+#ifndef uniform
+#define uniform
+#endif
+
+#ifndef varying
+#define varying
+#endif
+
+uint get_sub_group_global_id()
+{
+    return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
+}
+
+// each lane contains the number of 1 bits below the corresponding position in 'mask'
+uint subgroup_bit_prefix_exclusive(uniform uint mask)
+{
+    varying ushort lane = get_sub_group_local_id();
+    varying uint lane_mask = (1 << lane) - 1;
+    varying uint m = mask & lane_mask;
+    return popcount(m);
+}
+
+uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
+{
+    varying uint lane_mask = (1 << lane_idx) - 1;
+    varying uint m = mask & lane_mask;
+    return popcount(m);
+}
+
+
+uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
+{
+    return (uint3)(sub_group_broadcast(v.x,idx),
+                   sub_group_broadcast(v.y,idx),
+                   sub_group_broadcast(v.z,idx));
+}
+
+float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
+{
+    return (float3)(sub_group_broadcast(v.x, idx),
+                    sub_group_broadcast(v.y, idx),
+                    sub_group_broadcast(v.z, idx));
+}
+
+float3 sub_group_reduce_min_float3(float3 v)
+{
+    return (float3)(sub_group_reduce_min(v.x),
+                    sub_group_reduce_min(v.y),
+                    sub_group_reduce_min(v.z) );
+}
+float3 sub_group_reduce_max_float3(float3 v)
+{
+    return (float3)(sub_group_reduce_max(v.x),
+                    sub_group_reduce_max(v.y),
+                    sub_group_reduce_max(v.z));
+}
+
+float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
+{
+    return (float3)(intel_sub_group_shuffle(v.x, idx),
+                    intel_sub_group_shuffle(v.y, idx),
+                    intel_sub_group_shuffle(v.z, idx));
+}
+uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
+{
+    return (uint3)( intel_sub_group_shuffle(v.x, idx),
+                    intel_sub_group_shuffle(v.y, idx),
+                    intel_sub_group_shuffle(v.z, idx));
+}
+
+
+inline uchar sub_group_reduce_or_N6(uchar val)
+{
+    val = val | intel_sub_group_shuffle_down(val, val, 4);
+    val = val | intel_sub_group_shuffle_down(val, val, 2);
+    val = val | intel_sub_group_shuffle_down(val, val, 1);
+    return sub_group_broadcast(val, 0);
+}
+
+inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
+{
+    uint SIMD8_id = get_sub_group_local_id() / 8;
+    val = val | intel_sub_group_shuffle_down(val, val, 4);
+    val = val | intel_sub_group_shuffle_down(val, val, 2);
+    val = val | intel_sub_group_shuffle_down(val, val, 1);
+
+    return intel_sub_group_shuffle(val, SIMD8_id * 8);
+}
+
+
+inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
+{
+    return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
+}
+
+inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
+{
+    return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
+{
+    return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_add_local( local uint* p, uint n )
+{
+    return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_xor_local(local uint* p, uint n)
+{
+    return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_or_local(local uint* p, uint n)
+{
+    return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_min_local(local uint* p, uint n)
+{
+    return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_max_local(local uint* p, uint n)
+{
+    return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+
+
+
+inline uint atomic_inc_global( global uint* p )
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_dec_global(global uint* p)
+{
+    return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
+{
+    return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_add_global( global uint* p, uint n )
+{
+    return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_sub_global(global uint* p, uint n)
+{
+    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_or_global(global uint* p, uint n)
+{
+    return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_acquire(global uint* p)
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_release(global uint* p)
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+inline uint atomic_dec_global_release(global uint* p)
+{
+    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+
+inline uint generic_atomic_add(uint* p, uint val)
+{
+    if (to_global(p) != NULL)
+        return atomic_add_global(to_global(p), val);
+    if (to_local(p) != NULL)
+        return atomic_add_local(to_local(p), val);
+    return 0;
+}
+
+inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
+{
+    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+    return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
+{
+    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+    return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
+{
+    n = max(n, intel_sub_group_shuffle_down(n, n, 4));
+    n = max(n, intel_sub_group_shuffle_down(n, n, 2));
+    n = max(n, intel_sub_group_shuffle_down(n, n, 1));
+    return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
+}
+
+inline uint generic_atomic_inc(uint* p)
+{
+    if (to_global(p) != NULL)
+        return atomic_inc_global(to_global(p));
+    if (to_local(p) != NULL)
+        return atomic_inc(to_local(p));
+    return 0;
+}
+
+
+// Built-in GRL function which, if called in a kernel body, will force the kernel
+//  to be compiled to the minimum SIMD width supported by the platform
+void GRL_UseMinimumSIMDWidth();
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/libs/libraries.grl b/src/intel/vulkan/grl/gpu/libs/libraries.grl
new file mode 100644
index 00000000000..1d6c0d2c6c5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/libraries.grl
@@ -0,0 +1,13 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+library lsc_intrinsics
+{
+    default   "lsc_intrinsics.cl" ;
+    fallback  "lsc_intrinsics_fallback.cl";
+}
+
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
new file mode 100644
index 00000000000..03a76ba36f1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
@@ -0,0 +1,1033 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Cache options
+// Load message caching control
+enum LSC_LDCC {
+    LSC_LDCC_DEFAULT,
+    LSC_LDCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
+    LSC_LDCC_L1UC_L3C,      // Override to L1 uncached and L3 cached
+    LSC_LDCC_L1C_L3UC,      // Override to L1 cached and L3 uncached
+    LSC_LDCC_L1C_L3C,       // Override to L1 cached and L3 cached
+    LSC_LDCC_L1S_L3UC,      // Override to L1 streaming load and L3 uncached
+    LSC_LDCC_L1S_L3C,       // Override to L1 streaming load and L3 cached
+    LSC_LDCC_L1IAR_L3C,     // Override to L1 invalidate-after-read, and L3 cached
+};
+
+// Store message caching control (also used for atomics)
+enum LSC_STCC {
+    LSC_STCC_DEFAULT,
+    LSC_STCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
+    LSC_STCC_L1UC_L3WB,     // Override to L1 uncached and L3 written back
+    LSC_STCC_L1WT_L3UC,     // Override to L1 written through and L3 uncached
+    LSC_STCC_L1WT_L3WB,     // Override to L1 written through and L3 written back
+    LSC_STCC_L1S_L3UC,      // Override to L1 streaming and L3 uncached
+    LSC_STCC_L1S_L3WB,      // Override to L1 streaming and L3 written back
+    LSC_STCC_L1WB_L3WB,     // Override to L1 written through and L3 written back
+};
+
+// LSC Loads
+
+// Global address space
+uint    __builtin_IB_lsc_load_global_uchar_to_uint (const __global uchar  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D8U32
+uint    __builtin_IB_lsc_load_global_ushort_to_uint(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt);   //D16U32
+uint    __builtin_IB_lsc_load_global_uint  (const __global uint   *base, int immElemOff, enum LSC_LDCC cacheOpt);       //D32V1
+uint2   __builtin_IB_lsc_load_global_uint2 (const __global uint2  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V2
+uint3   __builtin_IB_lsc_load_global_uint3 (const __global uint3  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V3
+uint4   __builtin_IB_lsc_load_global_uint4 (const __global uint4  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V4
+uint8   __builtin_IB_lsc_load_global_uint8 (const __global uint8  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V8
+ulong   __builtin_IB_lsc_load_global_ulong (const __global ulong  *base, int immElemOff, enum LSC_LDCC cacheOpt);    //D64V1
+ulong2  __builtin_IB_lsc_load_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V2
+ulong3  __builtin_IB_lsc_load_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V3
+ulong4  __builtin_IB_lsc_load_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V4
+ulong8  __builtin_IB_lsc_load_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V8
+
+// Local address space
+uint    __builtin_IB_lsc_load_local_uchar_to_uint( const __local  uchar *base, int immElemOff); //D8U32
+uint    __builtin_IB_lsc_load_local_ushort_to_uint(const __local ushort *base, int immElemOff); //D16U32
+uint    __builtin_IB_lsc_load_local_uint  (const __local uint   *base, int immElemOff);   //D32V1
+uint2   __builtin_IB_lsc_load_local_uint2 (const __local uint2  *base, int immElemOff);  //D32V2
+uint3   __builtin_IB_lsc_load_local_uint3 (const __local uint3  *base, int immElemOff);  //D32V3
+uint4   __builtin_IB_lsc_load_local_uint4 (const __local uint4  *base, int immElemOff);  //D32V4
+uint8   __builtin_IB_lsc_load_local_uint8 (const __local uint8  *base, int immElemOff);  //D32V8
+ulong   __builtin_IB_lsc_load_local_ulong (const __local ulong  *base, int immElemOff);  //D64V1
+ulong2  __builtin_IB_lsc_load_local_ulong2(const __local ulong2 *base, int immElemOff); //D64V2
+ulong3  __builtin_IB_lsc_load_local_ulong3(const __local ulong3 *base, int immElemOff); //D64V3
+ulong4  __builtin_IB_lsc_load_local_ulong4(const __local ulong4 *base, int immElemOff); //D64V4
+ulong8  __builtin_IB_lsc_load_local_ulong8(const __local ulong8 *base, int immElemOff); //D64V8
+
+// LSC Stores
+
+// Global address space
+void  __builtin_IB_lsc_store_global_uchar_from_uint (__global uchar  *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);     //D8U32
+void  __builtin_IB_lsc_store_global_ushort_from_uint(__global ushort *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);  //D16U32
+void  __builtin_IB_lsc_store_global_uint  (__global uint   *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);        //D32V1
+void  __builtin_IB_lsc_store_global_uint2 (__global uint2  *base, int immElemOff, uint2 val, enum LSC_STCC cacheOpt);     //D32V2
+void  __builtin_IB_lsc_store_global_uint3 (__global uint3  *base, int immElemOff, uint3 val, enum LSC_STCC cacheOpt);     //D32V3
+void  __builtin_IB_lsc_store_global_uint4 (__global uint4  *base, int immElemOff, uint4 val, enum LSC_STCC cacheOpt);     //D32V4
+void  __builtin_IB_lsc_store_global_uint8 (__global uint8  *base, int immElemOff, uint8 val, enum LSC_STCC cacheOpt);     //D32V8
+void  __builtin_IB_lsc_store_global_ulong (__global ulong  *base, int immElemOff, ulong val, enum LSC_STCC cacheOpt);     //D64V1
+void  __builtin_IB_lsc_store_global_ulong2(__global ulong2 *base, int immElemOff, ulong2 val, enum LSC_STCC cacheOpt);  //D64V2
+void  __builtin_IB_lsc_store_global_ulong3(__global ulong3 *base, int immElemOff, ulong3 val, enum LSC_STCC cacheOpt);  //D64V3
+void  __builtin_IB_lsc_store_global_ulong4(__global ulong4 *base, int immElemOff, ulong4 val, enum LSC_STCC cacheOpt);  //D64V4
+void  __builtin_IB_lsc_store_global_ulong8(__global ulong8 *base, int immElemOff, ulong8 val, enum LSC_STCC cacheOpt);  //D64V8
+
+// Local address space
+void  __builtin_IB_lsc_store_local_uchar_from_uint (__local  uchar *base, int immElemOff, uint val);   //D8U32
+void  __builtin_IB_lsc_store_local_ushort_from_uint(__local ushort *base, int immElemOff, uint val); //D16U32
+void  __builtin_IB_lsc_store_local_uint  (__local uint   *base, int immElemOff, uint val);   //D32V1
+void  __builtin_IB_lsc_store_local_uint2 (__local uint2  *base, int immElemOff, uint2 val);  //D32V2
+void  __builtin_IB_lsc_store_local_uint3 (__local uint3  *base, int immElemOff, uint3 val);  //D32V3
+void  __builtin_IB_lsc_store_local_uint4 (__local uint4  *base, int immElemOff, uint4 val);  //D32V4
+void  __builtin_IB_lsc_store_local_uint8 (__local uint8  *base, int immElemOff, uint8 val);  //D32V8
+void  __builtin_IB_lsc_store_local_ulong (__local ulong  *base, int immElemOff, ulong val);  //D64V1
+void  __builtin_IB_lsc_store_local_ulong2(__local ulong2 *base, int immElemOff, ulong2 val);  //D64V2
+void  __builtin_IB_lsc_store_local_ulong3(__local ulong3 *base, int immElemOff, ulong3 val);  //D64V3
+void  __builtin_IB_lsc_store_local_ulong4(__local ulong4 *base, int immElemOff, ulong4 val);  //D64V4
+void  __builtin_IB_lsc_store_local_ulong8(__local ulong8 *base, int immElemOff, ulong8 val);  //D64V8
+
+// LSC prefetching
+
+// LSC Pre-Fetch Load functions with CacheControls
+// Global address space
+void __builtin_IB_lsc_prefetch_global_uchar (const __global uchar  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
+void __builtin_IB_lsc_prefetch_global_ushort(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
+void __builtin_IB_lsc_prefetch_global_uint  (const __global uint   *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
+void __builtin_IB_lsc_prefetch_global_uint2 (const __global uint2  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
+void __builtin_IB_lsc_prefetch_global_uint3 (const __global uint3  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
+void __builtin_IB_lsc_prefetch_global_uint4 (const __global uint4  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
+void __builtin_IB_lsc_prefetch_global_uint8 (const __global uint8  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
+void __builtin_IB_lsc_prefetch_global_ulong (const __global ulong  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
+void __builtin_IB_lsc_prefetch_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
+void __builtin_IB_lsc_prefetch_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
+void __builtin_IB_lsc_prefetch_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
+void __builtin_IB_lsc_prefetch_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
+
+// LSC Fence support
+
+// FS - Fence Scope
+enum LSC_FS {
+    LSC_FS_THREAD_GROUP,
+    LSC_FS_LOCAL,
+    LSC_FS_TILE,
+    LSC_FS_GPU,
+    LSC_FS_GPUs,
+    LSC_FS_SYSTEM_RELEASE,
+    LSC_FS_SYSTEM_ACQUIRE
+};
+
+// FT - Fence Type
+enum LSC_FT {
+    LSC_FT_DEFAULT,
+    LSC_FT_EVICT,
+    LSC_FT_INVALIDATE,
+    LSC_FT_DISCARD,
+    LSC_FT_CLEAN,
+    LSC_FT_L3
+};
+
+// LSC Fence functions
+void  __builtin_IB_lsc_fence_global_untyped(enum LSC_FS scope, enum LSC_FT flushType);   // Mem Port - UGM
+void  __builtin_IB_lsc_fence_global_untyped_cross_tile(enum LSC_FS scope, enum LSC_FT flushType);  // Mem Port - UGML
+void  __builtin_IB_lsc_fence_global_typed(enum LSC_FS scope, enum LSC_FT flushType);     // Mem Port - TGM
+void  __builtin_IB_lsc_fence_local();                                                    // Mem Port - SLM
+
+// Exported functions
+
+// LSC Loads
+// uchar
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ushort
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint
+uint load_uint_L1UC_L3UC(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_uint_L1UC_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_uint_L1C_L3UC(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_uint_L1C_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_uint_L1S_L3UC(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_uint_L1S_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_uint_L1IAR_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint2
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint3
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint4
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint8
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong load_ulong_L1C_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong load_ulong_L1S_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong2
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong3
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong4
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong8
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// LSC Stores
+// uchar
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ushort
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint2
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint3
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint4
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint8
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong2
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong3
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong4
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong8
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// LSC Fence support
+void mem_fence_gpu_default()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_DEFAULT);
+}
+
+void mem_fence_workgroup_default()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_THREAD_GROUP, LSC_FT_DEFAULT);
+}
+
+void mem_fence_gpu_invalidate()
+{
+    // NOTE: 'FS_TILE' is used here to avoid DG2 HW bug where L3 is needlessly flushed on a 'GPU' scope fence
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_INVALIDATE);
+}
+
+void mem_fence_gpu_evict()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_EVICT);
+}
+
+void mem_fence_evict_to_memory()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_EVICT);
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_L3);
+}
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
new file mode 100644
index 00000000000..a12dac00e77
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
@@ -0,0 +1,207 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Loads
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset);
+
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset);
+
+uint load_uint_L1UC_L3UC(global uint* it, int offset);
+uint load_uint_L1UC_L3C(global uint* it, int offset);
+uint load_uint_L1C_L3UC(global uint* it, int offset);
+uint load_uint_L1C_L3C(global uint* it, int offset);
+uint load_uint_L1S_L3UC(global uint* it, int offset);
+uint load_uint_L1S_L3C(global uint* it, int offset);
+uint load_uint_L1IAR_L3C(global uint* it, int offset);
+
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset);
+
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset);
+
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset);
+
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset);
+
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset);
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1C_L3C(global ulong* it, int offset);
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1S_L3C(global ulong* it, int offset);
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset);
+
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset);
+
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset);
+
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset);
+
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset);
+
+// LSC Stores
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value);
+
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value);
+
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value);
+
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value);
+
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value);
+
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value);
+
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value);
+
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value);
+
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value);
+
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value);
+
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value);
+
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value);
+
+// LSC Fence support
+void mem_fence_gpu_default();
+void mem_fence_workgroup_default();
+void mem_fence_gpu_invalidate();
+void mem_fence_gpu_evict();
+void mem_fence_evict_to_memory();
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
new file mode 100644
index 00000000000..2217618c7c5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
@@ -0,0 +1,898 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Loads
+// uchar
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+// ushort
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+// uint
+uint load_uint_L1UC_L3UC(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1UC_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1C_L3UC(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1C_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1S_L3UC(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1S_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1IAR_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+// uint2
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+// uint3
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+// uint4
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+// uint8
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1C_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1S_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong2
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong3
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong4
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong8
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+// LSC Stores
+// uchar
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+// ushort
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+// uint
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+// uint2
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+// uint3
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+// uint4
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+// uint8
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+// ulong
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+// ulong2
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+// ulong3
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+// ulong4
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+// ulong8
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+// LSC Fence support
+void mem_fence_gpu_default()
+{
+    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_workgroup_default()
+{
+    write_mem_fence( CLK_GLOBAL_MEM_FENCE );
+}
+
+void mem_fence_gpu_invalidate()
+{
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_gpu_evict()
+{
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_evict_to_memory()
+{
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/src/intel/vulkan/grl/gpu/mem_utils.h b/src/intel/vulkan/grl/gpu/mem_utils.h
new file mode 100644
index 00000000000..b57a25279fd
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/mem_utils.h
@@ -0,0 +1,161 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "shared.h"
+
+/// Write cache line to global memory
+/// Assumes subgroup_size is 16
+///
+/// @param dst 64 bytes aligned output pointer
+/// @param val value to write
+GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
+{
+    global uint* addrAligned = (global uint*)(global uint16*)dst;
+    intel_sub_group_block_write(addrAligned, val);
+}
+
+/// Read cache line from global memory
+/// Assumes subgroup_size is 16
+///
+/// @param src 64 bytes aligned input pointer
+/// @return uint read from memory
+GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
+{
+    const global uint* addrAligned = (const global uint*)(global uint16*)src;
+    return intel_sub_group_block_read(addrAligned);
+}
+
+/// Copy cache line
+/// Assumes subgroup_size is 16
+///
+/// @param dst 64 bytes aligned output pointer
+/// @param src input pointer
+GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
+{
+    global const uint* usrc = (global const uint*) (src);
+
+    uint data = intel_sub_group_block_read(usrc);
+    CacheLineSubgroupWrite(dst, data);
+}
+
+/// Fast memory copy
+/// 
+/// @param dst output pointer
+/// @param src input pointer
+/// @param size number of bytes to copy
+/// @param numGroups number of groups that execute this function
+GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
+{
+    const uint CACHELINE_SIZE = 64;
+
+    uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
+    
+    // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
+    // it copies laso reminder
+    {
+        uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
+        alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
+
+        if (size > alignAdd)
+        {
+            uint alignedBytesCount = size - alignAdd;
+            uint alignedDWsCount = alignedBytesCount >> 2;
+            global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
+            global uint* srcAlignedPart = (global uint*)(src + alignAdd);
+
+            for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
+            {
+                dstAlignedPart[id] = srcAlignedPart[id];
+            }
+
+            if (globalID < alignedBytesCount - (alignedDWsCount << 2))
+            {
+                global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
+                global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
+                dstByteRem[globalID] = srcByteRem[globalID];
+            }
+        }
+    }
+    
+    // copy to dst below aligned up to chacheline
+    {
+        uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
+        if (misalignmentBytesSize)
+        {
+            if (globalID < misalignmentBytesSize)
+            {
+                dst[globalID] = src[globalID];
+            }
+            dst += misalignmentBytesSize;
+            src += misalignmentBytesSize;
+        }
+
+        uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
+        if (misalignmentDWSize)
+        {
+            if (globalID < (misalignmentDWSize >> 2))
+            {
+                ((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
+            }
+        }
+    }
+}
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_PER_BLOCK 4
+#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
+
+GRL_INLINE
+global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
+{
+    if (array != NULL)
+    {
+        return array + byteOffset;
+    }
+    else
+    {
+        return (global char *)arrayOfPtrs[byteOffset >> 6];
+    }
+}
+
+// assummed:
+// dst is always 64 bytes alligned
+// size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
+GRL_INLINE
+void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
+{
+    uint taskId = get_group_id(0);
+
+    uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
+
+    uint cachelinedTailOffset = blockedSize;
+    uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
+
+    uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
+    uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
+    if (reversedTaskId < tailCacheLines)
+    {
+        uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
+        global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
+        CopyCacheLine(dst + byteOffset, src);
+    }
+
+    uint numBlocks = blockedSize >> 8;
+    while (taskId < numBlocks)
+    {
+        uint byteOffset = (taskId * BLOCK_SIZE);
+
+        for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
+        {
+            global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
+            CopyCacheLine(dst + byteOffset, src);
+            byteOffset += CACHELINE_SIZE;
+        }
+
+        taskId += numGroups;
+    }
+}
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/misc.cl b/src/intel/vulkan/grl/gpu/misc.cl
new file mode 100644
index 00000000000..d32c8267b73
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc.cl
@@ -0,0 +1,367 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+#include "misc_shared.h"
+#include "mem_utils.h"
+
+#define DBG(x)
+#define ENABLE_CHECKS 0
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_PER_BLOCK 4
+#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
+
+GRL_INLINE
+uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+    return  (uint32_t)GRL_get_primitive_count(&geomDesc[index]);
+}
+
+GRL_INLINE
+uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+    return (uint32_t)GRL_get_Type(&geomDesc[index]) |
+           (((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16);
+}
+
+GRL_INLINE
+uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+    return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) |
+           (((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32);
+}
+
+// assummed:
+// dst is always 64 bytes alligned
+GRL_INLINE
+void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups)
+{
+    uint taskId = get_group_id(0);
+    uint localId = get_sub_group_local_id();
+
+    uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1));
+
+    uint reminderOffset = cachelinedSize;
+    uint reminderQWSize = (size - reminderOffset) >> 3;
+
+    uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE
+    uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1)));
+    if (reversedTaskId == tailCacheLines && localId < reminderQWSize)
+    {
+        uint reminderOffsetQW = reminderOffset >> 3;
+        global uint64_t* dstQW = (global uint64_t*)(dst);
+        dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW);
+    }
+
+    uint numCacheLines = cachelinedSize >> 6;
+    while (taskId < numCacheLines)
+    {
+        uint byteOffset = taskId * CACHELINE_SIZE;
+        uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1);
+
+        uint32_t data = 0;
+        if (localId & 1)
+        {
+            data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset);
+        }
+        else
+        {
+            data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset);
+        }
+        CacheLineSubgroupWrite(dst + byteOffset, data);
+
+        taskId += numGroups;
+    }
+}
+
+GRL_INLINE
+uint groupCountForInstancesCopySize(uint size)
+{
+    return (size >> 8) + 3;
+}
+
+GRL_INLINE
+uint groupCountForGeoMetaDataCopySize(uint size)
+{
+    return (size >> 6) + 1;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size)
+{
+  //  global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data)
+{
+    uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    instancesArray += indirect_data->primitiveOffset;
+    uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (tid == 0)
+    {
+        struct BVHBase* bvh     = (struct BVHBase*)dest;
+        bvh->Meta.instanceCount = indirect_data->primitiveCount;
+    }
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size)
+{
+    //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    arrayOfPtrs += indirect_data->primitiveOffset;
+    uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (tid == 0)
+    {
+        struct BVHBase* bvh     = (struct BVHBase*)dest;
+        bvh->Meta.instanceCount = indirect_data->primitiveCount;
+    }
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size)
+{
+    global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    uint64_t     size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    instancesArray += indirect_data->primitiveOffset;
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size)
+{
+    global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    uint64_t     size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    arrayOfPtrs += indirect_data->primitiveOffset;
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size)
+{
+    //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart);
+    global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src);
+    copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) )
+__attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) )
+void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries)
+{
+    uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    if (gid < numGeometries) {
+        global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest);
+        global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src);
+
+        GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid];
+
+        uint primitiveCount  = indirect_data[gid].primitiveCount;
+        uint primitiveOffset = indirect_data[gid].primitiveOffset;
+        uint firstVertex     = indirect_data[gid].firstVertex;
+        uint transformOffset = indirect_data[gid].transformOffset;
+
+        if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES)
+        {
+            if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
+            {
+                geo.Desc.Triangles.VertexCount = primitiveCount * 3;
+                geo.Desc.Triangles.pVertexBuffer += primitiveOffset
+                                                    + firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
+            }
+            else
+            {
+                geo.Desc.Triangles.IndexCount = primitiveCount * 3;
+                geo.Desc.Triangles.pIndexBuffer += primitiveOffset;
+                geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
+            }
+            if (geo.Desc.Triangles.pTransformBuffer) {
+                geo.Desc.Triangles.pTransformBuffer += transformOffset;
+            }
+        } else {
+            // GEOMETRY_TYPE_PROCEDURAL
+            geo.Desc.Procedural.AABBCount = primitiveCount;
+            geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset;
+        }
+
+        dstDesc[gid] = geo;
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data)
+{
+    uint groupID = get_group_id(0);
+
+    struct BatchedInitGlobalsData entry = data[groupID];
+
+    global struct Globals* globals = (global struct Globals*)entry.p_build_globals;
+    global char *bvh_mem = (global char*)entry.p_bvh_buffer;
+    uint numPrimitives = entry.numPrimitives;
+    uint numGeometries = entry.numGeometries;
+    uint numInstances = entry.numInstances;
+    uint instance_descs_start = entry.instance_descs_start;
+    uint geo_meta_data_start = entry.geo_meta_data_start;
+    uint node_data_start = entry.node_data_start;
+    uint quad_data_start = entry.leaf_data_start;
+    uint instance_data_start = entry.leaf_data_start;
+    uint procedural_data_start = entry.procedural_data_start;
+    uint back_pointer_start = entry.back_pointer_start;
+    uint build_record_start = entry.leaf_data_start;
+    uint totalBytes = entry.sizeTotal;
+    uint leafPrimType = entry.leafType;
+    uint leafSize = entry.leafSize;
+
+    uint root_node_offset = node_data_start;
+    struct BVHBase *base = (struct BVHBase *)bvh_mem;
+
+    base->Meta.instanceCount      = numInstances;
+    base->Meta.geoCount           = numGeometries;
+    base->Meta.instanceDescsStart = instance_descs_start;
+    base->Meta.geoDescsStart      = geo_meta_data_start;
+    base->Meta.allocationSize     = totalBytes;
+    // This doesnt work correctly
+    //ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA };
+    //base->Meta.errors = initErr;
+    base->Meta.errors.type = 0;
+    base->Meta.errors.offset_in_BVH = 0; //in 64B units
+    base->Meta.errors.when = 0;
+    base->Meta.errors.reserved = 0xAAABBAAA;
+
+    base->nodeDataCur = node_data_start / 64;
+    base->quadLeafStart = quad_data_start / 64;
+    base->quadLeafCur = quad_data_start / 64;
+    base->instanceLeafStart = instance_data_start / 64;
+    base->instanceLeafEnd = instance_data_start / 64;
+    base->proceduralDataStart = procedural_data_start / 64;
+    base->proceduralDataCur = procedural_data_start / 64;
+    base->backPointerDataStart = back_pointer_start / 64;
+    base->refitTreeletsDataStart = totalBytes / 64;
+    base->refitStartPointDataStart = totalBytes / 64;
+    base->BVHDataEnd = totalBytes / 64;
+    base->refitTreeletCnt = 0;
+    base->refitTreeletCnt2 = 0;
+    base->rootNodeOffset = root_node_offset;
+
+    base->fatLeafCount = 0;
+    base->fatLeafTableStart = entry.fatleaf_table_start / 64;
+    base->innerCount = 0;
+    base->innerTableStart = entry.innernode_table_start / 64;
+    base->quadLeftoversCountNewAtomicUpdate = 0;
+    base->quadTableSizeNewAtomicUpdate = 0;
+    base->quadIndicesDataStart = entry.quad_indices_data_start / 64;
+
+    if (back_pointer_start != totalBytes)
+    {
+        BackPointers* back_pointers = BVHBase_GetBackPointers(base);
+        uint root_node_idx = root_node_offset - node_data_start;
+        global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx);
+        *root_node_backpointer = ((uint)-1) << 6;
+    }
+
+    AABB3f_init(&base->Meta.bounds);
+    AABB_init(&globals->centroidBounds);
+
+    globals->build_record_start = build_record_start;
+
+    globals->numBuildRecords = 0;
+    globals->numBuildRecords_extended = 0;
+    globals->numPrimitives = numPrimitives;
+    globals->numSplittedPrimitives = 0;
+    globals->sync = 0;
+    globals->probThreshold = 0.0f;
+    globals->leafPrimType = leafPrimType;
+    globals->leafSize = leafSize;
+}
+
+
+
+// This is temporary WA for mock in DXR
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest,
+                                                                                     global char *src,
+                                                                                     uint32_t size)
+{
+    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    uint32_t globalSize = get_num_groups(0) * get_local_size(0);
+    for (uint32_t i = globalId; i < size; i += globalSize)
+    {
+        dest[i] = src[i];
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+void kernel mem_set(global char *dest,
+    dword byte,
+    dword size)
+{
+    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    if (globalId < size)
+    {
+        dest[globalId] = (char)byte;
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+void kernel mem_set_size_ptr(global char *dest,
+    dword byte,
+    global qword* sizePtr)
+{
+    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    if (globalId < *sizePtr)
+    {
+        dest[globalId] = (char)byte;
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/misc.grl b/src/intel/vulkan/grl/gpu/misc.grl
new file mode 100644
index 00000000000..cb98534afb4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc.grl
@@ -0,0 +1,278 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module misc;
+
+kernel_module misc("misc.cl")
+{
+    kernel opencl_kernel_batched_init_globals                 < kernelFunction="batched_init_globals" >;
+    kernel opencl_kernel_copy_instances                       < kernelFunction="copy_instances" >;
+    kernel opencl_kernel_copy_instances_indirect              < kernelFunction="copy_instances_indirect" >;
+    kernel opencl_kernel_copy_instance_ptrs                   < kernelFunction="copy_instance_ptrs" >;
+    kernel opencl_kernel_copy_instance_ptrs_indirect          < kernelFunction="copy_instance_ptrs_indirect" >;
+    kernel opencl_kernel_copy_instances_base_ptr              < kernelFunction="copy_instances_base_ptr" >;
+    kernel opencl_kernel_copy_instances_base_ptr_indirect     < kernelFunction="copy_instances_base_ptr_indirect" >;
+    kernel opencl_kernel_copy_instance_ptrs_base_ptr          < kernelFunction="copy_instance_ptrs_base_ptr" >;
+    kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >;
+    kernel opencl_kernel_copy_geo_meta_data                   < kernelFunction="copy_geo_meta_data" >;
+    kernel opencl_kernel_copy_geo_descs_indirect_build        < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >;
+    kernel opencl_kernel_copy_mock                            < kernelFunction="copy_mock" >;
+    kernel opencl_kernel_memset                               < kernelFunction="mem_set" >;
+    kernel opencl_kernel_memset_size_ptr                      < kernelFunction="mem_set_size_ptr" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+metakernel batched_init_globals(
+    qword p_data,
+    dword numWgs)
+{
+    dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data);
+}
+
+metakernel copy_instances(
+    qword bvh_buffer,
+    qword instanceDescsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel
+copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo )
+{
+
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instances_indirect args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_instance_ptrs(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel copy_instance_ptrs_indirect(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_instances_base_ptr(
+    qword bvh_buffer,
+    qword instanceDescsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel copy_instances_base_ptr_indirect(
+    qword bvh_buffer,
+    qword instanceDescsBuffer,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_instance_ptrs_base_ptr(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel copy_instance_ptrs_base_ptr_indirect(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect  args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_geo_descs(
+    qword private_dest,
+    qword transient_src,
+    qword indirectBuildRangeInfo,
+    dword numGeometries)
+{
+
+    define num_groups (numGeometries + 16 - 1) / 16;
+    dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args(
+        private_dest,
+        transient_src,
+        indirectBuildRangeInfo,
+        numGeometries);
+}
+
+metakernel copy_geo_meta_data(
+    qword bvh_buffer,
+    qword geomdesc_buffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args(
+        bvh_buffer,
+        geomdesc_buffer,
+        totalSizeToCopy);
+}
+
+
+const COPY_MOCK_GROUP_SIZE = 16;
+
+metakernel copy_mock(
+    qword dest,
+    qword src,
+    dword size)
+{
+    define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE;
+    dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args(
+        dest,
+        src,
+        size);
+}
+
+metakernel memset(
+    qword dest,
+    dword byte,
+    dword size)
+{
+    define num_groups (size + 32 - 1) / 32;
+    dispatch opencl_kernel_memset(num_groups, 1, 1) args(
+        dest,
+        byte,
+        size);
+}
+
+metakernel memset_size_ptr(
+    qword dest,
+    dword byte,
+    qword sizePtr)
+{
+    define byteSize REG0;
+    define C_32 REG1; C_32 = 32;
+    define C_1 REG2; C_1 = 1;
+    define C_4 REG3; C_4 = 4;
+    define numGroupsRqd REG4;
+
+    byteSize = load_dword(sizePtr);
+    
+    numGroupsRqd = byteSize + C_32;
+    numGroupsRqd = numGroupsRqd - C_1;
+    numGroupsRqd = numGroupsRqd >> C_4;
+    numGroupsRqd = numGroupsRqd >> C_1;
+
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_memset_size_ptr args(
+        dest,
+        byte,
+        sizePtr);
+}
diff --git a/src/intel/vulkan/grl/gpu/misc_legacy.cl b/src/intel/vulkan/grl/gpu/misc_legacy.cl
new file mode 100644
index 00000000000..a464e89537c
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc_legacy.cl
@@ -0,0 +1,386 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "input_client_structs.h"
+#include "common.h"
+#include "instance.h"
+
+#define DBG(x)
+#define ENABLE_CHECKS 0
+
+/*
+
+  This kernel implements a exclusive scan addition operation. The
+  implementation currently only uses one DSS.
+
+ */
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add(global uint *input,
+                            global uint *output,
+                            const uint N)
+{
+    const uint j = get_local_id(0);
+    const uint J = get_local_size(0);
+    const uint BLOCKSIZE = (N + J - 1) / J;
+    const uint start = min((j + 0) * BLOCKSIZE, N);
+    const uint end = min((j + 1) * BLOCKSIZE, N);
+
+    uint base = 0;
+    for (uint i = start; i < end; i++)
+        base += input[i];
+
+    base = work_group_scan_exclusive_add(base);
+
+    uint accu = 0;
+    for (uint i = start; i < end; i++)
+    {
+        output[i] = base + accu;
+        accu += input[i];
+    }
+}
+
+/*
+
+  This kernel implements a exclusive scan addition operation that can use the entire GPU.
+
+ */
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add_phase0(global uint *input,
+                                   global uint *output,
+                                   global uint *prefix_sums,
+                                   const uint N)
+{
+    const uint local_size = get_local_size(0);
+    const uint numTasks = get_num_groups(0);
+    const uint groupID = get_group_id(0);
+    const uint localID = get_local_id(0);
+    const uint global_startID = (groupID + 0) * N / numTasks;
+    const uint global_endID = (groupID + 1) * N / numTasks;
+
+    uint base = 0;
+    for (uint i = global_startID + localID; i < global_endID; i += local_size)
+        base += input[i];
+
+    base = work_group_reduce_add(base);
+
+    if (localID == 0)
+    {
+        prefix_sums[groupID] = base;
+        printf("%d -> %d \n", groupID, base);
+    }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add_phase1(global uint *input,
+                                   global uint *output,
+                                   global uint *prefix_sums,
+                                   const uint N)
+{
+    const uint local_size = get_local_size(0);
+    const uint numTasks = get_num_groups(0);
+    const uint groupID = get_group_id(0);
+    const uint localID = get_local_id(0);
+    const uint global_startID = (groupID + 0) * N / numTasks;
+    const uint global_endID = (groupID + 1) * N / numTasks;
+    const uint local_range = global_endID - global_startID;
+
+    uint global_base = 0;
+    for (uint i = 0; i < groupID; i++)
+        global_base += prefix_sums[i];
+
+    const uint j = get_local_id(0);
+    const uint J = get_local_size(0);
+    const uint BLOCKSIZE = (local_range + J - 1) / J;
+    const uint startID = (j + 0) * local_range / J + global_startID;
+    const uint endID = (j + 1) * local_range / J + global_startID;
+
+    uint base = 0;
+    for (uint i = startID; i < endID; i++)
+        base += input[i];
+
+    base = work_group_scan_exclusive_add(base);
+
+    uint accu = 0;
+    for (uint i = startID; i < endID; i++)
+    {
+        output[i] = global_base + base + accu;
+        accu += input[i];
+    }
+}
+
+/* ========================================================================= */
+/* ============================== STATISTICS =============================== */
+/* ========================================================================= */
+
+/* ====== STATS config ====== */
+
+#define ENABLE_STAT_CHECKS 1
+#define DBG_STATS(x)
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+printBVHStatistics(global struct Globals *globals,
+                   global char *bvh_mem,
+                   global struct StatStackEntry *global_stack0,
+                   global struct StatStackEntry *global_stack1,
+                   const uint presplit)
+{
+    const uint globalID = get_global_id(0);
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+
+    struct BVHBase *base = (struct BVHBase *)bvh_mem;
+    const uint root = base->rootNodeOffset;
+
+    local uint stack_items[2];
+    local uint iterations;
+
+    struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root));
+    root_aabb = conservativeAABB(&root_aabb);
+    const float root_area = AABB_halfArea(&root_aabb);
+
+    global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
+
+    if (root_node->type != BVH_INTERNAL_NODE)
+    {
+        const uint numChildren = getNumChildren_QBVHNodeN(root_node);
+        const uint current = root;
+        for (uint i = 0; i < numChildren; i++)
+        {
+            struct AABB aabb = extractAABB_QBVHNodeN(root_node, i);
+            const float area = AABB_halfArea(&aabb);
+
+            global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad);
+            global_stack0[i].type = root_node->type;
+            global_stack0[i].area = area;
+            global_stack0[i].aabb = aabb;
+            global_stack0[i].depth = 0;
+        }
+        stack_items[0] = numChildren;
+        stack_items[1] = 0;
+    }
+    else
+    {
+        global_stack0[0].node = root;
+        global_stack0[0].type = root_node->type;
+        global_stack0[0].area = root_area;
+        global_stack0[0].aabb = root_aabb;
+        global_stack0[0].depth = 1;
+        stack_items[0] = 1;
+        stack_items[1] = 0;
+    }
+
+    const uint maxInnerNodeOffset = globals->node_mem_allocator.cur;
+    const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur;
+
+    DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64));
+
+    iterations = 0;
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sah_nodes = 0.0f;
+    float sah_leaves = 0.0f;
+    uint leaves = 0;
+    uint inner_nodes = 0;
+    uint max_depth = 0;
+    uint leaf_items = 0;
+    uint inner_nodes_valid_children = 0;
+
+    while (1)
+    {
+        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+        const uint buffer_index = (iterations % 2) == 0 ? 0 : 1;
+        global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1;
+        global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0;
+
+        const uint local_stack_items = stack_items[buffer_index];
+        stack_items[1 - buffer_index] = 0;
+
+        DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items));
+
+        if (local_stack_items == 0)
+            break;
+        //if (iterations == 5) break;
+
+        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (globalID == 0)
+            iterations++;
+
+        for (uint sindex = localID; sindex < local_stack_items; sindex += local_size)
+        {
+
+            uint current = input_global_stack[sindex].node;
+            uint type = input_global_stack[sindex].type;
+            float current_area = input_global_stack[sindex].area;
+            struct AABB current_aabb = input_global_stack[sindex].aabb;
+            uint current_depth = input_global_stack[sindex].depth;
+
+            //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items);
+
+            max_depth = max(max_depth, current_depth);
+
+            if (type == BVH_QUAD_NODE)
+            {
+                unsigned int prims = 1; //getNumLeafPrims(current);
+                if (prims > BVH_LEAF_N_MAX)
+                    printf("too many items in leaf %d \n", prims);
+                unsigned int prims_offset = current; //getLeafOffset(current);
+                //printf("prims_offset %d \n",prims_offset);
+
+                leaf_items += prims;
+                sah_leaves += current_area;
+                leaves++;
+#if ENABLE_STAT_CHECKS == 1
+                struct AABB leafAABB;
+                AABB_init(&leafAABB);
+
+                global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset);
+                //printf("prims_offset %d \n",prims_offset);
+
+                for (uint i = 0; i < prims; i++)
+                {
+                    struct AABB quadAABB = getAABB_Quad(&quads[i]);
+                    AABB_extend(&leafAABB, &quadAABB);
+                }
+
+                if (!presplit && !AABB_subset(&leafAABB, &current_aabb))
+                {
+                    printf("leaf error: current %d depth %d \n", current, current_depth);
+                    AABB_print(&current_aabb);
+                    printf("leaf bounds: \n");
+                    AABB_print(&leafAABB);
+                }
+#endif
+            }
+            else if (type == BVH_INTERNAL_NODE)
+            {
+                inner_nodes++;
+                sah_nodes += current_area;
+                global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current);
+
+                uint children = 0;
+                for (uint i = 0; i < BVH_NODE_N6; i++)
+                {
+                    if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i])
+                        break;
+                    children++;
+                }
+                //printf("children %d \n",children);
+
+#if ENABLE_STAT_CHECKS == 1
+                if (children > BVH_NODE_N6 || children == 0)
+                {
+                    printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID);
+                    printQBVHNodeN(nodeN);
+                }
+
+                if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0)
+                {
+                    printf("offset error %d \n", nodeN->offset);
+                }
+#endif
+
+                uint children_offset = atomic_add(&stack_items[1 - buffer_index], children);
+
+                for (uint i = 0; i < children; i++)
+                {
+                    inner_nodes_valid_children++;
+
+                    struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i);
+                    const float area = AABB_halfArea(&aabb);
+
+                    aabb = conservativeAABB(&aabb);
+
+#if 0 // ENABLE_STAT_CHECKS == 1                            // FIXME: not clear whether parent child property still holds !!!!
+
+                  // if (aabb.lower.x == (float)(INFINITY))
+                  //   {
+                  //     printf("aabb inf error %d current %d nodeN %d \n",i, current, children);
+                  //     break;
+                  //   }
+
+
+                  if (!presplit && !AABB_subset(&aabb,&current_aabb))
+                    {
+                      printf("Parent: current %d depth %d children %d \n",current, current_depth, children);
+                      AABB_print(&current_aabb);
+                      printf("Child %d: \n",i);
+                      AABB_print(&aabb);
+                    }
+#endif
+
+                    uint dest_index = children_offset + i;
+                    if (nodeN->type == BVH_QUAD_NODE)
+                    {
+                        output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad);
+                        if (output_global_stack[dest_index].node >= maxLeafNodeOffset)
+                        {
+                            printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64);
+                        }
+                    }
+                    else if (nodeN->type == BVH_INTERNAL_NODE)
+                    {
+                        output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN));
+                        if (output_global_stack[dest_index].node >= maxInnerNodeOffset)
+                        {
+                            printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset);
+                        }
+                    }
+
+                    output_global_stack[dest_index].type = nodeN->type;
+                    output_global_stack[dest_index].area = area;
+                    output_global_stack[dest_index].aabb = aabb;
+                    output_global_stack[dest_index].depth = current_depth + 1;
+                    //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type);
+                }
+            }
+        }
+    }
+
+    sah_nodes = work_group_reduce_add(sah_nodes);
+    sah_leaves = work_group_reduce_add(sah_leaves);
+    leaves = work_group_reduce_add(leaves);
+    inner_nodes = work_group_reduce_add(inner_nodes);
+    max_depth = work_group_reduce_max(max_depth);
+    leaf_items = work_group_reduce_add(leaf_items);
+    inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children);
+
+    if (globalID == 0)
+    {
+        /*
+    sah_nodes  *= 1.0f / root_area;
+    sah_leaves *= 1.0f / root_area;
+    float sah = sah_nodes + sah_leaves;
+
+    const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start;
+    const uint totalAllocatedMem = globals->totalAllocatedMem;
+
+    printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX);
+    float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6);
+    float leaf_util = 100.0f * (float)leaf_items / (leaves);
+    printf("allocators: node  %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start);
+    printf("inner nodes %d leaves %d  sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves);
+    uint node_mem     = globals->node_mem_allocator_cur;
+    uint max_node_mem = globalLeafMemAllocatorOffset;
+    float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem;
+
+    uint leaf_mem        = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset;
+    uint max_leaf_mem    = totalAllocatedMem - globalLeafMemAllocatorOffset;
+    float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem;
+
+    uint total_mem = node_mem + leaf_mem;
+    float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem;
+
+    printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem);
+    */
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/misc_shared.h b/src/intel/vulkan/grl/gpu/misc_shared.h
new file mode 100644
index 00000000000..218f2fa4291
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc_shared.h
@@ -0,0 +1,196 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+//   This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(MISC)
+
+struct BatchedInitGlobalsData
+{
+    qword p_build_globals;
+    qword p_bvh_buffer;
+    dword numPrimitives;
+    dword numGeometries;
+    dword numInstances;
+    dword instance_descs_start;
+    dword geo_meta_data_start;
+    dword node_data_start;
+    dword leaf_data_start;
+    dword procedural_data_start;
+    dword back_pointer_start;
+    dword sizeTotal;
+    dword leafType;
+    dword leafSize;
+    dword fatleaf_table_start;
+    dword innernode_table_start;
+    dword quad_indices_data_start;
+};
+
+/// Header of debug buffer
+///
+/// Header is placed at the begining of debug buffer. 
+/// After header there is circullar buffer space
+typedef struct DebugBufferHeader
+{
+    /// Offset to begin of buffer (after header)
+    dword headStart;
+    /// Offset to free memory in buffer (used by gpu)
+    dword gpuHead;
+    /// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader)
+    dword cpuHead;
+    /// Flag for buffer overflow
+    dword overflow;
+    /// Total size of buffer
+    dword totalSize;
+    /// Padding needed because otherwise GPU overrides tail with cacheline flush
+    dword pad[11];
+    /// Offset to begin of data in buffer
+    dword tail;
+} DebugBufferHeader;
+
+enum InputDumpOperationType
+{
+    INPUT_DUMP_OP_NOP,
+    INPUT_DUMP_OP_BATCH,
+    INPUT_DUMP_OP_BUILD,
+    INPUT_DUMP_OP_UPDATE,
+    INPUT_DUMP_OP_CLONE,
+    INPUT_DUMP_OP_COMPACT,
+    INPUT_DUMP_OP_SERIALIZE,
+    INPUT_DUMP_OP_DESERIALIZE,
+    INPUT_DUMP_OP_END_BUFFER
+};
+
+// each operation starts with the same header structure and looks like this
+
+//  some defined struct { <-----------------start
+//     OpHeader 
+//     .... struct type specific data
+//  }
+//  ... auxilary data of variable len
+//  <-------------------------------------- end - indicated by endOfData
+typedef struct OpHeader
+{
+    dword operationType;
+    dword endOfData; // offset to end of this primitive
+} OpHeader;
+
+// header for batch operations
+typedef struct BatchOpHeader
+{
+    OpHeader opHeader;
+} BatchOpHeader;
+
+// interpretation for operationType INPUT_DUMP_OP_BATCH
+typedef struct InputBatch
+{
+    BatchOpHeader header;
+    qword batchId;
+    dword vertexBufferDataSize;
+    dword firstContainedOpOffset;
+    
+    // layout of batch is as below, each line is 128B aligned:
+
+    // 
+    //  InputBatch <-------------------------------- start
+    //       optional: batchVertexData
+    //  InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset
+    //       optional: extra data of above token
+    //  InputBuildDesc/InputCopy
+    //       optional: extra data of above token
+    //  ...
+    //  InputBuildDesc/InputCopy
+    //       optional: extra data of above token
+    //  <-------------------------------------------- end    = start + endOfData
+} InputBatch;
+
+// for operationType:
+//   INPUT_DUMP_OP_BUILD,
+//   INPUT_DUMP_OP_UPDATE,
+// followed by auxilary data of variable len
+typedef struct InputBuild
+{
+    OpHeader header;
+    qword srcBvhPtr;
+    qword dstBvhPtr;
+    dword flags;
+    dword numGeos;
+    dword numInstances;
+    dword instArrayOfPtrs;
+} InputBuild;
+
+// for operationType:
+//   INPUT_DUMP_OP_CLONE,
+//   INPUT_DUMP_OP_COMPACT,
+//   INPUT_DUMP_OP_SERIALIZE,
+// 
+//   Not for INPUT_DUMP_OP_DESERIALIZE!
+typedef struct InputCopy
+{
+    OpHeader header;
+    qword srcBvhPtr;
+    qword dstBvhPtr;
+} InputCopy;
+
+// for INPUT_DUMP_OP_DESERIALIZE
+// decode for debug tools follows this format
+typedef struct InputDeserialize
+{
+    OpHeader header;
+    qword dstBvhPtr;
+} InputDeserialize;
+
+typedef struct InputBatchPtrs
+{
+    qword dumpDst;
+    qword globalDumpBuffer;
+    qword nonVertexDataStart;
+    dword vertexBuffersSize;
+    dword totalSize;
+} InputBatchPtrs;
+
+enum OutputDumpOperationType
+{
+    OUTPUT_DUMP_OP_NOP,
+    OUTPUT_DUMP_OP_BATCH,
+    OUTPUT_DUMP_OP_DATA,
+    OUTPUT_DUMP_OP_END_BUFFER
+};
+
+// interpretation for operationType OUTPUT_DUMP_OP_BATCH
+typedef struct OutputBatch {
+    BatchOpHeader header;
+    qword batchId;
+    dword firstContainedOpOffset;
+} OutputBatch;
+
+// interpretation for operationType OUTPUT_DUMP_OP_DATA
+typedef struct OutputData
+{
+    OpHeader header;
+    qword srcBvhPtr;
+} OutputData;
+
+typedef struct OutputBatchPtrs 
+{
+    qword dumpDst;
+    qword dataStart;
+    dword dataSize;
+    dword totalSize;
+} OutputBatchPtrs;
+
+GRL_NAMESPACE_END(MISC)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/morton/morton_common.h b/src/intel/vulkan/grl/gpu/morton/morton_common.h
new file mode 100644
index 00000000000..2beb7a1aff3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/morton_common.h
@@ -0,0 +1,245 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "common.h"
+
+#define MORTON_DEBUG_CHECKS 0
+#define MORTON_VERBOSE_LOG 0
+
+GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
+{
+#if 0 // turn off, because current hierarchy build requires full sort
+    // Difference between max iterations needed for LSB sorting and
+    // number of iterations needed for LSB sorting without primIDs
+    // This indicates how many of first iterations would be skipped in LSB
+    return 8 - (8 - (shift >> 3));
+#else
+    return 0;
+#endif
+}
+
+typedef struct BuildRecordLocalMortonFlattener
+{
+    unsigned int leftChild;  // global
+    unsigned int rightChild; // global
+    unsigned int rangeStart; // global
+    unsigned int local_parent_index__numItems;
+} BuildRecordLocalMortonFlattener;
+
+// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
+typedef union UPerNodeData {
+    float4                           four_DWs;
+    BuildRecordLocalMortonFlattener  buildRecord;
+    MortonFlattenedBoxlessNode             boxlessNode;
+    struct AABB                      box;
+} UPerNodeData;
+
+GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
+{
+    return bn.childOffset_type >> 6;
+}
+
+GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
+{
+    return bn.childOffset_type & ((1<<6) -1);
+}
+
+GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
+{
+    short lane_used = index % get_sub_group_size();
+    short shift = (index / get_sub_group_size()) * get_sub_group_size();
+    if (lane_used == lane) {
+        *arr |= (val << shift);
+    }
+}
+
+GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
+{
+    short r = 0;
+    short lane_used = index % get_sub_group_size();
+    short shift =    (index / get_sub_group_size()) * get_sub_group_size();
+        r = arr >> shift;
+    r = sub_group_broadcast(r, lane_used);
+    return r;
+}
+
+GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
+{
+    if (lane < count)
+    {
+        dst[lane]=(ushort)(arr & 0xFFFF);
+        short hi_idx = lane + get_sub_group_size();
+        if (hi_idx < count) {
+            dst[hi_idx] = (ushort)(arr >> 16);
+        }
+    }
+}
+
+
+GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
+{
+    if (lane < count)
+    {
+        *arr = src[lane];
+        short hi_idx = lane + get_sub_group_size();
+        if (hi_idx < count) {
+            *arr |= ((uint)(src[hi_idx])) << 16u;
+        }
+    }
+}
+
+GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
+{
+    short lane_used = index % get_sub_group_size();
+    short shift = (index / get_sub_group_size()) * get_sub_group_size();
+    if (lane_used == lane) {
+        uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
+        *arr = (val << shift) | rem_val;
+    }
+}
+
+GRL_INLINE void SUBGROUP_refit_bottom_up_local(
+    uniform struct QBVHNodeN* globalNodeData,
+    uniform struct BackPointers* backPointers,
+    uniform uint treeletRootGlobalIndex,
+    uniform uint globalBaseForInternalNodes,
+    varying ushort lane,
+    uniform local union UPerNodeData* local_nodes,
+    varying uint sg_bu_startpoints,
+    uniform uint sg_bu_startpoints_cnt)
+{
+    if(sg_bu_startpoints_cnt == 0)
+        return;
+
+    const uint head_lane = 0;
+    uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
+
+    uniform uint prev_loc_index = 0;
+    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
+
+    uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
+
+    while (curNodeIndex != 0)
+    {
+        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
+        uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
+        varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
+
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+        if (child_loc_idx != prev_loc_index &&
+            lane < numChildren)
+        {
+            child_aabb = local_nodes[child_loc_idx].box;
+        }
+        else if (lane >= numChildren) {
+            AABB_init(&child_aabb);
+            child_aabb.lower.w = as_float(0u);
+        }
+
+        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
+        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+        reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
+
+        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
+        reduced_bounds.lower.w = as_float((uint)instMask);
+        uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
+        local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
+        if (lane < 8)
+        {
+            pbox[lane] = reduce_bounds_lane;
+        }
+
+        uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
+        /* get bounds of all children from child nodes directly */
+        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
+        subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
+        child_aabb = reduced_bounds;
+        uint parentIndex = BackPointer_GetParentIndex(backpointer);
+
+        write_mem_fence(CLK_LOCAL_MEM_FENCE);
+
+        if (lane == 0)
+        {
+            backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
+            uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
+            uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
+
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
+
+#if MORTON_VERBOSE_LOG
+            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
+                   global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
+#endif
+        }
+
+        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
+        prev_loc_index = curNodeIndex;
+        curNodeIndex = parentIndex;
+
+        /* if all children got refitted, then continue */
+        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
+        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
+        if (numChildrenRefitted != numChildrenTotal)
+        {
+            if(sg_bu_startpoints_cnt)
+            {
+                curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
+                backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
+            }
+            else
+                return;
+        }
+    }
+
+    // process root of the treelet
+    {
+
+#if MORTON_DEBUG_CHECKS
+        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
+#endif
+
+        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
+        varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+        if (child_loc_idx != prev_loc_index &&
+            lane < numChildren)
+        {
+            child_aabb = local_nodes[child_loc_idx].box;
+        }
+        else if (lane >= numChildren) {
+            AABB_init(&child_aabb);
+            child_aabb.lower.w = as_float(0u);
+        }
+
+        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
+        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
+        uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
+        uint global_node_idx = treeletRootGlobalIndex;
+        uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
+
+        /* get bounds of all children from child nodes directly */
+        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
+
+        subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
+
+        /* reset refit counter for next refit */
+        if (lane == 0)
+        {
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
+
+            // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
+
+#if MORTON_VERBOSE_LOG
+            printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
+                   curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
+#endif
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/phase0.cl b/src/intel/vulkan/grl/gpu/morton/phase0.cl
new file mode 100644
index 00000000000..2fa91c214e1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase0.cl
@@ -0,0 +1,400 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+GRL_INLINE void SUBGROUP_create_node_phase0(
+    uniform global struct Globals* globals,
+    uniform global struct BinaryMortonCodeHierarchy* bnodes,
+    uniform global char* bvh_mem,
+    uniform global uint *global_refit_startpoints,
+    uniform uint rID,
+    uniform local uint* local_numRecords,
+    uniform local uint* local_QNodeOffset,
+    uniform global struct BuildRecordMorton* records,
+    uniform struct BuildRecordMorton current,
+    uniform local uint* local_startpoints_num)
+{
+    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+    varying ushort lane = get_sub_group_local_id();
+
+    /* initialize child array */
+    uniform uint numChildren = 2;
+    varying struct BuildRecordMorton sg_children;
+    sg_children.items = 0;
+    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
+
+    if ( lane < numChildren )
+        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
+
+    /* fill QBVH6 node with up to 6 children */
+    while ( numChildren < BVH_NODE_N6 )
+    {
+        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
+        if ( sub_group_all( sg_is_leaf ) )
+            break;
+
+        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
+        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
+        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
+
+        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
+
+        if ( lane == numChildren || lane == bestChild )
+        {
+            sg_children.nodeID = nodeID;
+            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
+        }
+
+        numChildren++;
+    }
+
+    const uint current_index = current.current_index;
+    struct QBVHNodeN* qnode = nodeData + current_index;
+    SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
+
+    uniform uint global_offset;
+    uniform uint child_node_offset;
+
+    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
+    // used in global refit after phase1
+    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
+    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
+
+    if ( lane == 0 )
+    {
+        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+
+        /* create node, but to not set bounds yet as these get calculated during refit */
+        QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE );
+        QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) );
+        /* set back pointers */
+        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
+
+        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
+
+#if MORTON_VERBOSE_LOG
+        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n",
+               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren);
+#endif
+
+        if(children_roots_num == numChildren)
+        {
+            uint startpoints_offset = atomic_inc_local( local_startpoints_num );
+            global_refit_startpoints[startpoints_offset] = current_index;
+        }
+        else
+        {
+            backpointer += children_roots_num;
+        }
+
+        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
+    }
+
+    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
+    global_offset = sub_group_broadcast( global_offset, 0 );
+
+    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
+
+    sg_children.current_index = childNodes - nodeData + lane;
+    sg_children.parent_index = current_index;
+
+    if ( lane < numChildren )
+    {
+        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
+        records[write_position] = sg_children;
+    }
+}
+
+
+GRL_INLINE void SUBGROUP_create_node_phase0_local_sync(
+    uniform global struct Globals* globals,
+    uniform global struct BinaryMortonCodeHierarchy* bnodes,
+    uniform global char* bvh_mem,
+    uniform uint rID,
+    uniform local uint* local_numRecords,
+    uniform local uint* local_QNodeOffset,
+    uniform global struct BuildRecordMorton* records,
+    uniform struct BuildRecordMorton current,
+    uniform local uint* local_p0_total,
+    uniform global struct MortonFlattenedBoxlessNode *boxless_nodes,
+    uniform uint nodeDataStart)
+{
+    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    uniform const uint rootNodeOffset = bvh->rootNodeOffset;
+    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+    varying ushort lane = get_sub_group_local_id();
+
+    /* initialize child array */
+    uniform uint numChildren = 2;
+    varying struct BuildRecordMorton sg_children;
+    sg_children.items = 0;
+    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
+
+    if ( lane < numChildren )
+        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
+
+    /* fill QBVH6 node with up to 6 children */
+    while ( numChildren < BVH_NODE_N6 )
+    {
+        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
+        if ( sub_group_all( sg_is_leaf ) )
+            break;
+
+        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
+        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
+        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
+
+        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
+
+        if ( lane == numChildren || lane == bestChild )
+        {
+            sg_children.nodeID = nodeID;
+            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
+        }
+
+        numChildren++;
+    }
+
+    const uint current_index = current.current_index;
+    uniform uint global_offset;
+    uniform uint child_node_offset;
+
+    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
+    // used in global refit after phase1
+    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
+    uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane);
+    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
+
+    if ( lane == 0 )
+    {
+        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+
+        /* Do not create qnodes here */
+        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
+
+        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
+
+#if MORTON_VERBOSE_LOG
+        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n",
+               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart);
+#endif
+
+        MortonFlattenedBoxlessNode flattened_node;
+
+        if(children_roots_num != numChildren)
+            backpointer += children_roots_num;
+
+        flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask;
+
+        uint loc_id = atomic_inc_local( local_p0_total );
+
+        flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE;
+        flattened_node.backPointer = backpointer;
+
+        //TODO: change this writes to L1WB or streaming
+        boxless_nodes[loc_id] = flattened_node;
+
+        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
+    }
+
+    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
+    global_offset = sub_group_broadcast( global_offset, 0 );
+
+    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
+
+    sg_children.current_index = childNodes - nodeData + lane;
+    sg_children.parent_index = current_index;
+
+    if ( lane < numChildren )
+    {
+        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
+        records[write_position] = sg_children;
+    }
+}
+
+/*
+
+  In this phase a single large work group performs the construction of
+  the top of the BVH and creates a build record array.
+
+  Two varians of this kernel:
+  1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit
+     in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase
+     that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate
+     that is not effective.
+  2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with
+     number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit.
+     In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1.
+     Refit is performed only with local synchronization.
+
+*/
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+parallel_build_phase0(global struct Globals *globals,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem,
+                      global uint *global_refit_startpoints)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+
+    /* a queue of build records in global memory */
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+    local uint local_numRecords;
+    local uint local_QNodeOffset;
+    local uint local_startpoints_num;
+
+    /* initialize first build record */
+    if (get_local_id(0) == 0)
+    {
+        /* allocate root node */
+        uint root_node_offset = 64*bvh->nodeDataCur;
+        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
+
+        //assert(root_node_offset == 0);
+        records[0].nodeID = globals->binary_hierarchy_root;
+        records[0].items = globals->numPrimitives;
+        records[0].current_index = rootNode - nodeData;
+        records[0].parent_index = -1;
+
+        local_numRecords = 1;
+        local_QNodeOffset = root_node_offset + 64;
+        local_startpoints_num = 0;
+
+        mem_fence_workgroup_default();
+    }
+
+    uint num_records = 1;
+
+    /* terminate when all subtrees are under size threshold */
+    while(true)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* all work items in the work group pick a subtree to build */
+        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
+        {
+            /* small subtrees will get built in next phase */
+            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
+                continue;
+
+            /* create QBVH node */
+            SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset,
+                                        records, records[ID], &local_startpoints_num);
+        }
+
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+        mem_fence_workgroup_default();
+        uint old_num_records = num_records;
+        num_records = local_numRecords;
+        if( old_num_records == num_records )
+            break;
+
+    }
+
+    /* remember number of build records for next phase */
+    if (get_local_id( 0 ) == 0)
+    {
+        globals->numBuildRecords = local_numRecords;
+        globals->p0_created_num = local_startpoints_num;
+        bvh->nodeDataCur = local_QNodeOffset / 64;
+
+#if MORTON_VERBOSE_LOG
+        printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num);
+#endif
+    }
+}
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+parallel_build_phase0_local_sync(global struct Globals *globals,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem,
+                      global struct MortonFlattenedBoxlessNode *boxless_nodes)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+    uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+
+    /* a queue of build records in global memory */
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+    local uint local_numRecords;
+    local uint local_QNodeOffset;
+    local uint local_p0_total;
+
+    /* initialize first build record */
+    if (get_local_id(0) == 0)
+    {
+        /* allocate root node */
+        uint root_node_offset = 64*bvh->nodeDataCur;
+        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
+
+        //assert(root_node_offset == 0);
+        records[0].nodeID = globals->binary_hierarchy_root;
+        records[0].items = globals->numPrimitives;
+        records[0].current_index = rootNode - nodeData;
+        records[0].parent_index = -1;
+
+        local_numRecords = 1;
+        local_QNodeOffset = root_node_offset + 64;
+        local_p0_total = 0;
+
+        mem_fence_workgroup_default();
+    }
+
+    uint num_records = 1;
+
+    /* terminate when all subtrees are under size threshold */
+    while(true)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* all work items in the work group pick a subtree to build */
+        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
+        {
+            /* small subtrees will get built in next phase */
+            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
+                continue;
+
+            /* create QBVH node */
+            SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records,
+                                                   records[ID], &local_p0_total, boxless_nodes, nodeDataStart);
+        }
+
+        mem_fence_workgroup_default();
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+        uint old_num_records = num_records;
+        num_records = local_numRecords;
+        if( old_num_records == num_records )
+            break;
+
+    }
+
+    /* remember number of build records for next phase */
+    if (get_local_id( 0 ) == 0)
+    {
+        globals->numBuildRecords = local_numRecords;
+        bvh->nodeDataCur = local_QNodeOffset / 64;
+
+        globals->p0_allocated_num = BVHBase_numNodes(bvh);
+        globals->p0_created_num = local_p0_total;
+
+#if MORTON_VERBOSE_LOG
+            printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints);
+#endif
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/phase1.cl b/src/intel/vulkan/grl/gpu/morton/phase1.cl
new file mode 100644
index 00000000000..6a1dd2aa44b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase1.cl
@@ -0,0 +1,785 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards;
+BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec)
+{
+    BuildRecordLocalMortonFlattener rec;
+    rec.leftChild  = srcRec.leftChild;
+    rec.rightChild = srcRec.rightChild;
+    rec.rangeStart = srcRec.range.start;
+    rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1;
+    return rec;
+}
+
+GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless)
+{
+    BuildRecordLocalMortonFlattener rec;
+    rec.leftChild = boxless.binary_hierarchy_index;
+    rec.rightChild = boxless.childOffset_type;
+    rec.rangeStart = boxless.backPointer;
+    rec.local_parent_index__numItems = 0;
+    return rec;
+}
+
+GRL_INLINE void SUBGROUP_create_boxless_node_phase1(
+    uniform global struct Globals* globals,
+    uniform global struct BinaryMortonCodeHierarchy* bnodes,
+    uniform global char* bvh_mem,
+    uniform BuildRecordLocalMortonFlattener currentRecord,
+    uniform uint  currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record
+    uniform local uint* local_numRecords,
+    uniform uint tictoc,
+    uniform uint* sg_bu_startpoint_arr,
+    uniform uint* sg_bu_startpoint_cnt,
+    uniform uint parentOfRoot,
+    uniform bool processRoot,
+    uniform UPerNodeData* nodeData)
+{
+    varying ushort lane = get_sub_group_local_id();
+
+    /* initialize child array */
+    uniform uint numChildren = 2;
+    varying struct BuildRecordLocalMortonFlattener sg_children;
+    sg_children.local_parent_index__numItems = 0;
+
+    uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild;
+    if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31;
+
+    sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx));
+
+    /* fill QBVH6 node with up to 6 children */
+    while (numChildren < BVH_NODE_N6)
+    {
+        // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point
+        uint childNumItems = sg_children.local_parent_index__numItems;
+        varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize;
+        if (sub_group_all(sg_is_leaf)) { break; }
+
+        uniform uint   bestItems = sub_group_reduce_max_N6(childNumItems);
+        uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems));
+        varying uint   leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes
+        uniform uint   rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild);
+
+        varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest;
+
+        if (lane == numChildren || lane == bestChild)
+        {
+            sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID));
+        }
+
+        numChildren++;
+    }
+
+    uniform uint global_offset;
+    uniform uint child_node_index;
+
+    bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren);
+    uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild));
+
+    if (lane <= numChildren) {
+        uint           writeIDX = 0;
+
+        if (lane == numChildren)
+    {
+        /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */
+        MortonFlattenedBoxlessNode flattened_node;
+            uint parentIDX;
+
+            if (processRoot)
+            {
+                *local_numRecords = numChildren + 1;
+                child_node_index = 1;
+                writeIDX = 0;
+        flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
+                flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE;
+                parentIDX = parentOfRoot;
+            }
+            else
+            {
+                uint shift = (16 * tictoc);
+                uint mask = 0xFFFF;
+                uint atomicAddVal = numChildren << shift;
+                child_node_index = atomic_add_local(local_numRecords, atomicAddVal);
+                sub_group_barrier(0);
+                writeIDX = currQnodeLocalId;
+                parentIDX = currentRecord.local_parent_index__numItems >> 16;
+                flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
+                sub_group_barrier(0);
+                child_node_index = (child_node_index >> 16) + (child_node_index & mask);
+        flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE;
+            }
+
+#if MORTON_VERBOSE_LOG
+            printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren);
+#endif
+            flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren;
+            sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node);
+    }
+
+        child_node_index = sub_group_broadcast(child_node_index, numChildren);
+
+        if (lane != numChildren)
+    {
+            writeIDX = child_node_index + lane;
+            sg_children.local_parent_index__numItems |= currQnodeLocalId << 16;
+    }
+
+        nodeData[writeIDX].buildRecord = sg_children;
+    }
+
+    if (numFatleafChildren == numChildren) {
+        uint arridx = *sg_bu_startpoint_cnt;
+        // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
+        set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane);
+        *sg_bu_startpoint_cnt = arridx + 1;
+    }
+}
+
+// TODO_OPT:  Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants
+//    of this kernel with different WG sizes.   There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is
+//     probably often wasted
+GRL_INLINE void phase1_process_fatleaf(
+    uint   globalBaseForInternalNodes,    // for root node this is indexOfRoot
+    uint   globalParent          ,        // for root this should be parentOfRoot
+    bool   isInstancePrimLeafType,        //
+    uint   leafPrimType,                  //
+    uint   leafStride,                    //
+    global struct QBVHNodeN* nodeData,    // per group
+    uint nodeDataStart,                   //
+    struct AABB* primref,                 //
+    BackPointers* backPointers,           //
+    global struct MortonCodePrimitive* mc,//
+    uint nodesToLeafsGap,                 //
+    local union UPerNodeData* perNodeData,//
+    bool processRoot,                               //
+    short localNodeId,                              //
+    BuildRecordLocalMortonFlattener fatleafRecord,  // per node
+    uint primID )                                   //
+{
+    uint lane = get_sub_group_local_id();
+    uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+    uniform uint mcID = fatleafRecord.rangeStart;
+    uint pseudolane = lane < numChildren ? lane : 0;
+    varying struct AABB sg_bounds = primref[primID];
+
+    uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16);
+    uint globalNodeId = globalBaseForInternalNodes + localNodeId;
+    uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId;
+
+    uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId;
+
+    {
+        /* For all primitives in a fat leaf we store a back
+         * pointer. This way we can modify the fat leaf node at leaf construction time. */
+        uint back_pointer = globalNodeId + nodeDataStart;
+        /* Store back pointer and primID inside morton code array to
+         * be later used by leaf creation. */
+        mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+    }
+
+    struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds);
+    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
+
+    uint8_t instMask;
+    if (isInstancePrimLeafType)
+    {
+        instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0;
+        subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask);
+        instMask = sub_group_reduce_or_N6(instMask);
+    }
+    else
+    {
+        instMask = 0xFF;
+        subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds);
+    }
+
+    reduce_bounds.lower.w = as_float((uint)instMask);
+    uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0);
+    local uint* boxUint = (local uint*)(perNodeData + localNodeId);
+    if (get_sub_group_size() == 8 || lane < 8)
+    {
+        boxUint[lane] = reduce_bounds_lane;
+        uint globalParentIdx;
+        if (processRoot) {
+            // for root, treeletRootGlobalIndex is index of rootsParent in global space
+            globalParentIdx = globalParent;
+        }
+        else {
+            // for non root, raw_parent_idx is in local space
+            globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent;
+        }
+        if (lane == 0) {
+            *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3);
+        }
+    }
+}
+
+GRL_INLINE void perform_phase1(global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem,
+    local union UPerNodeData* perNodeData,
+    local uint* local_records_head,
+    local uint* local_globalOffsetForNodes,
+    BuildRecordLocalMortonFlattener rootRecord,
+    uint treeletRootGlobalIndex,
+    uint parentOfRootIndex,
+    const uint leafPrimType,
+    bool isInstancePrimLeafType)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    varying ushort lane = get_sub_group_local_id();
+
+    // array that will keep 2x8 shorts indices
+    varying uint    sg_fatleaf_array = 0x0;
+    uniform uint8_t sg_fatleaf_cnt = 0;
+    /* terminate when all subtrees are leaves */
+
+    uint subgroupId = get_sub_group_id();
+    uint ID = subgroupId;
+
+    uint sg_bu_startpoints = 0;
+    uniform uint sg_bu_startpoints_cnt = 0;
+    const uint shift_mask = globals->shift_mask;
+
+    const uint nodeDataStart  = BVH_ROOT_NODE_OFFSET / 64;
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+    global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh);
+
+    uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart;
+    uint  leafStart = *pLeafStart;
+    uint  leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode));
+    uint  nodesToLeafsGap = leafStart - nodeDataStart;
+
+    if (ID == 0)
+    {
+        BuildRecordLocalMortonFlattener current = rootRecord;
+
+        if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
+        {
+             *local_records_head = 1;
+#if MORTON_DEBUG_CHECKS
+                if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
+#endif
+            BuildRecordLocalMortonFlattener fatleafRecord = current;
+            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+            uint pseudolane = lane < numChildren ? lane : 0;
+            uniform const uint mcID = fatleafRecord.rangeStart;
+            varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+
+            phase1_process_fatleaf(
+                treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+                true, 0, fatleafRecord, primID);
+        }
+        else
+        {
+#if MORTON_VERBOSE_LOG
+            if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); }
+#endif
+            //printf("local_records_head = %d\n", *local_records_head);
+            SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData);
+            *local_globalOffsetForNodes = treeletRootGlobalIndex;
+        }
+
+        ID += get_num_sub_groups();
+    }
+
+    uniform uint priv_records_tail = 1;
+
+    /* wait for all work items to have updated local_records array */
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    uniform uint priv_records_head = *local_records_head & 0xFFFF;
+    treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1
+    uniform uint priv_records_tail_prev = priv_records_tail;
+    uniform uint other_records_head = priv_records_head;
+
+    uint ticToc = 1;
+
+    if (priv_records_head == priv_records_tail)
+    {
+        return;
+    }
+    else
+    {
+        do
+        {
+            for (; ID < priv_records_head; ID += get_num_sub_groups())
+            {
+                BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord);
+
+                if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
+                {
+                    set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane);
+#if MORTON_VERBOSE_LOG
+                    if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID);
+#endif
+#if MORTON_DEBUG_CHECKS
+                    if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
+#endif
+                }
+                else
+                {
+                    SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData);
+                }
+            }
+
+            priv_records_tail = priv_records_head;
+            /* wait for all work items to have updated local_records array */
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+            {
+                uint records_as_in_mem = *local_records_head;
+                priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF;
+                uint other_records_head_temp = priv_records_head;
+                priv_records_head += other_records_head;
+                other_records_head = other_records_head_temp;
+                ticToc = ticToc ^ 1;
+#if MORTON_VERBOSE_LOG
+                if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem);
+#endif
+            }
+        } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head
+    }
+
+    bool atomicNodeAllocation = treeletRootGlobalIndex > 0;
+    bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation;
+    uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0;
+
+    uniform uint globalBaseForInternalNodes = 0;
+
+    // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex
+    // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so
+    // there's no need to synchronize multiple treelets nodes allocations with atomics.
+    if (atomicNodeAllocationProduce)
+    {
+        *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1);
+    }
+
+    // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1
+            // mapping local to global:
+            // local space                           global space
+            // [0]             - treelet root        [treeletRootGlobalIndex]
+            //                                       ... possibly very long distance ...
+            // [1]             - first non root      [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above
+            // [2]             - first               [globalBaseForInternalNodes + 2]
+            // ...
+            // [numToAllocate] - last node           [globalBaseForInternalNodes + 3]
+    if (atomicNodeAllocation)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+        globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1);
+    }
+
+#if MORTON_VERBOSE_LOG
+    if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); }
+#endif
+
+    if (sg_fatleaf_cnt)
+    {
+        short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane);
+        //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue;
+        //if(local_startpoints_cnt > 1) return;
+        BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord;
+
+        varying uint primID;
+        {
+            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+            uint pseudolane = lane < numChildren ? lane : 0;
+                uniform const uint mcID = fatleafRecord.rangeStart;
+                primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+        }
+
+        // process fatleafs, and store their boxes to SLM
+        // also put startpoints for bottom up
+        //uint fatleaf_cnt = *local_startpoints_cnt;
+        while (sg_fatleaf_cnt-- > 1)
+        {
+            short                           nextLocalNodeId   = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane);
+            BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord;
+            varying uint                    nextPrimId;
+
+            {
+                uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF);
+                uint pseudolane = lane < numChildren ? lane : 0;
+                uniform const uint mcID = nextfatleafRecord.rangeStart;
+                nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+            }
+
+            phase1_process_fatleaf(
+                globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+                false, localNodeId, fatleafRecord, primID);
+
+            fatleafRecord = nextfatleafRecord;
+            localNodeId   = nextLocalNodeId;
+            primID        = nextPrimId;
+        }
+
+        phase1_process_fatleaf(
+            globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+            nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+            false, localNodeId, fatleafRecord, primID);
+        }
+
+#if 0
+    // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups.
+        {
+            ushort myStartpointWriteSite = 0;
+
+            if (lane == 0)
+            {
+                myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt);
+            }
+            myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0);
+
+            unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite);
+        }
+#endif
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // distribute bottom-up startpoints
+#if 0
+        {
+            short sp_count_to_divide = (*local_startpoints_cnt);
+
+            //calculate the chunk for each sg.
+            sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups();
+            uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups();
+
+            uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt;
+            if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) {
+                //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx
+                // and all sgs before it also have one extra
+                myReadSite += get_sub_group_id();
+                sg_bu_startpoints_cnt++;
+        }
+        else
+        {
+            // all reminder elements are consummed by previous sgs
+            myReadSite += sg_bu_startpoints_cnt_reminder;
+        }
+
+        pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane);
+    }
+#endif
+
+    SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt);
+
+    if (singleTreeletBumpBVHnodeCnt)
+    {
+        bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt;
+    }
+}
+
+GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType)
+{
+    if (get_sub_group_id() == 0 )
+    {
+        global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh);
+        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+        //set required fields to mark that blas is empty
+        uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0;
+        qnode->type = leafPrimType;
+        qnode->instMask = 0;
+        qnode->qbounds.lower_x[k] = 0x80;
+        qnode->qbounds.upper_x[k] = 0;
+
+        *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6);
+    }
+}
+
+/*
+
+  POSTSORT PHASE1:
+  Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD.
+  1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip
+  2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards)
+
+*/
+
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase1_Indirect_SG( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    const uint leafPrimType = globals->leafPrimType;
+
+    //special case for empty blas
+    if(globals->numPrimitives == 0)
+    {
+        bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1;
+        update_empty_blas(bvh, leafPrimType);
+        return;
+    }
+
+    local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1];
+    local uint local_records_head;
+    // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers
+    local uint local_globalOffsetForNodes, local_globalOffsetForNodes2;
+
+    uint rootIndex = 0;
+    uint parentOfRoot = 0;
+    BuildRecordLocalMortonFlattener  rootBuildRecord;
+
+    /* add start build record to local stack */
+    if (get_sub_group_id() == 0 )
+    {
+        global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart);
+        uint recordID = get_group_id(0);
+        struct BuildRecordMorton mortonGlobalRecord = records[recordID];
+
+        rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID));
+
+        parentOfRoot = mortonGlobalRecord.parent_index;
+        rootIndex = mortonGlobalRecord.current_index;
+
+#if MORTON_VERBOSE_LOG
+        printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n",
+               local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index);
+#endif
+    }
+
+    if (leafPrimType == NODE_TYPE_INSTANCE)
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes,
+            rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true);
+    }
+    else
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes,
+            rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false);
+    }
+
+}
+
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase1_Indirect_global_root( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    const uint leafPrimType = globals->leafPrimType;
+    const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+
+    bvh->nodeDataCur = nodeDataStart + 1;
+
+    //special case for empty blas
+    if(globals->numPrimitives == 0)
+    {
+        update_empty_blas(bvh, leafPrimType);
+        return;
+    }
+
+    local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1];
+    local uint local_records_head;
+    local uint local_globalOffsetForNodes;
+
+    BuildRecordLocalMortonFlattener rootBuildRecord;
+
+    if (get_sub_group_id() == 0 )
+    {
+        struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root);
+
+        rootBuildRecord = TranslateToLocalRecord(binaryNode);
+
+        local_globalOffsetForNodes = 0;
+    }
+
+    if (leafPrimType == NODE_TYPE_INSTANCE)
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true);
+    }
+    else
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false);
+
+    }
+}
+
+#if 0
+GRL_INLINE void
+DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem,
+    uint startID, uint endID,
+    local uint* local_numRecords,
+    local uint* local_numRecordsOld,
+    local struct BuildRecordMorton* local_records
+)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
+
+    /* iterate over all subtrees this workgroup should build */
+    for ( uint recordID = startID; recordID < endID; recordID++ )
+    {
+        /* add start build record to local stack */
+        if ( get_local_id( 0 ) == 0 )
+        {
+            local_records[0] = records[recordID];
+            *local_numRecords = 1;
+            *local_numRecordsOld = 0;
+        }
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+        /* terminate when all subtrees are leaves */
+        while ( *local_numRecords != *local_numRecordsOld )
+        {
+            /* remember the old number of build records to detect later
+       * whether we are done */
+            if ( get_local_id( 0 ) == 0 )
+            {
+                *local_numRecordsOld = *local_numRecords;
+            }
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+            /* all work items in the sub group pick a subtree to build */
+            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
+            {
+                /* ignore small subtrees */
+                if ( local_records[ID].items <= BVH_NODE_N6 )
+                    continue;
+
+                /* create QBVH node */
+                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
+            }
+
+            /* wait for all work items to have updated local_records array */
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+        }
+
+        const uint shift_mask = globals->shift_mask;
+        const uint leafPrimType = globals->leafPrimType;
+        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+        /* create all fat leaf nodes and initiate refit */
+        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
+        {
+            struct BuildRecordMorton current = local_records[ID];
+            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
+
+            global struct QBVHNodeN* qnode = nodeData + current.current_index;
+
+            /* get bounds of all children of the fat leaf node */
+            struct AABB bounds[BVH_NODE_N6];
+            for ( uint i = 0; i < current.items; i++ )
+            {
+                /* get primID and bounds of primitive */
+                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
+                bounds[i] = primref[primID];
+
+                /* For all primitives in a fat leaf we store a back
+                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
+                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
+
+                /* Store back pointer and primID inside morton code array to
+                 * be later used by leaf creation. */
+                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+            }
+
+            /* update fat leaf node */
+            QBVHNodeN_setType( qnode, leafPrimType );
+            global void* offset;
+            if ( leafPrimType != BVH_INSTANCE_NODE )
+            {
+                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
+                QBVHNodeN_setChildIncr1( qnode );
+            }
+            else
+            {
+                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
+                QBVHNodeN_setChildIncr2( qnode );
+            }
+            QBVH6Node_set_offset( qnode, offset );
+            QBVHNodeN_setBounds( qnode, bounds, current.items );
+
+            /* set back pointers for fat leaf nodes */
+            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
+
+            /* bottom up refit */
+            refit_bottom_up( qnode, bvh, bounds, current.items );
+        }
+    }
+}
+
+/*
+
+  This phase takes the build records calculated in phase0 as input and
+  finished the BVH construction for all these subtrees.
+
+*/
+__attribute__((reqd_work_group_size(8, 1, 1)))
+old_parallel_build_phase1(global struct Globals *globals,
+                      global struct MortonCodePrimitive *mc,
+                      global struct AABB *primref,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+
+__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
+old_parallel_build_phase1_Indirect( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem )
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    uint startID = get_group_id( 0 );
+    uint endID   = startID + 1;
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/morton/phase2.cl b/src/intel/vulkan/grl/gpu/morton/phase2.cl
new file mode 100644
index 00000000000..e82d22aaacf
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase2.cl
@@ -0,0 +1,314 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "bvh_build_refit.h"
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+/*
+
+  POSTSORT PHASE2:
+  Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
+  1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
+                                   This kernel should be used only for very big bvh, it is faster than non-SLM fallback
+                                   in parallel_build_phase2_refit_local.
+  2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
+                                   nodes allocated in phase0, but there is also non-SLM fallback there, as the
+                                   decision on which kernel to run is based on the nodes estimates on the host
+                                   side.
+
+*/
+
+
+GRL_INLINE void refit_bottom_up_global_sync(
+    global char* bvh_mem,
+    global uint* global_refit_startpoints,
+    uniform uint nodeId,
+    uniform ushort lane)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+
+    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+    // Get the node idx that was put here in phase1
+    const uint innerNodeIdx = global_refit_startpoints[nodeId];
+
+    // Get the qnode and backpointer
+    uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
+    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+
+    varying struct AABB childrenAABB; // one child AABB per lane
+    AABB_init(&childrenAABB);
+
+    uniform uint numChildren = (backPointer >> 3) & 0x7;
+    if(numChildren == 0) return;
+
+    global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+    varying ushort child_idx = (lane < numChildren) ? lane : 0;
+    childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+#if MORTON_VERBOSE_LOG
+    if(lane == 0)
+        printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
+#endif
+
+    struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
+    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
+
+    subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);
+
+    uint children_mask = qnode_child[child_idx].instMask;
+    qnode->instMask = sub_group_reduce_or_N6(children_mask);
+
+    SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
+}
+
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
+parallel_build_phase2_refit( global char* bvh_mem,
+    global uint* global_refit_startpoints )
+{
+    refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
+}
+
+
+GRL_INLINE void SUBGROUP_refit_bottom_up_global(
+    uniform global struct QBVHNodeN* globalNodeData,
+    uniform struct BackPointers* backPointers,
+    varying ushort lane,
+    varying uint curNodeIndex)
+{
+    uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+
+    const uint head_lane = 0;
+    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
+
+    while (curNodeIndex != 0)
+    {
+        global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
+        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+        varying ushort child_idx = (lane < numChildren) ? lane : 0;
+        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
+
+        /* get bounds of all children from child nodes directly */
+        subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);
+
+        uchar childrenMask = qnode_child[child_idx].instMask;
+        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
+
+        uint parentIndex = BackPointer_GetParentIndex(backpointer);
+
+        mem_fence_gpu_invalidate();
+
+        if (lane == 0)
+        {
+            backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));
+
+            uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);
+
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;
+
+#if MORTON_VERBOSE_LOG
+            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
+                   curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
+#endif
+        }
+
+        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
+        curNodeIndex = parentIndex;
+
+        /* if all children got refitted, then continue */
+        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
+        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
+
+        if (numChildrenRefitted != numChildrenTotal)
+                return;
+    }
+
+    // process root of the treelet
+    {
+
+#if MORTON_DEBUG_CHECKS
+        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
+#endif
+
+        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+        varying ushort child_idx = (lane < numChildren) ? lane : 0;
+        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
+
+        /* get bounds of all children from child nodes directly */
+        subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);
+
+        uchar childrenMask = qnode_child[child_idx].instMask;
+        globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);
+
+        /* reset refit counter for next refit */
+        if (lane == 0)
+        {
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);
+
+#if MORTON_VERBOSE_LOG
+        printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
+               curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
+#endif
+        }
+    }
+}
+
+
+// TODO: Check why 512 wg size has worse performance than 256
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase2_refit_local( global struct Globals* globals,
+    global char* bvh_mem,
+    global struct MortonFlattenedBoxlessNode *boxless_nodes)
+{
+    // Number of nodes created in P0, to be refitted in this stage
+    uint p0_created_num = globals->p0_created_num;
+
+    // Return immediately if host executed this kernel but there is nothing to do
+    if(p0_created_num == 0)
+        return;
+
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    varying ushort lane = get_sub_group_local_id();
+
+    // Hardcode SLM to max here as we do not know upfront how much mem will be needed
+    local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */
+
+    // Number of allocated nodes in phase0 (p0_created_num + children)
+    uint p0_allocated_num = globals->p0_allocated_num;
+
+    // array that will keep 2x8 shorts indices
+    varying uint sg_fatleaf_array = 0x0;
+    uniform uint8_t sg_bu_startpoints_cnt = 0;
+
+    // Determine if we can fit into SLM with all the nodes allocated in phase0,
+    // There are two paths here:
+    // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
+    //    which does refit nad creates qnodes in bvh
+    // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
+    //    It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
+    //    to the old refit separated path, with wg_size 8 with better EU reuse.
+    if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
+    {
+        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
+        {
+            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
+            uint current_id = boxless_node.binary_hierarchy_index >> 6;
+
+            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
+            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
+
+            if(lane == 0)
+                perNodeData[current_id].boxlessNode = boxless_node;
+
+            // When no children are subtree roots, we are done and skip to the next iteration
+            if(children_root_mask == 0x0)
+            {
+                continue;
+            }
+            // When all children are subtree roots, put them to sg_fatleaf_array
+            else if(children_root_mask == 0x3F)
+            {
+                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
+            }
+
+            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
+
+            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
+            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
+            varying ushort child_idx = (lane < numChildren) ? lane : 0;
+
+            varying struct AABB childrenAABB; // one child AABB per lane
+            AABB_init(&childrenAABB);
+
+            uint lead_child_global_id = current_id + lead_child_offset;
+
+            uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
+            childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+            // Get only AABBs of children that are p1 subtree roots
+            bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
+            if(lane_active)
+            {
+                uint child_global_id = lead_child_global_id + child_idx;
+                perNodeData[child_global_id].box = childrenAABB;
+                perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
+            }
+
+#if MORTON_VERBOSE_LOG
+            if(lane == 0)
+                printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
+#endif
+        }
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
+    }
+    else
+    {
+        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
+        {
+            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
+            uint current_id = boxless_node.binary_hierarchy_index >> 6;
+
+            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
+            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
+            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
+
+            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
+            uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
+            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
+
+            SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
+            if(lane == 0)
+            {
+                QBVH6Node_set_type( qnode, nodeType );
+                qnode->offset = lead_child_offset;
+            }
+
+            // When no children are subtree roots, we are done and skip to the next iteration
+            if(children_root_mask == 0x0)
+            {
+                continue;
+            }
+            // When all children are subtree roots, put them to sg_fatleaf_array
+            else if(children_root_mask == 0x3F)
+            {
+                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
+            }
+
+#if MORTON_VERBOSE_LOG
+            if(lane == 0)
+                printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
+#endif
+        }
+
+        while (sg_bu_startpoints_cnt > 0)
+        {
+            uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);
+
+            SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/post_sort.cl b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
new file mode 100644
index 00000000000..c13762438a3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
@@ -0,0 +1,521 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
+
+    This kernel constructs a binary hierarchy in bottom up fashion from
+    the morton codes.
+
+*/
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 )
+{
+    const uint64_t key1 = mc[i1].index_code;
+    return  clz(key0 ^ key1);
+}
+
+int sign( int d )
+{
+    return (d > 0) ? 1 : -1;
+}
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
+void kernel build_bottom_up_indirect( global struct Globals* globals,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global struct MortonCodePrimitive* mc )
+{
+    /* construct range of primitives that each work group will process */
+    const uint numPrimitives = globals->numPrimitives;
+
+    uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 );
+
+    if (i == 0)
+    {
+        globals->binary_hierarchy_root = 0;
+        if (numPrimitives == 1)
+        {
+            // special kludge for 1-prim tree.  Make sure the one leaf node is initialized
+            bnodes[i].range.start   = 0;
+            bnodes[i].range.end     = 0;
+            bnodes[i].leftChild     = -1;
+            bnodes[i].rightChild    = -1;
+        }
+
+        // store pointer to the binary hierarchy in the globals struct.
+        //  This will be used
+        globals->binary_hierarchy_buffer = (gpuva_t) bnodes;
+    }
+
+    uint num_inner_nodes = numPrimitives-1;
+    if ( i < num_inner_nodes )
+    {
+        //
+        // direction is 1 if this morton code is the node's first key, -1 if it's the last
+        //    By construction every internal node is either the start or the end of a given key range
+        //  direction should be towards the neighbor with the most bits in common
+
+        uint64_t ki = mc[i].index_code;
+
+        int direction, delta_min;
+        uint lmax;
+        if( i == 0 )
+        {
+            direction = 1;
+            delta_min = -1;
+            lmax = numPrimitives;
+        }
+        else
+        {
+            direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc,  ki, i - 1 ) );
+            delta_min = Delta( mc,  ki, i - direction );
+
+            // find upper bound for length of this node's key range
+            lmax = 8;
+            while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min)
+                lmax = lmax * 2;
+        }
+
+        // clamp max length so that the binary searches are fully in-bounds
+        uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1);
+        lmax = min(lmax, maxLen);
+
+        // find end of range using binary search
+        uint length = 0;
+        uint end    = lmax-1;
+        while (length != end)
+        {
+            uint mid = length + ((end-length)/2) + ((end-length)%2);
+            bool bigger =  Delta( mc, ki, i+mid*direction) > delta_min;
+            length = bigger ? mid : length;
+            end    = bigger ? end : mid-1;
+        }
+        uint j = i + length*direction ;
+
+        // find split position using binary search
+        uint split = 0;
+        end    = length-1;
+        int delta_node = Delta(mc, ki, j);
+        while (split != end)
+        {
+            uint mid = split + ((end-split)/2) + ((end-split)%2);
+            bool bigger =  Delta( mc, ki, i+mid*direction) > delta_node;
+            split = bigger ? mid : split;
+            end   = bigger ? end : mid-1;
+        }
+        split = i + split*direction + min(direction,0);
+
+        uint left  = split;
+        uint right = split+1;
+
+        // mark leaves
+        if( min(i,j) == split )
+            left = left | (1<<31);
+        if( max(i,j) == split+1 )
+            right = right | (1<<31);
+
+        bnodes[i].range.start = min(i,j);
+        bnodes[i].range.end   = max(i,j);
+        bnodes[i].leftChild   = left;
+        bnodes[i].rightChild  = right;
+    }
+}
+
+
+
+
+
+#if 0
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
+void kernel build_bottom_up_indirect( global struct Globals* globals,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global struct MortonCodePrimitive* mc )
+{
+    /* construct range of primitives that each work group will process */
+    const uint numPrimitives = globals->numPrimitives;
+
+    // RangeFactor determines the distance between adjacent nodeIds in work group.
+    // The aim of the nodes distribution within work group, for rangeFactor > 1
+    // is to be sure that half of the work groups will entirelly be dropped off
+    // at the bottom layer of the graph. This way the EUs can be reused faster.
+    // The factor needs to be smaller than MAX_HW_SIMD_WIDTH
+    const uint rangeFactor = 2;
+
+    const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH);
+    const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 );
+    const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups;
+    const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor;
+
+    /* iterate over all primitives the work group should process */
+    const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange;
+
+    if ( i < numPrimitives )
+    {
+        uint node = i | ((uint)1 << 31);
+        uint start = i;
+        uint end = i;
+
+        /* bottom up */
+        while ( true )
+        {
+            /* goto parent node and link parent node to current node */
+            node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 );
+
+            /* do not continue if we reached this node the first time */
+            if ( node == -1 )
+                break;
+
+            mem_fence_gpu_invalidate();
+
+            /* update range */
+            start = bnodes[node].range.start;
+            end = bnodes[node].range.end;
+
+            /* stop when we reached the root node */
+            if ( start == 0 && end == numPrimitives - 1 )
+            {
+                globals->binary_hierarchy_root = node;
+                break;
+            }
+        }
+    }
+}
+
+#endif
+
+/*
+
+  This function builds one QBVH6 node by opening the provided binary
+  BVH nodes until the QBVH node is full.
+
+ */
+
+GRL_INLINE void create_node(global struct Globals *globals,
+                        global struct BinaryMortonCodeHierarchy *bnodes,
+                        global char *bvh_mem,
+                        uint rID,
+                        local uint *local_numRecords,
+                        local uint *local_QNodeOffset,
+                        struct BuildRecordMorton *records,
+                        struct BuildRecordMorton *current)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+    BackPointers *backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* initialize child array */
+    uint numChildren = 2;
+    struct BuildRecordMorton children[BVH_NODE_N6];
+    children[0].nodeID = bnodes[current->nodeID].leftChild;
+    children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID);
+    children[1].nodeID = bnodes[current->nodeID].rightChild;
+    children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID);
+
+    /* fill QBVH6 node with up to 6 children */
+    while (numChildren < BVH_NODE_N6)
+    {
+        /*! find best child to split */
+        uint bestItems = 0;
+        int bestChild = -1;
+        for (int i = 0; i < numChildren; i++)
+        {
+            const uint items = children[i].items;
+
+            /* ignore leaves as they cannot get split */
+            if (items <= cfg_minLeafSize)
+                continue;
+
+            /* find child with largest number of items */
+            if (items > bestItems)
+            {
+                bestItems = items;
+                bestChild = i;
+            }
+        }
+        if (bestChild == -1)
+            break;
+
+        /* perform best found split */
+        const uint bestNodeID = children[bestChild].nodeID;
+        struct BuildRecordMorton *lrecord = &children[bestChild];
+        struct BuildRecordMorton *rrecord = &children[numChildren];
+        lrecord->nodeID = bnodes[bestNodeID].leftChild;
+        lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID);
+        rrecord->nodeID = bnodes[bestNodeID].rightChild;
+        rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID);
+        numChildren++;
+    }
+
+    /* allocate memory for all children */
+    const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+    global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset);
+
+    /* create node, but to not set bounds yet as these get calculated during refit */
+    const uint current_index = current->current_index;
+    struct QBVHNodeN *qnode = nodeData + current_index;
+    QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE);
+    QBVHNodeN_setChildIncr1(qnode);
+    QBVH6Node_set_offset(qnode, childNodes);
+
+    /* set back pointers */
+    *InnerNode_GetBackPointer(backPointers,  current_index) = (current->parent_index << 6) | (numChildren << 3);
+
+    /* update parent pointer of build records of all children */
+    for (uint ID = 0; ID < numChildren; ID++)
+    {
+        children[ID].current_index = childNodes - nodeData + ID;
+        children[ID].parent_index = current_index;
+    }
+
+    /* write out child build records */
+    const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1);
+    records[rID] = children[0];
+
+    for (uint i = 1; i < numChildren; i++)
+        records[global_offset + i - 1] = children[i];
+
+    mem_fence_workgroup_default();
+
+}
+
+#if 0
+/* This function calculates the similarity between two morton
+ * codes. It essentially counts how many bits of the morton codes are
+ * equal starting at the top. The more bits are equal, the similar the
+ * codes, and the closer the primitives are located spatially. */
+
+GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc,
+                      const uint id)
+{
+    const uint64_t key0 = mc[id + 0].index_code;
+    const uint64_t key1 = mc[id + 1].index_code;
+    return clz(key0 ^ key1);
+}
+
+
+
+/* This function checks for a range [left,right] of morton codes, if
+ * it is spatially closer to the left or to the right nodes. */
+
+GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc,
+                           const uint left,
+                           const uint right,
+                           const uint last)
+{
+    /* merge to right if we are at the left end of the array */
+    if (left == 0)
+        return true;
+
+    /* merge to left if we are at the right end of the array */
+    if (right == last)
+        return false;
+
+    /* otherwise merge to the side where the morton code sequence has
+   * the largest number of equal bits from the top */
+    return delta(mc, right) > delta(mc, left - 1);
+}
+
+GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes,
+                         global struct MortonCodePrimitive *mc,
+                         const uint nodeID,
+                         const uint left,
+                         const uint right,
+                         const uint last)
+{
+    uint parent;
+
+    /* check if we should merge this node to the left or right */
+    if (merge_to_right(mc, left, right, last))
+    {
+        parent = right;
+        bnodes[parent].leftChild = nodeID;
+        bnodes[parent].range.start = left;
+    }
+    else
+    {
+        parent = left - 1;
+        bnodes[parent].rightChild = nodeID;
+        bnodes[parent].range.end = right;
+    }
+
+    mem_fence_gpu_default();
+
+    /* stop ascending the tree if we reached this node the first time */
+    const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0;
+    return first ? -1 : parent;
+}
+
+GRL_INLINE void
+DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem,
+    uint startID, uint endID,
+    local uint* local_numRecords,
+    local uint* local_numRecordsOld,
+    local struct BuildRecordMorton* local_records
+)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
+
+    /* iterate over all subtrees this workgroup should build */
+    for ( uint recordID = startID; recordID < endID; recordID++ )
+    {
+        /* add start build record to local stack */
+        if ( get_local_id( 0 ) == 0 )
+        {
+            local_records[0] = records[recordID];
+            *local_numRecords = 1;
+            *local_numRecordsOld = 0;
+        }
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+        /* terminate when all subtrees are leaves */
+        while ( *local_numRecords != *local_numRecordsOld )
+        {
+            /* remember the old number of build records to detect later
+       * whether we are done */
+            if ( get_local_id( 0 ) == 0 )
+            {
+                *local_numRecordsOld = *local_numRecords;
+            }
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+            /* all work items in the sub group pick a subtree to build */
+            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
+            {
+                /* ignore small subtrees */
+                if ( local_records[ID].items <= BVH_NODE_N6 )
+                    continue;
+
+                /* create QBVH node */
+                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
+            }
+
+            /* wait for all work items to have updated local_records array */
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+        }
+
+        const uint shift_mask = globals->shift_mask;
+        const uint leafPrimType = globals->leafPrimType;
+        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+        /* create all fat leaf nodes and initiate refit */
+        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
+        {
+            struct BuildRecordMorton current = local_records[ID];
+            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
+
+            global struct QBVHNodeN* qnode = nodeData + current.current_index;
+
+            /* get bounds of all children of the fat leaf node */
+            struct AABB bounds[BVH_NODE_N6];
+            for ( uint i = 0; i < current.items; i++ )
+            {
+                /* get primID and bounds of primitive */
+                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
+                bounds[i] = primref[primID];
+
+                /* For all primitives in a fat leaf we store a back
+                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
+                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
+
+                /* Store back pointer and primID inside morton code array to
+                 * be later used by leaf creation. */
+                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+            }
+
+            /* update fat leaf node */
+            QBVHNodeN_setType( qnode, leafPrimType );
+            global void* offset;
+            if ( leafPrimType != BVH_INSTANCE_NODE )
+            {
+                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
+                QBVHNodeN_setChildIncr1( qnode );
+            }
+            else
+            {
+                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
+                QBVHNodeN_setChildIncr2( qnode );
+            }
+            QBVH6Node_set_offset( qnode, offset );
+            QBVHNodeN_setBounds( qnode, bounds, current.items );
+
+            /* set back pointers for fat leaf nodes */
+            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
+
+            /* bottom up refit */
+            refit_bottom_up( qnode, bvh, bounds, current.items );
+        }
+    }
+}
+
+/*
+
+  This phase takes the build records calculated in phase0 as input and
+  finished the BVH construction for all these subtrees.
+
+*/
+__attribute__((reqd_work_group_size(8, 1, 1)))
+old_parallel_build_phase1(global struct Globals *globals,
+                      global struct MortonCodePrimitive *mc,
+                      global struct AABB *primref,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+
+__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
+old_parallel_build_phase1_Indirect( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem )
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    uint startID = get_group_id( 0 );
+    uint endID   = startID + 1;
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
new file mode 100644
index 00000000000..099f926e194
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
@@ -0,0 +1,117 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "morton/morton_common.h"
+
+GRL_INLINE uint get_morton_shift( uint numPrimitives )
+{
+    return 32 - clz( numPrimitives );
+}
+
+GRL_INLINE uint get_morton_shift_mask( uint numPrimitives )
+{
+    uint shift = get_morton_shift( numPrimitives );
+    uint mask =(uint)(((ulong)1 << shift));
+    return mask - 1; // separated due to problems in DX
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals )
+{
+    /* variable shift for putting morton code + index to 64 bit */
+    const uint shift = 32 - clz(globals->numPrimitives);
+    globals->shift = shift;
+    globals->shift_mask = (uint)(((ulong)1 << shift));
+    globals->shift_mask -= 1; // separated due to problems in DX
+    globals->binary_hierarchy_root = 0;
+    globals->morton_sort_in_flight = 0;
+    globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift);
+}
+
+/*
+
+  This kernel create a morton code array containing a morton code and
+  index into the primref array.
+
+  The code uses the maximal number of bits for the morton code, such
+  that the morton code and index can still both get stored in 64 bits.
+
+  The algorithm first maps the centroids of the primitives and their
+  bounding box diagonal into a 4D grid, and then interleaves all 4
+  grid coordinates to construct the to morton code.
+
+ */
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel
+create_morton_codes_indirect( global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global struct MortonCodePrimitive* morton_codes,
+    global struct MortonCodePrimitive* morton_codes_tmp,
+    uint use_new_morton_sort)
+{
+    /* construct range of morton codes each work group should create */
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = get_group_id( 0 ) * get_local_size( 0 );
+    const uint endID   = min((uint)(startID + get_local_size(0)), numPrimitives);
+
+    /* get lower and upper bounds of geometry and length of scene diagonal */
+    const float3 lower = globals->centroidBounds.lower.xyz;
+    const float3 upper = globals->centroidBounds.upper.xyz;
+    const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz );
+
+    /* calculates the 4D grid */
+    const uint shift = get_morton_shift( numPrimitives );
+    const uint grid_size = 1 << (64 - shift) / 4;
+    const float4 grid_base = (float4)(lower, 0.0f);
+    const float4 grid_extend = (float4)(upper - lower, diag);
+    const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!!
+
+    const uint req_iterations = get_morton_sort_lsb_req_iterations(shift);
+
+    /* each work group iterates over its range of morton codes to create */
+    uint primID = startID + get_local_id( 0 );
+    if( primID < endID )
+    {
+        /* calculate position inside 4D grid */
+        float4 centroid2 = AABB_centroid2( &primref[primID] );
+        centroid2.w = length( AABB_size( &primref[primID] ).xyz );
+        const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale );
+
+        /* calculate and store morton code */
+        const ulong code = ulong_bitInterleave4D( gridpos );
+        const ulong index_code = ((ulong)code << shift) | (ulong)primID;
+
+        // It is required for morton code to be in morton_codes buffer after LSB sort finishes.
+        // If there would be odd iteration number needed for sorting, it is needed
+        // to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer
+        if(req_iterations & 1 && !use_new_morton_sort)
+            morton_codes_tmp[primID].index_code = index_code;
+        else
+            morton_codes[primID].index_code = index_code;
+    }
+}
+
+/*
+
+  Initialization of the binary morton code hierarchy.
+
+ */
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals,
+    global struct BinaryMortonCodeHierarchy* bnodes )
+{
+    /* construct range each work group will process */
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = get_group_id( 0 ) * get_local_size(0);
+    const uint endID   = min((uint)(startID + get_local_size(0)), numPrimitives);
+
+    /* each workgroup iterates over its range to initialize the binary BVH */
+    uint i = startID + get_local_id( 0 );
+    if( i < endID )
+        BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 );
+}
diff --git a/src/intel/vulkan/grl/gpu/morton_builder.grl b/src/intel/vulkan/grl/gpu/morton_builder.grl
new file mode 100644
index 00000000000..f221fd39fed
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_builder.grl
@@ -0,0 +1,335 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module morton_builder;
+
+kernel_module morton_kernels ("morton/pre_sort.cl")
+{
+    kernel opencl_build_kernel_init                                     < kernelFunction="init" >;
+    kernel opencl_build_morton_kernel_create_morton_codes_indirect      < kernelFunction="create_morton_codes_indirect" >;
+    kernel opencl_build_morton_kernel_init_bottom_up_indirect           < kernelFunction="init_bottom_up_indirect" >;
+}
+
+kernel_module morton_kernels ("morton/post_sort.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_build_bottom_up_indirect          < kernelFunction="build_bottom_up_indirect" >;
+}
+
+kernel_module morton_kernels ("morton/phase0.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_parallel_build_phase0             < kernelFunction="parallel_build_phase0" >;
+    kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync  < kernelFunction="parallel_build_phase0_local_sync" >;
+}
+
+kernel_module morton_kernels ("morton/phase1.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect    < kernelFunction="parallel_build_phase1_Indirect_SG" >;
+    kernel opencl_build_morton_kernel_parallel_build_phase1_root        < kernelFunction="parallel_build_phase1_Indirect_global_root" >;
+}
+
+kernel_module morton_kernels ("morton/phase2.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_parallel_build_phase2_refit       < kernelFunction="parallel_build_phase2_refit" >;
+    kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >;
+}
+
+import struct MKBuilderState "structs.grl";
+
+/*
+metakernel begin(
+    MKBuilderState state,
+    qword morton_code_buffer,
+    dword primLeafType,
+    dword numHwThreads)
+{
+    dispatch opencl_build_kernel_init(1, 1, 1) args(
+        state.build_globals
+        );
+
+    control(wait_idle);
+
+
+    dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        morton_code_buffer);
+
+    control(wait_idle);
+
+}
+
+metakernel build_bottom_up(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer,
+    dword numHwThreads)
+{
+    dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+
+    control(wait_idle);
+
+}
+
+
+metakernel parallel_build(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer,
+    dword numHwThreads)
+{
+    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+   control(wait_idle);
+
+}
+
+*/
+
+metakernel NewMorton_pre_sort(
+    qword num_primrefs_counter,
+    MKBuilderState state,
+    qword morton_code_buffer,
+    qword morton_code_buffer_tmp,
+    qword buildrecords_bottom_up,
+    dword use_new_morton_sort)
+{
+
+
+    {
+        REG1 = 15;
+        REG2 = 4;
+        REG0 = load_dword( num_primrefs_counter );
+
+        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
+        REG1 = ~REG1;
+        REG0 = REG0 & REG1;
+        REG0 = REG0 >> REG2;
+    }
+
+    dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals );
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    /*
+    // new bottom-up kernel does not need this
+    dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args(
+        state.build_globals,
+        buildrecords_bottom_up);
+        */
+    dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        morton_code_buffer,
+        morton_code_buffer_tmp,
+        use_new_morton_sort);
+
+
+}
+
+
+
+metakernel NewMorton_post_sort(
+    qword num_primrefs_counter,
+    qword num_buildrecords_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer )
+{
+
+    {
+        REG1 = 15;
+        REG2 = 4;
+        REG0 = load_dword( num_primrefs_counter );
+
+        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
+        REG1 = ~REG1;
+        REG0 = REG0 & REG1;
+        REG0 = REG0 >> REG2;
+    }
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+
+
+    /*
+   dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+        */
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+    control(wait_idle);
+
+    DISPATCHDIM_X = load_dword( num_buildrecords_counter );
+
+    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+   control(wait_idle);
+
+}
+
+metakernel NewMorton_bottom_up(
+    qword num_primrefs_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer )
+{
+
+    {
+        REG1 = 15;
+        REG2 = 4;
+        REG0 = load_dword( num_primrefs_counter );
+
+        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
+        REG1 = ~REG1;
+        REG0 = REG0 & REG1;
+        REG0 = REG0 >> REG2;
+    }
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+}
+
+
+metakernel NewMorton_phase0(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_p0_refit_startpoints)
+{
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer,
+        morton_p0_refit_startpoints);
+}
+
+metakernel NewMorton_phase0_local_sync(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword p0_boxless_nodes)
+{
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer,
+        p0_boxless_nodes);
+}
+
+
+metakernel NewMorton_phase1(
+    qword num_buildrecords_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer)
+{
+
+    DISPATCHDIM_X = load_dword( num_buildrecords_counter );
+
+    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+}
+
+metakernel NewMorton_phase1_root(
+    qword num_buildrecords_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer)
+{
+    dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+}
+
+metakernel NewMorton_phase2(
+    qword num_leaves_counter,
+    MKBuilderState state,
+    qword bottom_node_ids )
+{
+
+    DISPATCHDIM_X = load_dword( num_leaves_counter );
+
+    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args(
+        state.bvh_buffer,
+        bottom_node_ids);
+}
+
+metakernel NewMorton_phase2_local(
+    MKBuilderState state,
+    qword p0_boxless_nodes)
+{
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args(
+        state.build_globals,
+        state.bvh_buffer,
+        p0_boxless_nodes);
+}
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
new file mode 100644
index 00000000000..075d44a51ba
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// just inlines the kernels that are there in the header
+#include "morton_msb_radix_bitonic_sort.h"
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
new file mode 100644
index 00000000000..4fb6c21b014
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
@@ -0,0 +1,924 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "morton_msb_radix_bitonic_sort_shared.h"
+
+#include "libs/lsc_intrinsics.h"
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Configuration switches
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#define DEBUG 0
+#define MERGE_BLS_WITHIN_SG 0
+
+///////////////////////////////////////////////////////////////////////////////
+
+
+#if DEBUG
+#define DEBUG_CODE(A) A
+#else
+#define DEBUG_CODE(A)
+#endif
+
+#define BOTTOM_LEVEL_SORT_WG_SIZE 512
+
+// this kernel is only used to put into metakernel for debug to print that the code reached that place
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel debug_print_kernel(uint variable)
+{
+    if(get_local_id(0) == 0)
+    printf("I'm here! %d\n", variable);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel check_bls_sort(global struct Globals* globals, global ulong* input)
+{
+    uint prims_num = globals->numPrimitives;
+
+    printf("in check_bls_sort kernel. Values count:: %d\n", prims_num);
+
+    ulong left = input[0];
+    ulong right;
+    for (int i = 0; i < prims_num - 1; i++)
+    {
+        right = input[i + 1];
+        printf("sorted val: %llu\n", left);
+        if (left > right)
+        {
+            printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right);
+        }
+        left = right;
+    }
+}
+
+inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE)
+{
+    const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE;
+    const uint sg_local_id = get_local_id(0) % SG_SIZE;
+    const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE;
+
+    uint acc = sub_group_scan_inclusive_add(val);
+    if (NUM_HW_THREADS_IN_WG == 1)
+    {
+        return acc;
+    }
+    tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
+    uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+    uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
+    // for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration
+    // same for > 64 workitems and more in SIMD8
+    uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE;
+    for (int i = 1; i < num_iterations; i++)
+    {
+        // need to add tmp[] because of "exclusive" scan, so last element misses it
+        uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1];
+        loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
+        wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+        wgs_acc += prev_max_sum;
+        uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE);
+        if (hw_thread_in_wg_id >= i * SG_SIZE)
+            acc_for_this_hw_thread = new_acc_for_this_hw_thread;
+    }
+    return acc + acc_for_this_hw_thread;
+}
+
+struct MSBDispatchArgs
+{
+    global struct MSBRadixContext* context;
+    uint num_of_wgs; // this is the number of workgroups that was dispatched for this context
+    ulong* wg_key_start; // this is where keys to process start for current workgroup
+    ulong* wg_key_end;
+    uint shift_bit;
+};
+
+
+
+
+struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler)
+{
+    global struct MSBDispatchQueue* queue = &scheduler->msb_queue;
+
+    uint group = get_group_id(0);
+    struct MSBDispatchRecord record;
+
+    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
+    //    Or use sub-group ops
+    uint i = 0;
+    while (i < queue->num_records)
+    {
+        uint n = queue->records[i].wgs_to_dispatch;
+
+        if (group < n)
+        {
+            record = queue->records[i];
+            break;
+        }
+
+        group -= n;
+        i++;
+    }
+
+    uint context_id = i;
+    global struct MSBRadixContext* context = &scheduler->contexts[context_id];
+
+    // moving to ulongs to avoid uint overflow
+    ulong group_id_in_dispatch = group;
+    ulong start_offset = context->start_offset;
+    ulong num_keys = context->num_keys;
+    ulong wgs_to_dispatch = record.wgs_to_dispatch;
+
+    struct MSBDispatchArgs args;
+    args.context = context;
+    args.num_of_wgs = record.wgs_to_dispatch;
+    args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch);
+    args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch);
+    args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION;
+    return args;
+}
+
+
+
+
+void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record)
+{
+    uint new_idx = atomic_inc_global(&queue->num_records);
+    queue->records[new_idx] = *record;
+    DEBUG_CODE(printf("adding bls of size: %d\n", record->count));
+}
+
+
+
+
+void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
+{
+    uint tid = get_local_id(0);
+
+    global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset;
+
+    ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX;
+
+    SLM_shared[tid] = a;
+
+    uint counter = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    ulong curr = SLM_shared[get_sub_group_local_id()];
+
+    for (uint i = 16; i < dispatchRecord.count; i += 16)
+    {
+        ulong next  = SLM_shared[i + get_sub_group_local_id()];
+
+        for (uint j = 0; j < 16; j++)
+        {
+            // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
+            uint2 curr_as_uint2 = as_uint2(curr);
+            uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
+            ulong c = as_ulong(sg_curr_as_uint2);
+            if (c < a)
+                counter++;
+        }
+
+        curr = next;
+    }
+
+
+    // last iter
+    for (uint j = 0; j < 16; j++)
+    {
+        // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
+        uint2 curr_as_uint2 = as_uint2(curr);
+        uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
+        ulong c = as_ulong(sg_curr_as_uint2);
+        if (c < a)
+            counter++;
+    }
+
+    // save elements to its sorted positions
+    if (tid < dispatchRecord.count)
+        output[dispatchRecord.start_offset + counter] = a;
+}
+
+void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
+{
+    uint lid = get_local_id(0);
+    uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD;
+    while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE)
+    {
+        elements_to_sort >>= 1;
+    }
+
+    for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+    {
+        uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+
+        if (tid >= dispatchRecord.count)
+            SLM_shared[tid] = ULONG_MAX;
+        else
+            SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint k_iterations = elements_to_sort;
+    while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0)
+    {
+        k_iterations >>= 1;
+    }
+
+    for (unsigned int k = 2; k <= k_iterations; k *= 2)
+    {
+        for (unsigned int j = k / 2; j > 0; j /= 2)
+        {
+            // this loop is needed when we can't create big enough workgroup so we need to process multiple times
+            for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+            {
+                uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+                unsigned int ixj = tid ^ j;
+                if (ixj > tid)
+                {
+                    if ((tid & k) == 0)
+                    {
+                        if (SLM_shared[tid] > SLM_shared[ixj])
+                        {
+                            ulong tmp = SLM_shared[tid];
+                            SLM_shared[tid] = SLM_shared[ixj];
+                            SLM_shared[ixj] = tmp;
+                        }
+                    }
+                    else
+                    {
+                        if (SLM_shared[tid] < SLM_shared[ixj])
+                        {
+                            ulong tmp = SLM_shared[tid];
+                            SLM_shared[tid] = SLM_shared[ixj];
+                            SLM_shared[ixj] = tmp;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    }
+
+    for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+    {
+        uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+
+        if (tid < dispatchRecord.count)
+            output[dispatchRecord.start_offset + tid] = SLM_shared[tid];
+    }
+}
+
+
+
+
+void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+    uint lid = get_local_id(0);
+
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint start_offset = context->start_offset + start;
+
+    struct BLSDispatchRecord record;
+    record.start_offset = start_offset;
+    record.count = count;
+    record.keys_in = context->keys_out;
+
+    if (count == 0) // we don't have elements so don't do anything
+    {
+    }
+    else if (count == 1) // single element so just write it out
+    {
+        input[start_offset] = ((global ulong*)record.keys_in)[start_offset];
+    }
+    else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD)
+    {
+        BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+    }
+}
+
+
+
+
+// We try to merge small BLS into larger one within the sub_group
+void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+    uint lid = get_local_id(0);
+    uint sid = get_sub_group_local_id();
+
+    uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint ctx_start_offset = context->start_offset;
+
+    if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS
+    {
+        struct BLSDispatchRecord record;
+        if (create_msb_work)
+        {
+            record.start_offset = ctx_start_offset + start + count;
+            record.count = 0;
+        }
+        else // SIMD lane 0 case
+        {
+            record.start_offset = ctx_start_offset + start; 
+            record.count = count;
+        }
+
+        record.keys_in = context->keys_out;
+
+        uint loop_idx = 1;
+        while (sid + loop_idx < 16) // loop over subgroup
+        {
+            uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx);
+            uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx);
+            uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx);
+
+            if (_create_msb_work) // found out next MSB work, so range of merges ends
+                break;
+
+            // need to push record since nothing more will fit
+            if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
+            {
+                if (record.count == 1)
+                {
+                    input[record.start_offset] = record.keys_in[record.start_offset];
+                }
+                else if (record.count > 1)
+                {
+                    BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+                }
+                record.start_offset = ctx_start_offset + _start;
+                record.count = _count;
+            }
+            else
+            {
+                record.count += _count;
+            }
+            loop_idx++;
+        }
+        // if we have any elements left, then schedule them
+        if (record.count == 1) // only one element, so just write it out
+        {
+            input[record.start_offset] = record.keys_in[record.start_offset];
+        }
+        else if (record.count > 1)
+        {
+            BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+        }
+    }
+}
+
+
+
+
+// We try to merge small BLS into larger one within the sub_group
+void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+    uint lid = get_local_id(0);
+    uint sid = get_sub_group_local_id();
+
+    uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint ctx_start_offset = context->start_offset;
+
+    if (sid == 0)
+    {
+        struct BLSDispatchRecord record;
+        record.start_offset = ctx_start_offset + start;
+        record.count = 0;
+        record.keys_in = context->keys_out;
+
+        for (int i = 0; i < 16; i++)
+        {
+            uint _create_msb_work = sub_group_broadcast(create_msb_work, i);
+            uint _count = sub_group_broadcast(count, i);
+            uint _start = sub_group_broadcast(start, i);
+            if (_create_msb_work)
+            {
+                if (record.count == 1) // only one element, so just write it out
+                {
+                    input[record.start_offset] = record.keys_in[record.start_offset];
+                }
+                else if (record.count > 1)
+                {
+                    BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+                }
+                record.start_offset = ctx_start_offset + _start + _count;
+                record.count = 0;
+                continue;
+            }
+            // need to push record since nothing more will fit
+            if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
+            {
+                BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+                record.start_offset = ctx_start_offset + _start;
+                record.count = _count;
+            }
+            else
+            {
+                record.count += _count;
+            }
+        }
+        // if we have any elements left, then schedule them
+        if (record.count == 1) // only one element, so just write it out
+        {
+            input[record.start_offset] = record.keys_in[record.start_offset];
+        }
+        else if (record.count > 1)
+        {
+            BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+        }
+    }
+}
+
+
+
+
+void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size)
+{
+    uint lid = get_local_id(0);
+
+    uint iteration = context->iteration + 1;
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint start_offset = context->start_offset + start;
+
+    uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+#if MERGE_BLS_WITHIN_SG
+    DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input);
+#else
+    DO_Create_Separate_BLS_Work(scheduler, context, input);
+#endif
+
+    uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work);
+    uint stack_begin_entry;
+    // last workitem in wg contains number of all new entries
+    if (lid == (MSB_RADIX_NUM_BINS - 1))
+    {
+        stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id);
+    }
+    stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1));
+    new_entry_id += stack_begin_entry -1;
+    
+
+    if (create_msb_work)
+    {
+        scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset;
+        scheduler->msb_stack.entries[new_entry_id].count = count;
+        scheduler->msb_stack.entries[new_entry_id].iteration = iteration;
+    }
+
+    if (lid == 0) {
+        DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records));
+    }
+}
+
+
+struct BatchedBLSDispatchEntry
+{
+    /////////////////////////////////////////////////////////////
+    //  State data used for communication with command streamer
+    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+    /////////////////////////////////////////////////////////////
+    qword p_data_buffer;
+    qword num_elements; // number of elements in p_data_buffer
+};
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches)
+{
+    uint dispatch_id = get_group_id(0);
+    uint lid = get_local_id(0);
+
+    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+    struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id];
+    struct BLSDispatchRecord dispatchRecord;
+    dispatchRecord.start_offset = 0;
+    dispatchRecord.count = dispatchArgs.num_elements;
+    dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer;
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count));
+
+    if(dispatchRecord.count > 1)
+        DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in);
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output)
+{
+    uint lid = get_local_id(0);
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives));
+
+    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+    
+    struct BLSDispatchRecord dispatchRecord;
+    dispatchRecord.start_offset = 0;
+    dispatchRecord.count = globals->numPrimitives;
+    dispatchRecord.keys_in = (ulong*)input;
+
+    //TODO: count or bitonic here?
+    //DO_Bitonic(dispatchRecord, SLM_shared, output);
+    DO_CountSort(dispatchRecord, SLM_shared, output);
+}
+
+
+
+
+// This kernel initializes first context to start up the whole execution
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_msb_begin(
+    global struct Globals* globals,
+    global struct VContextScheduler* scheduler,
+    global ulong* buf0,
+    global ulong* buf1)
+{
+    uint lid = get_local_id(0);
+    uint gid = get_group_id(0);
+
+    DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n"));
+
+    scheduler->contexts[gid].count[lid] = 0;
+
+    if (gid == 0 && lid == 0)
+    {
+        global struct MSBRadixContext* context = &scheduler->contexts[lid];
+        const uint num_prims = globals->numPrimitives;
+
+        scheduler->bls_queue0.num_records = 0;
+        scheduler->bls_queue1.num_records = 0;
+
+        scheduler->curr_bls_queue = &scheduler->bls_queue1;
+        scheduler->next_bls_queue = &scheduler->bls_queue0;
+
+        context->start_offset = 0;
+        context->num_wgs_in_flight = 0;
+        context->num_keys = num_prims;
+        context->iteration = 0;
+        context->keys_in = buf0;
+        context->keys_out = buf1;
+
+        uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
+        scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch;
+
+        scheduler->num_wgs_msb = msb_wgs_to_dispatch;
+        scheduler->num_wgs_bls = 0;
+        scheduler->msb_stack.num_entries = 0;
+        scheduler->msb_queue.num_records = 1;
+    }
+}
+
+
+
+
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1)))
+kernel void
+scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1)
+{
+    uint lid = get_local_id(0);
+    
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n"));
+
+    uint context_idx = lid;
+
+    const uint num_of_stack_entries = scheduler->msb_stack.num_entries;
+
+    uint msb_wgs_to_dispatch = 0;
+    if (lid < num_of_stack_entries)
+    {
+        struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid];
+        global struct MSBRadixContext* context = &scheduler->contexts[lid];
+        context->start_offset = entry.start_offset;
+        context->num_wgs_in_flight = 0;
+        context->num_keys = entry.count;
+        context->iteration = entry.iteration;
+        context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1;
+        context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0;
+
+        msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
+        scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch;
+    }
+
+    msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it
+
+    if (lid == 0)
+    {
+        // swap queue for next iteration
+        struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue;
+        scheduler->curr_bls_queue = scheduler->next_bls_queue;
+        scheduler->next_bls_queue = tmp;
+
+        scheduler->next_bls_queue->num_records = 0;
+
+        scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records;
+        scheduler->num_wgs_msb = msb_wgs_to_dispatch;
+
+        if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS)
+        {
+            scheduler->msb_queue.num_records = num_of_stack_entries;
+            scheduler->msb_stack.num_entries = 0;
+        }
+        else
+        {
+            scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS;
+            scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS;
+        }
+    }
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n",
+        scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries));
+}
+
+
+
+
+// this is the lowest sub-task, which should end return sorted codes
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output)
+{
+    uint lid = get_local_id(0);
+
+    DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n"));
+
+    local struct BLSDispatchRecord l_dispatchRecord;
+    if (lid == 0)
+    {
+        uint record_idx = get_group_id(0);
+        l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx];
+        //l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue);
+        atomic_dec_global(&scheduler->num_wgs_bls);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    struct BLSDispatchRecord dispatchRecord = l_dispatchRecord;
+
+    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+    // right now use only bitonic sort
+    // TODO: maybe implement something else
+    if (1)
+    {
+        //DO_Bitonic(dispatchRecord, SLM_shared, output);
+        DO_CountSort(dispatchRecord, SLM_shared, output);
+    }
+}
+
+
+
+
+#define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS
+#define MSB_COUNT_SG_SIZE 16
+
+// count how many elements per buckets we have
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE)))
+void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler)
+{
+    uint lid = get_local_id(0);
+    uint lsz = MSB_RADIX_NUM_BINS;
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n"));
+
+    local uint bucket_count[MSB_RADIX_NUM_BINS];
+    local uint finish_count;
+    bucket_count[lid] = 0;
+    if (lid == 0)
+    {
+        finish_count = 0;
+    }
+
+    struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
+
+    global struct MSBRadixContext* context = dispatchArgs.context;
+
+    global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
+    global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
+    uint shift_bit = dispatchArgs.shift_bit;
+    uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    global uchar* ks = (global uchar*)key_start;
+    ks += shift_byte;
+    global uchar* ke = (global uchar*)key_end;
+    ke += shift_byte;
+
+    // double buffering on value loading
+    if (ks < ke)
+    {
+        uchar bucket_id = *ks;
+        ks += lsz * sizeof(ulong);
+
+        for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong))
+        {
+            uchar next_bucket_id = *k;
+            atomic_inc_local(&bucket_count[bucket_id]);
+            bucket_id = next_bucket_id;
+        }
+
+        atomic_inc_local(&bucket_count[bucket_id]);
+
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //update global counters for context
+    uint count = bucket_count[lid];
+    if (count > 0)
+        atomic_add_global(&context->count[lid], bucket_count[lid]);
+
+    mem_fence_gpu_invalidate();
+    work_group_barrier(0);
+
+    bool final_wg = true;
+    // count WGs which have reached the end
+    if (dispatchArgs.num_of_wgs > 1)
+    {
+        if (lid == 0)
+            finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        final_wg = finish_count == dispatchArgs.num_of_wgs;
+    }
+
+    local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
+    // if this is last wg for current dispatch, update context
+    if (final_wg)
+    {
+        // code below does work_group_scan_exclusive_add(context->count[lid]);
+        {
+            uint lane_val = context->count[lid];
+            uint sg_result = sub_group_scan_inclusive_add(lane_val);
+
+            partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1);
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]);
+            slm_result = sub_group_broadcast(slm_result, get_sub_group_id());
+            uint result = slm_result + sg_result - lane_val;
+            context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]);
+        }
+
+        context->count[lid] = 0;
+        if(lid == 0)
+            context->num_wgs_in_flight = 0;
+    }
+}
+
+
+
+
+// sort elements into appropriate buckets
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_msb_bin_items(
+    global struct VContextScheduler* scheduler, global ulong* input)
+{
+    uint lid = get_local_id(0);
+    uint lsz = get_local_size(0);
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n"));
+
+    local uint finish_count;
+    if (lid == 0)
+    {
+        finish_count = 0;
+    }
+
+    struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
+    global struct MSBRadixContext* context = dispatchArgs.context;
+
+    global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
+    global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
+    uint shift_bit = dispatchArgs.shift_bit;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset;
+    
+#if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem
+    // here we'll do local counting, then move to global
+
+    local uint slm_counters[MSB_RADIX_NUM_BINS];
+    slm_counters[lid] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint place_in_slm_bucket;
+    uint bucket_id;
+    ulong val;
+
+    bool active_lane = key_start < key_end;
+
+    if (active_lane)
+    {
+        val = *key_start;
+
+        bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+        place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway
+    if (slm_counters[lid])
+        slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]);
+
+    if (active_lane)
+        sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+#else
+    // double buffering on value loading
+    if (key_start < key_end)
+    {
+        ulong val = *key_start;
+        key_start += lsz;
+
+        for (global ulong* k = key_start; k < key_end; k += lsz)
+        {
+            ulong next_val = *k;
+            uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+            uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
+
+            //printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id);
+            sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+
+            val = next_val;
+        }
+
+        uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+        uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
+
+        sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+    }
+#endif
+
+    // make sure all groups's "counters" and "starts" are visible to final workgroup
+    mem_fence_gpu_invalidate();
+    work_group_barrier(0);
+
+    bool final_wg = true;
+    // count WGs which have reached the end
+    if (dispatchArgs.num_of_wgs > 1)
+    {
+        if (lid == 0)
+            finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        final_wg = finish_count == dispatchArgs.num_of_wgs;
+    }
+
+    local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
+    // if this is last wg for current dispatch, then prepare sub-tasks
+    if (final_wg)
+    {
+        DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS);
+
+        // clear context's counters for future execution
+        context->count[lid] = 0;
+    }
+
+}
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
new file mode 100644
index 00000000000..c2ab0d4a2c9
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
@@ -0,0 +1,135 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+//   This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+// NOTE:
+// MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work
+// BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work
+//
+
+#define MSB_RADIX_NUM_BINS    256
+#define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration
+#define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here
+
+#define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used
+
+#define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here,
+// since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these
+
+#define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context
+
+#define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS,
+// so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS
+
+#define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup.
+                                           // If a single MSB entry needs more, then it will spawn more WGs
+                                           // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num
+
+#define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance
+// Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with.
+// Since we use ulong(8bytes) we can store 4096 elements
+// This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler
+// Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD
+
+#define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+
+
+
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT)
+
+struct MSBStackEntry
+{
+    uint start_offset;
+    uint count;
+    uint iteration;
+};
+
+struct MSBStack
+{
+    dword num_entries;
+    struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM];
+};
+
+struct MSBRadixContext
+{
+    uint start[MSB_RADIX_NUM_BINS];
+    uint count[MSB_RADIX_NUM_BINS];
+    uint num_wgs_in_flight; // this is used to identify which msb wg is last
+    uint num_keys; // number of keys to process
+    uint iteration;
+    ulong* keys_in;
+    ulong* keys_out;
+
+    uint start_offset; //offset from the beginning of the buffer
+};
+
+struct MSBDispatchRecord
+{
+    uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record
+};
+
+struct MSBDispatchQueue
+{
+    dword num_records;
+    struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record
+};
+
+// BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks
+struct BLSDispatchRecord
+{
+    uint start_offset; // offset from the beginning of the buffer
+    uint count;
+    ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer 
+};
+
+struct BLSDispatchQueue
+{
+    dword num_records;
+    struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS];
+};
+
+struct VContextScheduler
+{
+    /////////////////////////////////////////////////////////////
+    //  State data used for communication with command streamer
+    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+    /////////////////////////////////////////////////////////////
+
+    dword num_wgs_msb; // number of MSB workgroups being processed by current iteration
+    dword num_wgs_bls; // number of BLS workgroups being processed by current iteration
+
+    dword scheduler_postsync;
+    dword _pad1;
+
+    /////////////////////////////////////////////////////////////
+
+    struct MSBDispatchQueue msb_queue;
+    struct BLSDispatchQueue bls_queue0;
+    struct BLSDispatchQueue bls_queue1;
+
+    struct BLSDispatchQueue* curr_bls_queue;
+    struct BLSDispatchQueue* next_bls_queue;
+
+    struct MSBStack msb_stack;
+
+    struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS];
+};
+
+GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
new file mode 100644
index 00000000000..e123b2f46d3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// just inlines the kernels that are there in the header
+#include "morton_radix_sort.h"
diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.h b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
new file mode 100644
index 00000000000..d58ec829883
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
@@ -0,0 +1,855 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "libs/lsc_intrinsics.h"
+
+/* ============================================================================= */
+/* ============================== LSB RADIX SORT =============================== */
+/* ============================================================================= */
+
+#define RADIX_BINS 256
+#define SCATTER_WG_SIZE 512
+#define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort
+
+uint2 get_thread_range( uint numItems, uint numGroups, uint taskID )
+{
+    uint items_per_group = (numItems / numGroups);
+    uint remainder = numItems - (items_per_group * numGroups);
+    uint startID = taskID * items_per_group  + min(taskID, remainder);
+    uint endID   = startID + items_per_group + ((taskID < remainder) ? 1 : 0);
+
+    return (uint2)(startID,endID);
+}
+
+GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals,
+                                                 global uint* global_histogram,
+                                                 global uchar* input,
+                                                 local uint* histogram,
+                                                 uint iteration,
+                                                 uint numGroups,
+                                                 uint numItems,
+                                                 bool shift_primID,
+                                                 uint taskID,
+                                                 uint startID,
+                                                 uint endID)
+{
+    const uint shift = globals->shift;
+
+    for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+        histogram[i] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (shift_primID)
+    {
+        for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+        {
+            // Read input as ulong to make bitshift, so the bits representing primID are not being
+            // taken into account during sorting, which would result in smaller sort loops for
+            // cases where morton shift are bigger than 8 bits
+            ulong* ptr_ul = (ulong*)&input[8 * i];
+            ulong code = *ptr_ul;
+            uchar* ptr = (uchar*)&code;
+            code >>= shift;
+
+            uchar bin = ptr[iteration];
+            atomic_inc_local(&histogram[bin]);
+        }
+    }
+    else
+    {
+        for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+        {
+            uchar bin = input[8 * i + iteration];
+            atomic_inc_local(&histogram[bin]);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+        global_histogram[RADIX_BINS * taskID + i] = histogram[i];
+}
+
+GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* wg_flags,
+    global uchar* input,
+    local uint* histogram,
+    uint iteration,
+    uint numGroups,
+    uint numItems,
+    bool shift_primID,
+    bool update_wg_flags)
+{
+    if (shift_primID)
+    {
+        // This check is present in other LSB sort functions as well, its purpose is
+        // to skip first n iterations where n is the difference between max iterations
+        // and actually needed iterations to sort without primIDs
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        // iteration needs to be adjusted to reflect the skipped cycles
+        iteration -= req_iterations;
+    }
+
+    const uint taskID = get_group_id(0);
+
+    if (taskID == 0 && update_wg_flags)
+    {
+        for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+            wg_flags[i] = 0;
+    }
+
+    uint2 ids = get_thread_range(numItems, numGroups, taskID);
+    uint startID = ids.x;
+    uint endID = ids.y;
+
+    sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID,
+                                            taskID, startID, endID);
+}
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+void kernel
+sort_morton_codes_bin_items(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* wg_flags,
+    global uchar* input,
+    uint iteration,
+    uint numGroups,
+    uint update_wg_flags
+)
+{
+    local uint histogram[RADIX_BINS];
+    const uint numItems = globals->numPrimitives;
+    if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags);
+    else
+        sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags);
+}
+
+
+GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals,
+                                                   global uint* global_histogram,
+                                                   local uint* partials,
+                                                   uint numTasks,
+                                                   uint iteration,
+                                                   bool shift_primID)
+{
+    const uint localID = get_local_id(0);
+
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+    }
+
+    uint t = 0;
+    for (uint j = 0; j < numTasks; j++)
+    {
+        const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0);
+        store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t);
+        t += count;
+    }
+
+    // each lane now contains the number of elements in the corresponding bin
+    //     prefix sum this for use in the subsequent scattering pass.
+    uint global_count = t;
+
+    partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint lane = get_sub_group_local_id();
+    uint p = partials[lane];
+    p = (lane < get_sub_group_id()) ? p : 0;
+
+    global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+    store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+void kernel
+sort_morton_codes_reduce_bins(global struct Globals* globals,
+    uint numTasks,
+    global uint* global_histogram,
+    uint iteration)
+{
+    local uint partials[RADIX_BINS];
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false);
+    else
+        sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true);
+}
+
+
+#if 1
+GRL_INLINE void sort_morton_codes_scatter_items_func(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global ulong* input,
+    global ulong* output,
+    local uint* local_offset,
+    local uint* flags,
+    uint iteration,
+    uint numGroups,
+    uint numItems,
+    bool shift_primID,
+    bool update_morton_sort_in_flight)
+{
+    const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    const uint global_shift = globals->shift;
+    const uint localID = get_local_id(0);
+    const uint taskID = get_group_id(0);
+
+    if (gID == 0 && update_morton_sort_in_flight)
+        globals->morton_sort_in_flight = 0;
+
+    uint2 ids = get_thread_range(numItems, numGroups, taskID);
+    uint startID = ids.x;
+    uint endID = ids.y;
+
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+    }
+
+    const uint shift = 8 * iteration;
+
+    // load the global bin counts, and add each bin's global prefix
+    //   to the local prefix
+    {
+        uint global_prefix = 0, local_prefix = 0;
+        if (localID < RADIX_BINS)
+        {
+            local_prefix = global_histogram[RADIX_BINS * taskID + localID];
+            global_prefix = global_histogram[RADIX_BINS * numGroups + localID];
+            local_offset[localID] = global_prefix + local_prefix;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+
+    // move elements in WG-sized chunks.   The elements need to be moved sequentially (can't use atomics)
+    //   because relative order has to be preserved for LSB radix sort to work
+
+    // For each bin, a bit vector indicating which elements are in the bin
+    for (uint block_base = startID; block_base < endID; block_base += get_local_size(0))
+    {
+        // initialize bit vectors
+        for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0))
+        {
+            flags[i + 0] = 0;
+            flags[i + 1] = 0;
+            flags[i + 2] = 0;
+            flags[i + 3] = 0;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // read sort key, determine which bin it goes into, scatter into the bit vector
+        //  and pre-load the local offset
+        uint ID = localID + block_base;
+        ulong key = 0;
+        uint bin_offset = 0;
+        uint bin = 0;
+        uint bin_word = localID / 32;
+        uint bin_bit = 1 << (localID % 32);
+
+        if (ID < endID)
+        {
+            key = input[ID];
+
+            if (shift_primID)
+                bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1);
+            else
+                bin = (key >> shift) & (RADIX_BINS - 1);
+
+            atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit);
+            bin_offset = local_offset[bin];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (ID < endID)
+        {
+            // each key reads the bit-vectors for its bin,
+            //    - Computes local prefix sum to determine its output location
+            //    - Computes number of items added to its bin (last thread adjusts bin position)
+            uint prefix = 0;
+            uint count = 0;
+            for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++)
+            {
+                uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i];
+                uint bc = popcount(bits);
+                uint pc = popcount(bits & (bin_bit - 1));
+                prefix += (i < bin_word) ? bc : 0;
+                prefix += (i == bin_word) ? pc : 0;
+
+                count += bc;
+            }
+
+            // store the key in its proper place..
+            output[prefix + bin_offset] = key;
+
+            // last item for each bin adjusts local offset for next outer loop iteration
+            if (prefix == count - 1)
+                local_offset[bin] += count;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+    }
+
+    /* uint local_offset[RADIX_BINS];   */
+    /* uint offset_global = 0; */
+    /* for (int i=0;i<RADIX_BINS;i++) */
+    /*   { */
+    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
+    /*     local_offset[i] = offset_global + offset_local; */
+    /*     offset_global += count_global; */
+    /*   } */
+
+    /* for (uint ID=startID;ID<endID;ID++) */
+    /* { */
+    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+    /*   const uint offset = local_offset[bin]; */
+    /*   output[offset] = input[ID]; */
+    /*   local_offset[bin]++; */
+    /* } */
+}
+
+#else
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_scatter_items(
+    global struct Globals* globals,
+    uint shift,
+    global uint* global_histogram,
+    global char* input0,
+    global char* input1,
+    unsigned int input0_offset,
+    unsigned int input1_offset,
+    uint iteration)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+    const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = (taskID + 0) * numItems / numTasks;
+    const uint endID = (taskID + 1) * numItems / numTasks;
+
+    global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+    global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
+
+    local uint local_offset[RADIX_BINS];
+    uint off = 0;
+    for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+    {
+        const uint count = global_histogram[RADIX_BINS * numTasks + i];
+        const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
+        const uint sum = sub_group_reduce_add(count);
+        const uint prefix_sum = sub_group_scan_exclusive_add(count);
+        local_offset[i] = off + offset_task + prefix_sum;
+        off += sum;
+    }
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
+        const uint offset = atomic_add_local(&local_offset[bin], 1);
+        output[offset] = input[ID];
+    }
+
+    /* uint local_offset[RADIX_BINS];   */
+    /* uint offset_global = 0; */
+    /* for (int i=0;i<RADIX_BINS;i++) */
+    /*   { */
+    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
+    /*     local_offset[i] = offset_global + offset_local; */
+    /*     offset_global += count_global; */
+    /*   } */
+
+    /* for (uint ID=startID;ID<endID;ID++) */
+    /* { */
+    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+    /*   const uint offset = local_offset[bin]; */
+    /*   output[offset] = input[ID]; */
+    /*   local_offset[bin]++; */
+    /* } */
+}
+#endif
+
+#if 1
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1)))
+void kernel
+sort_morton_codes_scatter_items(
+    global struct Globals *globals,
+    global uint *global_histogram,
+    global ulong *input,
+    global ulong *output,
+    uint iteration,
+    uint numGroups,
+    uint update_morton_sort_in_flight)
+{
+    local uint local_offset[RADIX_BINS];
+    local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32];
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
+                                             flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight);
+    else
+        sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
+                                             flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight);
+}
+
+#else
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_scatter_items(
+    global struct Globals *globals,
+    uint shift,
+    global uint *global_histogram,
+    global char *input0,
+    global char *input1,
+    unsigned int input0_offset,
+    unsigned int input1_offset,
+    uint iteration)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+    const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = (taskID + 0) * numItems / numTasks;
+    const uint endID = (taskID + 1) * numItems / numTasks;
+
+    global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+    global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
+
+    local uint local_offset[RADIX_BINS];
+    uint off = 0;
+    for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+    {
+        const uint count = global_histogram[RADIX_BINS * numTasks + i];
+        const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
+        const uint sum = sub_group_reduce_add(count);
+        const uint prefix_sum = sub_group_scan_exclusive_add(count);
+        local_offset[i] = off + offset_task + prefix_sum;
+        off += sum;
+    }
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
+        const uint offset = atomic_add_local(&local_offset[bin], 1);
+        output[offset] = input[ID];
+    }
+
+    /* uint local_offset[RADIX_BINS];   */
+    /* uint offset_global = 0; */
+    /* for (int i=0;i<RADIX_BINS;i++) */
+    /*   { */
+    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
+    /*     local_offset[i] = offset_global + offset_local; */
+    /*     offset_global += count_global; */
+    /*   } */
+
+    /* for (uint ID=startID;ID<endID;ID++) */
+    /* { */
+    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+    /*   const uint offset = local_offset[bin]; */
+    /*   output[offset] = input[ID]; */
+    /*   local_offset[bin]++; */
+    /* } */
+}
+#endif
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel
+sort_morton_codes_merged(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uchar* input,
+    uint iteration,
+    uint numGroups
+)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint taskID = get_group_id(0);
+    const uint loc_id = get_local_id(0);
+    const uint lane = get_sub_group_local_id();
+
+    uint2 ids = get_thread_range(numItems, numGroups, taskID);
+    uint startID = ids.x;
+    uint endID = ids.y;
+
+    local uint histogram[RADIX_BINS];
+    local uint hist_tmp[RADIX_BINS];
+
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+    {
+        sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false,
+            taskID, startID, endID);
+    }
+    else
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+
+        sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true,
+            taskID, startID, endID);
+    }
+
+    uint last_group = 0;
+    if (loc_id == 0)
+        last_group = atomic_inc_global(&globals->morton_sort_in_flight);
+
+    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    last_group = work_group_broadcast(last_group, 0);
+
+    bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1);
+
+    uint global_count = 0;
+
+    if (isLastGroup)
+    {
+        for (uint j = 0; j < numGroups; j++)
+        {
+            const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0);
+            store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count);
+            global_count += count;
+        }
+
+        hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (isLastGroup)
+    {
+        uint p = hist_tmp[lane];
+        p = (lane < get_sub_group_id()) ? p : 0;
+
+        global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+        store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count);
+    }
+}
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_bin_items(
+    global struct Globals* globals,
+    uint shift,
+    global uint* global_histogram,
+    global char* input0,
+    global char* input1,
+    unsigned int input0_offset,
+    unsigned int input1_offset,
+    uint iteration)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+    const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = (taskID + 0) * numItems / numTasks;
+    const uint endID = (taskID + 1) * numItems / numTasks;
+
+    global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+
+#if 1
+    local uint histogram[RADIX_BINS];
+    for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+        histogram[i] = 0;
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
+        atomic_add(&histogram[bin], 1);
+    }
+
+    for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+        global_histogram[RADIX_BINS * taskID + i] = histogram[i];
+
+#else
+    uint histogram[RADIX_BINS];
+    for (int i = 0; i < RADIX_BINS; i++)
+        histogram[i] = 0;
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
+        histogram[bin]++;
+    }
+
+    for (uint i = 0; i < RADIX_BINS; i++)
+    {
+        const uint reduced_counter = sub_group_reduce_add(histogram[i]);
+        global_histogram[RADIX_BINS * taskID + i] = reduced_counter;
+    }
+#endif
+}
+
+#endif
+
+#define WG_SIZE_WIDE 256
+#define SG_SIZE_SCAN 16
+
+// Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16
+GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val)
+{
+    const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN;
+    const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN;
+    const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN;
+
+    uint acc = sub_group_scan_exclusive_add(val);
+    uint acc2 = acc + val;
+
+    tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint loaded_val = tmp[sg_local_id];
+    uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+    uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
+    return acc + acc_for_this_hw_thread;
+}
+
+// Wide reduce algorithm is divided into 2 kernels:
+// 1. First, partial exclusive add scans are made within each work group using SLM.
+//    Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer.
+//    Last work group is determined using global atomics on wg_flags buffer.
+// 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are.
+//    Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones.
+GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    global uint* wg_flags,
+    local uint* exclusive_scan_tmp,
+    uint numTasks,
+    uint numGroups,
+    uint iteration,
+    bool shift_primID)
+{
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+    }
+
+    const uint groupID = get_group_id(0) % RADIX_BINS;
+    const uint scanGroupID = get_group_id(0) / RADIX_BINS;
+    uint localID = get_local_id(0);
+    uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
+    const uint lastGroup = (numGroups / WG_SIZE_WIDE);
+    const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
+
+    uint temp = 0;
+    uint last_count = 0;
+    if (globalID < numTasks)
+    {
+        temp = global_histogram[RADIX_BINS * globalID + groupID];
+
+        // Store the last value of the work group, it is either last element of histogram or last item in work group
+        if (globalID == endID)
+            last_count = temp;
+    }
+
+    uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp);
+
+    if (globalID <= numTasks)
+    {
+        global_histogram[RADIX_BINS * globalID + groupID] = val;
+
+        // Store the block sum value to separate buffer
+        if (globalID == endID)
+            global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count;
+    }
+
+    // Make sure that global_histogram_partials is updated in all work groups
+    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(0);
+
+    // Now, wait for the last group for each histogram bin, so we know that
+    // all work groups already updated the global_histogram_partials buffer
+    uint last_group = 0;
+    if (localID == 0)
+        last_group = atomic_inc_global(&wg_flags[groupID]);
+
+    last_group = work_group_broadcast(last_group, 0);
+    bool isLastGroup = (last_group == lastGroup - 1);
+
+    // Each of the last groups computes the scan exclusive add for each partial sum we have
+    if (isLastGroup)
+    {
+        uint temp1 = 0;
+        if (localID < lastGroup)
+            temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID];
+
+        uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1);
+
+        if (localID < lastGroup)
+            global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2;
+    }
+}
+
+GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    local uint* partials,
+    uint numTasks,
+    uint numGroups,
+    uint iteration,
+    bool shift_primID)
+{
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+    }
+
+    const uint groupID = get_group_id(0) % RADIX_BINS;
+    const uint scanGroupID = get_group_id(0) / RADIX_BINS;
+    const uint lastGroup = (numGroups / WG_SIZE_WIDE);
+    uint localID = get_local_id(0);
+    uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
+    const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
+
+    // Add the global sums to the partials, skip the firsy scanGroupID as the first add
+    // value is 0 in case of exclusive add scans
+    if (scanGroupID > 0 && globalID <= numTasks)
+    {
+        uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID];
+        atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val);
+    }
+
+    // Wait for the last group
+    uint last_group = 0;
+    if (localID == 0)
+        last_group = atomic_inc_global(&globals->morton_sort_in_flight);
+
+    last_group = work_group_broadcast(last_group, 0);
+    bool isLastGroup = (last_group == numGroups - 1);
+
+    // Do the exclusive scan within all bins with global data now
+    if (isLastGroup)
+    {
+        mem_fence_gpu_invalidate();
+
+        uint global_count = global_histogram[numTasks * RADIX_BINS + localID];
+
+        partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        uint lane = get_sub_group_local_id();
+        uint p = partials[lane];
+        p = (lane < get_sub_group_id()) ? p : 0;
+
+        global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+        store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count);
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
+void kernel
+sort_morton_codes_reduce_bins_wide_partial_sum(
+    global struct Globals* globals,
+    uint numTasks,
+    uint numGroups,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    global uint* wg_flags,
+    uint iteration)
+{
+    local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN];
+
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false);
+    else
+        sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
+void kernel
+sort_morton_codes_reduce_bins_wide_add_reduce(
+    global struct Globals* globals,
+    uint numTasks,
+    uint numGroups,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    uint iteration)
+{
+    local uint partials[RADIX_BINS];
+
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false);
+    else
+        sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true);
+}
diff --git a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
new file mode 100644
index 00000000000..dee315adcda
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
@@ -0,0 +1,297 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module msb_radix_bitonic_sort;
+
+kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_debug_print                                    < kernelFunction="debug_print_kernel">;
+    kernel opencl_check_bls                                      < kernelFunction="check_bls_sort">;
+
+    kernel opencl_bottom_level_sort_single_wg                    < kernelFunction="sort_morton_codes_bottom_level_single_wg">;
+
+    kernel opencl_build_morton_kernel_sort_msb_init              < kernelFunction="sort_morton_codes_msb_begin">;
+
+    kernel opencl_build_morton_kernel_sort_msb_scheduler         < kernelFunction="scheduler">;
+
+    kernel opencl_build_morton_kernel_sort_bottom_level          < kernelFunction="sort_morton_codes_bottom_level">;
+
+    kernel opencl_build_morton_kernel_sort_msb_count_items       < kernelFunction="sort_morton_codes_msb_count_items">;
+    kernel opencl_build_morton_kernel_sort_msb_bin_items         < kernelFunction="sort_morton_codes_msb_bin_items">;
+
+    kernel opencl_build_morton_kernel_sort_batched_bls_dispatch  < kernelFunction="sort_morton_codes_batched_BLS_dispatch">;
+}
+
+
+const MSB_RADIX_NUM_VCONTEXTS  = 8;
+const BOTTOM_LEVEL_SORT_THRESHOLD  = 512;
+
+struct MSBRadixScheduler
+{
+    dword num_wgs_msb;
+    dword num_wgs_bls;
+
+    dword scheduler_postsync;
+    dword _pad1;
+};
+
+struct MSBRadixArgs
+{
+    qword p_scheduler;
+    qword p_num_primitives;
+};
+
+
+
+
+struct BatchedBLSDispatchEntry
+{
+    qword p_data_buffer;
+    qword num_elements; // number of elements in p_data_buffer
+};
+
+
+
+
+metakernel add_bls_dispatch_init(qword p_storage)
+{
+    define REG_numWgs         REG14;
+    define REG_p_storage      REG15;
+
+    REG_numWgs = 0;
+    REG_p_storage = p_storage;
+}
+
+
+
+
+// basically this code does:
+// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives };
+// dispatchId++;
+//
+metakernel add_bls_dispatch(
+    qword p_data,
+    qword p_num_primitives
+)
+{
+    define C_1                                REG0;
+    define C_8                                REG1;
+
+    define C_MIN_PRIMREFS                     REG2;
+
+    define REG_p_data                         REG3;
+    define REG_num_prims                      REG4;
+    define REG_no_dispatch                    REG5;
+
+    define REG_numWgs                         REG14;
+    define REG_p_storage                      REG15;
+
+    C_MIN_PRIMREFS = 2;
+
+    REG_num_prims = 0;
+    REG_num_prims.lo = load_dword(p_num_primitives);
+
+    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
+
+    goto l_finish if(REG_no_dispatch.lo);
+
+    C_1 = 1;
+    C_8 = 8;
+
+    // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data
+    REG_p_data = p_data;
+    store_qword( REG_p_storage, REG_p_data ); // store the data pointer
+
+    REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct
+
+    // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives
+    store_qword( REG_p_storage, REG_num_prims );
+
+    REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance
+
+    REG_numWgs = REG_numWgs + C_1;
+
+l_finish:
+
+}
+
+
+
+
+metakernel batched_bls_dispatch(
+    qword private_mem
+)
+{
+    define REG_numWgs REG14;
+
+    DISPATCHDIM_X = REG_numWgs;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem);
+}
+
+
+
+
+metakernel sort_bottom_level(
+    qword build_globals,
+    qword input,
+    qword p_num_primitives)
+{
+    define REG_num_prims       REG0;
+    define C_MIN_PRIMREFS      REG1;
+    define REG_no_dispatch     REG2;
+
+    REG_num_prims  = load_dword( p_num_primitives );
+
+    C_MIN_PRIMREFS = 2;
+
+    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
+
+    goto l_finish if(REG_no_dispatch.lo);
+
+    dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
+
+l_finish:
+
+}
+
+
+
+
+metakernel sort(
+    qword build_globals,
+    qword input,
+    qword tmp,
+    MSBRadixArgs sort_args)
+{
+    define REG_num_prims       REG0;
+    {
+        define C_MIN_PRIMREFS           REG1;
+        define C_MAX_PRIMREFS           REG2;
+        define REG_no_dispatch          REG3;
+        define REG_dispatch_single_wg   REG4;
+    
+        REG_num_prims  = load_dword( sort_args.p_num_primitives );
+        C_MIN_PRIMREFS = 2;
+        C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD;
+    
+        REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
+        REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS;
+    
+        goto l_sort_finish if(REG_no_dispatch.lo);
+        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
+        goto l_full_sort;
+    }
+
+l_dispatch_single_wg:
+
+    {
+        dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
+        goto l_sort_finish;
+    }
+
+l_full_sort:
+
+    define p_scheduler                  sort_args.p_scheduler;
+    define p_scheduler_postsync        (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) );
+    define p_num_wgs_bls               (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) );
+
+    define REG_scheduler_postsync    REG3;
+    REG_scheduler_postsync = p_scheduler_postsync;
+
+    define C_0    REG4;
+    define C_8    REG5;
+    define C_255  REG6;
+    C_0 = 0;
+    C_8 = 8;
+    C_255 = 255;
+
+    store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
+
+    REG_num_prims = REG_num_prims + C_255;
+    REG_num_prims = REG_num_prims >> C_8;
+
+    DISPATCHDIM_X = REG_num_prims.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    control( cs_store_fence ); // commit the semaphore write 
+
+    // initialize the whole execution
+    dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp)
+        postsync store_dword( p_scheduler_postsync, 1 );
+
+    // wait on count_items kernel
+    semaphore_wait while( *p_scheduler_postsync != 1 );
+
+    dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
+        postsync store_dword( p_scheduler_postsync, 2 );
+        
+    // wait on count_items kernel
+    semaphore_wait while( *p_scheduler_postsync != 2 );
+
+    dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
+        postsync store_dword( p_scheduler_postsync, 0 );
+
+    define C_MASK_HI REG4;
+    C_MASK_HI = 0x00000000ffffffff;
+
+    l_build_loop:
+    {
+        semaphore_wait while( *p_scheduler_postsync != 0 );
+        {
+            dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp )
+                postsync store_dword( p_scheduler_postsync, 1 );
+        
+            // wait on scheduler kernel
+            semaphore_wait while( *p_scheduler_postsync != 1 );
+        }
+
+        // load and process the scheduler results
+        define REG_wg_counts    REG0;
+        define REG_num_msb_wgs  REG0.lo;
+        define REG_num_bls_wgs  REG0.hi;
+        define REG_p_scheduler  REG1;
+        define REG_no_msb_wgs   REG2;
+        {
+            REG_p_scheduler = p_scheduler;
+            REG_wg_counts    = load_qword( REG_p_scheduler ); 
+
+            REG_no_msb_wgs = REG_wg_counts  & C_MASK_HI;
+            REG_no_msb_wgs = REG_no_msb_wgs == 0;
+        }
+
+        // dispatch new bls WGs
+        DISPATCHDIM_X = REG_num_bls_wgs;
+        dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input );
+
+        // jump out if there are no msb WGs
+        goto l_sort_finish if (REG_no_msb_wgs);
+
+        DISPATCHDIM_X = REG_num_msb_wgs;
+        dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
+            postsync store_dword( p_scheduler_postsync, 2 );
+        
+        // wait on count_items kernel
+        semaphore_wait while( *p_scheduler_postsync != 2 );
+
+        dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
+            postsync store_dword( p_scheduler_postsync, 0 );
+
+        // wait till all BLS finished launching
+        semaphore_wait while( *p_num_wgs_bls != 0 );
+
+        goto l_build_loop;
+    }
+
+l_sort_finish:
+
+}
diff --git a/src/intel/vulkan/grl/gpu/new_sah_builder.grl b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
new file mode 100644
index 00000000000..d0a9694acc2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
@@ -0,0 +1,665 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module new_sah_builder;
+
+kernel_module bfs_kernels ("bvh_build_BFS.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial      <  kernelFunction="BFS_pass1_initial"  >   ;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed      <  kernelFunction="BFS_pass1_indexed"  >   ;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial      <  kernelFunction="BFS_pass2_initial"  >   ;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed      <  kernelFunction="BFS_pass2_indexed"  >   ;
+
+    kernel opencl_build_kernel_BinnedSAH_DFS                    <  kernelFunction="DFS"        >;
+    // kernel opencl_build_kernel_BinnedSAH_BuildQNodes            <  kernelFunction="build_qnodes" >;
+    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff    <  kernelFunction="build_qnodes_pc_kickoff" >;
+    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify    <  kernelFunction="build_qnodes_pc_amplify" >;
+    kernel opencl_build_kernel_BinnedSAH_begin                  <  kernelFunction = "begin" >;
+    kernel opencl_build_kernel_BinnedSAH_scheduler              <  kernelFunction = "scheduler" >;
+
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch   < kernelFunction="BFS_pass1_initial_batchable"  >;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch   < kernelFunction="BFS_pass1_indexed_batchable"  >;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch   < kernelFunction="BFS_pass2_initial_batchable"  >;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch   < kernelFunction="BFS_pass2_indexed_batchable"  >;
+
+    kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
+    kernel opencl_build_kernel_BinnedSAH_begin_batched     < kernelFunction="begin_batchable"   >;
+
+    kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched      < kernelFunction="build_qnodes_init_scheduler_batched" >;
+    kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched               < kernelFunction="build_qnodes_begin_batchable" >;
+    kernel opencl_build_kernel_BinnedSAH_qnode_scheduler                   < kernelFunction="build_qnodes_scheduler" >;
+    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch         < kernelFunction="build_qnodes_pc_amplify_batched" >;
+
+    kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;
+
+}
+
+kernel opencl_build_kernel_DFS_single_wg             < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
+kernel opencl_build_kernel_DFS_trivial               < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial"  >
+kernel opencl_build_kernel_DFS_single_wg_batch       < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
+kernel opencl_build_kernel_DFS_trivial_batch         < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable"   >
+
+kernel single_pass_binsah                            < source="bvh_build_DFS.cl", kernelFunction="DFS"                           >
+
+
+const DFS_MIN_PRIMREFS  = 6;
+const DFS_MAX_PRIMREFS  = 256;
+const BFS_WG_SIZE_SHIFT = 9;
+
+
+
+struct Scheduler
+{
+    dword num_bfs_wgs;
+    dword num_dfs_wgs;
+
+    dword scheduler_postsync;
+    dword _pad1;
+
+    dword num_trivial_builds;
+    dword num_single_builds;
+
+    dword batched_build_wg_count;
+    dword batched_build_loop_mask;
+
+};
+
+
+struct SAHBuildArgs
+{
+    qword p_num_primitives;
+    qword p_qnode_child_buffer;
+    qword p_scheduler;
+    qword p_sah_globals;
+    qword p_globals;
+    qword p_primref_buffer;
+    qword p_primref_index_buffers;
+    qword p_bvh_base;
+    qword p_bvh2;
+    qword p_root_buffer_counters;
+    dword sah_build_flags;
+    dword leaf_size;
+    dword leaf_type;
+    dword max_internal_nodes;
+};
+
+
+metakernel single_pass_binsah(
+    qword build_globals,
+    qword bvh_buffer,
+    qword build_primref_buffer,
+    qword build_primref_index_buffers,
+    dword alloc_backpointers )
+{
+
+    dispatch single_pass_binsah(1, 1, 1) args(
+        build_globals,
+        bvh_buffer,
+        build_primref_buffer,
+        build_primref_index_buffers,
+        alloc_backpointers
+    );
+
+}
+
+
+
+metakernel new_sah_build( SAHBuildArgs build_args )
+{
+    define REG_num_prims    REG0;
+
+    {
+        define C_MIN_PRIMREFS           REG1;
+        define C_MAX_PRIMREFS           REG2;
+        define REG_dispatch_trivial     REG3;
+        define REG_dispatch_single_wg   REG4;
+
+        REG_num_prims  = load_dword( build_args.p_num_primitives );
+        C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
+        C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;
+
+        REG_dispatch_trivial   = REG_num_prims <= C_MIN_PRIMREFS;
+        REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;
+
+        goto l_dispatch_trivial   if(REG_dispatch_trivial.lo);
+        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
+        goto l_full_build;
+    }
+
+l_dispatch_trivial:
+    {
+        dispatch opencl_build_kernel_DFS_trivial    (1,1,1)
+            args( build_args.p_globals,
+                  build_args.p_bvh_base,
+                  build_args.p_primref_buffer,
+                  build_args.p_primref_index_buffers,
+                  build_args.sah_build_flags
+                  );
+
+        control( wait_idle );
+        goto l_done;
+    }
+
+l_dispatch_single_wg:
+    {
+        dispatch opencl_build_kernel_DFS_single_wg    (1,1,1)
+            args( build_args.p_globals,
+                  build_args.p_bvh_base,
+                  build_args.p_primref_buffer,
+                  build_args.p_primref_index_buffers,
+                  build_args.sah_build_flags
+                  );
+
+        control( wait_idle );
+        goto l_done;
+    }
+
+
+l_full_build:
+
+
+    {
+        define p_scheduler                  build_args.p_scheduler;
+        define p_num_dfs_wgs                build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
+        define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
+        define C_0    REG1;
+        define C_8    REG2;
+        C_8 = 8;
+        C_0 = 0;
+
+
+        //
+        //  Init pass
+        //
+        store_dword( p_scheduler_postsync, C_0.lo );
+
+        // compute number of BFS WGs from prim-count
+        // NOTE:  This code uses a hardcoded WG size of 512 for BFS
+        //    If the BFS wg size ever changes, it needs to be touched
+        //    This is necessary because DG2 shifter only supports POW2 shifts
+        {
+            define REG_scheduler_postsync    REG3;
+            define C_511    REG4;
+            define C_1      REG5;
+
+            REG_scheduler_postsync = p_scheduler_postsync;
+            C_511 = 511;
+            C_1   = 1;
+
+            store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
+
+            REG_num_prims = REG_num_prims + C_511;
+            REG_num_prims = REG_num_prims >> C_8;
+            REG_num_prims = REG_num_prims >> C_1;
+
+            DISPATCHDIM_X = REG_num_prims.lo;
+            DISPATCHDIM_Y = 1;
+            DISPATCHDIM_Z = 1;
+
+            control( cs_store_fence ); // commit the semaphore write
+
+            // launch scheduler init kernel
+            dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
+                args(
+                    build_args.p_scheduler,
+                    build_args.leaf_size,
+                    build_args.leaf_type,
+                    build_args.p_primref_index_buffers,
+                    build_args.p_primref_buffer,
+                    build_args.p_bvh2,
+                    build_args.p_bvh_base,
+                    build_args.p_globals,
+                    build_args.p_sah_globals,
+                    build_args.p_qnode_child_buffer,
+                    build_args.sah_build_flags
+                )
+                postsync store_dword( p_scheduler_postsync, 1 );
+
+            // wait on init kernel
+            semaphore_wait while( *p_scheduler_postsync != 1 );
+
+            // launch BFS1 pass1
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
+                args( build_args.p_scheduler,
+                      build_args.p_sah_globals)
+                postsync store_dword( p_scheduler_postsync, 0 );
+
+            // wait on BFS pass1
+            semaphore_wait while( *p_scheduler_postsync != 0 );
+
+            // launch BFS pass2
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
+                args( build_args.p_scheduler,
+                      build_args.p_sah_globals )
+                postsync store_dword( p_scheduler_postsync, 1 );
+        }
+
+        // after BFS pass 2 we drop into a scheduling loop
+
+        l_build_loop:
+        {
+            semaphore_wait while( *p_scheduler_postsync != 1 );
+
+            {
+                dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
+                    args( build_args.p_scheduler, build_args.p_sah_globals )
+                    postsync store_dword( p_scheduler_postsync, 0 );
+
+                // wait on the scheduler
+                semaphore_wait while( *p_scheduler_postsync != 0 );
+            }
+
+            // load and process the scheduler results
+            define REG_wg_counts    REG0;
+            define REG_num_bfs_wgs  REG0.lo;
+            define REG_num_dfs_wgs  REG0.hi;
+            define REG_loop_break   REG1;
+            define REG_p_scheduler  REG2;
+            {
+                REG_p_scheduler = p_scheduler;
+                REG_wg_counts    = load_qword( REG_p_scheduler );
+
+                define C_MASK_LO REG3 ;
+                C_MASK_LO = 0xffffffff;
+
+                REG_loop_break = REG_wg_counts  & C_MASK_LO;
+                REG_loop_break = REG_loop_break == 0;
+            }
+
+            // dispatch new DFS WGs
+            DISPATCHDIM_X = REG_num_dfs_wgs;
+            dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
+                args( p_scheduler,
+                      build_args.p_sah_globals );
+
+            // jump out if there are no bfs WGs
+            goto l_build_qnodes if (REG_loop_break);
+
+            // dispatch new BFS1 WGs
+            DISPATCHDIM_X = REG_num_bfs_wgs;
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
+                args( p_scheduler,
+                      build_args.p_sah_globals )
+                postsync store_dword( p_scheduler_postsync, 2 );
+
+           semaphore_wait while( *p_scheduler_postsync != 2 );
+
+           // dispatch new BFS2 WGs
+           dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
+               args( p_scheduler,
+                     build_args.p_sah_globals )
+               postsync store_dword( p_scheduler_postsync, 1 );
+
+            //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
+
+            // wait until all upcoming DFS WGs have finished launching
+            //   so that the scheduler can refill the launch array
+                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
+            semaphore_wait while( *p_num_dfs_wgs != 0 );
+
+
+            goto l_build_loop;
+        }
+    }
+
+l_build_qnodes:
+
+    control( wait_idle );
+
+    // P/C qnode build
+
+    dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
+        args( build_args.p_sah_globals,
+              build_args.p_qnode_child_buffer,
+              build_args.sah_build_flags );
+
+    {
+        define p_pc_counters ( build_args.p_root_buffer_counters );
+
+        define REG_addr      REG0;
+        define REG_produced  REG1;
+        define REG_consumed  REG2;
+        define REG_have_work REG3;
+        define REG_wg_count  REG4;
+        define C_8 REG5;
+        define C_16 REG6;
+        define C_1 REG7;
+        C_1 = 1;
+        C_8 =  8;
+        C_16 = 16;
+        REG_addr =  build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address
+
+        REG_consumed = 0;
+
+        l_qnode_loop:
+
+            control( wait_idle ); // wait for previous pass
+
+            // load counters and compute number of wgs to respawn
+            REG_produced  = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
+            REG_wg_count  = REG_produced - REG_consumed;
+            REG_have_work = REG_wg_count > 0;
+
+            goto l_done if not(REG_have_work.lo);
+
+            // save REG_consumed as a starting position in p_qnode_child_buffer
+            store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;
+
+            // save REG_produced as ending position in p_qnode_child_buffer
+            store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;
+
+            REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration
+
+            // calculate amount of workgroups to schedule
+            REG_wg_count = REG_wg_count + C_1;
+            REG_wg_count = REG_wg_count >> C_1;
+
+            DISPATCHDIM_X = REG_wg_count.lo;
+
+            control( cs_store_fence ); // commit the stores
+
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
+                    args( build_args.p_sah_globals,
+                          build_args.p_qnode_child_buffer,
+                          build_args.sah_build_flags);
+
+            goto l_qnode_loop;
+    }
+
+l_done:
+}
+
+
+
+
+
+
+
+
+
+struct SAHBuildArgsBatchable
+{
+    qword p_globals_ptrs;
+    qword p_scheduler;
+    qword p_buffers_info;
+    qword p_sah_globals;
+
+    dword num_max_qnode_global_root_buffer_entries;
+    dword num_builds;
+
+};
+
+
+metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
+{
+    define p_scheduler                  build_args.p_scheduler;
+    define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
+    define p_num_dfs_wgs                (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));
+
+    // initialize scheduler semaphore
+    REG0.lo = 0;
+    store_dword( p_scheduler_postsync, REG0.lo );
+
+
+    // dispatch categorization pass
+    dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
+        args(
+              build_args.p_scheduler,
+              build_args.p_globals_ptrs,
+              build_args.p_buffers_info,
+              build_args.p_sah_globals,
+              build_args.num_builds
+          )
+          postsync store_dword( p_scheduler_postsync, 1 );
+
+    // wait on the categorization pass
+    semaphore_wait while( *p_scheduler_postsync != 1 );
+
+
+    //  dispatch the trivial and single-WG passes
+    {
+        REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
+        DISPATCHDIM_X = REG0.lo;
+        DISPATCHDIM_Y = 1;
+        DISPATCHDIM_Z = 1;
+
+        // dispatch trivial builds
+
+        dispatch_indirect opencl_build_kernel_DFS_trivial_batch
+            args( build_args.p_sah_globals );
+
+        control( wait_idle );
+
+        // dispatch single-wg builds
+
+        DISPATCHDIM_X = REG0.hi;
+        dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
+            args( build_args.p_sah_globals, build_args.p_scheduler );
+    }
+
+    // compute the number of builds not covered by the trivial passes
+    // skip the builder loop if all builds are satisfied by trivial passes
+    {
+        REG1 = REG0.lo;
+        REG2 = REG0.hi;
+        REG3 = build_args.num_builds;
+        REG5 = REG2 + REG1;
+        REG5 = REG3 - REG5;
+        REG4 = REG5 == 0 ;
+
+        goto l_done if (REG4.lo);
+    }
+
+    // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
+    define REG_num_nontrivial REG5;
+
+l_build_outer_loop:
+    {
+
+        // configure the scheduler to initiate a new block of builds
+
+        dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
+            args( build_args.p_scheduler, build_args.p_sah_globals )
+            postsync store_dword( p_scheduler_postsync, 0 );
+
+        // wait on init kernel
+        semaphore_wait while( *p_scheduler_postsync != 0 );
+
+
+        // read results produced by scheduler init kernel
+        //   lo == BFS wg count.  hi == all ones if we need to loop again
+        //
+        REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
+        REG4 = load_qword( REG0 );
+
+        // launch BFS1 pass1
+        DISPATCHDIM_X = REG4.lo;
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
+            args( build_args.p_scheduler,
+                    build_args.p_sah_globals)
+            postsync store_dword( p_scheduler_postsync, 1 );
+
+        // wait on BFS pass1
+        semaphore_wait while( *p_scheduler_postsync != 1 );
+
+        // launch BFS pass2
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
+            args( build_args.p_scheduler,
+                    build_args.p_sah_globals )
+            postsync store_dword( p_scheduler_postsync, 0 );
+
+        l_build_loop:
+            {
+                semaphore_wait while( *p_scheduler_postsync != 0 );
+
+                {
+                    dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
+                        args( build_args.p_scheduler, build_args.p_sah_globals )
+                        postsync store_dword( p_scheduler_postsync, 1 );
+
+                    // wait on the scheduler
+                    semaphore_wait while( *p_scheduler_postsync != 1 );
+                }
+
+                // load and process the scheduler results
+                define REG_wg_counts    REG0;
+                define REG_num_bfs_wgs  REG0.lo;
+                define REG_num_dfs_wgs  REG0.hi;
+                define REG_loop_break   REG1;
+                define REG_p_scheduler  REG2;
+                {
+                    REG_p_scheduler = p_scheduler;
+                    REG_wg_counts    = load_qword( REG_p_scheduler );
+
+                    define C_MASK_LO REG3 ;
+                    C_MASK_LO = 0xffffffff;
+
+                    REG_loop_break = REG_wg_counts  & C_MASK_LO;
+                    REG_loop_break = REG_loop_break == 0;
+                }
+
+                // dispatch new DFS WGs
+                DISPATCHDIM_X = REG_num_dfs_wgs;
+                dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
+                    args( p_scheduler,
+                          build_args.p_sah_globals );
+
+                // jump out if there are no bfs WGs
+                goto l_continue_outer_loop if (REG_loop_break);
+
+                // dispatch new BFS1 WGs
+                DISPATCHDIM_X = REG_num_bfs_wgs;
+                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
+                    args( p_scheduler,
+                          build_args.p_sah_globals )
+                    postsync store_dword( p_scheduler_postsync, 2 );
+
+               semaphore_wait while( *p_scheduler_postsync != 2 );
+
+                // dispatch new BFS2 WGs
+                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
+                    args( p_scheduler,
+                          build_args.p_sah_globals )
+                    postsync store_dword( p_scheduler_postsync, 0 );
+
+                //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
+
+                // wait until all upcoming DFS WGs have finished launching
+                //   so that the scheduler can refill the launch array
+                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
+                semaphore_wait while( *p_num_dfs_wgs != 0 );
+
+                goto l_build_loop;
+            }
+
+
+        l_continue_outer_loop:
+
+
+            goto l_build_outer_loop if(REG4.hi);
+
+    }
+
+////////
+//
+// Qnode build phase
+//
+////////
+
+    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
+    control( wait_idle );
+
+    define REG_wg_counts   REG1;
+    define REG_p_scheduler REG2;
+    define REG_have_work   REG3;
+    define REG_GRB_NUM_MAX_ENTRIES    REG4;
+
+    // init scheduler for qnode phase
+    dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
+        args( build_args.p_scheduler,
+              build_args.num_builds,
+              build_args.num_max_qnode_global_root_buffer_entries);
+
+    REG_p_scheduler = p_scheduler;
+
+    control( wait_idle );
+
+    REG_wg_counts   = load_qword( REG_p_scheduler );
+
+    DISPATCHDIM_X = REG_wg_counts.lo;
+
+    // configure the scheduler to initiate a new block of builds
+    dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
+        args( build_args.p_scheduler,
+              build_args.p_sah_globals);
+
+    // read results produced by init scheduler kernel
+    //   lo == num of builds processed.  hi == num of maximum global root buffer entries
+    //
+    REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
+    REG5 = load_qword( REG0 );
+
+    REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
+    REG_GRB_NUM_MAX_ENTRIES.hi = 0;
+
+l_qnode_loop:
+    {
+        control( wait_idle ); // wait for previous pass
+
+        dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );
+
+        control( wait_idle );
+
+        REG_wg_counts   = load_qword( REG_p_scheduler );
+        REG_have_work = REG_wg_counts > 0;
+
+        goto l_done if not(REG_have_work.lo);
+
+        DISPATCHDIM_X = REG_wg_counts.lo;
+
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
+                args( build_args.p_sah_globals,
+                      build_args.p_scheduler );
+
+        control( wait_idle );
+
+        REG_wg_counts   = load_qword( REG_p_scheduler ); // reload values
+        REG_wg_counts.lo = REG_wg_counts.hi;
+        REG_wg_counts.hi = 0;
+
+        REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;
+
+        goto l_qnode_loop if not(REG_have_work.lo);
+
+        DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled
+
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
+                args( build_args.p_sah_globals,
+                      build_args.p_scheduler );
+
+        goto l_qnode_loop;
+    }
+
+////////
+//
+// Old implementation - TODO: maybe add switch between two implementations?
+//
+////////
+    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
+    //DISPATCHDIM_X = REG5.lo;
+
+    //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
+    //    args( build_args.p_sah_globals, build_args.p_scheduler );
+
+
+l_done:
+
+    control( wait_idle );
+
+}
diff --git a/src/intel/vulkan/grl/gpu/postbuild_info.grl b/src/intel/vulkan/grl/gpu/postbuild_info.grl
new file mode 100644
index 00000000000..3039e533a9b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/postbuild_info.grl
@@ -0,0 +1,49 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module postbuild_info; // In postbuild we assume output data structure to be DXR compatible
+
+kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" >
+kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" >
+kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" >
+kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" >
+
+metakernel compacted_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch compacted_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
+
+metakernel current_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch current_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
+
+metakernel serialized_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch serialized_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
+
+metakernel decoded_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch decoded_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
diff --git a/src/intel/vulkan/grl/gpu/presplit.grl b/src/intel/vulkan/grl/gpu/presplit.grl
new file mode 100644
index 00000000000..d0f6e53fbb1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/presplit.grl
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module presplit;
+
+kernel_module presplit_kernels ("bvh_build_presplit.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_compute_num_presplits               < kernelFunction="compute_num_presplits" >;
+    kernel opencl_kernel_priority_sum                        < kernelFunction="priority_sum"          >;
+    kernel opencl_kernel_perform_presplits                   < kernelFunction="perform_presplits"     >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+metakernel compute_num_presplits(
+    MKBuilderState state,
+    qword presplit_buffer,
+    dword numHwThreads )
+{
+    dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        presplit_buffer,
+        state.geomDesc_buffer );
+}
+
+
+metakernel priority_sum(
+    MKBuilderState state,
+    MKSizeEstimate estimate,
+    qword presplit_buffer )
+{
+    dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args(
+        state.build_globals,
+        presplit_buffer,
+        estimate.numPrimitivesToSplit / 2 );
+}
+
+metakernel perform_presplits(
+    MKBuilderState state,
+    MKSizeEstimate estimate,
+    qword presplit_buffer,
+    dword numHwThreads )
+{
+    dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        presplit_buffer,
+        state.bvh_buffer,
+        state.geomDesc_buffer,
+        estimate.numPrimitivesToSplit / 2 );
+}
diff --git a/src/intel/vulkan/grl/gpu/qbvh6.h b/src/intel/vulkan/grl/gpu/qbvh6.h
new file mode 100644
index 00000000000..22260d07f41
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/qbvh6.h
@@ -0,0 +1,933 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+#include "shared.h"
+#include "quad.h"
+
+/* ====== GENERAL BVH config ====== */
+
+#define BVH_NODE_N6 6
+#define BVH_NODE_N 8
+#define BVH_NODE_N_LOG 3
+
+#define SAH_LOG_BLOCK_SHIFT 2
+#define BVH_LEAF_N_MIN BVH_NODE_N6
+#define BVH_LEAF_N_MAX BVH_NODE_N6
+
+#define BVH_NODE_DEFAULT_MASK 0xff
+#define BVH_NODE_DEGENERATED_MASK 0x00
+
+/* ====== QUANTIZATION config ====== */
+
+#define QUANT_BITS 8
+#define QUANT_MIN 0
+#define QUANT_MAX 255
+#define QUANT_MAX_MANT (255.0f / 256.0f)
+
+#define NO_NODE_OFFSET 0
+
+/* ======================================================================= */
+/* ============================== BVH BASE =============================== */
+/* ======================================================================= */
+
+GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb)
+{
+    base->Meta.bounds.lower[0] = aabb->lower.x;
+    base->Meta.bounds.lower[1] = aabb->lower.y;
+    base->Meta.bounds.lower[2] = aabb->lower.z;
+
+    base->Meta.bounds.upper[0] = aabb->upper.x;
+    base->Meta.bounds.upper[1] = aabb->upper.y;
+    base->Meta.bounds.upper[2] = aabb->upper.z;
+}
+
+GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh)
+{
+    return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
+}
+
+GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh)
+{
+    return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
+}
+
+GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh)
+{
+    return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart);
+}
+
+GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh)
+{
+    return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
+}
+
+GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh)
+{
+    return bvh->quadLeafCur - bvh->quadLeafStart;
+}
+
+GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh)
+{
+    return bvh->proceduralDataCur - bvh->proceduralDataStart;
+}
+
+GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh)
+{
+    return bvh->instanceLeafEnd - bvh->instanceLeafStart;
+}
+
+/* =================================================================== */
+/* ============================== QBVH =============================== */
+/* =================================================================== */
+
+__constant const float ulp = FLT_EPSILON;
+
+GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb)
+{
+    struct AABB box;
+    const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper));
+    const float v = ulp * max(v4.x, max(v4.y, v4.z));
+    box.lower = aabb->lower - (float4)v;
+    box.upper = aabb->upper + (float4)v;
+    return box;
+}
+
+GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d)
+{
+    struct AABB aabb4d = AABBfromAABB3f(*aabb3d);
+    struct AABB box = conservativeAABB(&aabb4d);
+    return AABB3fFromAABB(box);
+}
+
+struct QBVH_AABB
+{
+    uchar lower_x[BVH_NODE_N6];
+    uchar upper_x[BVH_NODE_N6];
+    uchar lower_y[BVH_NODE_N6];
+    uchar upper_y[BVH_NODE_N6];
+    uchar lower_z[BVH_NODE_N6];
+    uchar upper_z[BVH_NODE_N6];
+};
+
+struct QBVHNodeN
+{
+    float lower[3];
+    int offset;
+    // 16 bytes
+    uchar type;
+    uchar pad;
+    // 18 bytes
+    char exp[3];
+    uchar instMask;
+    // 22 bytes
+    uchar childData[6];
+    // 28 bytes
+    struct QBVH_AABB qbounds; // + 36 bytes
+                              // 64 bytes
+};
+
+GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID)
+{
+    return This->childData[childID] & 0x3;
+}
+
+GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID)
+{
+    return (This->childData[childID] >> 2) & 0xF;
+}
+
+GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode)
+{
+    uint *ptr = (uint *)qnode;
+    for (uint i = 0; i < 16; i++)
+        ptr[i] = 0;
+}
+
+GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i)
+{
+    struct AABB aabb;
+    const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
+    const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0);
+    const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0);
+    const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
+    aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+    aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+    return aabb;
+}
+
+GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode)
+{
+    struct AABB aabb;
+#if 0
+  AABB_init(&aabb);
+  for (uint i = 0; i < BVH_NODE_N6; i++)
+  {
+    struct AABB v = extractAABB_QBVHNodeN(qnode, i);
+    AABB_extend(&aabb, &v);
+  }
+#else
+    uint lower_x = qnode->qbounds.lower_x[0];
+    uint lower_y = qnode->qbounds.lower_y[0];
+    uint lower_z = qnode->qbounds.lower_z[0];
+
+    uint upper_x = qnode->qbounds.upper_x[0];
+    uint upper_y = qnode->qbounds.upper_y[0];
+    uint upper_z = qnode->qbounds.upper_z[0];
+
+    for (uint i = 1; i < BVH_NODE_N6; i++)
+    {
+        uint lx = qnode->qbounds.lower_x[i];
+        uint ly = qnode->qbounds.lower_y[i];
+        uint lz = qnode->qbounds.lower_z[i];
+
+        uint ux = qnode->qbounds.upper_x[i];
+        uint uy = qnode->qbounds.upper_y[i];
+        uint uz = qnode->qbounds.upper_z[i];
+
+        bool valid = lx <= ux;
+        if (valid)
+        {
+            lower_x = min(lower_x, lx);
+            lower_y = min(lower_y, ly);
+            lower_z = min(lower_z, lz);
+
+            upper_x = max(upper_x, ux);
+            upper_y = max(upper_y, uy);
+            upper_z = max(upper_z, uz);
+        }
+    }
+
+    const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
+    const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0);
+    const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0);
+    const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
+    aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+    aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+#endif
+    return aabb;
+}
+
+GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node)
+{
+    return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node));
+}
+
+GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode)
+{
+    uint children = 0;
+    for (uint i = 0; i < BVH_NODE_N6; i++)
+    {
+        uint lx = qnode->qbounds.lower_x[i];
+        uint ux = qnode->qbounds.upper_x[i];
+        bool valid = lx <= ux;
+        if (valid)
+            children++;
+    }
+    return children;
+}
+
+GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode)
+{
+    return ((long)qnode->offset) << 6;
+}
+
+GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode)
+{
+    const int offset = qnode->offset;
+    return (void *)(qnode + offset);
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint k = subgroupLocalID;
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width
+    aabb = AABB_sub_group_broadcast(&aabb, 0);
+
+    if (subgroupLocalID < BVH_NODE_N6)
+    {
+        struct AABB conservative_aabb = conservativeAABB(&aabb);
+        const float3 len = AABB_size(&conservative_aabb).xyz * up;
+        int3 exp;
+        const float3 mant = frexp_vec3(len, &exp);
+        const float3 org = conservative_aabb.lower.xyz;
+
+        exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+        qbvh_node->offset = offset;
+        qbvh_node->type = type;
+
+        qbvh_node->lower[0] = org.x;
+        qbvh_node->lower[1] = org.y;
+        qbvh_node->lower[2] = org.z;
+
+        qbvh_node->exp[0] = exp.x;
+        qbvh_node->exp[1] = exp.y;
+        qbvh_node->exp[2] = exp.z;
+
+        qbvh_node->instMask = mask;
+
+        uchar3 lower_uchar = (uchar3)(0x80);
+        uchar3 upper_uchar = (uchar3)(0);
+
+        if (subgroupLocalID < numChildren)
+        {
+            struct AABB child_aabb = conservativeAABB(input_aabb);
+
+            float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+            lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+            float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+            upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+            lower_uchar = convert_uchar3_rtn(lower);
+            upper_uchar = convert_uchar3_rtp(upper);
+
+            if (degenerated)
+            {
+                lower_uchar = upper_uchar = 0;
+            }
+        }
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+        qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+
+        if (!(exp.x >= -128 && exp.x <= 127))
+            printf("exp_x error \n");
+        if (!(exp.y >= -128 && exp.y <= 127))
+            printf("exp_y error \n");
+        if (!(exp.z >= -128 && exp.z <= 127))
+            printf("exp_z error \n");
+
+        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+        if (!AABB_subset(&child_aabb, &child_qaabb))
+        {
+            uint3 lower_i = convert_uint3(lower_uchar);
+            uint3 upper_i = convert_uint3(upper_uchar);
+
+            printf("\n ERROR %d\n", k);
+            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
+            printf("%i uncompressed \n", k);
+            AABB_print(&child_aabb);
+            printf("%i compressed \n", k);
+            AABB_print(&child_qaabb);
+
+            printf("%i uncompressed (as int) \n", k);
+            AABB_printasInt(&child_aabb);
+            printf("%i compressed (as int) \n", k);
+            AABB_printasInt(&child_qaabb);
+
+            int4 e0 = child_aabb.lower < child_qaabb.lower;
+            int4 e1 = child_aabb.upper > child_qaabb.upper;
+            printf("e0 %d e1 %d \n", e0, e1);
+        }
+#endif
+    }
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated)
+{
+    struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb);
+    subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb);
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+    const uint node_in_sg = get_sub_group_local_id() / 8;
+    const uint k = lane;
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width
+    aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8);
+
+    if (lane < BVH_NODE_N6 && active_lane)
+    {
+        struct AABB conservative_aabb = conservativeAABB(&aabb);
+        const float3 len = AABB_size(&conservative_aabb).xyz * up;
+        int3 exp;
+        const float3 mant = frexp_vec3(len, &exp);
+        const float3 org = conservative_aabb.lower.xyz;
+
+        exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+        qbvh_node->offset = offset;
+        qbvh_node->type = type;
+
+        qbvh_node->lower[0] = org.x;
+        qbvh_node->lower[1] = org.y;
+        qbvh_node->lower[2] = org.z;
+
+        qbvh_node->exp[0] = exp.x;
+        qbvh_node->exp[1] = exp.y;
+        qbvh_node->exp[2] = exp.z;
+
+        qbvh_node->instMask = mask;
+
+        uchar3 lower_uchar = (uchar3)(0x80);
+        uchar3 upper_uchar = (uchar3)(0);
+
+        if (lane < numChildren)
+        {
+            struct AABB child_aabb = conservativeAABB(input_aabb);
+
+            float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+            lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+            float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+            upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+            lower_uchar = convert_uchar3_rtn(lower);
+            upper_uchar = convert_uchar3_rtp(upper);
+
+            if (degenerated)
+            {
+                lower_uchar = upper_uchar = 0;
+            }
+        }
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+        qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+
+        if (!(exp.x >= -128 && exp.x <= 127))
+            printf("exp_x error \n");
+        if (!(exp.y >= -128 && exp.y <= 127))
+            printf("exp_y error \n");
+        if (!(exp.z >= -128 && exp.z <= 127))
+            printf("exp_z error \n");
+
+        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+        if (!AABB_subset(&child_aabb, &child_qaabb))
+        {
+            uint3 lower_i = convert_uint3(lower_uchar);
+            uint3 upper_i = convert_uint3(upper_uchar);
+
+            printf("\n ERROR %d\n", k);
+            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
+            printf("%i uncompressed \n", k);
+            AABB_print(&child_aabb);
+            printf("%i compressed \n", k);
+            AABB_print(&child_qaabb);
+
+            printf("%i uncompressed (as int) \n", k);
+            AABB_printasInt(&child_aabb);
+            printf("%i compressed (as int) \n", k);
+            AABB_printasInt(&child_qaabb);
+
+            int4 e0 = child_aabb.lower < child_qaabb.lower;
+            int4 e1 = child_aabb.upper > child_qaabb.upper;
+            printf("e0 %d e1 %d \n", e0, e1);
+        }
+#endif
+    }
+}
+
+GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+    uchar commonMask = sub_group_reduce_or_N6(instMask);
+    if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated);
+}
+
+
+// return true if is degenerated
+GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+
+    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+    uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
+    if (active_lane)
+        *mask = commonMask;
+
+    if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK))
+        AABB_init(input_aabb);
+
+    return active_lane ? degenerated : false;
+}
+
+GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+
+    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+    uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
+    if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane);
+}
+
+
+GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    if (subgroupLocalID < numChildren)
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false);
+}
+
+
+GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    if (lane < numChildren)
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane);
+}
+
+
+GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node, 
+                                              uniform struct AABB reduced_bounds,
+                                              varying struct AABB input_aabb, 
+                                              uniform uint numChildren,
+                                              varying ushort lane )
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    int3 exp;
+
+    struct AABB conservative_aabb = conservativeAABB( &reduced_bounds);
+    const float3 len = AABB_size( &conservative_aabb ).xyz * up;
+    const float3 mant = frexp_vec3( len, &exp );
+    const float3 org = conservative_aabb.lower.xyz;
+
+    exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    qbvh_node->lower[0] = org.x;
+    qbvh_node->lower[1] = org.y;
+    qbvh_node->lower[2] = org.z;
+
+    qbvh_node->exp[0] = exp.x;
+    qbvh_node->exp[1] = exp.y;
+    qbvh_node->exp[2] = exp.z;
+
+    qbvh_node->instMask = 0xff;
+
+    uchar3 lower_uchar = 0x80;
+    uchar3 upper_uchar = 0;
+
+    if ( lane < BVH_NODE_N6 )
+    {
+        ushort k = lane;
+        if( lane < numChildren )
+        {
+            struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ???
+
+            float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
+            lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+            float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
+            upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+
+            lower_uchar = convert_uchar3_rtn( lower );
+            upper_uchar = convert_uchar3_rtp( upper );
+        }
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+    }
+   
+}
+
+GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren)
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    int3 exp;
+    struct AABB aabb;
+    AABB_init(&aabb);
+    for (uint i = 0; i < numChildren; i++)
+        AABB_extend(&aabb, &input_aabb[i]);
+
+    struct AABB conservative_aabb = conservativeAABB(&aabb);
+    const float3 len = AABB_size(&conservative_aabb).xyz * up;
+    const float3 mant = frexp_vec3(len, &exp);
+    const float3 org = conservative_aabb.lower.xyz;
+
+    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    qbvh_node->lower[0] = org.x;
+    qbvh_node->lower[1] = org.y;
+    qbvh_node->lower[2] = org.z;
+
+    qbvh_node->exp[0] = exp.x;
+    qbvh_node->exp[1] = exp.y;
+    qbvh_node->exp[2] = exp.z;
+
+    qbvh_node->instMask = 0xff;
+
+    for (uint k = 0; k < numChildren; k++)
+    {
+        struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ???
+
+        float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+        lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+        float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+        upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+        uchar3 lower_uchar = convert_uchar3_rtn(lower);
+        uchar3 upper_uchar = convert_uchar3_rtp(upper);
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+        if (!(exp.x >= -128 && exp.x <= 127))
+            printf("exp_x error \n");
+        if (!(exp.y >= -128 && exp.y <= 127))
+            printf("exp_y error \n");
+        if (!(exp.z >= -128 && exp.z <= 127))
+            printf("exp_z error \n");
+
+        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+        if (!AABB_subset(&child_aabb, &child_qaabb))
+        {
+            uint3 lower_i = convert_uint3(lower_uchar);
+            uint3 upper_i = convert_uint3(upper_uchar);
+
+            printf("\n ERROR %d\n", k);
+            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
+            printf("%i uncompressed \n", k);
+            AABB_print(&child_aabb);
+            printf("%i compressed \n", k);
+            AABB_print(&child_qaabb);
+
+            printf("%i uncompressed (as int) \n", k);
+            AABB_printasInt(&child_aabb);
+            printf("%i compressed (as int) \n", k);
+            AABB_printasInt(&child_qaabb);
+
+            int4 e0 = child_aabb.lower < child_qaabb.lower;
+            int4 e1 = child_aabb.upper > child_qaabb.upper;
+            printf("e0 %d e1 %d \n", e0, e1);
+        }
+#endif
+    }
+    for (uint k = numChildren; k < BVH_NODE_N6; k++)
+    {
+        qbvh_node->qbounds.lower_x[k] = 0x80;
+        qbvh_node->qbounds.lower_y[k] = 0x80;
+        qbvh_node->qbounds.lower_z[k] = 0x80;
+        qbvh_node->qbounds.upper_x[k] = 0;
+        qbvh_node->qbounds.upper_y[k] = 0;
+        qbvh_node->qbounds.upper_z[k] = 0;
+    }
+}
+
+GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren)
+{
+    qbvh_node->offset = offset;
+    for (uint k = 0; k < BVH_NODE_N6; k++)
+        qbvh_node->childData[k] = 1;
+}
+
+GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
+{
+    for (uint k = 0; k < BVH_NODE_N6; k++)
+        qbvh_node->childData[k] = 1;
+}
+
+GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
+{
+    if( get_sub_group_local_id() < BVH_NODE_N6 )
+        qbvh_node->childData[get_sub_group_local_id()] = 1;
+}
+
+
+GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node)
+{
+    for (uint k = 0; k < BVH_NODE_N6; k++)
+        qbvh_node->childData[k] = 2;
+}
+
+GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type)
+{
+    qbvh_node->type = type;
+}
+
+GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node)
+{
+    QBVHNodeN_setType(qbvh_node, type);
+    QBVHNodeN_setChildren(qbvh_node, offset, numChildren);
+    QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren);
+}
+
+GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode)
+{
+    printf(" offset %d type %d \n", qnode->offset, (int)qnode->type);
+    printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]);
+    printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]);
+    printf(" instMask %d \n", qnode->instMask);
+
+    struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0);
+    struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1);
+    struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2);
+    struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3);
+    struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4);
+    struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5);
+
+    printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x);
+    printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x);
+
+    printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y);
+    printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y);
+
+    printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z);
+    printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z);
+}
+
+GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset)
+{
+    long global_parent_offset = (long)parent - (long)bvh_mem;
+    global_parent_offset = global_parent_offset & (~(64 - 1));        // FIXME: (sw) this should not be necessary?
+    int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB
+    //if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset);
+    return relative_offset;
+}
+
+GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children)
+{
+    int ofs = (struct QBVHNodeN *)children - qnode;
+    qnode->offset = ofs;
+}
+
+GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type)
+{
+    qnode->type = type;
+}
+
+GRL_INLINE uint sortBVHChildrenIDs(uint input)
+{
+#if BVH_NODE_N == 8
+    return sort8_descending(input);
+#else
+    return sort4_descending(input);
+#endif
+}
+
+enum XFM_BOX_OPTION {
+    XFM_BOX_NO_CLIP = 0,
+    XFM_BOX_NOT_REFINED_CLIPPED = 1, //<<use clipbox, for not refined, compute bbox from children, transform after extending to one box
+    XFM_BOX_NOT_REFINED_TAKE_CLIPBOX = 2 //<<use clipbox, for not refined, just transform xlipbox, don't take children boxes into account
+};
+
+#define DEB_PRINTFS 0
+#ifndef FINE_TRANSFORM_NODE_BOX
+#define FINE_TRANSFORM_NODE_BOX 0
+#endif
+
+GRL_INLINE struct AABB3f GRL_OVERLOADABLE compute_xfm_bbox(const float* xfm, InternalNode* pnode, enum XFM_BOX_OPTION clipOpt, const AABB3f* clipBox, float matrixTransformOverhead)
+{
+    AABB3f childrenbox;
+#if FINE_TRANSFORM_NODE_BOX
+    struct AffineSpace3f axfm = AffineSpace3f_load_row_major(xfm);
+    bool computeFine = matrixTransformOverhead < 0.6f;
+    computeFine = sub_group_any(computeFine);
+    if (computeFine)
+    {
+        bool clip = clipOpt != XFM_BOX_NO_CLIP;
+        InternalNode node = *pnode;
+
+#if DEB_PRINTFS
+        if (InternalNode_IsChildValid(&node, 5) && !InternalNode_IsChildValid(&node, 4))
+            printf("child 5 valid && child 4 invalid\n");
+        if (InternalNode_IsChildValid(&node, 4) && !InternalNode_IsChildValid(&node, 3))
+            printf("child 4 valid && child 3 invalid\n");
+        if (InternalNode_IsChildValid(&node, 3) && !InternalNode_IsChildValid(&node, 2))
+            printf("child 3 valid && child 2 invalid\n");
+        if (InternalNode_IsChildValid(&node, 2) && !InternalNode_IsChildValid(&node, 1))
+            printf("child 2 valid && child 1 invalid\n");
+        if (InternalNode_IsChildValid(&node, 1) && !InternalNode_IsChildValid(&node, 0))
+            printf("child 1 valid && child 0 invalid\n");
+#endif
+
+#if DEB_PRINTFS
+        printf("F");
+#endif
+        AABB3f child_bounds0 = InternalNode_GetChildAABB(&node, 0);
+        AABB3f child_bounds1 = InternalNode_GetChildAABB(&node, 1);
+        AABB3f child_bounds2 = InternalNode_GetChildAABB(&node, 2); 
+        AABB3f child_bounds3 = InternalNode_GetChildAABB(&node, 3); 
+        AABB3f child_bounds4 = InternalNode_GetChildAABB(&node, 4); 
+        AABB3f child_bounds5 = InternalNode_GetChildAABB(&node, 5); 
+
+        // we bravely assumme we will have at least 2 children here.
+        if(!InternalNode_IsChildValid(&node, 2)) child_bounds2 = child_bounds0;
+        if(!InternalNode_IsChildValid(&node, 3)) child_bounds3 = child_bounds0;
+        if(!InternalNode_IsChildValid(&node, 4)) child_bounds4 = child_bounds0;
+        if(!InternalNode_IsChildValid(&node, 5)) child_bounds5 = child_bounds0;
+
+        if (clip)
+        {
+            AABB3f_trim_upper(&child_bounds0, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds1, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds2, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds3, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds4, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds5, clipBox->upper);
+        }
+
+        child_bounds0 = transform_aabb(child_bounds0, xfm);
+        child_bounds1 = transform_aabb(child_bounds1, xfm);
+        child_bounds2 = transform_aabb(child_bounds2, xfm);
+        child_bounds3 = transform_aabb(child_bounds3, xfm);
+        child_bounds4 = transform_aabb(child_bounds4, xfm);
+        child_bounds5 = transform_aabb(child_bounds5, xfm);
+        
+        AABB3f_extend(&child_bounds0, &child_bounds1);
+        AABB3f_extend(&child_bounds2, &child_bounds3);
+        AABB3f_extend(&child_bounds4, &child_bounds5);
+        AABB3f_extend(&child_bounds0, &child_bounds2);
+        AABB3f_extend(&child_bounds0, &child_bounds4);
+
+        return child_bounds0;
+    }
+#endif
+
+#if DEB_PRINTFS
+    printf("0");
+#endif
+
+    struct AABB3f child_bounds;
+
+    if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX)
+    {
+        // XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP
+        child_bounds = InternalNode_getAABB3f(pnode);
+        if (clipOpt != XFM_BOX_NO_CLIP)
+        {
+            AABB3f_intersect(&child_bounds, *clipBox);
+        }
+    }
+    else
+    {
+        //XFM_BOX_NOT_REFINED_TAKE_CLIPBOX
+        child_bounds = *clipBox;
+    }
+
+    child_bounds = transform_aabb(child_bounds, xfm);
+    //child_bounds = conservativeAABB3f(&child_bounds);
+    return child_bounds;
+}
+
+GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead)
+{
+    float transform[12];
+    load_row_major_from_AffineSpace3f(xfm, transform);
+    return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead);
+}
+
+GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base)
+{
+    uint dataSize = 0;
+
+    if (BVHBase_HasBackPointers(base))
+    {
+        const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63;
+        const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63;
+
+        // New atomic update
+        if(base->quadIndicesDataStart > base->backPointerDataStart)
+        {
+            uint numQuads = BVHBase_GetNumQuads(base);
+
+            const uint quadTableMainBufferSize = (numQuads + 255) & ~255;
+            const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255;
+            const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
+
+            const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63;
+
+            dataSize += quadTableEntriesSize + quadIndicesDataSize;
+        }
+
+        dataSize += 
+            ((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63)
+            + fatleafEntrySize + innerEntrySize;
+    }
+
+    return (uint64_t)dataSize;
+}
+
+GRL_INLINE uint64_t compute_compacted_size(BVHBase* base)
+{
+    uint64_t size = sizeof(BVHBase);
+    size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf);
+    size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf);
+    size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf);
+    size += compute_refit_structs_compacted_size(base);
+    size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode);
+    size += sizeof(InstanceDesc) * base->Meta.instanceCount;
+    size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64
+    size = (size + 63) & ~63;
+
+    return size;
+}
diff --git a/src/intel/vulkan/grl/gpu/quad.h b/src/intel/vulkan/grl/gpu/quad.h
new file mode 100644
index 00000000000..cc1b7d470f8
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/quad.h
@@ -0,0 +1,127 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+
+// JDB TODO:  Use corresponding GRL structures!!!
+
+struct Quad
+{
+    unsigned int shaderIndex;   // note: also mask
+    unsigned int geomIndex;     // note:  also geom flags in upper 2 bits
+    unsigned int primIndex0;
+    unsigned int primIndex1Delta;
+    float v[4][3];
+};
+
+GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad)
+{
+    return quad->geomIndex;
+}
+
+GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad)
+{
+    return quad->primIndex0;
+}
+
+GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad)
+{
+    return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF);
+}
+
+GRL_INLINE float3 load_float3(float *p)
+{
+    return (float3)(p[0], p[1], p[2]);
+}
+
+GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm)
+{
+    return (float3)(p[perm.x], p[perm.y], p[perm.z]);
+}
+
+GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm)
+{
+    return (float2)(p[perm.x], p[perm.y]);
+}
+
+GRL_INLINE float load_perm_float(float *p, const uint perm)
+{
+    return p[perm];
+}
+
+GRL_INLINE struct AABB getAABB_Quad(struct Quad *q)
+{
+    struct AABB aabb;
+    const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
+    const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
+    aabb.lower = (float4)(lower, 0.0f);
+    aabb.upper = (float4)(upper, 0.0f);
+    return aabb;
+}
+
+GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box)
+{
+    struct AABB aabb;
+    const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
+    const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
+    aabb.lower = (float4)(lower, 0.0f);
+    aabb.upper = (float4)(upper, 0.0f);
+    AABB_extend(box, &aabb);
+}
+
+GRL_INLINE float4 getCentroid2_Quad(struct Quad *q)
+{
+    struct AABB aabb = getAABB_Quad(q);
+    return aabb.lower + aabb.upper;
+}
+
+GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3,
+                    const uchar j0, const uchar j1, const uchar j2,
+                    const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags )
+{
+    quad->v[0][0] = v0.x;
+    quad->v[0][1] = v0.y;
+    quad->v[0][2] = v0.z;
+    quad->v[1][0] = v1.x;
+    quad->v[1][1] = v1.y;
+    quad->v[1][2] = v1.z;
+    quad->v[2][0] = v2.x;
+    quad->v[2][1] = v2.y;
+    quad->v[2][2] = v2.z;
+    quad->v[3][0] = v3.x;
+    quad->v[3][1] = v3.y;
+    quad->v[3][2] = v3.z;
+
+    quad->shaderIndex = (geomMask << 24) | geomID;
+    quad->geomIndex = geomID | (geomFlags << 30);
+    quad->primIndex0 = primID0;
+    const uint delta = primID1 - primID0;
+    const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
+    quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf
+   
+}
+
+GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3)
+{
+    quad->v[0][0] = v0.x;
+    quad->v[0][1] = v0.y;
+    quad->v[0][2] = v0.z;
+    quad->v[1][0] = v1.x;
+    quad->v[1][1] = v1.y;
+    quad->v[1][2] = v1.z;
+    quad->v[2][0] = v2.x;
+    quad->v[2][1] = v2.y;
+    quad->v[2][2] = v2.z;
+    quad->v[3][0] = v3.x;
+    quad->v[3][1] = v3.y;
+    quad->v[3][2] = v3.z;
+}
diff --git a/src/intel/vulkan/grl/gpu/radix_sort.grl b/src/intel/vulkan/grl/gpu/radix_sort.grl
new file mode 100644
index 00000000000..df932057a10
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/radix_sort.grl
@@ -0,0 +1,163 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module radix_sort;
+
+kernel_module radix_kernels ("morton_radix_sort.cl") 
+{
+    links lsc_intrinsics;
+    kernel opencl_build_morton_kernel_sort_bin_items              < kernelFunction="sort_morton_codes_bin_items">;
+    kernel opencl_build_morton_kernel_sort_reduce_bins            < kernelFunction="sort_morton_codes_reduce_bins">;
+    kernel opencl_build_morton_kernel_sort_scatter_items          < kernelFunction="sort_morton_codes_scatter_items">;
+
+    kernel opencl_build_morton_codes_sort_merged                  < kernelFunction="sort_morton_codes_merged">;
+
+    kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum   < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">;
+    kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce    < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">;
+}
+
+metakernel sort(
+    qword build_globals,
+    dword shift,
+    qword global_histogram,
+    qword input0,
+    qword input1,
+    dword input0_offset,
+    dword input1_offset,
+    dword iteration,
+    dword threads)
+{
+    dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
+        build_globals,
+        shift,
+        global_histogram,
+        input0,
+        input1,
+        input0_offset,
+        input1_offset,
+        iteration);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
+        threads,
+        global_histogram);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args(
+        build_globals,
+        shift,
+        global_histogram,
+        input0,
+        input1,
+        input0_offset,
+        input1_offset,
+        iteration);
+
+        control(wait_idle);
+
+}
+
+metakernel sort_bin_items(
+    qword build_globals,
+    qword global_histogram,
+    qword wg_flags,
+    qword input0,
+    dword iteration,
+    dword threads,
+    dword update_wg_flags
+    )
+{
+    dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
+        build_globals,
+        global_histogram,
+        wg_flags,
+        input0,
+        iteration,
+        threads,
+        update_wg_flags
+    );
+}
+
+metakernel sort_reduce_bins(
+    qword build_globals,
+    qword global_histogram,
+    dword threads,
+    dword iteration)
+{
+    dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
+        build_globals,
+        threads,
+        global_histogram,
+        iteration);
+}
+
+metakernel sort_scatter_items(
+    qword build_globals,
+    qword global_histogram,
+    qword input0,
+    qword input1,
+    dword iteration,
+    dword threads,
+    dword update_morton_sort_in_flight )
+{
+    dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args(
+        build_globals,
+        global_histogram,
+        input0,
+        input1,
+        iteration,
+        threads,
+        update_morton_sort_in_flight
+    );
+}
+
+metakernel sort_bin_items_merged(
+    qword build_globals,
+    qword global_histogram,
+    qword input0,
+    dword iteration,
+    dword threads)
+{
+    dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args(
+        build_globals,
+        global_histogram,
+        input0,
+        iteration,
+        threads
+    );
+}
+
+metakernel sort_reduce_bins_wide(
+    qword build_globals,
+    qword global_histogram,
+    qword global_histogram_tmp,
+    qword wg_flags,
+    dword threads,
+    dword threads_groups,
+    dword iteration)
+{
+    dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args(
+        build_globals,
+        threads,
+        threads_groups,
+        global_histogram,
+        global_histogram_tmp,
+        wg_flags,
+        iteration);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args(
+        build_globals,
+        threads,
+        threads_groups,
+        global_histogram,
+        global_histogram_tmp,
+        iteration);
+}
diff --git a/src/intel/vulkan/grl/gpu/rebraid.grl b/src/intel/vulkan/grl/gpu/rebraid.grl
new file mode 100644
index 00000000000..5aa809637a3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/rebraid.grl
@@ -0,0 +1,167 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module rebraid;
+
+kernel init_scratch             < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch"                        >
+kernel chase_instance_ptrs      < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers"             >
+kernel calc_aabb                < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances"           >
+kernel calc_aabb_indirect                < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect"           >
+kernel calc_aabb_ptr            < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers"  >
+kernel calc_aabb_ptr_indirect            < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect"  >
+kernel count_splits             < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits"          >
+kernel count_splits_SG          < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG"       >
+kernel count_splits_SG_indirect          < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect"       >
+kernel build_primrefs           < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs"        >
+kernel build_primrefs_indirect           < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect"        >
+
+//kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" >
+//kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" >
+
+
+const PRIMREF_GROUP_SIZE = 256;
+
+const COUNT_SPLITS_GROUP_SIZE = 16;
+
+struct MKRebraidArgs
+{
+  qword bvh_buffer;
+  qword primref_buffer;
+  qword global_buffer;
+  qword instances_buffer;
+  qword rebraid_scratch;
+  qword flat_instances_buffer;
+  dword num_instances;
+  dword num_extra_primrefs;
+};
+
+metakernel rebraid(
+    MKRebraidArgs Args
+  )
+{
+  dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
+  dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
+  control( wait_idle );
+
+  //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
+  //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances );
+  
+  dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
+  control( wait_idle );
+
+  define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
+
+  dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+  control( wait_idle );
+
+  //dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+}
+
+metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
+{
+
+    dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
+
+    define num_groups  REG0;
+    num_groups = load_dword(indirectBuildRangeInfo);
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
+    control(wait_idle);
+
+    dispatch_indirect count_splits_SG_indirect
+        args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
+
+    define groupsize_1 REG1; // groupsize - 1
+    define C_8         REG2;
+
+    groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
+    C_8 = 8;          // log_2(PRIMREF_GROUP_SIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
+    DISPATCHDIM_X = num_groups.lo;
+
+    control(wait_idle);
+
+    dispatch_indirect build_primrefs_indirect args(
+        Args.global_buffer,
+        Args.bvh_buffer,
+        Args.instances_buffer,
+        Args.rebraid_scratch,
+        Args.primref_buffer,
+        indirectBuildRangeInfo,
+        Args.num_extra_primrefs);
+    control(wait_idle);
+}
+
+metakernel rebraid_ptrs(
+    MKRebraidArgs Args
+  )
+{
+  dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
+  dispatch chase_instance_ptrs( Args.num_instances, 1, 1)   args( Args.instances_buffer, Args.flat_instances_buffer );
+  dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
+  control( wait_idle );
+
+  //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
+  //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
+  
+  dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch );
+  control( wait_idle );
+
+  define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
+
+
+  dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch,  Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+  control( wait_idle );
+
+}
+
+metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
+{
+    dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
+
+    define num_groups  REG0;
+    num_groups = load_dword(indirectBuildRangeInfo);
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect chase_instance_ptrs
+             args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo);
+    dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
+    control(wait_idle);
+
+    dispatch_indirect count_splits_SG_indirect
+        args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
+
+    define groupsize_1 REG1; // groupsize - 1
+    define C_8         REG2;
+
+    groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
+    C_8 = 8;          // log_2(PRIMREF_GROUP_SIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
+    DISPATCHDIM_X = num_groups.lo;
+
+    control(wait_idle);
+
+    dispatch_indirect build_primrefs_indirect args(
+        Args.global_buffer,
+        Args.bvh_buffer,
+        Args.flat_instances_buffer,
+        Args.rebraid_scratch,
+        Args.primref_buffer,
+        Args.num_extra_primrefs,
+        indirectBuildRangeInfo,
+        Args.num_instances);
+    control(wait_idle);
+}
diff --git a/src/intel/vulkan/grl/gpu/shared.h b/src/intel/vulkan/grl/gpu/shared.h
new file mode 100644
index 00000000000..0d42d98a1d4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/shared.h
@@ -0,0 +1,182 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "GRLGen12.h"
+#pragma once
+
+#define sizeof_Quad 64
+#define sizeof_Procedural 64
+#define sizeof_PrimRef 32
+#define sizeof_PresplitItem 8
+#define sizeof_HwInstanceLeaf 128
+#define MORTON_BUILDER_SUBTREE_THRESHOLD 256
+#define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32
+// Temporarily disable localized phase2 due to issues in ELG presi
+// This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit
+#define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0
+
+#define BVH_QUAD_NODE 4
+#define BVH_INSTANCE_NODE 1
+#define BVH_INTERNAL_NODE 0
+#define BVH_PROCEDURAL_NODE 3
+#define BUILDRECORD_STACK_SIZE 48
+#define BINS 16
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
+
+struct AABB
+{
+    float4 lower;
+    float4 upper;
+};
+
+typedef struct BlockAllocator
+{
+    unsigned int start;
+    unsigned int cur;
+} BlockAllocator;
+
+struct Globals
+{
+    struct AABB centroidBounds;
+
+    unsigned int build_record_start;
+    unsigned int numPrimitives;
+    unsigned int leafPrimType;
+    unsigned int leafSize;
+
+    unsigned int numSplittedPrimitives;
+    unsigned int numBuildRecords;
+
+    // spatial split sate
+    unsigned int numOriginalPrimitives;
+    float presplitPrioritySum;
+    float probThreshold;
+
+    // binned-sah bfs state 
+    unsigned int counter;
+    unsigned int numBuildRecords_extended;
+    
+    // sync variable used for global-sync on work groups
+    unsigned int sync;
+    
+
+    /* morton code builder state */
+    unsigned int shift;      // used by adaptive mc-builder
+    unsigned int shift_mask; // used by adaptive mc-builder
+    unsigned int binary_hierarchy_root;
+    unsigned int p0_allocated_num;
+    unsigned int p0_created_num;
+    unsigned int morton_sort_in_flight;
+    unsigned int sort_iterations;
+
+    gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
+};
+
+struct Range
+{
+    unsigned int start, end;
+};
+
+struct Triangle
+{
+    unsigned int vtx[3];
+    //unsigned int primID;
+    //unsigned int geomID;
+};
+
+struct MortonCodePrimitive
+{
+    uint64_t index_code; // 64bit code + index combo
+};
+
+struct BuildRecord
+{
+    struct AABB centroidBounds;
+    unsigned int start, end;
+    __global void *current;
+};
+
+struct BinaryMortonCodeHierarchy
+{
+    struct Range range;
+    unsigned int leftChild;
+    unsigned int rightChild;
+   // unsigned int flag;
+};
+
+typedef struct MortonFlattenedBoxlessNode {
+    uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE
+    uint childOffset_type;       // childOffset : 26, type : 6
+    uint backPointer;            // same usage as in bvh
+} MortonFlattenedBoxlessNode;
+
+struct StatStackEntry
+{
+    struct AABB aabb;
+    unsigned int node;
+    unsigned int type;
+    unsigned int depth;
+    float area;
+};
+
+struct BuildRecordMorton
+{
+    unsigned int nodeID;
+    unsigned int items;
+    unsigned int current_index;
+    unsigned int parent_index;
+};
+
+struct Split
+{
+    float sah;
+    int dim;
+    int pos;
+};
+
+struct BinMapping
+{
+    float4 ofs, scale;
+};
+
+struct BinInfo
+{
+    struct AABB3f boundsX[BINS];
+    struct AABB3f boundsY[BINS];
+    struct AABB3f boundsZ[BINS];
+    uint3 counts[BINS];
+};
+
+struct BinInfo2
+{
+    struct AABB3f boundsX[BINS * 2];
+    struct AABB3f boundsY[BINS * 2];
+    struct AABB3f boundsZ[BINS * 2];
+    uint3 counts[BINS * 2];
+};
+
+struct GlobalBuildRecord
+{
+    struct BinInfo2 binInfo;
+    struct BinMapping binMapping;
+    struct Split split;
+    struct Range range;
+    struct AABB leftCentroid;
+    struct AABB rightCentroid;
+    struct AABB leftGeometry;
+    struct AABB rightGeometry;
+    unsigned int atomicCountLeft;
+    unsigned int atomicCountRight;
+    unsigned int buildRecordID;
+};
+
+GRL_NAMESPACE_END(GPUBVHBuilder)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/structs.grl b/src/intel/vulkan/grl/gpu/structs.grl
new file mode 100644
index 00000000000..f15b1d2346b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/structs.grl
@@ -0,0 +1,38 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module structs;
+
+struct MKBuilderState {
+    qword geomDesc_buffer;
+    qword build_primref_buffer;
+    qword build_globals;
+    qword bvh_buffer;
+    dword leaf_type;
+    dword leaf_size;
+};
+
+struct MKSizeEstimate {
+    dword numTriangles;
+    dword numProcedurals;
+    dword numPrimitives;
+    dword numMeshes;
+    dword numBuildPrimitives;
+    dword numPrimitivesToSplit;
+    dword instance_descs_start;
+    dword geo_meta_data_start;
+    dword node_data_start;
+    dword leaf_data_start;
+    dword procedural_data_start;
+    dword back_pointer_start;
+    dword sizeTotal;
+    dword updateScratchSizeTotal;
+    dword fatleaf_table_start;
+    dword innernode_table_start;
+    dword max_fatleaves;
+    dword quad_indices_data_start;
+};
diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.cl b/src/intel/vulkan/grl/gpu/traversal_shader.cl
new file mode 100644
index 00000000000..ee5d2afcc75
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.cl
@@ -0,0 +1,277 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "instance.h"
+#include "api_interface.h"
+
+#include "bvh_build_primref.h"
+#include "bvh_build_refit.h"
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+TS_primrefs_from_instances(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+    uint numInstances,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate
+    )
+{
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if ( pIsProcedural[instanceIndex] )
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+TS_primrefs_from_instances_indirect(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+    uint numInstances,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate,
+    global struct IndirectBuildRangeInfo* indirect_data
+    )
+{
+    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+            (((global char*)instances) + indirect_data->primitiveOffset);
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if ( pIsProcedural[instanceIndex] )
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+TS_primrefs_from_instances_pointers(global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global void* instances_in,
+    uint numInstances,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate
+    )
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
+
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if (pIsProcedural[instanceIndex])
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global void* instances_in,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate,
+    global struct IndirectBuildRangeInfo* indirect_data
+    )
+{
+    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
+        global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
+            (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if (pIsProcedural[instanceIndex])
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+TS_update_instance_leaves(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch,
+    global uchar* aabbs,
+    global uchar* is_procedural,
+    dword aabb_stride
+)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+    if (id >= num_leaves)
+        return;
+
+    struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh);
+    uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]);
+
+    global GRL_RAYTRACING_AABB* procedural_box = 0;
+    if (is_procedural[idx])
+    {
+        procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx));
+    }
+
+    DO_update_instance_leaves(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtr,
+        instance_aabb_scratch,
+        id,
+        procedural_box);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+TS_fixup_leaves( global struct BVHBase* bvh,
+                 global uchar* primref_index,
+                 global PrimRef* primrefs,
+                 uint stride )
+
+{
+    uint num_inners = BVHBase_GetNumInternalNodes(bvh);
+    uint id       = get_local_id(0) + get_local_size(0) * get_group_id(0);
+
+    // assign 8 lanes to each inner node, 6 of which will do useful work
+    uint node_id  = id / 8;
+    uint child_id = id % 8;
+
+    bool node_valid = (node_id < num_inners);
+
+    if (node_valid )
+    {
+        global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
+        global InternalNode* my_node = nodes + node_id;
+
+        if (my_node->nodeType == BVH_INSTANCE_NODE)
+        {
+            bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id);
+            if (child_valid)
+            {
+                global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node);
+                uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id;
+
+                const uint primrefID = *(uint*)(primref_index + leafIndex * stride);
+
+                uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ?
+                                BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
+
+                InternalNode_SetChildType(my_node, child_id, type);
+            }
+
+            if (child_id == 0)
+                my_node->nodeType = BVH_INTERNAL_NODE;
+        }
+    }
+}
+
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
+TS_Refit_per_one_startpoint_sg(
+    global struct BVHBase* bvh,
+    global struct AABB3f* instance_leaf_aabbs,
+    global uchar* procedural_instance_enable_buffer )
+{
+    DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer );
+
+}
diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.grl b/src/intel/vulkan/grl/gpu/traversal_shader.grl
new file mode 100644
index 00000000000..3820996c348
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.grl
@@ -0,0 +1,244 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module traversal_shader;
+
+kernel_module morton_kernels ("traversal_shader.cl")
+{
+    links lsc_intrinsics;
+
+    kernel TS_primrefs_from_instances       < kernelFunction = "TS_primrefs_from_instances" >;
+    kernel TS_primrefs_from_instances_indirect       < kernelFunction = "TS_primrefs_from_instances_indirect" >;
+    kernel TS_primrefs_from_instances_ptrs  < kernelFunction = "TS_primrefs_from_instances_pointers" >;
+    kernel TS_primrefs_from_instances_ptrs_indirect  < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >;
+    kernel TS_update_instance_leaves        < kernelFunction = "TS_update_instance_leaves" >;
+    kernel TS_Refit_per_one_startpoint_sg   < kernelFunction = "TS_Refit_per_one_startpoint_sg" >;
+    kernel TS_fixup_leaves                  < kernelFunction = "TS_fixup_leaves" >;
+}
+
+struct MKTSBuildArgs
+{
+    qword build_globals;
+    qword bvh_buffer;
+    qword instance_descs;
+    qword build_primref_buffer;
+    qword aabb_buffer;
+    qword is_procedural_buffer;
+    qword leaf_creation_index_buffer;
+    dword aabb_stride;
+    dword num_instances;
+    dword leaf_creation_index_stride;
+};
+
+const BUILD_PRIMREFS_GROUPSIZE = 16;
+
+
+metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate )
+{
+    define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
+    dispatch TS_primrefs_from_instances(num_groups, 1, 1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.num_instances,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate
+    );
+
+}
+
+metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
+    C_4 = 4;          // log_2(BUILD_PRIMREFS_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_primrefs_from_instances_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate,
+        indirectBuildRangeInfo
+    );
+
+}
+
+metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate )
+{
+    define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
+    dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.num_instances,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate
+    );
+}
+
+metakernel
+TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
+    C_4 = 4;          // log_2(BUILD_PRIMREFS_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate,
+        indirectBuildRangeInfo
+    );
+}
+
+
+
+
+const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16;
+
+struct MKTSUpdateArgs
+{
+    qword bvh_buffer;
+    qword instance_descs;
+    qword instance_descs_ptrs;
+    qword aabb_buffer;
+    qword is_procedural_buffer;
+    qword refit_scratch;
+    dword aabb_stride;
+    dword num_instances;
+};
+
+metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state )
+{
+    define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE);
+    dispatch TS_update_instance_leaves(num_groups, 1, 1) args(
+        update_state.bvh_buffer,
+        update_state.instance_descs,
+        update_state.instance_descs_ptrs,
+        update_state.refit_scratch,
+        update_state.aabb_buffer,
+        update_state.is_procedural_buffer,
+        update_state.aabb_stride
+    );
+}
+
+metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo )
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1
+    C_4 = 4;          // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    // need to add indirect offset?
+    dispatch_indirect TS_update_instance_leaves args(
+        update_state.bvh_buffer,
+        update_state.instance_descs,
+        update_state.instance_descs_ptrs,
+        update_state.refit_scratch,
+        update_state.aabb_buffer,
+        update_state.is_procedural_buffer,
+        update_state.aabb_stride
+    );
+}
+
+metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
+{
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_Refit_per_one_startpoint_sg
+    args(
+        update_state.bvh_buffer,
+        update_state.refit_scratch,
+        update_state.is_procedural_buffer
+    );
+}
+
+
+const FIXUP_LEAVES_NODES_PER_GROUP = 2;
+
+metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
+{
+    define ONE REG3;
+
+    ONE = 1;
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + ONE;
+    REG2 = REG2 >> ONE;
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_fixup_leaves
+        args(
+            build_state.bvh_buffer,
+            build_state.leaf_creation_index_buffer,
+            build_state.build_primref_buffer,
+            build_state.leaf_creation_index_stride
+        );
+
+}
diff --git a/src/intel/vulkan/grl/grl_cl_kernel_gen.py b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
new file mode 100644
index 00000000000..18b3a41a420
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
@@ -0,0 +1,212 @@
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os
+
+from grl_parser import parse_grl_file
+from mako.template import Template
+
+TEMPLATE_H = Template(COPYRIGHT + """
+/* This file generated from ${filename}, don't edit directly. */
+
+#ifndef GRL_CL_KERNEL_H
+#define GRL_CL_KERNEL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "genxml/gen_macros.h"
+#include "compiler/brw_kernel.h"
+
+enum grl_cl_kernel {
+% for k in kernels:
+    GRL_CL_KERNEL_${k.upper()},
+% endfor
+};
+
+const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id);
+
+void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* INTEL_GRL_H */
+""", output_encoding='utf-8')
+
+TEMPLATE_C = Template(COPYRIGHT + """
+/* This file generated from ${filename}, don't edit directly. */
+
+#include "grl_cl_kernel.h"
+
+% for k in kernels:
+#include "${prefix}_${k}.h"
+% endfor
+
+const char *
+genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id)
+{
+    switch (id) {
+% for k in kernels:
+    case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1;
+% endfor
+    default:
+        unreachable("Invalid GRL kernel enum");
+    }
+};
+
+void
+${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id)
+{
+    switch (id) {
+% for k in kernels:
+    case GRL_CL_KERNEL_${k.upper()}:
+        *kernel = ${prefix}_${k};
+        break;
+% endfor
+    default:
+        unreachable("Invalid GRL kernel enum");
+    }
+}
+""", output_encoding='utf-8')
+
+def get_libraries_files(kernel_module):
+    lib_files = []
+    for item in kernel_module[3]:
+        if item[0] != 'library':
+            continue
+        default_file = None
+        fallback_file = None
+        path_directory = None
+        for props in item[2]:
+            if props[0] == 'fallback':
+                fallback_file = props[1]
+            elif props[0] == 'default':
+                default_file = props[1]
+            elif props[0] == 'path':
+                path_directory = props[1]
+        assert path_directory
+        assert default_file or fallback_file
+        if fallback_file:
+            lib_files.append(os.path.join(path_directory, fallback_file))
+        else:
+            lib_files.append(os.path.join(path_directory, default_file))
+    return lib_files
+
+def add_kernels(kernels, cl_file, entrypoint, libs):
+    assert cl_file.endswith('.cl')
+    for lib_file in libs:
+        assert lib_file.endswith('.cl')
+    kernels.append((cl_file, entrypoint, ','.join(libs)))
+
+def get_kernels(grl_nodes):
+    kernels = []
+    for item in grl_nodes:
+        assert isinstance(item, tuple)
+        if item[0] == 'kernel':
+            ann = item[2]
+            add_kernels(kernels, ann['source'], ann['kernelFunction'], [])
+        elif item[0] == 'kernel-module':
+            cl_file = item[2]
+            libfiles = get_libraries_files(item)
+            for kernel_def in item[3]:
+                if kernel_def[0] == 'kernel':
+                    ann = kernel_def[2]
+                    add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles)
+    return kernels
+
+def parse_libraries(filenames):
+    libraries = {}
+    for fname in filenames:
+        lib_package = parse_grl_file(fname, [])
+        for lib in lib_package:
+            assert lib[0] == 'library'
+            # Add the directory of the library so that CL files can be found.
+            lib[2].append(('path', os.path.dirname(fname)))
+            libraries[lib[1]] = lib
+    return libraries
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--out-c', help='Output C file')
+    parser.add_argument('--out-h', help='Output H file')
+    parser.add_argument('--ls-kernels', action='store_const', const=True,
+                        help='List all openCL kernels')
+    parser.add_argument('--prefix', help='Prefix')
+    parser.add_argument('--library', dest='libraries', action='append',
+                        default=[], help='Libraries to include')
+    parser.add_argument('files', type=str, nargs='*', help='GRL files')
+    args = parser.parse_args()
+
+    libraries = parse_libraries(args.libraries)
+
+    kernels = []
+    for fname in args.files:
+        kernels += get_kernels(parse_grl_file(fname, libraries))
+
+    # Make the list of kernels unique and sorted
+    kernels = sorted(list(set(kernels)))
+
+    if args.ls_kernels:
+        for cl_file, entrypoint, libs in kernels:
+            if not os.path.isabs(cl_file):
+                cl_file = os.path.join(os.path.dirname(fname), cl_file)
+            print('{}:{}:{}'.format(cl_file, entrypoint, libs))
+
+    kernel_c_names = []
+    for cl_file, entrypoint, libs in kernels:
+        cl_file = os.path.splitext(cl_file)[0]
+        cl_file_name = cl_file.replace('/', '_')
+        kernel_c_names.append('_'.join([cl_file_name, entrypoint]))
+
+    try:
+        if args.out_h:
+            with open(args.out_h, 'wb') as f:
+                f.write(TEMPLATE_H.render(kernels=kernel_c_names,
+                                          filename=os.path.basename(__file__)))
+
+        if args.out_c:
+            with open(args.out_c, 'wb') as f:
+                f.write(TEMPLATE_C.render(kernels=kernel_c_names,
+                                          prefix=args.prefix,
+                                          filename=os.path.basename(__file__)))
+    except Exception:
+        # In the event there's an error, this imports some helpers from mako
+        # to print a useful stack trace and prints it, then exits with
+        # status 1, if python is run with debug; otherwise it just raises
+        # the exception
+        if __debug__:
+            import sys
+            from mako import exceptions
+            sys.stderr.write(exceptions.text_error_template().render() + '\n')
+            sys.exit(1)
+        raise
+
+if __name__ == '__main__':
+    main()
diff --git a/src/intel/vulkan/grl/include/AABB3f.h b/src/intel/vulkan/grl/include/AABB3f.h
new file mode 100644
index 00000000000..a3412332c77
--- /dev/null
+++ b/src/intel/vulkan/grl/include/AABB3f.h
@@ -0,0 +1,459 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+
+#include "affinespace.h"
+
+#ifndef __OPENCL_VERSION__
+#   include "stdio.h" //for printf
+#endif
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+
+GRL_INLINE void AABB3f_init(struct AABB3f *aabb)
+{
+    aabb->lower[0] = (float)(INFINITY);
+    aabb->lower[1] = (float)(INFINITY);
+    aabb->lower[2] = (float)(INFINITY);
+
+    aabb->upper[0] = -(float)(INFINITY);
+    aabb->upper[1] = -(float)(INFINITY);
+    aabb->upper[2] = -(float)(INFINITY);
+}
+
+GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb )
+{
+    float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] };
+    return v;
+}
+GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb )
+{
+    float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] };
+    return v;
+}
+
+GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v)
+{
+    aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]);
+    aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]);
+    aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]);
+    aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]);
+    aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]);
+    aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]);
+}
+
+GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters)
+{
+    aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]);
+    aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]);
+    aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]);
+    aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]);
+    aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]);
+    aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]);
+}
+
+GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper)
+{
+    aabb->upper[0] = fmin(upper[0], aabb->upper[0]);
+    aabb->upper[1] = fmin(upper[1], aabb->upper[1]);
+    aabb->upper[2] = fmin(upper[2], aabb->upper[2]);
+}
+
+GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper )
+{
+    aabb->lower[0] = lower.x ;
+    aabb->lower[1] = lower.y ;
+    aabb->lower[2] = lower.z ;
+    aabb->upper[0] = upper.x ;
+    aabb->upper[1] = upper.y ;
+    aabb->upper[2] = upper.z ;
+}
+
+inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p)
+{
+    aabb->lower[0] = fmin(aabb->lower[0], p.x);
+    aabb->lower[1] = fmin(aabb->lower[1], p.y);
+    aabb->lower[2] = fmin(aabb->lower[2], p.z);
+    aabb->upper[0] = fmax(aabb->upper[0], p.x);
+    aabb->upper[1] = fmax(aabb->upper[1], p.y);
+    aabb->upper[2] = fmax(aabb->upper[2], p.z);
+}
+
+GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper)
+{
+    aabb->lower[0] = fmin(aabb->lower[0], lower.x);
+    aabb->lower[1] = fmin(aabb->lower[1], lower.y);
+    aabb->lower[2] = fmin(aabb->lower[2], lower.z);
+    aabb->upper[0] = fmax(aabb->upper[0], upper.x);
+    aabb->upper[1] = fmax(aabb->upper[1], upper.y);
+    aabb->upper[2] = fmax(aabb->upper[2], upper.z);
+}
+
+GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb)
+{
+    return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb);
+}
+
+GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb)
+{
+    const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb );
+    return d.x* (d.y + d.z) + (d.y * d.z);
+}
+
+GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me
+{
+    const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] };
+    return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower)
+{
+    aabb->lower[0] = lower.x;
+    aabb->lower[1] = lower.y;
+    aabb->lower[2] = lower.z;
+}
+
+GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper)
+{
+    aabb->upper[0] = upper.x;
+    aabb->upper[1] = upper.y;
+    aabb->upper[2] = upper.z;
+}
+
+GRL_INLINE float3 conservativeExtent(float3 extent)
+{
+    const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z));
+    float3 v3 = { v,v,v };
+    extent = extent + v3;
+    return extent;
+}
+
+inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform)
+{
+#if 1
+    // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
+    //     New AABB is center +- Extent.
+    //
+    // For derivation see:
+    //    https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
+    //
+
+    float3 Center = (upper + lower) * 0.5f;
+    float3 Extent = (conservativeExtent(upper) - lower) * 0.5f;
+
+    float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3];
+    float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7];
+    float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11];
+    float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
+    float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
+    float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
+
+    Center.x = cx; Center.y = cy;  Center.z = cz;
+    Extent.x = ex; Extent.y = ey;  Extent.z = ez;
+
+    struct AABB3f box;
+    AABB3f_set_lower(&box, Center - Extent);
+    AABB3f_set_upper(&box, Center + Extent);
+    return box;
+#else
+    struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform);
+
+    float3 plll = { lower.x, lower.y, lower.z };
+    float3 pllu = { lower.x, lower.y, upper.z };
+    float3 plul = { lower.x, upper.y, lower.z };
+    float3 pluu = { lower.x, upper.y, upper.z };
+    float3 pull = { upper.x, lower.y, lower.z };
+    float3 pulu = { upper.x, lower.y, upper.z };
+    float3 puul = { upper.x, upper.y, lower.z };
+    float3 puuu = { upper.x, upper.y, upper.z };
+    plll  = xfmPoint(xfm, plll) ;
+    pllu  = xfmPoint(xfm, pllu) ;
+    plul  = xfmPoint(xfm, plul) ;
+    pluu  = xfmPoint(xfm, pluu) ;
+    pull  = xfmPoint(xfm, pull) ;
+    pulu  = xfmPoint(xfm, pulu) ;
+    puul  = xfmPoint(xfm, puul) ;
+    puuu  = xfmPoint(xfm, puuu) ;
+
+    float3 p1_min = fmin(plll, pull);
+    float3 p2_min = fmin(pllu, pulu);
+    float3 p3_min = fmin(plul, puul);
+    float3 p4_min = fmin(pluu, puuu);
+    float3 p1_max = fmax(plll, pull);
+    float3 p2_max = fmax(pllu, pulu);
+    float3 p3_max = fmax(plul, puul);
+    float3 p4_max = fmax(pluu, puuu);
+    p1_min = fmin(p1_min, p3_min);
+    p2_min = fmin(p2_min, p4_min);
+    p1_max = fmax(p1_max, p3_max);
+    p2_max = fmax(p2_max, p4_max);
+    p1_min = fmin(p1_min, p2_min);
+    p1_max = fmax(p1_max, p2_max);
+
+    AABB3f out = {
+        {p1_min.x,p1_min.y,p1_min.z},
+        {p1_max.x,p1_max.y,p1_max.z}
+    };
+    return out;
+#endif
+}
+
+GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform)
+{
+    float3 lower = { box.lower[0], box.lower[1], box.lower[2] };
+    float3 upper = { box.upper[0], box.upper[1], box.upper[2] };
+    return transform_aabb(lower, upper, Transform);
+}
+
+GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in)
+{
+    struct AABB3f out;
+    float rmTransform[12];
+    load_row_major_from_AffineSpace3f(xfm, rmTransform);
+    out = transform_aabb(in, rmTransform);
+
+    return out;
+}
+
+GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained)
+{
+    bool iscontained =
+        contained.x >= bigger.lower[0] &&
+        contained.y >= bigger.lower[1] &&
+        contained.z >= bigger.lower[2] &&
+        contained.x <= bigger.upper[0] &&
+        contained.y <= bigger.upper[1] &&
+        contained.z <= bigger.upper[2];
+
+    return iscontained;
+}
+
+GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained)
+{
+    bool iscontained =
+        contained.lower[0] >= bigger.lower[0] &&
+        contained.lower[1] >= bigger.lower[1] &&
+        contained.lower[2] >= bigger.lower[2] &&
+        contained.upper[0] <= bigger.upper[0] &&
+        contained.upper[1] <= bigger.upper[1] &&
+        contained.upper[2] <= bigger.upper[2];
+
+    return iscontained;
+}
+
+GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box )
+{
+    return box->lower[0] > box->upper[0] ||
+           box->lower[1] > box->upper[1] ||
+           box->lower[2] > box->upper[2];
+}
+
+GRL_INLINE void AABB3f_print(struct AABB3f *aabb)
+{
+    printf("AABB {\n");
+    printf("  lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]);
+    printf("  upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]);
+    printf("}\n");
+}
+
+
+
+#ifdef __OPENCL_VERSION__
+GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID);
+    bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID);
+    bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID);
+    bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID);
+    bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID);
+    bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID);
+    return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]);
+    bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]);
+    bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]);
+    bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]);
+    bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]);
+    bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]);
+    return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]);
+    bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]);
+    bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]);
+    bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]);
+    bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]);
+    bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]);
+    return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]);
+    bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]);
+    bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]);
+    bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]);
+    bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]);
+    bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]);
+    return bounds;
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper)
+{
+    atomic_min((local float *)&aabb->lower + 0, lower.x);
+    atomic_min((local float *)&aabb->lower + 1, lower.y);
+    atomic_min((local float *)&aabb->lower + 2, lower.z);
+    atomic_max((local float *)&aabb->upper + 0, upper.x);
+    atomic_max((local float *)&aabb->upper + 1, upper.y);
+    atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+
+
+GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper )
+{
+    atomic_min( (global float*) & aabb->lower + 0, lower.x );
+    atomic_min( (global float*) & aabb->lower + 1, lower.y );
+    atomic_min( (global float*) & aabb->lower + 2, lower.z );
+    atomic_max( (global float*) & aabb->upper + 0, upper.x );
+    atomic_max( (global float*) & aabb->upper + 1, upper.y );
+    atomic_max( (global float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper )
+{
+    atomic_min( (local float*) & aabb->lower + 0, lower.x );
+    atomic_min( (local float*) & aabb->lower + 1, lower.y );
+    atomic_min( (local float*) & aabb->lower + 2, lower.z );
+    atomic_max( (local float*) & aabb->upper + 0, upper.x );
+    atomic_max( (local float*) & aabb->upper + 1, upper.y );
+    atomic_max( (local float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper)
+{
+    float lx = sub_group_reduce_min(lower.x);
+    float ly = sub_group_reduce_min(lower.y);
+    float lz = sub_group_reduce_min(lower.z);
+
+    float ux = sub_group_reduce_max(upper.x);
+    float uy = sub_group_reduce_max(upper.y);
+    float uz = sub_group_reduce_max(upper.z);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        atomic_min((local float*) & aabb->lower + 0, lx);
+        atomic_min((local float*) & aabb->lower + 1, ly);
+        atomic_min((local float*) & aabb->lower + 2, lz);
+        atomic_max((local float*) & aabb->upper + 0, ux);
+        atomic_max((local float*) & aabb->upper + 1, uy);
+        atomic_max((local float*) & aabb->upper + 2, uz);
+    }
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper)
+{
+    uint lane = get_sub_group_local_id();
+    float l[3];
+    l[0] = sub_group_reduce_min(lower.x);
+    l[1] = sub_group_reduce_min(lower.y);
+    l[2] = sub_group_reduce_min(lower.z);
+    float u[3];
+    u[0] = sub_group_reduce_max(upper.x);
+    u[1] = sub_group_reduce_max(upper.y);
+    u[2] = sub_group_reduce_max(upper.z);
+
+    if (lane < 3)
+    {
+        atomic_min((global float*)&aabb->lower + lane, l[lane]);
+        atomic_max((global float*)&aabb->upper + lane, u[lane]);
+    }
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other )
+{
+    float3 lower = AABB3f_load_lower( other );
+    float3 upper = AABB3f_load_upper( other );
+    atomic_min( (global float*) & aabb->lower + 0, lower.x );
+    atomic_min( (global float*) & aabb->lower + 1, lower.y );
+    atomic_min( (global float*) & aabb->lower + 2, lower.z );
+    atomic_max( (global float*) & aabb->upper + 0, upper.x );
+    atomic_max( (global float*) & aabb->upper + 1, upper.y );
+    atomic_max( (global float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb )
+{
+    atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] );
+    atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] );
+    atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] );
+    atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] );
+    atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] );
+    atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper)
+{
+    if (lower.x < aabb->lower[0])
+        atomic_min((local float *)&aabb->lower + 0, lower.x);
+    if (lower.y < aabb->lower[1])
+        atomic_min((local float *)&aabb->lower + 1, lower.y);
+    if (lower.z < aabb->lower[2])
+        atomic_min((local float *)&aabb->lower + 2, lower.z);
+    if (upper.x > aabb->upper[0])
+        atomic_max((local float *)&aabb->upper + 0, upper.x);
+    if (upper.y > aabb->upper[1])
+        atomic_max((local float *)&aabb->upper + 1, upper.y);
+    if (upper.z > aabb->upper[2])
+        atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source)
+{
+    float3 l = AABB3f_load_lower(source);
+    float3 u = AABB3f_load_upper(source);
+    atomic_min((global float *)&dest->lower + 0, l.x );
+    atomic_min((global float *)&dest->lower + 1, l.y );
+    atomic_min((global float *)&dest->lower + 2, l.z );
+    atomic_max((global float *)&dest->upper + 0, u.x );
+    atomic_max((global float *)&dest->upper + 1, u.y );
+    atomic_max((global float *)&dest->upper + 2, u.z );
+}
+
+
+struct AABB3f AABB3f_construct( float3 min, float3 max )
+{
+    struct AABB3f bb;
+    bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z;
+    bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z;
+    return bb;
+}
+
+struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond )
+{
+    float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond );
+    float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond );
+    return AABB3f_construct( l, u );
+}
+
+#endif
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
+
diff --git a/src/intel/vulkan/grl/include/GRLGen12.h b/src/intel/vulkan/grl/include/GRLGen12.h
new file mode 100644
index 00000000000..20849599e91
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLGen12.h
@@ -0,0 +1,691 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures
+//
+//
+
+//********************************************************************************************
+//   WARNING!!!!!
+// This file is shared by OpenCL and C++ source code and must be compatible.
+//  There should only be C structure definitions and trivial GRL_INLINE functions here
+//
+//********************************************************************************************
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+#include "GRLUtilities.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GEN12)
+
+    enum_uint8(NodeType)
+    {
+        NODE_TYPE_MIXED = 0x0,        // identifies a mixed internal node where each child can have a different type
+        NODE_TYPE_INTERNAL = 0x0,     // internal BVH node with 6 children
+        NODE_TYPE_INSTANCE = 0x1,     // instance leaf
+        NODE_TYPE_PROCEDURAL = 0x3,   // procedural leaf
+        NODE_TYPE_QUAD = 0x4,         // quad leaf
+        NODE_TYPE_INVALID = 0x7       // indicates invalid node
+    };
+
+
+    typedef enum PrimLeafType
+    {
+        TYPE_NONE = 0,
+
+        TYPE_QUAD = 0,
+
+        /* For a node type of NODE_TYPE_PROCEDURAL we support enabling
+        * and disabling the opaque/non_opaque culling. */
+
+        TYPE_OPACITY_CULLING_ENABLED = 0,
+        TYPE_OPACITY_CULLING_DISABLED = 1
+    } PrimLeafType;
+
+    #define BVH_MAGIC_MACRO     "GEN12_RTAS_005"    //  If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end
+    static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO;
+
+    typedef struct BVHBase
+    {
+        // TODO:  Implement the "copy-first-node" trick... duplicate root node here
+
+        uint64_t rootNodeOffset;
+
+        uint32_t reserved;
+
+        uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
+        uint32_t quadLeafStart;
+        uint32_t quadLeafCur;
+        uint32_t proceduralDataStart;
+        uint32_t proceduralDataCur;
+        uint32_t instanceLeafStart;
+        uint32_t instanceLeafEnd;
+        uint32_t backPointerDataStart;     //
+        uint32_t refitTreeletsDataStart;   // refit structs
+        uint32_t refitStartPointDataStart; //
+        uint32_t BVHDataEnd;
+
+        // number of bottom treelets
+        // if 1, then the bottom treelet is also tip treelet
+        uint32_t refitTreeletCnt;    
+        uint32_t refitTreeletCnt2; // always 0, used for atomic updates
+        // data layout:
+        // @backPointerDataStart
+        //  'backpointer' - a dword per inner node.
+        //  The bits are used as follows:
+        //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
+        //     5:3  --> Number of children
+        //     31:6 --> Index of the parent node in the internal node array
+        //    The root node has a parent index of all ones
+        // @refitTreeletsDataStart
+        //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom 
+        // @refitStartPointDataStart
+        //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
+        // @backPointerDataEnd
+
+        uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
+        uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
+        uint32_t fatLeafTableStart;
+        uint32_t innerTableStart;
+
+        uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update
+        uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256
+        uint32_t quadIndicesDataStart;
+
+        uint32_t _pad[9];
+
+        struct RTASMetaData Meta;
+
+    } BVHBase;
+
+    GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base)
+    {
+        return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart);
+    }
+
+#ifdef __OPENCL_VERSION__
+#define BVH_ROOT_NODE_OFFSET sizeof(BVHBase)
+#else
+#define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase)
+#endif
+
+GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!");
+GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!");
+
+    typedef struct BackPointers {
+    } BackPointers;
+
+    // threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number
+    // means that no bottom treelet has more paths than this number
+    #define TREELET_NUM_STARTPOINTS 1536
+
+    // threshold under which only one treelet will be created
+    #define SINGLE_TREELET_THRESHOLD 3072
+    
+    typedef struct LeafTableEntry {
+
+        uint backpointer;
+        uint inner_node_index;
+        uint leaf_index;
+    } LeafTableEntry;
+
+    typedef struct InnerNodeTableEntry {
+
+        uint node_index_and_numchildren; // numchildren in 3 lsbs
+        uint first_child;
+
+    } InnerNodeTableEntry;
+
+    typedef struct QuadDataIndices
+    {
+        uint header_data[4];
+        uint vert_idx[4];
+    } QuadDataIndices;
+
+    typedef struct RefitTreelet {
+        uint32_t startpoint_offset;
+        uint32_t numStartpoints;
+        uint32_t numNonTrivialStartpoints;
+        uint8_t  maxDepth;
+        uint8_t  depthLess64; // depth from bottom at which there are less 64  paths
+        uint8_t  depthLess128;// depth from bottom at which there are less 128 paths
+        uint8_t  depthLess256;// depth from bottom at which there are less 256 paths
+    } RefitTreelet;
+
+    // if RefitTreelet has number of startpoints == 1
+    // it should be reinterpreted as:
+    typedef struct RefitTreeletTrivial {
+        uint32_t theOnlyNodeIndex;
+        uint32_t numStartpoints; // have to be 1 or 0
+        int32_t  childrenOffsetOfTheNode; // 0th node based
+        uint8_t  maxDepth;
+        uint8_t  numChildrenOfTheNode;
+    } RefitTreeletTrivial;
+
+    // 5:0  - depth after you die
+    // 31:6 - Index of the inner node
+    typedef uint32_t StartPoint;
+
+    struct HwInstanceLeaf;
+    struct QuadLeaf;
+    struct ProceduralLeaf;
+    struct InternalNode;
+
+    typedef struct HwInstanceLeaf HwInstanceLeaf;
+    typedef struct InternalNode InternalNode;
+    typedef struct QuadLeaf QuadLeaf;
+    typedef struct ProceduralLeaf ProceduralLeaf;
+
+    GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp )
+    {
+        return bp >> 6;
+    }
+    GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp )
+    {
+        return (bp >> 3) & (7);
+    }
+    GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp )
+    {
+        return bp & 7;
+    }
+    GRL_INLINE bool BackPointer_IsRoot( uint32_t bp )
+    {
+        return (bp >> 6) == 0x03FFFFFF;
+    }
+
+    GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p )
+    {
+        return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET);
+    }
+
+    GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p)
+    {
+        return p->Meta.bounds;
+    }
+
+    GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p)
+    {
+        return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET);
+    }
+    GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p)
+    {
+        return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur));
+    }
+    GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p)
+    {
+        return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
+    }
+
+
+    GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p)
+    {
+        return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart));
+    }
+    GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p)
+    {
+        return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur));
+    }
+
+    GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p)
+    {
+        return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur));
+    }
+
+    GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p)
+    {
+        return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart));
+    }
+
+    GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p )
+    {
+        char* pRTASBits = (char*)p;
+        return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart));
+    }
+
+    GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p )
+    {
+        char* pRTASBits = (char*) p;
+        return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd));
+    }
+
+    GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p )
+    {
+        return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
+    }
+
+    GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p)
+    {
+        return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart));
+    }
+
+    GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p)
+    {
+        return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart);
+    }
+
+    GRL_INLINE uint StartPoint_GetDepth(StartPoint s)
+    {
+        return s & ((1 << 6) - 1);
+    }
+
+    GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s)
+    {
+        return s >> 6;
+    }
+
+    GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p)
+    {
+        return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart));
+    }
+
+    // this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms.
+    // to get real number of all treelets including tip, the formula is 
+    //    actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1;
+    GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p)
+    {
+        return &p->refitTreeletCnt;
+    }
+
+    GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p)
+    {
+        return p->refitTreeletCnt;
+    }
+
+    GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p)
+    {
+        return p->refitTreeletCnt == 1;
+    }
+
+    GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p)
+    {
+        return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart));
+    }
+
+
+    GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p)
+    {
+        return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart));
+    }
+    GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p)
+    {
+        return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart));
+    }
+    GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p)
+    {
+        return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart));
+    }
+
+    GRL_INLINE unsigned* InnerNode_GetBackPointer(
+        BackPointers* backpointersStruct,
+        uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/)
+    {
+        uint* backpointersArray = (uint*)backpointersStruct;
+        // BACKPOINTER_LAYOUT
+        uint new_index = inodeOffset;                                                                              //<-layout canonical
+        //uint new_index = inodeOffset*16;                                                                           //<-layout scattered
+        // uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8));     //<-layout hashed
+
+        return backpointersArray + new_index;
+    }
+
+    GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p)
+    {
+        return 64u * (p->BVHDataEnd - p->backPointerDataStart);
+    }
+
+    GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p)
+    {
+        return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart);
+    }
+
+    GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p )
+    {
+        return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd));
+    }
+
+    GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p )
+    {
+        return p->refitTreeletsDataStart > p->backPointerDataStart;
+    }
+
+    GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p)
+    {
+        return p->quadLeafCur - p->quadLeafStart;
+    }
+
+    GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p)
+    {
+        return p->proceduralDataCur - p->proceduralDataStart;
+    }
+
+    GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p)
+    {
+        return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
+    }
+
+    GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p)
+    {
+        return p->BVHDataEnd * 64u;
+    }
+
+
+
+    struct HwInstanceLeaf
+    {
+        /* first 64 bytes accessed during traversal */
+        struct Part0
+        {
+            //uint32_t shaderIndex : 24;
+            //uint32_t geomMask : 8;
+            uint32_t DW0;
+
+            // uint32_t instanceContributionToHitGroupIndex : 24;
+            // uint32_t pad0 : 8
+            //
+            // NOTE:  Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path
+            //    For a procedural instance, bit 29 should be set to 1, to disable "opaque culling"
+            //      and bits 30 and 31 must be zero.  See also the definition of the 'PrimLeafDesc' structure
+            uint32_t DW1;
+
+            //      uint64_t rootNodePtr : 48;
+            //      uint64_t instFlags : 8;
+            //      uint64_t pad1 : 8;
+            uint64_t DW2_DW3;
+
+            // Vec3f world2obj_vx;   // 1st row of Worl2Obj transform
+            float    world2obj_vx_x;
+            float    world2obj_vx_y;
+            float    world2obj_vx_z;
+
+            // Vec3f world2obj_vy;   // 2nd row of Worl2Obj transform
+            float    world2obj_vy_x;
+            float    world2obj_vy_y;
+            float    world2obj_vy_z;
+
+            // Vec3f world2obj_vz;   // 3rd row of Worl2Obj transform
+            float    world2obj_vz_x;
+            float    world2obj_vz_y;
+            float    world2obj_vz_z;
+
+            // Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in fist 64 bytes)
+            float    obj2world_p_x;
+            float    obj2world_p_y;
+            float    obj2world_p_z;
+        } part0;
+
+        /* second 64 bytes accessed during shading */
+        // NOTE: Everything in this block is under SW control
+        struct Part1
+        {
+            //      uint64_t bvhPtr : 48;
+            //      uint64_t pad : 16;
+            uint64_t DW0_DW1;
+
+            uint32_t instanceID;
+            uint32_t instanceIndex;
+
+            // Vec3f world2obj_vx;   // 1st row of Worl2Obj transform
+            float    obj2world_vx_x;
+            float    obj2world_vx_y;
+            float    obj2world_vx_z;
+
+            // Vec3f world2obj_vy;   // 2nd row of Worl2Obj transform
+            float    obj2world_vy_x;
+            float    obj2world_vy_y;
+            float    obj2world_vy_z;
+
+            // Vec3f world2obj_vz;   // 3rd row of Worl2Obj transform
+            float    obj2world_vz_x;
+            float    obj2world_vz_y;
+            float    obj2world_vz_z;
+
+            // Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in fist 64 bytes)
+            float    world2obj_p_x;
+            float    world2obj_p_y;
+            float    world2obj_p_z;
+        } part1;
+    };
+
+    __constant const uint64_t c_one = 1ul;
+
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p )
+    {
+        return p->part0.DW0 >> 24;
+    }
+
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p )
+    {
+        return p->part0.DW1 & 0x00ffffff;
+    }
+
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p )
+    {
+        return (p->part0.DW2_DW3 >> 48) & 0xff;
+    }
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p )
+    {
+        return p->part1.instanceID;
+    }
+
+    GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p )           { return p->part1.DW0_DW1 & ((c_one << 48) - 1); }
+    GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p )     { return p->part0.DW2_DW3 & ((c_one << 48) - 1); }
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; }
+
+    GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform)
+    {
+        transform[0]  = p->part1.obj2world_vx_x;
+        transform[1]  = p->part1.obj2world_vy_x;
+        transform[2]  = p->part1.obj2world_vz_x;
+        transform[3]  = p->part0.obj2world_p_x;
+        transform[4]  = p->part1.obj2world_vx_y;
+        transform[5]  = p->part1.obj2world_vy_y;
+        transform[6]  = p->part1.obj2world_vz_y;
+        transform[7]  = p->part0.obj2world_p_y;
+        transform[8]  = p->part1.obj2world_vx_z;
+        transform[9]  = p->part1.obj2world_vy_z;
+        transform[10] = p->part1.obj2world_vz_z;
+        transform[11] = p->part0.obj2world_p_z;
+    }
+
+    GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) {
+        uint64_t mask = ((c_one << 48) - 1);
+        uint64_t v = p->part1.DW0_DW1;
+        v = (b & mask) | (v & ~mask);
+        p->part1.DW0_DW1 = v;
+    }
+    GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) {
+        uint64_t mask = ((c_one << 48) - 1);
+        uint64_t v = p->part0.DW2_DW3;
+        v = (b & mask) | (v & ~mask);
+        p->part0.DW2_DW3 = v;
+    }
+    GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p,
+                                                             gpuva_t root,
+                                                             uint8_t flags ) {
+        uint64_t mask = ((1ull << 48) - 1);
+        uint64_t v = (root & mask) | ((uint64_t)(flags)<<48);
+        p->part1.DW0_DW1 = v;
+    }
+
+    struct InternalNode
+    {
+        float lower[3];       // world space origin of quantization grid
+        int32_t childOffset;  // offset to all children in 64B multiples
+
+        uint8_t nodeType;     // the type of the node
+        uint8_t pad;          // unused byte
+
+        int8_t exp_x;         // 2^exp_x is the size of the grid in x dimension
+        int8_t exp_y;         // 2^exp_y is the size of the grid in y dimension
+        int8_t exp_z;         // 2^exp_z is the size of the grid in z dimension
+        uint8_t nodeMask;     // mask used for ray filtering
+
+        struct ChildData
+        {
+            //uint8_t blockIncr : 2; // size of child in 64 byte blocks.   Must be ==2 for instance leaves, <=2 for quad leaves.
+            //uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode
+            //uint8_t pad : 2; // unused bits
+            uint8_t bits;
+        } childData[6];
+
+        uint8_t lower_x[6];  // the quantized lower bounds in x-dimension
+        uint8_t upper_x[6];  // the quantized upper bounds in x-dimension
+        uint8_t lower_y[6];  // the quantized lower bounds in y-dimension
+        uint8_t upper_y[6];  // the quantized upper bounds in y-dimension
+        uint8_t lower_z[6];  // the quantized lower bounds in z-dimension
+        uint8_t upper_z[6];  // the quantized upper bounds in z-dimension
+    };
+
+    GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx )
+    {
+        return p->childData[idx].bits & 3;
+    }
+    GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx )
+    {
+        return (p->childData[idx].bits>>2) & 0xf;
+    }
+
+    GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx )
+    {
+        return (p->childData[idx].bits >> 2) & 0xF;
+    }
+
+    GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type )
+    {
+        uint bits = p->childData[idx].bits;
+        const uint mask = (0xF << 2);
+        bits = ((type << 2) & mask) | (bits & ~mask);
+        p->childData[idx].bits = (uint8_t)bits;
+    }
+
+    GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child )
+    {
+        bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0
+        bool upper = p->upper_x[child] & 0x80;
+        return !lower || upper;
+    }
+
+    GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i)
+    {
+        float4 lower, upper;
+        const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f };
+        const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 };
+        const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 };
+        const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 };
+        lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+        upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+        AABB3f aabb3f = {
+            { lower.x, lower.y, lower.z },
+            { upper.x, upper.y, upper.z } };
+        return aabb3f;
+    }
+
+    GRL_INLINE void* InternalNode_GetChildren( InternalNode* node)
+    {
+        return (void*)(((char*)node) + node->childOffset * 64);
+    }
+
+    typedef struct PrimLeafDesc
+    {
+        //uint32_t shaderIndex : 24;    // shader index used for shader record calculations
+        //uint32_t geomMask : 8;        // geometry mask used for ray masking
+        uint32_t shaderIndex_geomMask;
+
+        //uint32_t geomIndex : 29;      // the geometry index specifies the n'th geometry of the scene
+        //PrimLeafType type : 1;        // see above
+        //GeometryFlags geomFlags : 2;  // geometry flags of this geometry
+        uint32_t geomIndex_flags;
+    } PrimLeafDesc;
+
+    GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p )
+    {
+        return p->shaderIndex_geomMask & ((1 << 24) - 1);
+    }
+    GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p )
+    {
+        return p->geomIndex_flags & ((1<<29)-1);
+    }
+    GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p )
+    {
+        return (p->geomIndex_flags >> 30);
+    }
+    GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p)
+    {
+        return (p->geomIndex_flags >> 29) & 1;
+    }
+
+    struct QuadLeaf
+    {
+        PrimLeafDesc leafDesc;
+
+        uint32_t primIndex0;
+
+        //uint32_t primIndex1Delta : 16;
+        //uint32_t j0 : 2;
+        //uint32_t j1 : 2;
+        //uint32_t j2 : 2;
+        //uint32_t last : 1; // last quad in list
+        //uint32_t pad : 9;
+        uint32_t DW1;
+
+        float v[4][3];
+    };
+
+    GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p )
+    {
+        return p->DW1 & 0x0000ffff;
+    }
+    GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p )
+    {
+        return p->primIndex0;
+    }
+    GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p )
+    {
+        return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p);
+    }
+    GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p )
+    {
+        return QuadLeaf_GetPrimIndexDelta(p) == 0;
+    }
+    GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p )
+    {
+        return (p->DW1>>16) & 0x3f;
+    }
+
+    GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 )
+    {
+        quad->v[0][0] = v0.x;
+        quad->v[0][1] = v0.y;
+        quad->v[0][2] = v0.z;
+        quad->v[1][0] = v1.x;
+        quad->v[1][1] = v1.y;
+        quad->v[1][2] = v1.z;
+        quad->v[2][0] = v2.x;
+        quad->v[2][1] = v2.y;
+        quad->v[2][2] = v2.z;
+        quad->v[3][0] = v3.x;
+        quad->v[3][1] = v3.y;
+        quad->v[3][2] = v3.z;
+    }
+
+
+    struct ProceduralLeaf {
+        PrimLeafDesc leafDesc;
+
+        // Number of primitives + "last" bits.
+        // The meaning of this section is SW-defined and flexible
+        uint32_t DW1 ;
+        uint32_t _primIndex[13];
+    } ;
+
+GRL_NAMESPACE_END(Gen12)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/include/GRLIntTypes.h b/src/intel/vulkan/grl/include/GRLIntTypes.h
new file mode 100644
index 00000000000..573dbbc7481
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLIntTypes.h
@@ -0,0 +1,152 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//********************************************************************************************
+//   WARNING!!!!!
+//
+// This file is shared by OpenCL and C++ source code and must be a pure C header
+//  There should only be C structure definitions and trivial inline functions here
+//
+//********************************************************************************************
+
+#pragma once
+
+#include "GRLOCLCompatibility.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+    typedef uint32_t dword;
+    typedef uint64_t qword;
+    typedef qword gpuva_t;
+
+
+    enum_uint8( InstanceFlags )
+    {
+        INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
+        INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
+        INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
+        INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8,
+    };
+
+    enum_uint8( GeometryFlags )
+    {
+        GEOMETRY_FLAG_NONE = 0x0,
+        GEOMETRY_FLAG_OPAQUE = 0x1,
+        GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2,
+    };
+
+    enum_uint8( GeometryType )
+    {
+        GEOMETRY_TYPE_TRIANGLES = 0,
+        GEOMETRY_TYPE_PROCEDURAL = 1,
+        NUM_GEOMETRY_TYPES = 2
+    };
+
+    // NOTE: Does NOT match DXR
+    enum_uint8( IndexFormat )
+    {
+        INDEX_FORMAT_NONE     = 0,     // INDEX_FORMAT_NONE Indicates non-indexed geometry
+        INDEX_FORMAT_R16_UINT = 2,
+        INDEX_FORMAT_R32_UINT = 4,
+        INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1
+    };
+
+    // NOTE: Does NOT match DXR
+    enum_uint8( VertexFormat )
+    {
+        VERTEX_FORMAT_R32G32_FLOAT          = 0,
+        VERTEX_FORMAT_R32G32B32_FLOAT       = 1,
+        VERTEX_FORMAT_R16G16_FLOAT          = 2,
+        VERTEX_FORMAT_R16G16B16A16_FLOAT    = 3,
+        VERTEX_FORMAT_R16G16_SNORM          = 4,
+        VERTEX_FORMAT_R16G16B16A16_SNORM    = 5,
+        VERTEX_FORMAT_R16G16B16A16_UNORM    = 6,
+        VERTEX_FORMAT_R16G16_UNORM          = 7,
+        VERTEX_FORMAT_R10G10B10A2_UNORM     = 8,
+        VERTEX_FORMAT_R8G8B8A8_UNORM        = 9,
+        VERTEX_FORMAT_R8G8_UNORM            = 10,
+        VERTEX_FORMAT_R8G8B8A8_SNORM        = 11,
+        VERTEX_FORMAT_R8G8_SNORM            = 12,
+        VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1
+    };
+
+
+
+    enum_uint32(RTASFlags)
+    {
+        // These flags match DXR
+        BUILD_FLAG_ALLOW_UPDATE                 = 1<<0,
+        BUILD_FLAG_ALLOW_COMPACTION             = 1<<1,
+        BUILD_FLAG_PREFER_FAST_TRACE            = 1<<2,
+        BUILD_FLAG_PREFER_FAST_BUILD            = 1<<3,
+        BUILD_FLAG_MINIMIZE_MEMORY              = 1<<4,
+        BUILD_FLAG_PERFORM_UPDATE               = 1<<5,
+
+        // internal flags start here
+        BUILD_FLAG_DISALLOW_REBRAID             = 1<<16,
+
+        BUILD_FLAG_ALL = 0x0001003f
+    };
+
+    enum_uint8(BVHType)
+    {
+        BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices
+        BVH_TYPE_GEN12,
+    };
+
+    enum_uint8(PostBuildInfoType)
+    {
+        PBI_CURRENT_SIZE,
+        PBI_COMPACTED_SIZE,
+        PBI_DXR_TOOLS_VISUALIZATION_DESC,
+        PBI_DXR_SERIALIZATION_DESC,
+    };
+
+    enum_uint32(HazardTypes)
+    {
+        HAZARD_RTAS_READ       = 1 << 0,
+        HAZARD_RTAS_WRITE      = 1 << 1,
+        HAZARD_READ            = 1 << 2,
+        HAZARD_WRITE           = 1 << 3,
+        HAZARD_ALL             = 0xf
+    };
+    
+    enum_uint32(RaytracingAccelerationStructureType)
+    {
+        TOP_LEVEL    = 0x0,
+        BOTTOM_LEVEL = 0x1,
+    };
+
+    typedef struct PostbuildInfoCurrentSize
+    {
+        uint64_t CurrentSizeInBytes;
+    } PostbuildInfoCurrentSize;
+
+    typedef struct PostbuildInfoCompactedSize
+    {
+        uint64_t CompactedSizeInBytes;
+    } PostbuildInfoCompactedSize;
+
+    typedef struct PostbuildInfoToolsVisualizationDesc
+    {
+        uint64_t DecodedSizeInBytes;
+    } PostbuildInfoToolsVisualizationDesc;
+
+    typedef struct PostbuildInfoSerializationDesc
+    {
+        uint64_t SerializedSizeInBytes;
+        uint64_t NumBottomLevelAccelerationStructurePointers;
+    } PostbuildInfoSerializationDesc;
+
+    typedef struct DecodeHeader
+    {
+        RaytracingAccelerationStructureType Type;
+        uint32_t NumDesc;
+    } DecodeHeader;
+
+
+GRL_NAMESPACE_END(GRL)
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
new file mode 100644
index 00000000000..dd9ff2c271a
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
@@ -0,0 +1,205 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#ifdef __OPENCL_VERSION__
+
+typedef uchar  uint8_t;
+typedef ushort uint16_t;
+typedef uint   uint32_t;
+typedef ulong  uint64_t;
+typedef char   int8_t;
+typedef short  int16_t;
+typedef int    int32_t;
+typedef long   int64_t;
+
+#else
+
+#include <stdint.h>
+
+typedef uint8_t  uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+typedef uint64_t ulong;
+
+#define __constant
+#define __global
+
+typedef struct uint2
+{
+#ifdef __cplusplus
+    uint2() {};
+    uint2( uint ix, uint iy ) : x( ix ), y( iy ) {};
+#endif
+    uint x;
+    uint y;
+} uint2;
+
+typedef struct uint3
+{
+#ifdef __cplusplus
+    uint3() {};
+    uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {};
+#endif
+    uint x;
+    uint y;
+    uint z;
+} uint3;
+
+typedef struct int3
+{
+    int32_t x;
+    int32_t y;
+    int32_t z;
+
+#ifdef __cplusplus
+    int3() {};
+    int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {};
+
+    int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); }
+    int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); }
+#endif
+} int3;
+
+typedef struct int4
+{
+    int32_t x;
+    int32_t y;
+    int32_t z;
+    int32_t w;
+
+#ifdef __cplusplus
+    int4() {};
+    int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {};
+
+    int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); }
+    int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); }
+    int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); }
+#endif
+} int4;
+
+typedef struct float3
+{
+    float x;
+    float y;
+    float z;
+
+#ifdef __cplusplus
+    float3(){};
+    float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){};
+
+    float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); }
+    float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); }
+    float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); }
+    float3 operator-() { return float3(-this->x, -this->y, -this->z); }
+    float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); }
+#endif
+} float3;
+
+typedef struct float4
+{
+    float x;
+    float y;
+    float z;
+    float w;
+
+#ifdef __cplusplus
+    float4() {};
+    float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {};
+
+    float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); }
+    float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); }
+#endif
+} float4;
+
+#endif /* ! __OPENCL_VERSION__ */
+
+
+#ifndef __cplusplus
+
+#define GRL_NAMESPACE_BEGIN(x)
+#define GRL_NAMESPACE_END(x)
+#define GRL_OVERLOADABLE __attribute((overloadable))
+#define GRL_INLINE __attribute__((always_inline)) inline static
+
+#   define enum_uint8(name)   \
+        typedef uint8_t name; \
+        enum name##_uint32
+#   define enum_uint16(name)   \
+        typedef uint16_t name; \
+        enum name##_uint32
+#   define enum_uint32(name)   \
+        typedef uint32_t name; \
+        enum name##_uint32
+
+#define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n)))
+#define GRL_STATIC_ASSERT(condition,desc)
+
+#else /* C++ */
+#ifdef __OPENCL_VERSION__
+#error "OpenCL C++ not supported by this header"
+#endif
+
+#define GRL_NAMESPACE_BEGIN(x) namespace x {
+#define GRL_NAMESPACE_END(x) }
+#define GRL_OVERLOADABLE
+#define GRL_INLINE inline
+
+#define enum_uint8(N) enum N : uint8_t
+#define enum_uint16(N) enum N : uint16_t
+#define enum_uint32(N) enum N : uint32_t
+
+#define OCL_BYTE_ALIGN(n)
+#define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc )
+
+#include <cmath>
+
+inline float3 fmin(float3 a, float3 b)
+{
+    float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) };
+    return o;
+}
+
+inline float3 fmax(float3 a, float3 b)
+{
+    float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) };
+    return o;
+}
+
+inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); }
+
+inline float dot(const float3& a, const float3& b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline float as_float(uint32_t i)
+{
+    return *reinterpret_cast<float*>(&i);
+}
+
+inline float3 as_float3(int3 i3)
+{
+    return *reinterpret_cast<float3*>(&i3);
+}
+
+inline float4 as_float4(int4 i4)
+{
+    return *reinterpret_cast<float4*>(&i4);
+}
+
+inline float4 convert_float4_rtn(int4 i4)
+{
+    return float4(static_cast<float>(i4.x), static_cast<float>(i4.y), static_cast<float>(i4.z), static_cast<float>(i4.w));
+}
+
+inline float4 convert_float4_rtp(int4 i4)
+{
+    return convert_float4_rtn(i4);
+}
+
+#endif
diff --git a/src/intel/vulkan/grl/include/GRLRTASCommon.h b/src/intel/vulkan/grl/include/GRLRTASCommon.h
new file mode 100644
index 00000000000..1f2cda2ea0b
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLRTASCommon.h
@@ -0,0 +1,142 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file is to contain structure definitions for RTAS-related meta-deta.
+//   The structures here should be generic enough to apply to any acceleration structure.
+//   If we ever move to KD-Trees or Octrees, this file should not need to change.
+//
+
+//********************************************************************************************
+//   WARNING!!!!!
+//
+// This file is shared by OpenCL and C++ source code and must be a pure C header
+//  There should only be C structure definitions and trivial inline functions here
+//
+//********************************************************************************************
+
+
+#pragma once
+#include "GRLIntTypes.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+
+    typedef struct SerializationIdentifier
+    {
+        uint8_t Bytes[16];
+    } SerializationIdentifier;
+
+    GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!");
+
+
+    // Header structure for RTAS serialization.
+    //    This structure is binary-compatible with the DXR and Vulkan API definitions
+    typedef struct SerializationHeader
+    {
+        SerializationIdentifier DriverID;   // DXR 'DriverOpaqueGUID'.            Vulkan: 'driverUUID'
+        SerializationIdentifier GRLID;      // DXR 'DriverOpaqueVersioningData'.  Vulkan: 'accelerationStructureUUID'
+
+        uint64_t SerializedSizeInBytesIncludingHeader;
+        uint64_t DeserializedSizeInBytes;
+        uint64_t InstanceHandleCount;
+    } SerializationHeader;
+
+    GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!");
+
+    // This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures
+    typedef struct InstanceDesc {
+        float    Transform[3][4];
+        uint32_t InstanceIDAndMask; // mask in 8 msbs
+        uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs
+        gpuva_t  AccelerationStructureGPUVA; // NOTE:  In GRL this is always a VA.  Vulkan CPU builds use handles here, and these may need to be translated
+    } InstanceDesc;
+    GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!");
+
+    typedef struct GeoMetaData{
+        uint32_t PrimitiveCount;
+        uint16_t Type;
+        uint16_t Flags;
+    } GeoMetaData;
+    GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!");
+
+    typedef struct AABB3f {
+        float lower[3];
+        float upper[3];
+    } AABB3f;
+    GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!");
+
+    enum_uint32(error_t_) {
+        error_t_no_error = 0x0,
+        error_t_internal_node_child_OOB = 0x1,
+        error_t_leaf_node_child_OOB = 0x2,
+        error_t_unrecognised_node_t = 0x4,
+        error_t_mixed_node_unsupported = 0x8,
+        error_t_instance_pointers_inconsistent = 0x10,
+        error_t_instance_pointed_root_not_internal = 0x20,
+        error_t_leaf_node_instance_child_missed_by_64B = 0x40,
+        error_t_internal_node_child_cycle = 0x80,
+        error_t_input_geo_insane = 0x100,
+        error_t_quad_leaf_broken = 0x200,
+        error_t_backpointer_not_reset = 0x400,
+        error_t_backpointer_wrong_children_num = 0x500,
+        error_t_backpointer_inconsitent_parent_child = 0x600,
+        error_t_backpointer_root_not_root_error = 0x700,
+        error_t_backpointer_OOB = 0x800,
+        error_t_backpointers_buffer_too_small = 0x900,
+        error_t_atomic_update_struct_fatleaf_count_oob = 0x1000,            // for this and following:
+        error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000,         // offset_in_BVH is just index in fatleaf or inner node arrays
+        error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000,
+        error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000,
+        error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000,
+        error_t_atomic_update_struct_inner_count_oob = 0x6000,
+        error_t_atomic_update_struct_inner_node_idx_oob = 0x7000,
+        error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000,
+        error_t_atomic_update_struct_inner_num_children_error = 0x9000,
+        error_t_atomic_update_struct_inner_children_non_internal = 0xA000,
+        error_t_unknown = 1u << 31,
+    };
+
+    enum_uint32(error_phase_t) {
+        error_phase_t_unknown = 0,
+        error_phase_t_post_build_Morton  = 1,
+        error_phase_t_post_build_Trivial = 2,
+        error_phase_t_post_build_NewSAH  = 3,
+        error_phase_t_post_update        = 4,
+        error_phase_t_pre_update         = 5,
+        error_phase_t_post_copy_op       = 6,
+    };
+
+    typedef struct ERROR_INFO {
+        error_t_ type;
+        uint    offset_in_BVH; //in 64B units
+        error_phase_t when;
+        uint reserved;
+    } ERROR_INFO;
+
+    // Meta-data common to all acceleration structures, which is needed to implement required functionality
+    //  All RTAS structures must contain a struct of this type named 'Meta'
+    typedef struct RTASMetaData {
+        struct AABB3f bounds;
+
+        uint32_t instanceDescsStart;  // byte offset to array of original instance_descs used for build.  Required for DXR visualization and serialization
+        uint32_t instanceCount;
+
+        uint32_t geoDescsStart;     // byte offset to array of 'GeoMetaData' matching input geos.  Required for DXR visualization
+        uint32_t geoCount;
+
+        uint64_t allocationSize;  // Size of the memory allocation containing this RTAS
+                                  //  This is the size given to the app in the prebuild info when the RTAS was first created
+                                  //  If RTAS was compacted, this will be the compacted size
+
+        ERROR_INFO errors;        // only used in debug mode
+    } RTASMetaData;
+
+    GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!");
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/include/GRLStructs.h b/src/intel/vulkan/grl/include/GRLStructs.h
new file mode 100644
index 00000000000..c8af8313ffc
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLStructs.h
@@ -0,0 +1,60 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLIntTypes.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(_INTERNAL)
+
+    struct GeometryTriangles
+    {
+        gpuva_t        pTransformBuffer;
+        gpuva_t        pIndexBuffer;
+        gpuva_t        pVertexBuffer;
+        qword          VertexBufferByteStride;
+        dword          IndexCount;
+        dword          VertexCount;
+        IndexFormat    IndexFormat;
+        VertexFormat   VertexFormat;
+    };
+
+    struct GeometryProcedural
+    {
+        gpuva_t  pAABBs_GPUVA; ///<elements of pAABBs_GPUVA are gpuAABB format.
+        qword    AABBByteStride;
+        dword    AABBCount;
+    };
+
+    // TODO we miss 'unsigned int ShaderIndex_Mask; // extension' field
+    struct Geo
+    {
+        union
+        {
+            struct GeometryTriangles Triangles;
+            struct GeometryProcedural Procedural;
+        } Desc;
+
+        GeometryType Type;
+        uint8_t Flags;
+    };
+
+    // Matches the Vulkan VkAccelerationStructureBuildRangeInfoKHR structure
+    // See Vulkan spec for data access rules:
+    //     https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkAccelerationStructureBuildRangeInfoKHR.html
+    //
+    struct IndirectBuildRangeInfo
+    {
+        dword    primitiveCount;        // Number of primitives
+        dword    primitiveOffset;       // Byte offset to primitive data
+        dword    firstVertex;           // Index of first vertex
+        dword    transformOffset;       // Byte offset to transform data (for triangle Geo with non-null transform)
+    };
+
+GRL_NAMESPACE_END(_INTERNAL)
+GRL_NAMESPACE_END(GRL)
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/GRLUtilities.h b/src/intel/vulkan/grl/include/GRLUtilities.h
new file mode 100644
index 00000000000..22670bfad1b
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLUtilities.h
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLOCLCompatibility.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+    GRL_INLINE float4 bitShiftLdexp4(float4 x, int4 y)
+    {
+        y = (y + 127) << 23;
+        return x * as_float4(y);
+    }
+
+    GRL_INLINE float3 bitShiftLdexp3(float3 x, int3 y)
+    {
+        y = (y + 127) << 23;
+        return x * as_float3(y);
+    }
+
+    GRL_INLINE float bitShiftLdexp(float x, int y)
+    {
+        y = (y + 127) << 23;
+        return x * as_float(y);
+    }
+
+GRL_NAMESPACE_END(GRL)
\ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/affinespace.h b/src/intel/vulkan/grl/include/affinespace.h
new file mode 100644
index 00000000000..36ebae0ede6
--- /dev/null
+++ b/src/intel/vulkan/grl/include/affinespace.h
@@ -0,0 +1,192 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+inline float3 GRL_OVERLOADABLE cross(const float3 a, const float3 b)
+{
+    float3 res = { a.y * b.z - a.z * b.y,
+                   a.z * b.x - a.x * b.z,
+                   a.x * b.y - a.y * b.x };
+    return res;
+}
+
+struct LinearSpace3f
+{
+    float3 vx;
+    float3 vy;
+    float3 vz;
+};
+
+/* compute the determinant of the matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_Constructor(const float3 vx, const float3 vy, const float3 vz)
+{
+    struct LinearSpace3f xfm;
+    xfm.vx = vx;
+    xfm.vy = vy;
+    xfm.vz = vz;
+    return xfm;
+}
+
+/* compute the determinant of the matrix */
+GRL_INLINE float LinearSpace3f_det(struct LinearSpace3f xfm)
+{
+    return dot(xfm.vx, cross(xfm.vy, xfm.vz));
+}
+
+/* compute transposed matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_transpose(struct LinearSpace3f in)
+{
+    float3 x = { in.vx.x, in.vy.x, in.vz.x };
+    float3 y = { in.vx.y, in.vy.y, in.vz.y };
+    float3 z = { in.vx.z, in.vy.z, in.vz.z };
+
+    return LinearSpace3f_Constructor(x,
+                                     y,
+                                     z);
+}
+
+/* compute adjoint matrix */
+GRL_INLINE const struct LinearSpace3f LinearSpace3f_adjoint(struct LinearSpace3f in)
+{
+    return LinearSpace3f_transpose(LinearSpace3f_Constructor(cross(in.vy, in.vz),
+                                                             cross(in.vz, in.vx),
+                                                             cross(in.vx, in.vy)));
+}
+
+/* compute inverse matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_invert(struct LinearSpace3f in)
+{
+    const float det = LinearSpace3f_det(in);
+    const struct LinearSpace3f adj = LinearSpace3f_adjoint(in);
+    return LinearSpace3f_Constructor(adj.vx / det, adj.vy / det, adj.vz / det);
+}
+
+GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct LinearSpace3f xfm, float3 p)
+{
+    return xfm.vx * p.x + xfm.vy * p.y + xfm.vz * p.z;
+}
+
+struct AffineSpace3f
+{
+    struct LinearSpace3f l;
+    float3 p;
+};
+
+GRL_INLINE struct AffineSpace3f AffineSpace3f_Constructor(struct LinearSpace3f l, float3 p)
+{
+    struct AffineSpace3f out;
+    out.l = l;
+    out.p = p;
+    return out;
+}
+
+GRL_INLINE struct AffineSpace3f AffineSpace3f_load_row_major(const float *in)
+{
+    struct AffineSpace3f out;
+    out.l.vx.x = in[0];
+    out.l.vx.y = in[4];
+    out.l.vx.z = in[8];
+    out.l.vy.x = in[1];
+    out.l.vy.y = in[5];
+    out.l.vy.z = in[9];
+    out.l.vz.x = in[2];
+    out.l.vz.y = in[6];
+    out.l.vz.z = in[10];
+    out.p.x = in[3];
+    out.p.y = in[7];
+    out.p.z = in[11];
+    return out;
+}
+
+// squared proportion of oriented transformed cube to aa box that would contain it.
+// the smaller it is the more overhead transformation produces
+GRL_INLINE
+float transformation_bbox_surf_overhead(const float* Transform)
+{
+    // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
+    //     New AABB is center +- Extent.
+    //
+    // For derivation see:
+    //    https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
+    //
+
+
+    // take the cube of side 1 and see how big aabb containing it transformed is vs just surface of transformed
+    float ex = fabs(Transform[0]) + fabs(Transform[1]) + fabs(Transform[2]);
+    float ey = fabs(Transform[4]) + fabs(Transform[5]) + fabs(Transform[6]);
+    float ez = fabs(Transform[8]) + fabs(Transform[9]) + fabs(Transform[10]);
+
+    // we will compare squared sizes
+    ex = ex * ex;
+    ey = ey * ey;
+    ez = ez * ez;
+
+    // surface of aabb containing oriented box;
+    float aabb_sq_half_surf = ex * ey + ey * ez + ez * ex;
+
+    // ^2 lengths of transformed <1,0,0>, <0,1,0>, <0,0,1>
+    float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8];
+    float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9];
+    float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10];
+
+    float obb_sq_half_surf = obx * oby + oby * obz + obz * obx;
+
+    return obb_sq_half_surf / aabb_sq_half_surf;
+
+    // ex = 2.0
+    // ey = 2.0
+    // ez = 2.0
+    // ex = 4.0
+    // ey = 4.0
+    // ez = 4.0
+    // aabb_half_surf = 16+16 *2.0 +  2.0*2.0+ 2.0*2.0; = 12;
+    // aabb_sq_half_surf = 144;
+    //
+    // obx = 4.0;
+    // oby = 4.0;
+    // obz = 4.0;
+    // obb_sq_half_surf = 16 + 16+ 16;
+    // obb_sq_half_surf = 16.0 *3 = 48
+}
+
+GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out)
+{
+    out[0]  = in.l.vx.x;
+    out[4]  = in.l.vx.y;
+    out[8]  = in.l.vx.z;
+    out[1]  = in.l.vy.x;
+    out[5]  = in.l.vy.y;
+    out[9]  = in.l.vy.z;
+    out[2]  = in.l.vz.x;
+    out[6]  = in.l.vz.y;
+    out[10] = in.l.vz.z;
+
+    out[3]  = in.p.x;
+    out[7]  = in.p.y;
+    out[11] = in.p.z;
+}
+
+GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p)
+{
+    return xfmPoint(xfm.l, p) + xfm.p;
+}
+
+/* compute inverse matrix */
+GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in)
+{
+    const struct LinearSpace3f il = LinearSpace3f_invert(in.l);
+    float3 ip = -xfmPoint(il, in.p);
+    return AffineSpace3f_Constructor(il, ip);
+}
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)