diff --git a/src/intel/vulkan/bvh/encode.comp b/src/intel/vulkan/bvh/encode.comp
index 8f7e6b873d8..b78f49a223b 100644
--- a/src/intel/vulkan/bvh/encode.comp
+++ b/src/intel/vulkan/bvh/encode.comp
@@ -11,11 +11,36 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
 #include "anv_build_interface.h"
 
 #define ULP 1.1920928955078125e-7f
+#define READY_TO_WRITE(offset) ((offset) < VK_NULL_BVH_OFFSET)
+#define ASSIGNED_NODE_TO_ENCODE (gl_GlobalInvocationID.x < DEREF(args.header).ir_internal_node_count)
+
+/* Debugging helper: disable encoding by exiting early. Ensure compiler doesn't
+ * dead code eliminate by comparing to value that should never evaluate as true.
+ */
+#define DEBUG_DISABLE_WRITE 0
+#define DEBUG_EXIT_EARLY(val) (DEBUG_DISABLE_WRITE == 1) && ((val) != 123456)
+
+/* IR_NODE refers to memory that holds IR NODEs which are to be encoded. */
+#define IR_NODE uint32_t
+#define NODE_OFFSET(node) (OFFSET(args.intermediate_bvh, ir_id_to_offset(node)))
+
+/* An offset in 64B blocks from args.output_bvh that points to output of
+ * encoded nodes. Can be a leaf or internal node.
+ */
+#define BLOCK uint32_t
+#define BLOCK_OFFSET(block) (OFFSET(args.output_bvh, ANV_RT_BLOCK_SIZE * block))
 
 layout(push_constant) uniform CONSTS {
    encode_args args;
 };
 
+void
+debug_dump(uint32_t offset, uint32_t value)
+{
+   REF(uint32_t) msg = REF(uint32_t)(OFFSET(args.output_bvh, offset));
+   DEREF(msg) = value;
+}
+
 uint32_t
 get_instance_flag(uint32_t src)
 {
@@ -23,9 +48,34 @@ get_instance_flag(uint32_t src)
    return flags & 0xf;
 }
 
+struct anv_cluster {
+   /* simd lane inside cluster: 0 .. 7 */
+   uint32_t idx;
+
+   /* ID of cluster: 0 .. globalInvocations.x/8-1 */
+   uint32_t cluster_id;
+
+   /* size = 8 */
+   uint32_t size;
+};
+
+/* cluster_size has to be a power of two and <32. */
+void
+anv_cluster_init(out anv_cluster cluster, uint32_t size)
+{
+   cluster.idx = gl_SubgroupInvocationID & (size - 1);
+   cluster.cluster_id = gl_SubgroupInvocationID / size;
+   cluster.size = size;
+}
+
+#define anv_shuffle(cluster, cluster_idx, value) \
+   subgroupShuffle(value, (gl_SubgroupInvocationID & (~(cluster.size - 1))) + cluster_idx)
+
 void
 encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, REF(anv_accel_struct_header) dst_header)
 {
+   if (DEBUG_EXIT_EARLY(type))
+      return;
    switch (type) {
    case vk_ir_node_triangle: {
       REF(anv_quad_leaf_node) quad_leaf = REF(anv_quad_leaf_node)(dst_node);
@@ -201,9 +251,7 @@ encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, REF(anv_ac
       DEREF(dst_instance).part1.instance_index = src.instance_id;
       DEREF(dst_instance).part1.instance_id = src.custom_instance_and_mask & 0xffffff;
 
-      uint64_t instance_leaves_addr_base = args.output_bvh - args.output_bvh_offset + ANV_RT_BVH_HEADER_SIZE;
       uint64_t cnt = atomicAdd(DEREF(dst_header).instance_count, 1);
-      DEREF(INDEX(uint64_t, instance_leaves_addr_base, cnt)) = dst_node;
       break;
    }
    }
@@ -241,21 +289,20 @@ aabb_size(vk_aabb input_aabb)
  * Otherwise, it's a mixed node.
  */
 uint8_t
-determine_internal_node_type(uint32_t children[6], uint child_count)
+determine_internal_node_type(anv_cluster cluster, uint32_t child, uint child_count)
 {
    if (child_count == 0)
       return uint8_t(ANV_NODE_TYPE_INVALID);
 
-   uint32_t type_of_first_child = ir_id_to_type(children[0]);
-   for (uint32_t i = 1; i < child_count; ++i) {
-      uint32_t type = ir_id_to_type(children[i]);
-      if(type != type_of_first_child){
-         return uint8_t(ANV_NODE_TYPE_MIXED);
-      }
-   }
+   uint32_t type = ir_id_to_type(child);
+   uint32_t first_type_of_child = subgroupClusteredMin(type, 8);
+   uint32_t second_type_of_child = subgroupClusteredMax(type, 8);
+
+   if (first_type_of_child != second_type_of_child)
+      return uint8_t(ANV_NODE_TYPE_MIXED);
 
    /* All children have same type. Now check what type they are. */
-   switch (type_of_first_child){
+   switch (first_type_of_child){
    case vk_ir_node_triangle:
       return uint8_t(ANV_NODE_TYPE_QUAD);
    case vk_ir_node_aabb:
@@ -289,22 +336,20 @@ quantize_bounds(vk_aabb aabb, vec3 base, i8vec3 exp)
 }
 
 void
-encode_internal_node(uint32_t children[6], uint32_t child_block_offset_from_internal_node, uint child_count,
-                     vec3 min_offset, vec3 max_offset, uint32_t bvh_block_offset)
+encode_internal_node(uint32_t child, uint32_t child_block_offset_from_internal_node,
+                     uint child_count, vk_aabb child_aabb, uint32_t bvh_block_offset,
+                     anv_cluster cluster)
 {
+   if (DEBUG_EXIT_EARLY(child_count))
+      return;
    REF(anv_internal_node) dst_node =
       REF(anv_internal_node)(OFFSET(args.output_bvh, ANV_RT_BLOCK_SIZE * bvh_block_offset));
 
-   DEREF(dst_node).child_block_offset = child_block_offset_from_internal_node;
-
    vk_aabb box;
-   box.min = min_offset;
-   box.max = max_offset;
+   box.min = subgroupClusteredMin(child_aabb.min, 8);
+   box.max = subgroupClusteredMax(child_aabb.max, 8);
 
    vk_aabb conservative_child_aabb = conservative_aabb(box);
-   DEREF(dst_node).lower[0] = conservative_child_aabb.min.x;
-   DEREF(dst_node).lower[1] = conservative_child_aabb.min.y;
-   DEREF(dst_node).lower[2] = conservative_child_aabb.min.z;
 
    float up = 1.0 + ULP;
    ivec3 exp;
@@ -317,59 +362,63 @@ encode_internal_node(uint32_t children[6], uint32_t child_block_offset_from_inte
    exp.z += int((mant.z > (255.0f / 256.0f)));
 
    i8vec3 exponent_i8 = i8vec3(exp);
-   DEREF(dst_node).exp_x = max(int8_t(-128), exponent_i8.x);
-   DEREF(dst_node).exp_y = max(int8_t(-128), exponent_i8.y);
-   DEREF(dst_node).exp_z = max(int8_t(-128), exponent_i8.z);
+   i8vec3 exp_i8 = {max(int8_t(-128), exponent_i8.x),
+                    max(int8_t(-128), exponent_i8.y),
+                    max(int8_t(-128), exponent_i8.z)};
 
-   i8vec3 exp_i8 = i8vec3(DEREF(dst_node).exp_x, DEREF(dst_node).exp_y, DEREF(dst_node).exp_z);
+   uint8_t node_type = determine_internal_node_type(cluster, child, child_count);
 
-   DEREF(dst_node).node_mask = uint8_t(0xff);
-   DEREF(dst_node).node_type = determine_internal_node_type(children, child_count);
+   if (cluster.idx == 0) {
+      DEREF(dst_node).child_block_offset = child_block_offset_from_internal_node;
+      DEREF(dst_node).lower[0] = conservative_child_aabb.min.x;
+      DEREF(dst_node).lower[1] = conservative_child_aabb.min.y;
+      DEREF(dst_node).lower[2] = conservative_child_aabb.min.z;
+      DEREF(dst_node).exp_x = exp_i8[0];
+      DEREF(dst_node).exp_y = exp_i8[1];
+      DEREF(dst_node).exp_z = exp_i8[2];
+      DEREF(dst_node).node_mask = uint8_t(0xff);
+      DEREF(dst_node).node_type = node_type;
+   }
 
-   for (uint32_t i = 0; i < 6; i++) {
-      if (i < child_count) {
-         uint32_t type = ir_id_to_type(children[i]);
-         /* blockIncr and child_block_offset are how HW used to find children during traversal.
-          * If not set properly, gpu could hang.
-          */
-         DEREF(dst_node).data[i].block_incr_and_start_prim =
-            type == vk_ir_node_instance ? uint8_t(2) : uint8_t(1);
+   uint32_t type = ir_id_to_type(child);
+   /* blockIncr and child_block_offset are how HW used to find children during traversal.
+    * If not set properly, gpu could hang.
+    */
+   DEREF(dst_node).data[cluster.idx].block_incr_and_start_prim =
+      type == vk_ir_node_instance ? uint8_t(2) : uint8_t(1);
 
-         uint32_t offset = ir_id_to_offset(children[i]);
+   child_aabb = conservative_aabb(child_aabb);
 
-         vk_aabb child_aabb =
-            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+   vk_aabb quantize_aabb = quantize_bounds(child_aabb, conservative_child_aabb.min, exp_i8);
 
-         child_aabb = conservative_aabb(child_aabb);
+   DEREF(dst_node).lower_x[cluster.idx] = uint8_t(quantize_aabb.min.x);
+   DEREF(dst_node).lower_y[cluster.idx] = uint8_t(quantize_aabb.min.y);
+   DEREF(dst_node).lower_z[cluster.idx] = uint8_t(quantize_aabb.min.z);
+   DEREF(dst_node).upper_x[cluster.idx] = uint8_t(quantize_aabb.max.x);
+   DEREF(dst_node).upper_y[cluster.idx] = uint8_t(quantize_aabb.max.y);
+   DEREF(dst_node).upper_z[cluster.idx] = uint8_t(quantize_aabb.max.z);
 
-         vk_aabb quantize_aabb = quantize_bounds(child_aabb, conservative_child_aabb.min, exp_i8);
+   /* for a mixed node, encode type of each children in startPrim in childdata */
+   if (DEREF(dst_node).node_type == uint8_t(ANV_NODE_TYPE_MIXED)){
+      uint32_t type = ir_id_to_type(child);
+      switch (type){
+      case vk_ir_node_triangle:
+         DEREF(dst_node).data[cluster.idx].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_QUAD) << 2);
+         break;
+      case vk_ir_node_aabb:
+         DEREF(dst_node).data[cluster.idx].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_PROCEDURAL) << 2);
+         break;
+      case vk_ir_node_instance:
+         DEREF(dst_node).data[cluster.idx].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INSTANCE) << 2);
+         break;
+      case vk_ir_node_internal:
+         DEREF(dst_node).data[cluster.idx].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_MIXED) << 2);
+         break;
+      }
+   }
 
-         DEREF(dst_node).lower_x[i] = uint8_t(quantize_aabb.min.x);
-         DEREF(dst_node).lower_y[i] = uint8_t(quantize_aabb.min.y);
-         DEREF(dst_node).lower_z[i] = uint8_t(quantize_aabb.min.z);
-         DEREF(dst_node).upper_x[i] = uint8_t(quantize_aabb.max.x);
-         DEREF(dst_node).upper_y[i] = uint8_t(quantize_aabb.max.y);
-         DEREF(dst_node).upper_z[i] = uint8_t(quantize_aabb.max.z);
-
-         /* for a mixed node, encode type of each children in startPrim in childdata */
-         if (DEREF(dst_node).node_type == uint8_t(ANV_NODE_TYPE_MIXED)){
-            uint32_t type = ir_id_to_type(children[i]);
-            switch (type){
-            case vk_ir_node_triangle:
-               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_QUAD) << 2);
-               break;
-            case vk_ir_node_aabb:
-               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_PROCEDURAL) << 2);
-               break;
-            case vk_ir_node_instance:
-               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INSTANCE) << 2);
-               break;
-            case vk_ir_node_internal:
-               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_MIXED) << 2);
-               break;
-            }
-         }
-      } else {
+   if (cluster.idx == 0) {
+      for (uint32_t i = child_count; i < 6; i++) {
          /* Invalid Child Nodes: For invalid child nodes, the MSBs of lower and upper
           * x planes are flipped. In other words:
           * bool valid(int i) const {
@@ -390,23 +439,115 @@ encode_internal_node(uint32_t children[6], uint32_t child_block_offset_from_inte
    }
 }
 
+/* Collapse nodes until reaching 6 children, which typically can be
+ * 5 internal nodes, or run out of nodes to collapse which often
+ * happens at tips of tree. Tree is collapsed in direction of
+ * largest surface areas, resulting in a quality bvh tree.
+ *
+ * Early find_children phase defers node collapse as it's a
+ * speculative phase. Nodes are collapsed only if parent node
+ * is found to be real (not collapsed node.)
+ */
+uint32_t
+find_children(vk_ir_box_node src, inout uint32_t children[6],
+              inout uint32_t collapsed_nodes[6],
+              out uint32_t collapsed_child_count, bool defer_collapse)
+{
+   uint32_t found_child_count = 0;
+   collapsed_child_count = 0;
+
+   /* Initial node can have at most two children */
+   for (uint32_t i = 0; i < 2; ++i)
+      if (src.children[i] != VK_BVH_INVALID_NODE)
+         children[found_child_count++] = src.children[i];
+
+   /* For this node, try to collapse binary to 6-ary children */
+   while (found_child_count < 6) {
+      /* find vk_ir_node_internal children with largest surface areas */
+      int32_t collapsed_child_index = -1;
+      float largest_surface_area = -INFINITY;
+
+      for (int32_t i = 0; i < found_child_count; ++i) {
+         /* Only collapse internal nodes, not leaf nodes. */
+         if (ir_id_to_type(children[i]) != vk_ir_node_internal)
+            continue;
+
+         vk_aabb bounds = DEREF(REF(vk_ir_node)NODE_OFFSET(children[i])).aabb;
+
+         float surface_area = aabb_surface_area(bounds);
+         if (surface_area > largest_surface_area) {
+            largest_surface_area = surface_area;
+            collapsed_child_index = i;
+         }
+      }
+
+      if (collapsed_child_index != -1) {
+         /* If deferred, save nodes to collapse later */
+         if (defer_collapse && collapsed_child_count < 6)
+            collapsed_nodes[collapsed_child_count] =
+               ir_id_to_offset(children[collapsed_child_index]);
+         collapsed_child_count++;
+
+         /* Once I found a good vk_ir_node_internal child, try to connect myself
+          * to this child's children, i.e. my grandchildren. Grandchildren can be
+          * internal nodes or leaves.
+          */
+         REF(vk_ir_box_node) child_node =
+            REF(vk_ir_box_node)NODE_OFFSET(children[collapsed_child_index]);
+         IR_NODE grandchildren[2] = DEREF(child_node).children;
+         uint32_t valid_grandchild_count = 0;
+
+         if (grandchildren[1] != VK_BVH_INVALID_NODE)
+            ++valid_grandchild_count;
+
+         if (grandchildren[0] != VK_BVH_INVALID_NODE)
+            ++valid_grandchild_count;
+         else
+            grandchildren[0] = grandchildren[1];
+
+         /* Grandchild now becomes my direct child, and can possibly be collapsed
+          * in the next iteration if found_child_count has not reached 6.
+          */
+         if (valid_grandchild_count > 1)
+            children[found_child_count++] = grandchildren[1];
+
+         if (valid_grandchild_count > 0)
+            children[collapsed_child_index] = grandchildren[0];
+         else {
+            /* This child doesn't have valid children, then I don't consider this
+             * child as my child anymore. This is possible depending on how and
+             * when lbvh/ploc algorithm marks a node as VK_BVH_INVALID_NODE.
+             */
+            found_child_count--;
+            children[collapsed_child_index] = children[found_child_count];
+         }
+         if (!defer_collapse)
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
+      } else
+         break;
+   }
+   return found_child_count;
+}
+
 void
 main()
 {
-   /* Encode.comp is dispatched through indirect dispatch with calculated groupCountX,
-    * but we can still overdispatch invocations, so we need a guard here.
-    *
-    * Also, we can't support more than 0xFFFFFFFF internal nodes due to SW
-    * limit we enforce on indirect workgroup count for signaling.
+   /* Each lane will process one vk_ir_node_internal. The root node is sitting
+    * at the end of the IR BVH, and we let the lane with
+    * gl_GlobalInvocationID.x == 0 to take care of it. To improve performance,
+    * we remap globalID to reduce chances that the same HW thread will
+    * need to handle it's immediate children too, reducing latency. This hashing
+    * algorithm spreads handling of a node's children to other threads.
     */
-   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count ||
-       DEREF(args.header).ir_internal_node_count > 0xFFFFFFFF)
-      return;
+   uint32_t global_id_hash = (gl_GlobalInvocationID.x <
+                              (DEREF(args.header).ir_internal_node_count & ~0xFF))
+      ? (gl_GlobalInvocationID.x & 0xFFFFFF00)  |
+        ((gl_GlobalInvocationID.x & 0x0F) << 4) |
+        ((gl_GlobalInvocationID.x & 0xF0) >> 4)
+      : gl_GlobalInvocationID.x;
+   uint32_t global_id =
+      DEREF(args.header).ir_internal_node_count - 1 - global_id_hash;
 
-   /* Each lane will process one vk_ir_node_internal. The root node is sitting at the end
-    * of the IR BVH, and we let the lane with gl_GlobalInvocationID.x == 0 to take care of it.
-    */
-   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
 
    uint32_t intermediate_leaf_node_size;
    switch (args.geometry_type) {
@@ -421,16 +562,24 @@ main()
       break;
    }
 
-   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * intermediate_leaf_node_size;
+   /* Each invocation cluster encodes one internal node. */
+   anv_cluster cluster;
+   anv_cluster_init(cluster, 8);
+
+   uint32_t intermediate_leaf_nodes_size =
+      args.leaf_node_count * intermediate_leaf_node_size;
 
    REF(vk_ir_box_node) intermediate_internal_nodes =
-      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
-   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
+      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
+                                intermediate_leaf_nodes_size);
+   REF(vk_ir_box_node) src_node =
+      INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
    vk_ir_box_node src = DEREF(src_node);
 
    bool is_root_node = gl_GlobalInvocationID.x == 0;
 
-   REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(args.output_bvh - args.output_bvh_offset);
+   REF(anv_accel_struct_header) header =
+      REF(anv_accel_struct_header)(args.output_bvh - args.output_bvh_offset);
 
    if (is_root_node) {
       DEREF(header).instance_flags =
@@ -438,149 +587,154 @@ main()
          /* These will be removed when processing leaf nodes */
          ANV_INSTANCE_FLAG_FORCE_OPAQUE | ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE;
 
-      /* Indicate where the next children should be encoded. Offset measured in number of 64B blocks and started from output_bvh */
+      /* Tracks BLOCK where the next children should be encoded. */
       DEREF(args.header).dst_node_offset = 1;
-
       DEREF(header).instance_count = 0;
    }
 
-   for (;;) {
-      /* Make changes to the current node's BVH offset value visible. */
-      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+   IR_NODE children[6] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                          VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                          VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE};
+   uint32_t collapsed_nodes[6];
+   uint32_t collapsed_child_count;
+   uint32_t found_child_count;
+   uint32_t num_blocks_to_add;
 
-      /* Indicate where this internal node should be encoded. Offset measured in number of 64B blocks and started from output_bvh.*/
-      uint32_t bvh_block_offset = is_root_node ? 0 : DEREF(src_node).bvh_offset;
+   /* Every simd lane is assigned an IR BVH internal node to encode. Since
+    * we are collapsing a binary tree into a hex tree, most simd lanes will
+    * never need to encode.
+    *
+    * To increase performance, have all IR BVH speculatively calculate which
+    * nodes they would collapse. Most of this work will be thrown away since
+    * over half the IR internal nodes never get written, but reduces latency.
+    */
+   if (ASSIGNED_NODE_TO_ENCODE) {
+      found_child_count = find_children(src, children, collapsed_nodes,
+                                        collapsed_child_count, true);
 
-      /* The invocation that processes this node is spinning, since its parent hasn't told it bvh_offset */
-      if (bvh_block_offset == VK_UNKNOWN_BVH_OFFSET)
-         continue;
-
-      if (bvh_block_offset == VK_NULL_BVH_OFFSET)
-         break;
-
-      uint32_t found_child_count = 0;
-      uint32_t children[6] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
-                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
-                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE};
-
-      /* Initially, this node can have at most two children (can be internal nodes or leaves). */
-      for (uint32_t i = 0; i < 2; ++i)
-         if (src.children[i] != VK_BVH_INVALID_NODE)
-            children[found_child_count++] = src.children[i];
-
-      /* For this node, try to collapse binary to 6-ary children */
-      while (found_child_count < 6) {
-         /* For each iteration, find a vk_ir_node_internal child that has largest surface area */
-         int32_t collapsed_child_index = -1;
-         float largest_surface_area = -INFINITY;
-
-         for (int32_t i = 0; i < found_child_count; ++i) {
-            /* If a child is a leaf (not vk_ir_node_internal), there's no need to collapse it. */
-            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
-               continue;
-
-            vk_aabb bounds =
-               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
-                                           ir_id_to_offset(children[i]))).aabb;
-
-            float surface_area = aabb_surface_area(bounds);
-            if (surface_area > largest_surface_area) {
-               largest_surface_area = surface_area;
-               collapsed_child_index = i;
-            }
-         }
-
-         if (collapsed_child_index != -1) {
-            /* Once I found a good vk_ir_node_internal child, try to connect myself
-             * to this child's children, i.e. my grandchildren. Grandchildren can be
-             * internal nodes or leaves.
-             */
-            REF(vk_ir_box_node) child_node =
-               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
-                                        ir_id_to_offset(children[collapsed_child_index]));
-            uint32_t grandchildren[2] = DEREF(child_node).children;
-            uint32_t valid_grandchild_count = 0;
-
-            if (grandchildren[1] != VK_BVH_INVALID_NODE)
-               ++valid_grandchild_count;
-
-            if (grandchildren[0] != VK_BVH_INVALID_NODE)
-               ++valid_grandchild_count;
-            else
-               grandchildren[0] = grandchildren[1];
-
-            /* Grandchild now becomes my direct child, and can possibly be collapsed
-             * in the next iteration if found_child_count has not reached 6.
-             */
-            if (valid_grandchild_count > 1)
-               children[found_child_count++] = grandchildren[1];
-
-            if (valid_grandchild_count > 0)
-               children[collapsed_child_index] = grandchildren[0];
-            else {
-               /* This child doesn't have valid children, then I don't consider this
-                * child as my child anymore. This is possible depending on how and
-                * when lbvh/ploc algorithm marks a node as VK_BVH_INVALID_NODE.
-                */
-               found_child_count--;
-               children[collapsed_child_index] = children[found_child_count];
-            }
-
-            /* Finish collapsing, now I can mark this collapsed internal node as NULL,
-             * so whichever lane that would have processed it will return.
-             */
-            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
-         } else
-            break;
-      }
-
-      /* Count the number of instance children found. For each one found, it contributes to 2 blocks to dst_node_offset */
-      uint32_t num_blocks_to_add = 0;
+      /* Count the number of instance children found. For each one found,
+       * it contributes to 2 blocks to dst_node_offset
+       */
+      num_blocks_to_add = 0;
       for (uint32_t i = 0; i < found_child_count; ++i) {
          uint32_t type = ir_id_to_type(children[i]);
          num_blocks_to_add += (type == vk_ir_node_instance) ? 2 : 1;
       }
+   }
 
-      /* Used for finding where to encode children. Also, update dst_node_offset so other invocations know where to start encoding */
-      uint32_t child_block_offset_from_output_bvh = atomicAdd(DEREF(args.header).dst_node_offset, num_blocks_to_add);
+   BLOCK bvh_block_offset = (is_root_node) ? 0 :
+                            (ASSIGNED_NODE_TO_ENCODE ? VK_UNKNOWN_BVH_OFFSET
+                                                     : VK_NULL_BVH_OFFSET);
 
-      /* This is one of the needed information in anv_internal_node */
-      uint32_t child_block_offset_from_internal_node = child_block_offset_from_output_bvh - bvh_block_offset;
+   /* For all but the root internal node, nodes wait until their parent node
+    * informs them whether they are a valid child (valid bvh offset written)
+    * or were collapsed (VK_NULL_BVH_OFFSET written) and have no work to do.
+    */
+   for (;;) {
+      /* Make changes to the current node's BVH offset value visible. */
+      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                    gl_SemanticsAcquire | gl_SemanticsMakeVisible);
 
-      vec3 min_offset = vec3(INFINITY);
-      vec3 max_offset = vec3(-INFINITY);
-      for (uint32_t i = 0; i < found_child_count; ++i) {
-         /* Retrieve type and location of the child from IR BVH */
-         uint32_t type = ir_id_to_type(children[i]);
-         uint32_t offset = ir_id_to_offset(children[i]);
+      /* Indicate where this internal node should be encoded. Offset measured
+       * in number of 64B blocks and started from output_bvh.
+       */
+      if (bvh_block_offset == VK_UNKNOWN_BVH_OFFSET)
+         bvh_block_offset = DEREF(src_node).bvh_offset;
 
-         if (type == vk_ir_node_internal) {
-            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
-            DEREF(child_node).bvh_offset = child_block_offset_from_output_bvh;
-         } else {
-            encode_leaf_node(type, args.intermediate_bvh + offset,
-                             args.output_bvh + ANV_RT_BLOCK_SIZE * child_block_offset_from_output_bvh,
-                             header);
+      /* The invocation that processes this node is spinning, since its parent
+       * hasn't told it bvh_offset
+       */
+      BLOCK first_child_block;
+      if (READY_TO_WRITE(bvh_block_offset)) {
+         /* Used for finding where to encode children. Also, update dst_node_offset
+          * so other invocations know where to start encoding
+          */
+         first_child_block =
+            atomicAdd(DEREF(args.header).dst_node_offset, num_blocks_to_add);
+
+         /* Yes, we are potentially calling find_children again here. This is to
+          * handle an edge case where some bvh trees have nodes with only a single
+          * child. This can potentially lead to lots of nodes needing to be
+          * collapsed, overflowing the 6-element buffer allocated. To handle these
+          * rare cases we find the children again, immediately collapsing them as
+          * we find them.
+          */
+         if (collapsed_child_count > 6)
+            find_children(src, children, collapsed_nodes, collapsed_child_count, false);
+
+         BLOCK child_offset = first_child_block;
+         for (uint32_t i = 0; i < found_child_count; ++i) {
+            /* Retrieve type and location of the child from IR BVH */
+            uint32_t type = ir_id_to_type(children[i]);
+
+            if (type == vk_ir_node_internal) {
+               REF(vk_ir_box_node) child_node =
+                  REF(vk_ir_box_node)NODE_OFFSET(children[i]);
+               DEREF(child_node).bvh_offset = child_offset;
+            }
+
+            child_offset += (type == vk_ir_node_instance) ? 2 : 1;
          }
 
-         vk_aabb child_aabb =
-            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
-
-         min_offset = min(min_offset, child_aabb.min);
-         max_offset = max(max_offset, child_aabb.max);
-
-         child_block_offset_from_output_bvh += (type == vk_ir_node_instance) ? 2 : 1;
+         /* Mark this collapsed internal node as NULL,
+          * so whichever lane that would have processed it will return.
+          */
+         for (uint32_t i = 0; i < collapsed_child_count; i++) {
+            REF(vk_ir_box_node) child_node =
+               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, collapsed_nodes[i]);
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
+         }
+         /* Make changes to the children's BVH offset value available to child threads. */
+         memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                       gl_SemanticsRelease | gl_SemanticsMakeAvailable);
       }
 
-      /* Make changes to the children's BVH offset value available to the other invocations. */
-      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
-                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+      /* While all work prior was performed with each simd lane working on
+       * separate nodes, encoding is achieved with all simd lanes in cluster
+       * working together on same internal node. Walk through each ready node
+       * and encode in concert.
+       */
+      while (READY_TO_WRITE(subgroupClusteredMin(bvh_block_offset, 8))) {
+         /* Select next ready simd lane to write */
+         uint32_t idx = (READY_TO_WRITE(bvh_block_offset)) ? cluster.idx : -1;
+         idx = subgroupClusteredMin(idx, 8);
 
-      encode_internal_node(children, child_block_offset_from_internal_node,
-                           found_child_count, min_offset, max_offset, bvh_block_offset);
 
-      break;
+         /* Propagate src child and dest blocks of next simd lane to other lanes */
+         IR_NODE child = VK_BVH_INVALID_NODE;
+         BLOCK child_block = anv_shuffle(cluster, idx, first_child_block);
+         BLOCK internal_node_block = anv_shuffle(cluster, idx, bvh_block_offset);
+         bvh_block_offset = (cluster.idx == idx) ? VK_NULL_BVH_OFFSET
+                                                 : bvh_block_offset;
+         if (cluster.idx >= anv_shuffle(cluster, idx, found_child_count))
+            continue;
+         for (uint32_t i = 0; ; i++) {
+            child = anv_shuffle(cluster, idx, children[i]);
+            if (i == cluster.idx)
+               break;
+            uint32_t type = ir_id_to_type(child);
+            child_block += (type == vk_ir_node_instance) ? 2 : 1;
+         }
+
+         vk_aabb child_aabb = {vec3(INFINITY), vec3(-INFINITY)};
+         if (child != VK_BVH_INVALID_NODE)
+            child_aabb = DEREF(REF(vk_ir_node)NODE_OFFSET(child)).aabb;
+
+         uint32_t type = ir_id_to_type(child);
+         if (child != VK_BVH_INVALID_NODE && type != vk_ir_node_internal)
+            encode_leaf_node(type, NODE_OFFSET(child),
+                           BLOCK_OFFSET(child_block), header);
+
+         BLOCK child_block_offset =
+            anv_shuffle(cluster, 0, child_block) - internal_node_block;
+         encode_internal_node(child, child_block_offset,
+                              anv_shuffle(cluster, idx, found_child_count),
+                              child_aabb, internal_node_block, cluster);
+      }
+
+      uint32_t is_done = (bvh_block_offset == VK_NULL_BVH_OFFSET) ? 1 : 0;
+      if (subgroupClusteredAdd(is_done, 8) == 8)
+         break;
    }
 
    if (is_root_node) {