anv: Implement encode shader to fit in ANV BVH

This shader gets called and will construct ANV BVH from IR BVH. More specifically, each invocation will take care of one internal node. The internal nodes get processed starting from root node all the way to the bottom leaves. During processing, we keep track of the destination of where the internal node should be encoded (tracked in vk_ir_box.bvh_offset), and where its leaves should be encoded (tracked in vk_ir_header.dst_node_offset). The processed bvh is in contiguous memory, which starts with header, followed by interleaving internal nodes and leaves. The nodes information are also populated. Rework: (Sagar) - Return out of bounds threads early - Mimic GRL internal node encoding - Handle node mask - Fix block_incr_and_start_prim - Fix shader_index_and_geom_mask for instance node - Fix instance flag - Fix block_incr and instance_contribution_and_geom_flags initialized to be zero - Fix lower_x and upper_x to be properly flipped for invalid child - For invalid node, clear blockIncr and set startPrim to INVALID - Calculated things upfront and assign, cutting down more than ~200 instructions Co-authored-by: Kevin Chuang <kaiwenjon23@gmail.com> Co-authored-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31588>
2026-01-12 01:20:17 +01:00 · 2024-06-12 16:32:53 -07:00 · 2024-06-12 16:32:53 -07:00 · 2fe57947e3
commit 2fe57947e3
parent 692b5fa9f2
1 changed files with 587 additions and 0 deletions
--- a/src/intel/vulkan/bvh/encode.comp
+++ b/src/intel/vulkan/bvh/encode.comp
@ -0,0 +1,587 @@
+/* Copyright © 2022 Friedrich Vock
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_EXT_shader_atomic_int64: require
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+#include "anv_build_helpers.h"
+#include "anv_build_interface.h"
+
+#define ULP 1.1920928955078125e-7f
+
+layout(push_constant) uniform CONSTS {
+   encode_args args;
+};
+
+uint64_t
+get_instance_flag(uint32_t src)
+{
+   uint32_t flags = src & 0xff;
+   return flags & 0xf;
+}
+
+void
+encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, REF(anv_accel_struct_header) dst_header)
+{
+   switch (type) {
+   case vk_ir_node_triangle: {
+      REF(anv_quad_leaf_node) quad_leaf = REF(anv_quad_leaf_node)(dst_node);
+
+      vk_ir_triangle_node src = DEREF(REF(vk_ir_triangle_node)(src_node));
+      uint32_t geometry_id_and_flags = src.geometry_id_and_flags & 0xffffff;
+
+      /* sub-type (4-bit) encoded on 24-bit index */
+      geometry_id_and_flags |= (ANV_SUB_TYPE_QUAD & 0xF) << 24;
+      /* Set disable opacity culling by default */
+      geometry_id_and_flags |= (1 << 29);
+
+      /* Disable the second triangle */
+      uint32_t prim_index1_delta = 0;
+      /* For now, blockIncr are all 1, so every quad leaf has its "last" bit set. */
+      prim_index1_delta |= (1 << 22);
+
+      DEREF(quad_leaf).prim_index1_delta = prim_index1_delta;
+
+      if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) {
+         /* Geometry opqaue (1-bit) is encoded on 30-bit index */
+         geometry_id_and_flags |= (ANV_GEOMETRY_FLAG_OPAQUE << 30);
+         atomicAnd(DEREF(dst_header).instance_flags,
+                   ~ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE);
+      } else {
+         atomicAnd(DEREF(dst_header).instance_flags,
+                   ~ANV_INSTANCE_FLAG_FORCE_OPAQUE);
+      }
+
+      DEREF(quad_leaf).prim_index0 = src.triangle_id;
+      DEREF(quad_leaf).leaf_desc.geometry_id_and_flags = geometry_id_and_flags;
+
+      /* shaderIndex is typically set to match geomIndex
+       * Geom mask is default to 0xFF
+       */
+      DEREF(quad_leaf).leaf_desc.shader_index_and_geom_mask = 0xFF000000 | (geometry_id_and_flags & 0xffffff);
+
+      /* Setup single triangle */
+      for (uint32_t i = 0; i < 3; i++) {
+         for (uint32_t j = 0; j < 3; j++) {
+            DEREF(quad_leaf).v[i][j] = src.coords[i][j];
+         }
+      }
+      break;
+   }
+   case vk_ir_node_aabb: {
+      REF(anv_procedural_leaf_node) aabb_leaf = REF(anv_procedural_leaf_node)(dst_node);
+
+      vk_ir_aabb_node src = DEREF(REF(vk_ir_aabb_node)(src_node));
+      uint32_t geometry_id_and_flags = src.geometry_id_and_flags & 0xffffff;
+
+      /* sub-type (4-bit) encoded on 24-bit index */
+      geometry_id_and_flags |= (ANV_SUB_TYPE_PROCEDURAL & 0xF) << 24;
+      /* Set disable opacity culling by default */
+      geometry_id_and_flags |= (1 << 29);
+
+      if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) {
+         geometry_id_and_flags |= (ANV_GEOMETRY_FLAG_OPAQUE << 30);
+         atomicAnd(DEREF(dst_header).instance_flags,
+                   ~ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE);
+      } else {
+         atomicAnd(DEREF(dst_header).instance_flags,
+                   ~ANV_INSTANCE_FLAG_FORCE_OPAQUE);
+      }
+
+      DEREF(aabb_leaf).leaf_desc.geometry_id_and_flags = geometry_id_and_flags;
+
+      /* shaderIndex is typically set to match geomIndex
+       * Geom mask is default to 0xFF
+       */
+      DEREF(aabb_leaf).leaf_desc.shader_index_and_geom_mask = 0xFF000000 | (geometry_id_and_flags & 0xffffff);
+
+      /* num primitives = 1 */
+      uint32_t dw1 = 1;
+      /* "last" has only 1 bit, and it is set. */
+      dw1 |= (1 << 31);
+
+      DEREF(aabb_leaf).DW1 = dw1;
+      DEREF(aabb_leaf).primIndex[0] = src.primitive_id;
+      break;
+   }
+   case vk_ir_node_instance: {
+      vk_ir_instance_node src = DEREF(REF(vk_ir_instance_node)(src_node));
+
+      REF(anv_instance_leaf) dst_instance = REF(anv_instance_leaf)(dst_node);
+
+      REF(anv_accel_struct_header) blas_header = REF(anv_accel_struct_header)(src.base_ptr);
+      uint64_t start_node_ptr = uint64_t(src.base_ptr) + DEREF(blas_header).rootNodeOffset;
+
+      uint32_t sbt_offset_and_flags = src.sbt_offset_and_flags;
+
+      uint32_t shader_index_and_geom_mask = 0;
+      shader_index_and_geom_mask |= (src.custom_instance_and_mask & 0xff000000);
+      DEREF(dst_instance).part0.shader_index_and_geom_mask = shader_index_and_geom_mask;
+
+      uint32_t instance_contribution_and_geom_flags = 0;
+      instance_contribution_and_geom_flags |= src.sbt_offset_and_flags & 0xffffff;
+      instance_contribution_and_geom_flags |= (1 << 29);
+      instance_contribution_and_geom_flags |=
+         (get_instance_flag(src.sbt_offset_and_flags >> 24) == ANV_INSTANCE_FLAG_FORCE_OPAQUE ?
+          ANV_GEOMETRY_FLAG_OPAQUE : 0) << 30;
+      DEREF(dst_instance).part0.instance_contribution_and_geom_flags =
+         instance_contribution_and_geom_flags;
+
+      uint32_t instance_flags = DEREF(blas_header).instance_flags;
+      if (((sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
+                                           VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR)) != 0) {
+         instance_flags &= ~(VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
+                             VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR);
+         instance_flags |= (sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
+                                                           VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR);
+      }
+
+      DEREF(dst_instance).part0.start_node_ptr_and_inst_flags =
+         start_node_ptr |
+         (get_instance_flag(instance_flags | (src.sbt_offset_and_flags >> 24)) << 48);
+
+      mat4 transform = mat4(src.otw_matrix);
+
+      mat4 inv_transform = transpose(inverse(transpose(transform)));
+      mat3x4 wto_matrix = mat3x4(inv_transform);
+      mat3x4 otw_matrix = mat3x4(transform);
+
+      /* Arrange WTO transformation matrix in column-major order */
+      DEREF(dst_instance).part0.world2obj_vx_x = wto_matrix[0][0];
+      DEREF(dst_instance).part0.world2obj_vx_y = wto_matrix[1][0];
+      DEREF(dst_instance).part0.world2obj_vx_z = wto_matrix[2][0];
+      DEREF(dst_instance).part0.obj2world_p_x =  otw_matrix[0][3];
+
+      DEREF(dst_instance).part0.world2obj_vy_x = wto_matrix[0][1];
+      DEREF(dst_instance).part0.world2obj_vy_y = wto_matrix[1][1];
+      DEREF(dst_instance).part0.world2obj_vy_z = wto_matrix[2][1];
+      DEREF(dst_instance).part0.obj2world_p_y =  otw_matrix[1][3];
+
+      DEREF(dst_instance).part0.world2obj_vz_x = wto_matrix[0][2];
+      DEREF(dst_instance).part0.world2obj_vz_y = wto_matrix[1][2];
+      DEREF(dst_instance).part0.world2obj_vz_z = wto_matrix[2][2];
+      DEREF(dst_instance).part0.obj2world_p_z =  otw_matrix[2][3];
+
+      /* Arrange OTW transformation matrix in column-major order */
+      DEREF(dst_instance).part1.obj2world_vx_x = otw_matrix[0][0];
+      DEREF(dst_instance).part1.obj2world_vx_y = otw_matrix[1][0];
+      DEREF(dst_instance).part1.obj2world_vx_z = otw_matrix[2][0];
+      DEREF(dst_instance).part1.world2obj_p_x =  wto_matrix[0][3];
+
+      DEREF(dst_instance).part1.obj2world_vy_x = otw_matrix[0][1];
+      DEREF(dst_instance).part1.obj2world_vy_y = otw_matrix[1][1];
+      DEREF(dst_instance).part1.obj2world_vy_z = otw_matrix[2][1];
+      DEREF(dst_instance).part1.world2obj_p_y =  wto_matrix[1][3];
+
+      DEREF(dst_instance).part1.obj2world_vz_x = otw_matrix[0][2];
+      DEREF(dst_instance).part1.obj2world_vz_y = otw_matrix[1][2];
+      DEREF(dst_instance).part1.obj2world_vz_z = otw_matrix[2][2];
+      DEREF(dst_instance).part1.world2obj_p_z =  wto_matrix[2][3];
+
+      DEREF(dst_instance).part1.bvh_ptr = src.base_ptr;
+      DEREF(dst_instance).part1.instance_index = src.instance_id;
+      DEREF(dst_instance).part1.instance_id = src.custom_instance_and_mask & 0xffffff;
+
+      uint64_t instance_leaves_addr_base = args.output_bvh - args.output_bvh_offset + ANV_RT_BVH_HEADER_SIZE;
+      uint64_t cnt = atomicAdd(DEREF(dst_header).instance_count, 1);
+      DEREF(INDEX(uint64_t, instance_leaves_addr_base, cnt)) = dst_node;
+      break;
+   }
+   }
+}
+
+vk_aabb
+conservative_aabb(vk_aabb input_aabb)
+{
+   vk_aabb out_aabb;
+
+   vec3 reduce_value = max(abs(input_aabb.min), abs(input_aabb.max));
+   float err = ULP * max(reduce_value.x, max(reduce_value.y, reduce_value.z));
+
+   out_aabb.min = input_aabb.min - vec3(err);
+   out_aabb.max = input_aabb.max + vec3(err);
+
+   return out_aabb;
+}
+
+void
+aabb_extend(inout vk_aabb v1, vk_aabb v2)
+{
+   v1.min = min(v1.min, v2.min);
+   v1.max = max(v1.max, v2.max);
+}
+
+vec3
+aabb_size(vk_aabb input_aabb)
+{
+   return input_aabb.max - input_aabb.min;
+}
+
+/* Determine the node_type based on type of its children.
+ * If children are all the same leaves, this internal node is a fat leaf;
+ * Otherwise, it's a mixed node.
+ */
+uint8_t
+determine_internal_node_type(uint32_t children[6], uint child_count)
+{
+   if (child_count == 0)
+      return uint8_t(ANV_NODE_TYPE_INVALID);
+
+   uint32_t type_of_first_child = ir_id_to_type(children[0]);
+   for (uint32_t i = 1; i < child_count; ++i) {
+      uint32_t type = ir_id_to_type(children[i]);
+      if(type != type_of_first_child){
+         return uint8_t(ANV_NODE_TYPE_MIXED);
+      }
+   }
+
+   /* All children have same type. Now check what type they are. */
+   switch (type_of_first_child){
+   case vk_ir_node_triangle:
+      return uint8_t(ANV_NODE_TYPE_QUAD);
+   case vk_ir_node_aabb:
+      return uint8_t(ANV_NODE_TYPE_PROCEDURAL);
+   case vk_ir_node_instance:
+      return uint8_t(ANV_NODE_TYPE_INSTANCE);
+   case vk_ir_node_internal:
+      return uint8_t(ANV_NODE_TYPE_MIXED);
+   default:
+      return uint8_t(ANV_NODE_TYPE_INVALID);
+   }
+}
+
+vk_aabb
+quantize_bounds(vk_aabb aabb, vec3 base, i8vec3 exp)
+{
+   vk_aabb quant_aabb;
+   vec3 lower = aabb.min - base;
+   vec3 upper = aabb.max - base;
+
+   vec3 qlower = ldexp(lower, -exp + 8);
+   vec3 qupper = ldexp(upper, -exp + 8);
+
+   qlower = min(max(floor(qlower), vec3(0.0)), vec3(255.0));
+   qupper = min(max(ceil(qupper), vec3(0.0)), vec3(255.0));
+
+   quant_aabb.min = qlower;
+   quant_aabb.max = qupper;
+
+   return quant_aabb;
+}
+
+void
+encode_internal_node(uint32_t children[6], uint32_t child_block_offset_from_internal_node, uint child_count,
+                     vec3 min_offset, vec3 max_offset, uint32_t bvh_block_offset)
+{
+   REF(anv_internal_node) dst_node =
+      REF(anv_internal_node)(OFFSET(args.output_bvh, ANV_RT_BLOCK_SIZE * bvh_block_offset));
+
+   DEREF(dst_node).child_block_offset = child_block_offset_from_internal_node;
+
+   vk_aabb box;
+   box.min = min_offset;
+   box.max = max_offset;
+
+   vk_aabb conservative_child_aabb = conservative_aabb(box);
+   DEREF(dst_node).lower[0] = conservative_child_aabb.min.x;
+   DEREF(dst_node).lower[1] = conservative_child_aabb.min.y;
+   DEREF(dst_node).lower[2] = conservative_child_aabb.min.z;
+
+   float up = 1.0 + ULP;
+   ivec3 exp;
+
+   vec3 len = aabb_size(conservative_child_aabb) * up;
+   vec3 mant = frexp(len, exp);
+
+   exp.x += int((mant.x > (255.0f / 256.0f)));
+   exp.y += int((mant.y > (255.0f / 256.0f)));
+   exp.z += int((mant.z > (255.0f / 256.0f)));
+
+   i8vec3 exponent_i8 = i8vec3(exp);
+   DEREF(dst_node).exp_x = max(int8_t(-128), exponent_i8.x);
+   DEREF(dst_node).exp_y = max(int8_t(-128), exponent_i8.y);
+   DEREF(dst_node).exp_z = max(int8_t(-128), exponent_i8.z);
+
+   i8vec3 exp_i8 = i8vec3(DEREF(dst_node).exp_x, DEREF(dst_node).exp_y, DEREF(dst_node).exp_z);
+
+   DEREF(dst_node).node_mask = uint8_t(0xff);
+   DEREF(dst_node).node_type = determine_internal_node_type(children, child_count);
+
+   for (uint32_t i = 0; i < 6; i++) {
+      if (i < child_count) {
+         uint32_t type = ir_id_to_type(children[i]);
+         /* blockIncr and child_block_offset are how HW used to find children during traversal.
+          * If not set properly, gpu could hang.
+          */
+         DEREF(dst_node).data[i].block_incr_and_start_prim =
+            type == vk_ir_node_instance ? uint8_t(2) : uint8_t(1);
+
+         uint32_t offset = ir_id_to_offset(children[i]);
+
+         vk_aabb child_aabb =
+            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+
+         child_aabb = conservative_aabb(child_aabb);
+
+         vk_aabb quantize_aabb = quantize_bounds(child_aabb, conservative_child_aabb.min, exp_i8);
+
+         DEREF(dst_node).lower_x[i] = uint8_t(quantize_aabb.min.x);
+         DEREF(dst_node).lower_y[i] = uint8_t(quantize_aabb.min.y);
+         DEREF(dst_node).lower_z[i] = uint8_t(quantize_aabb.min.z);
+         DEREF(dst_node).upper_x[i] = uint8_t(quantize_aabb.max.x);
+         DEREF(dst_node).upper_y[i] = uint8_t(quantize_aabb.max.y);
+         DEREF(dst_node).upper_z[i] = uint8_t(quantize_aabb.max.z);
+
+         /* for a mixed node, encode type of each children in startPrim in childdata */
+         if (DEREF(dst_node).node_type == uint8_t(ANV_NODE_TYPE_MIXED)){
+            uint32_t type = ir_id_to_type(children[i]);
+            switch (type){
+            case vk_ir_node_triangle:
+               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_QUAD) << 2);
+               break;
+            case vk_ir_node_aabb:
+               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_PROCEDURAL) << 2);
+               break;
+            case vk_ir_node_instance:
+               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INSTANCE) << 2);
+               break;
+            case vk_ir_node_internal:
+               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_MIXED) << 2);
+               break;
+            }
+         }
+      } else {
+         /* Invalid Child Nodes: For invalid child nodes, the MSBs of lower and upper
+          * x planes are flipped. In other words:
+          * bool valid(int i) const {
+          *   return !(lower_x[i] & 0x80) || (upper_x[i] & 0x80);
+          * }
+          */
+         DEREF(dst_node).lower_x[i] = uint8_t(0x80);
+         DEREF(dst_node).lower_y[i] = uint8_t(0);
+         DEREF(dst_node).lower_z[i] = uint8_t(0);
+         DEREF(dst_node).upper_x[i] = uint8_t(0);
+         DEREF(dst_node).upper_y[i] = uint8_t(0);
+         DEREF(dst_node).upper_z[i] = uint8_t(0);
+
+         /* in case HW also references blockIncr to do something, we zero out the data. */
+         DEREF(dst_node).data[i].block_incr_and_start_prim = uint8_t(0);
+         DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INVALID) << 2);
+      }
+   }
+}
+
+void
+main()
+{
+   /* Encode.comp is dispatched through indirect dispatch with calculated groupCountX,
+    * but we can still overdispatch invocations, so we need a guard here.
+    *
+    * Also, we can't support more than 0xFFFFFFFF internal nodes due to SW
+    * limit we enforce on indirect workgroup count for signaling.
+    */
+   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count ||
+       DEREF(args.header).ir_internal_node_count > 0xFFFFFFFF)
+      return;
+
+   /* Each lane will process one vk_ir_node_internal. The root node is sitting at the end
+    * of the IR BVH, and we let the lane with gl_GlobalInvocationID.x == 0 to take care of it.
+    */
+   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
+
+   uint32_t intermediate_leaf_node_size;
+   switch (args.geometry_type) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      intermediate_leaf_node_size = SIZEOF(vk_ir_triangle_node);
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      intermediate_leaf_node_size = SIZEOF(vk_ir_aabb_node);
+      break;
+   default: /* instances */
+      intermediate_leaf_node_size = SIZEOF(vk_ir_instance_node);
+      break;
+   }
+
+   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * intermediate_leaf_node_size;
+
+   REF(vk_ir_box_node) intermediate_internal_nodes =
+      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
+   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
+   vk_ir_box_node src = DEREF(src_node);
+
+   bool is_root_node = gl_GlobalInvocationID.x == 0;
+
+   REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(args.output_bvh - args.output_bvh_offset);
+
+   if (is_root_node) {
+      DEREF(header).instance_flags =
+         (args.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR ? ANV_INSTANCE_ALL_AABB : 0) |
+         /* These will be removed when processing leaf nodes */
+         ANV_INSTANCE_FLAG_FORCE_OPAQUE | ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE;
+
+      /* Indicate where the next children should be encoded. Offset measured in number of 64B blocks and started from output_bvh */
+      DEREF(args.header).dst_node_offset = 1;
+
+      DEREF(header).instance_count = 0;
+   }
+
+   for (;;) {
+      /* Make changes to the current node's BVH offset value visible. */
+      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+      /* Indicate where this internal node should be encoded. Offset measured in number of 64B blocks and started from output_bvh.*/
+      uint32_t bvh_block_offset = is_root_node ? 0 : DEREF(src_node).bvh_offset;
+
+      /* The invocation that processes this node is spinning, since its parent hasn't told it bvh_offset */
+      if (bvh_block_offset == VK_UNKNOWN_BVH_OFFSET)
+         continue;
+
+      if (bvh_block_offset == VK_NULL_BVH_OFFSET)
+         break;
+
+      uint32_t found_child_count = 0;
+      uint32_t children[6] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
+                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE};
+
+      /* Initially, this node can have at most two children (can be internal nodes or leaves). */
+      for (uint32_t i = 0; i < 2; ++i)
+         if (src.children[i] != VK_BVH_INVALID_NODE)
+            children[found_child_count++] = src.children[i];
+
+      /* For this node, try to collapse binary to 6-ary children */
+      while (found_child_count < 6) {
+         /* For each iteration, find a vk_ir_node_internal child that has largest surface area */
+         int32_t collapsed_child_index = -1;
+         float largest_surface_area = -INFINITY;
+
+         for (int32_t i = 0; i < found_child_count; ++i) {
+            /* If a child is a leaf (not vk_ir_node_internal), there's no need to collapse it. */
+            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
+               continue;
+
+            vk_aabb bounds =
+               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
+                                           ir_id_to_offset(children[i]))).aabb;
+
+            float surface_area = aabb_surface_area(bounds);
+            if (surface_area > largest_surface_area) {
+               largest_surface_area = surface_area;
+               collapsed_child_index = i;
+            }
+         }
+
+         if (collapsed_child_index != -1) {
+            /* Once I found a good vk_ir_node_internal child, try to connect myself
+             * to this child's children, i.e. my grandchildren. Grandchildren can be
+             * internal nodes or leaves.
+             */
+            REF(vk_ir_box_node) child_node =
+               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
+                                        ir_id_to_offset(children[collapsed_child_index]));
+            uint32_t grandchildren[2] = DEREF(child_node).children;
+            uint32_t valid_grandchild_count = 0;
+
+            if (grandchildren[1] != VK_BVH_INVALID_NODE)
+               ++valid_grandchild_count;
+
+            if (grandchildren[0] != VK_BVH_INVALID_NODE)
+               ++valid_grandchild_count;
+            else
+               grandchildren[0] = grandchildren[1];
+
+            /* Grandchild now becomes my direct child, and can possibly be collapsed
+             * in the next iteration if found_child_count has not reached 6.
+             */
+            if (valid_grandchild_count > 1)
+               children[found_child_count++] = grandchildren[1];
+
+            if (valid_grandchild_count > 0)
+               children[collapsed_child_index] = grandchildren[0];
+            else {
+               /* This child doesn't have valid children, then I don't consider this
+                * child as my child anymore. This is possible depending on how and
+                * when lbvh/ploc algorithm marks a node as VK_BVH_INVALID_NODE.
+                */
+               found_child_count--;
+               children[collapsed_child_index] = children[found_child_count];
+            }
+
+            /* Finish collapsing, now I can mark this collapsed internal node as NULL,
+             * so whichever lane that would have processed it will return.
+             */
+            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
+         } else
+            break;
+      }
+
+      /* Count the number of instance children found. For each one found, it contributes to 2 blocks to dst_node_offset */
+      uint32_t num_blocks_to_add = 0;
+      for (uint32_t i = 0; i < found_child_count; ++i) {
+         uint32_t type = ir_id_to_type(children[i]);
+         num_blocks_to_add += (type == vk_ir_node_instance) ? 2 : 1;
+      }
+
+      /* Used for finding where to encode children. Also, update dst_node_offset so other invocations know where to start encoding */
+      uint32_t child_block_offset_from_output_bvh = atomicAdd(DEREF(args.header).dst_node_offset, num_blocks_to_add);
+
+      /* This is one of the needed information in anv_internal_node */
+      uint32_t child_block_offset_from_internal_node = child_block_offset_from_output_bvh - bvh_block_offset;
+
+      vec3 min_offset = vec3(INFINITY);
+      vec3 max_offset = vec3(-INFINITY);
+      for (uint32_t i = 0; i < found_child_count; ++i) {
+         /* Retrieve type and location of the child from IR BVH */
+         uint32_t type = ir_id_to_type(children[i]);
+         uint32_t offset = ir_id_to_offset(children[i]);
+
+         if (type == vk_ir_node_internal) {
+            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
+            DEREF(child_node).bvh_offset = child_block_offset_from_output_bvh;
+         } else {
+            encode_leaf_node(type, args.intermediate_bvh + offset,
+                             args.output_bvh + ANV_RT_BLOCK_SIZE * child_block_offset_from_output_bvh,
+                             header);
+         }
+
+         vk_aabb child_aabb =
+            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
+
+         min_offset = min(min_offset, child_aabb.min);
+         max_offset = max(max_offset, child_aabb.max);
+
+         child_block_offset_from_output_bvh += (type == vk_ir_node_instance) ? 2 : 1;
+      }
+
+      /* Make changes to the children's BVH offset value available to the other invocations. */
+      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+      encode_internal_node(children, child_block_offset_from_internal_node,
+                           found_child_count, min_offset, max_offset, bvh_block_offset);
+
+      break;
+   }
+
+   if (is_root_node) {
+      DEREF(header).aabb = src.base.aabb;
+      DEREF(header).rootNodeOffset = args.output_bvh_offset;
+   }
+}