radv: Emit compressed primitive nodes on GFX12

The normal encode pass writes batches to a section in build scratch memory. Those batches contain information about the internal node and the primitive nodes. The encoder is split to avoid the register pressure of the compressor and maximize occupancy. The compressor works in two passes because one pass can not guarantee that every primitive node (except) has at least two triangles. This guarantee is used to advertise a smaller acceleration structure size to the application. During compression, every invocation processes at most two triangles. Groups of 8 invocations are used to support the maximum triangle count of 16 that the hardware supports. The first step of compression is loading the triangle(s). Shared vertices are deduplicated early to avoid doing it in the compression loop. The compression loop tries to add triangles to a list of triangles until the computed node size needed for storing the triangles reaches the hardware node size. For this, each invocation first deduplicates vertices with the triangles that have already been picked. It then computes the node size of the picked triangles plus the candidate triangles of the current invocation. The invocation that computed the smallest size is added to the list. Because it may not be possible to fit every triangle into the same node, there can be multiple hardware nodes which are written in parallel for optimal performance. If there are no nodes with only one triangle, all nodes are written. If there is, compression of the batch is aborted and the index of the batch is written to build scratch memory. The second compression pass will repeat the steps above but only for those aborted batches. The nodes with only one triangle can and are now merged. It can not be determined during box node encode which triangles will be compressed together so the encoder also has to fix up the parent box node's child infos. Reviewed-by: Natalie Vock <natalie.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36965>
2025-12-20 11:40:10 +01:00 · 2025-04-07 22:19:42 +02:00 · 2025-04-07 22:19:42 +02:00 · c18a7d0e2b
commit c18a7d0e2b
parent c5f9fe5e3b
8 changed files with 949 additions and 18 deletions
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@ -21,6 +21,7 @@ TYPE(radv_gfx12_box_node, 4);
 TYPE(radv_gfx12_instance_node, 8);
 TYPE(radv_gfx12_instance_node_user_data, 4);
 TYPE(radv_gfx12_primitive_node, 4);
+TYPE(radv_triangle_encode_task, 4);

 uint32_t
 id_to_offset(uint32_t id)
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@ -25,6 +25,8 @@
 #define RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS        (1u << (VK_BUILD_FLAG_COUNT + 4))
 #define RADV_BUILD_FLAG_UPDATE_SINGLE_GEOMETRY         (1u << (VK_BUILD_FLAG_COUNT + 5))
 #define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES        (1u << (VK_BUILD_FLAG_COUNT + 6))
+#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES       (1u << (VK_BUILD_FLAG_COUNT + 7))
+#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 8))

 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
@ -55,6 +57,15 @@ struct encode_gfx12_args {
   uint32_t geometry_type;
 };

+struct encode_triangles_gfx12_args {
+   VOID_REF intermediate_bvh;
+   VOID_REF output_base;
+   REF(vk_ir_header) header;
+   uint32_t output_bvh_offset;
+   uint32_t leaf_node_offsets_offset;
+   uint32_t batches_size;
+};
+
 struct header_args {
   REF(vk_ir_header) src;
   REF(radv_accel_struct_header) dst;
@ -84,4 +95,11 @@ struct update_gfx12_args {
   vk_bvh_geometry_data geom_data0;
 };

+#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X       0
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Y       1
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Z       2
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X 3
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Y 4
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Z 5
+
 #endif /* BUILD_INTERFACE_H */
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@ -195,4 +195,13 @@ struct radv_gfx12_primitive_node {
   uint32_t dwords[32];
 };

+#define RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT 16
+#define RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT 8
+
+struct radv_triangle_encode_task {
+   uint32_t parent_offset;
+   /* The pair index is stored in the 4 high bits and the node index is stored in the low bits. */
+   uint32_t pair_index_node_index[RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT];
+};
+
 #endif /* BVH_H */
--- a/src/amd/vulkan/bvh/encode_gfx12.comp
+++ b/src/amd/vulkan/bvh/encode_gfx12.comp
@ -91,7 +91,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
               right = RADV_BVH_INVALID_NODE;
            } else if (right != RADV_BVH_INVALID_NODE && ir_id_to_type(left) == vk_ir_node_triangle &&
                       ir_id_to_type(right) == vk_ir_node_triangle &&
-                       VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES)) {
+                       (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
+                        VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))) {
               second_child = right;
               right = RADV_BVH_INVALID_NODE;
            }
@ -100,7 +101,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
         }
         right = radv_read_invocation(cluster, collapse_index, right);

-         if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES)) {
+         if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
+             VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) {
            bool is_valid_triangle = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_triangle;
            uint32_t right_pair_mask =
               radv_ballot(cluster, is_valid_triangle && second_child == RADV_BVH_INVALID_NODE &&
@ -142,6 +144,7 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
      uint32_t dst_leaf_offset;
      uint32_t dst_internal_offset;
      if (cluster.invocation_index == 0) {
+         if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))
            dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
         dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
      }
@ -170,7 +173,17 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
      extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
      uvec3 extent_exponents = floatBitsToUint(extent) >> 23;

-      uint32_t valid_child_count = child_leaf_node_count + child_internal_node_count;
+      uint32_t valid_child_count = child_internal_node_count;
+
+      uint32_t output_valid_child_count = valid_child_count;
+      /* Do not include triangle nodes if RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES because
+       * the count can only be computed by the encode pass.
+       */
+      if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))
+         output_valid_child_count += child_leaf_node_count;
+
+      valid_child_count += child_leaf_node_count;
+
      if (cluster.invocation_index == 0) {
         DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
         DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
@ -178,7 +191,7 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
            DEREF(dst).parent_id = RADV_BVH_INVALID_NODE;
         DEREF(dst).origin = origin;
         DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
-                                            (extent_exponents.z << 16) | ((valid_child_count - 1) << 28);
+                                            (extent_exponents.z << 16) | ((output_valid_child_count - 1) << 28);
         DEREF(dst).obb_matrix_index = 0x7f;
      }

@ -199,6 +212,39 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern

            REF(radv_gfx12_box_node) child_box = REF(radv_gfx12_box_node)(dst_child_addr);
            DEREF(child_box).parent_id = node_id;
+         } else if (VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) {
+            /* We try to encode 16 (RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT) triangles into a single node. */
+            uint32_t batch_aligned_triangle_index;
+            if (cluster.invocation_index == radv_first_active_invocation(cluster)) {
+               /* Each invocation will encode a triangle pair. */
+               batch_aligned_triangle_index =
+                  atomicAdd(DEREF(args.header).driver_internal[0], RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+            }
+            batch_aligned_triangle_index =
+               radv_read_invocation(cluster, radv_first_active_invocation(cluster), batch_aligned_triangle_index);
+
+            VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header));
+            REF(radv_triangle_encode_task) task =
+               INDEX(radv_triangle_encode_task, triangle_tasks,
+                     batch_aligned_triangle_index / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+            if (cluster.invocation_index == radv_first_active_invocation(cluster))
+               DEREF(task).parent_offset = bvh_offset;
+
+            uint32_t triangle_pair_index = child_index - child_internal_node_count;
+
+            DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 0] =
+               (child_index << 28) | (ir_id_to_offset(child) / ir_leaf_node_size);
+            if (second_child != RADV_BVH_INVALID_NODE) {
+               DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] =
+                  (child_index << 28) | (ir_id_to_offset(second_child) / ir_leaf_node_size);
+            } else {
+               DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] = RADV_BVH_INVALID_NODE;
+            }
+
+            if (child_leaf_node_count < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT &&
+                cluster.invocation_index == radv_first_active_invocation(cluster))
+               DEREF(task).pair_index_node_index[child_leaf_node_count * 2] = RADV_BVH_INVALID_NODE;
         } else {
            if (VK_BUILD_FLAG(RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS)) {
               /* Write leaf node offset. */
--- a/src/amd/vulkan/bvh/encode_triangles_gfx12.comp
+++ b/src/amd/vulkan/bvh/encode_triangles_gfx12.comp
@ -0,0 +1,692 @@
+/*
+ * Copyright © 2025 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_clustered : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+#define GFX12
+#define USE_GLOBAL_SYNC
+
+#include "vk_debug.h"
+
+#include "build_helpers.h"
+#include "build_interface.h"
+#include "encode.h"
+#include "invocation_cluster.h"
+
+layout(push_constant) uniform CONSTS
+{
+   encode_triangles_gfx12_args args;
+};
+
+#define UNASSIGNED_VERTEX_INDICES 0xfffffffffffful
+
+void
+main()
+{
+   bool is_retry = VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY);
+
+   uint32_t global_id = gl_GlobalInvocationID.x;
+
+   /* Each invocation cluster handles one task. */
+   radv_invocation_cluster cluster;
+   radv_invocation_cluster_init(cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+   uint32_t task_index = global_id / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+   if (is_retry) {
+      VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size);
+      task_index = DEREF(INDEX(uint32_t, retry_indices, task_index));
+   }
+
+   VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header));
+   REF(radv_triangle_encode_task) task = INDEX(radv_triangle_encode_task, triangle_tasks, task_index);
+   uint32_t pair_index_node_index0 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2];
+   uint32_t pair_index_node_index1 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2 + 1];
+
+   uint32_t total_pair_count = min(findLSB(radv_ballot(cluster, pair_index_node_index0 == RADV_BVH_INVALID_NODE)), 8u);
+
+   if (cluster.invocation_index >= total_pair_count)
+      return;
+
+   uint32_t leaf_index0 = pair_index_node_index0 & 0x0fffffff;
+   vk_ir_triangle_node node0 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index0));
+
+   uint32_t triangle_id0 = node0.triangle_id;
+   uint32_t geometry_id0 = node0.geometry_id_and_flags & 0xffffff;
+   bool opaque0 = (node0.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
+   uint32_t triangle_id1 = triangle_id0;
+   uint32_t geometry_id1 = geometry_id0;
+   bool opaque1 = false;
+
+   vec3 vertices[6];
+   vertices[0] = vec3(node0.coords[0][0], node0.coords[0][1], node0.coords[0][2]);
+   vertices[1] = vec3(node0.coords[1][0], node0.coords[1][1], node0.coords[1][2]);
+   vertices[2] = vec3(node0.coords[2][0], node0.coords[2][1], node0.coords[2][2]);
+
+   uint32_t pair_vertex_indices = 0x210;
+
+   uint32_t pair_size = 1;
+   if (pair_index_node_index1 != RADV_BVH_INVALID_NODE) {
+      pair_size = 2;
+
+      uint32_t leaf_index1 = pair_index_node_index1 & 0x0fffffff;
+      vk_ir_triangle_node node1 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index1));
+
+      triangle_id1 = node1.triangle_id;
+      geometry_id1 = node1.geometry_id_and_flags & 0xffffff;
+      opaque1 = (node1.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
+
+      vertices[3] = vec3(node1.coords[0][0], node1.coords[0][1], node1.coords[0][2]);
+      vertices[4] = vec3(node1.coords[1][0], node1.coords[1][1], node1.coords[1][2]);
+      vertices[5] = vec3(node1.coords[2][0], node1.coords[2][1], node1.coords[2][2]);
+
+      pair_vertex_indices = 0x543210;
+
+      /* Deduplicate vertices here so it does not have to be done during the compression loop. */
+      for (uint32_t i = 0; i < 3; i++) {
+         for (uint32_t j = 0; j < 3; j++) {
+            if (vertices[3 + i] == vertices[j]) {
+               uint32_t bit_offset = (i + 3) * 4;
+               uint32_t clear_mask = ~(0xf << bit_offset);
+               pair_vertex_indices = (pair_vertex_indices & clear_mask) | (j << bit_offset);
+               break;
+            }
+         }
+      }
+   }
+
+   /* Encode inside a loop. Every active invocation tries to compress with the previously chosen
+    * nodes. The invocation with the smallest node size is chosen. TODO: Are there better heuristics?
+    * If there are no new candidates because the node would be too large, encode the previously chosen nodes
+    * and break out of the loop. In this case the first active invocation is chosen.
+    */
+
+   /* Each vertex is described by 8 bits. The highest 4 contain the invocation index and the low 4 bits contain the
+    * array index.
+    */
+   uint64_t vertex_indices = UNASSIGNED_VERTEX_INDICES;
+
+   bool vertex_used[6] = {false, false, false, false, false, false};
+
+   uint32_t hw_node_index = 0;
+   uvec3 encode_vertex_payload_bit_size;
+   uint32_t encode_trailing_zero_bits;
+   uint32_t encode_geometry_id_base_bit_size;
+   uint32_t encode_geometry_id_payload_bit_size;
+   uint32_t encode_triangle_id_base_bit_size;
+   uint32_t encode_triangle_id_payload_bit_size;
+   uint32_t encode_indices_midpoint;
+
+   uint32_t invocation_vertex_count = pair_index_node_index1 != RADV_BVH_INVALID_NODE ? 6 : 3;
+
+   while (true) {
+      /* assigned is true for every invocation whorse triangles are already part of the node. */
+      bool assigned = vertex_indices != UNASSIGNED_VERTEX_INDICES;
+      uint32_t assigned_mask = radv_ballot(cluster, assigned);
+      uint32_t first_assigned_invocation = findLSB(assigned_mask);
+      uint32_t last_assigned_invocation = assigned_mask != 0 ? findMSB(assigned_mask) : 0;
+
+      if (!assigned)
+         vertex_indices = 0;
+
+      bool found[6] = {false, false, false, false, false, false};
+
+      /* At this point vertex_used is only set for assigned invocations since the rejected candidate invocations are
+       * reset.
+       */
+      uint32_t vertex_count = 0;
+      for (uint32_t i = 0; i < 6; i++)
+         vertex_count += bitCount(radv_ballot(cluster, vertex_used[i]));
+
+      for (uint32_t target_invocation = first_assigned_invocation; target_invocation <= last_assigned_invocation;
+           target_invocation++) {
+
+         if (((assigned_mask >> target_invocation) & 1) == 0)
+            continue;
+
+         vec3 target_vertices[6];
+         bool target_vertex_used[6];
+         for (uint32_t i = 0; i < 6; i++) {
+            target_vertices[i] = radv_read_invocation(cluster, target_invocation, vertices[i]);
+            target_vertex_used[i] = radv_read_invocation(cluster, target_invocation, vertex_used[i]);
+         }
+
+         uint32_t target_vertex_count = radv_read_invocation(cluster, target_invocation, invocation_vertex_count);
+
+         if (!assigned) {
+            for (uint32_t candidate_vertex_index = 0; candidate_vertex_index < invocation_vertex_count;
+                 candidate_vertex_index++) {
+               if (found[candidate_vertex_index])
+                  continue;
+
+               uint32_t assign_index = 0;
+
+               for (uint32_t target_vertex_index = 0; target_vertex_index < target_vertex_count;
+                    target_vertex_index++) {
+                  if (target_vertex_used[target_vertex_index] &&
+                      target_vertices[target_vertex_index] == vertices[candidate_vertex_index]) {
+                     found[candidate_vertex_index] = true;
+                     assign_index = target_vertex_index;
+                  }
+               }
+
+               if (found[candidate_vertex_index])
+                  vertex_indices |= uint64_t((target_invocation << 4) + assign_index)
+                                    << uint64_t(candidate_vertex_index * 8);
+            }
+         }
+      }
+
+      /* Handle the remaining vertices that are not already present in the assigned invocations. */
+      if (!assigned) {
+         for (uint32_t i = 0; i < invocation_vertex_count; i++) {
+            if (found[i])
+               continue;
+
+            uint32_t pair_vertex_index = (pair_vertex_indices >> (i * 4)) & 0xf;
+            if (pair_vertex_index == i) {
+               vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8);
+               vertex_used[i] = true;
+               vertex_count++;
+            } else {
+               uint64_t vertex_index = (vertex_indices >> uint64_t(pair_vertex_index * 8)) & 0xff;
+               vertex_indices |= vertex_index << uint64_t(i * 8);
+            }
+         }
+      }
+
+      /* Compute the node layout and size. For assigned invocations, the values contain information about the node with
+       * only the assigned triangles and for !assigned invocations, the current invocation is included.
+       */
+
+      uint32_t triangle_id_base_bit_size;
+      uint32_t triangle_id_payload_bit_size;
+      uint32_t geometry_id_base_bit_size;
+      uint32_t geometry_id_payload_bit_size;
+      for (uint32_t i = 0; i <= first_assigned_invocation; i++) {
+         /* Determine the number of bits required to represent the node ids in the hw's encoding format.
+          * Base and "offset" are masked and OR'd together, so look at the highest-ordered differing bit.
+          */
+         uint32_t triangle_id_base = radv_read_invocation(cluster, i, triangle_id0);
+         triangle_id_base_bit_size = findMSB(triangle_id_base) + 1;
+         uint32_t invoc_triangle_id_payload_bit_size =
+            max(findMSB(triangle_id0 ^ triangle_id_base), findMSB(triangle_id1 ^ triangle_id_base)) + 1;
+         triangle_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_triangle_id_payload_bit_size : 0,
+                                                             RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+         uint32_t geometry_id_base = radv_read_invocation(cluster, i, geometry_id0);
+         geometry_id_base_bit_size = align(findMSB(geometry_id_base) + 1, 2);
+         uint32_t invoc_geometry_id_payload_bit_size =
+            max(findMSB(geometry_id0 ^ geometry_id_base), findMSB(geometry_id1 ^ geometry_id_base)) + 1;
+         geometry_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_geometry_id_payload_bit_size : 0,
+                                                             RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+         if (!assigned) {
+            triangle_id_payload_bit_size = max(triangle_id_payload_bit_size, invoc_triangle_id_payload_bit_size);
+            geometry_id_payload_bit_size = max(geometry_id_payload_bit_size, invoc_geometry_id_payload_bit_size);
+         }
+
+         if (cluster.invocation_index <= i)
+            break;
+      }
+
+      geometry_id_payload_bit_size = align(geometry_id_payload_bit_size, 2);
+
+      /* vertex_used[0] is guaranteed to be true for at least one invocation. */
+      uvec3 vertex_prefix = first_assigned_invocation == 0xffffffff
+                               ? floatBitsToUint(vertices[0])
+                               : radv_read_invocation(cluster, first_assigned_invocation, floatBitsToUint(vertices[0]));
+      uvec3 vertex_payload_mask = uvec3(0);
+      uint32_t vertex_non_zero_mask = 0;
+      for (uint32_t i = 0; i < invocation_vertex_count; i++) {
+         vertex_payload_mask |= vertex_prefix ^ floatBitsToUint(vertices[i]);
+         vertex_non_zero_mask |=
+            floatBitsToUint(vertices[i].x) | floatBitsToUint(vertices[i].y) | floatBitsToUint(vertices[i].z);
+      }
+
+      uint32_t trailing_zero_bits = min(findLSB(vertex_non_zero_mask), 32u);
+      uvec3 vertex_payload_bit_size = min(findMSB(vertex_payload_mask), 31u) + 1;
+
+      if (!assigned) {
+         trailing_zero_bits = 32;
+         vertex_payload_bit_size = uvec3(0);
+      }
+
+      trailing_zero_bits = subgroupClusteredMin(trailing_zero_bits, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+      vertex_payload_bit_size =
+         subgroupClusteredMax(vertex_payload_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+      if (!assigned) {
+         trailing_zero_bits = min(trailing_zero_bits, min(findLSB(vertex_non_zero_mask), 32u));
+         vertex_payload_bit_size = max(vertex_payload_bit_size, min(findMSB(vertex_payload_mask), 31u) + 1);
+      }
+
+      vertex_payload_bit_size.x =
+         vertex_payload_bit_size.x > trailing_zero_bits ? vertex_payload_bit_size.x - trailing_zero_bits : 1;
+      vertex_payload_bit_size.y =
+         vertex_payload_bit_size.y > trailing_zero_bits ? vertex_payload_bit_size.y - trailing_zero_bits : 1;
+      vertex_payload_bit_size.z =
+         vertex_payload_bit_size.z > trailing_zero_bits ? vertex_payload_bit_size.z - trailing_zero_bits : 1;
+
+      uvec3 vertex_base_bit_size = uvec3(32 - trailing_zero_bits) - vertex_payload_bit_size;
+
+      uint32_t required_bit_size = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE;
+
+      required_bit_size += vertex_base_bit_size.x + vertex_base_bit_size.y + vertex_base_bit_size.z;
+      required_bit_size +=
+         vertex_count * (vertex_payload_bit_size.x + vertex_payload_bit_size.y + vertex_payload_bit_size.z);
+
+      uint32_t pair_count = bitCount(assigned_mask);
+      if (!assigned)
+         pair_count++;
+
+      required_bit_size += geometry_id_base_bit_size + (pair_count * 2 - 1) * geometry_id_payload_bit_size;
+      uint32_t indices_midpoint = required_bit_size;
+      required_bit_size += triangle_id_base_bit_size + (pair_count * 2 - 1) * triangle_id_payload_bit_size;
+
+      uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count;
+      required_bit_size += triangle_pair_descs_size;
+
+      if (vertex_count > 15)
+         required_bit_size = RADV_GFX12_BVH_NODE_SIZE * 8 + 1;
+
+      /* This is only relevant for unassigned invocations. If every invocation is assigned, the 0xffffffff will force a
+       * final flush.
+       */
+      uint32_t min_required_bit_size =
+         subgroupClusteredMin(assigned ? 0xffffffff : required_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+      /* The last iteration always needs to write the remaining triangles. */
+      if (min_required_bit_size > RADV_GFX12_BVH_NODE_SIZE * 8) {
+         if (assigned) {
+            encode_vertex_payload_bit_size = vertex_payload_bit_size;
+            encode_trailing_zero_bits = trailing_zero_bits;
+            encode_geometry_id_base_bit_size = geometry_id_base_bit_size;
+            encode_geometry_id_payload_bit_size = geometry_id_payload_bit_size;
+            encode_triangle_id_base_bit_size = triangle_id_base_bit_size;
+            encode_triangle_id_payload_bit_size = triangle_id_payload_bit_size;
+            encode_indices_midpoint = indices_midpoint;
+            break;
+         } else {
+            hw_node_index++;
+
+            vertex_indices = UNASSIGNED_VERTEX_INDICES;
+            for (uint32_t i = 0; i < 6; i++)
+               vertex_used[i] = false;
+         }
+      } else {
+         uint32_t chosen_invocation =
+            findMSB(radv_ballot(cluster, !assigned && required_bit_size == min_required_bit_size));
+         if (cluster.invocation_index != chosen_invocation && !assigned) {
+            vertex_indices = UNASSIGNED_VERTEX_INDICES;
+            for (uint32_t i = 0; i < 6; i++)
+               vertex_used[i] = false;
+         }
+      }
+   }
+
+   uint32_t hw_node_count = subgroupClusteredMax(hw_node_index, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) + 1;
+
+   uint32_t pair_index;
+   uint32_t pair_base_index = 0;
+   uint32_t pair_count;
+   uint32_t first_active_in_node;
+   uint32_t node_mask;
+   uint32_t node_invocations;
+   for (uint32_t i = 0; i < hw_node_count; i++) {
+      uint32_t current_node_mask = radv_ballot(cluster, hw_node_index == i);
+      if (hw_node_index == i) {
+         node_mask = current_node_mask;
+         pair_count = bitCount(node_mask);
+         first_active_in_node = findLSB(node_mask);
+         pair_index = bitCount(node_mask & ((1u << cluster.invocation_index) - 1));
+         node_invocations = subgroupClusteredOr(cluster.invocation_index << (pair_index * 4),
+                                                RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+         break;
+      }
+      pair_base_index += bitCount(current_node_mask);
+   }
+
+   bool is_single_prim_node = pair_count == 1 && pair_index_node_index1 == RADV_BVH_INVALID_NODE;
+
+   /* If there is a node that contains only one primitive, abort this encoding attempt and retry during a second pass
+    * which will pair such nodes. This needs a separate pass so that the allocated nodes of two batches can be
+    * guaranteed to be close enough since primitive nodes can only have small relative offsets. The retry pass has
+    * RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (is_retry) set.
+    */
+   uint32_t single_prim_node_invoc = findLSB(radv_ballot(cluster, is_single_prim_node));
+   bool has_single_prim_node = radv_ballot(cluster, is_single_prim_node) != 0;
+   if (!is_retry && has_single_prim_node) {
+      if (cluster.invocation_index == 0) {
+         uint32_t retry_base_invocation =
+            atomicAdd(DEREF(args.header).driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X],
+                      RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+         uint32_t retry_batch_index_index = retry_base_invocation / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+         VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size);
+         DEREF(INDEX(uint32_t, retry_indices, retry_batch_index_index)) = task_index;
+      }
+
+      return;
+   }
+
+   if (is_retry) {
+      /* Move the single primitive node to the end since it needs to offset into the next batch. */
+      uint32_t single_prim_pair_base_index = radv_read_invocation(cluster, single_prim_node_invoc, pair_base_index);
+
+      if (pair_base_index > single_prim_pair_base_index)
+         pair_base_index--;
+      if (is_single_prim_node)
+         pair_base_index = total_pair_count - 1;
+   }
+
+   REF(radv_gfx12_box_node) parent_node =
+      REF(radv_gfx12_box_node)(args.output_base + args.output_bvh_offset + DEREF(task).parent_offset);
+   uint32_t first_leaf_child_index = (DEREF(parent_node).child_count_exponents >> 28) + 1;
+   if (first_leaf_child_index == 0x10)
+      first_leaf_child_index = 0;
+
+   /* Two batches are always combined into one during the retry pass is there is a second batch. The goal is to merge
+    * all primitive nodes with just one triangle (except one if there is an odd number of such nodes). Since the
+    * compression loop above can always merge at least two nodes, the following assumptions should hold:
+    *
+    *    - There is at most one primitive node with only one triangle in a batch
+    *    - this primitive has the max hw_node_index in this batch.
+    *
+    * If there is a second batch, the first batch will allocate one less primitive node. This is the triangle that will
+    * be merged into the second batch which we know has the highest hw_node_index/dst_offset. The second batch starts in
+    * dst memory where the primitive that was removed from the first batch should have been. The merged triangle can be
+    * referenced in two different ways:
+    *
+    *    - If the batch contains only one triangle, the primitive_base_id is changed to point at the merged node.
+    *    - Otherwise the node size inside the cild info before the moved triangle child info is set to skip ahead to the
+    *      merged primitive node in the second batch.
+    */
+
+   if (is_retry) {
+      assert(bitCount(radv_ballot(cluster, is_single_prim_node)) == 1,
+             "radv: encode_triangles_gfx12: There must be exactly one node with only one triangle.\n");
+      assert(!is_single_prim_node || hw_node_index == hw_node_count - 1,
+             "radv: encode_triangles_gfx12: The single triangle primitive node must be last.\n");
+   }
+
+   radv_invocation_cluster alloc_cluster = cluster;
+   uint32_t alloc_hw_node_count = hw_node_count;
+   bool has_second_batch = false;
+   bool jump_to_second_batch = false;
+   uint32_t single_prim_node_invocs[2];
+   if (is_retry) {
+      radv_invocation_cluster_init(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT * 2);
+
+      has_second_batch = (radv_ballot(alloc_cluster, true) >> RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) != 0;
+
+      single_prim_node_invocs[0] = radv_read_invocation(alloc_cluster, 0, single_prim_node_invoc);
+      single_prim_node_invocs[1] =
+         radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, single_prim_node_invoc) +
+         RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+
+      if (has_second_batch) {
+         alloc_hw_node_count =
+            radv_read_invocation(alloc_cluster, 0, hw_node_count) +
+            radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, hw_node_count) - 1;
+
+         jump_to_second_batch = alloc_cluster.invocation_index < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+         if (is_single_prim_node) {
+            encode_vertex_payload_bit_size = uvec3(32);
+            encode_trailing_zero_bits = 0;
+            encode_geometry_id_base_bit_size = 24;
+            encode_geometry_id_payload_bit_size = 24;
+            encode_triangle_id_base_bit_size = 24;
+            encode_triangle_id_payload_bit_size = 24;
+            encode_indices_midpoint = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 32 * 9 * 2 + 24 * 2;
+
+            vertex_indices = 0;
+            for (uint32_t i = 0; i < 6; i++) {
+               vertex_used[i] = true;
+               vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8);
+            }
+
+            vertices[3] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[0]);
+            vertices[4] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[1]);
+            vertices[5] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[2]);
+
+            triangle_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], triangle_id0);
+            geometry_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], geometry_id0);
+            opaque1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], opaque0);
+
+            /* Indicate that there is a second node. The actual value of pair_index_node_index1 is not used. */
+            pair_index_node_index1 = 0;
+         }
+      }
+   }
+
+   /* Allocate space for the primitive node. */
+   uint32_t dst_offset;
+   if (cluster.invocation_index == 0) {
+      if (alloc_cluster.invocation_index == 0)
+         dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, alloc_hw_node_count * RADV_GFX12_BVH_NODE_SIZE);
+      dst_offset = radv_read_invocation(alloc_cluster, 0, dst_offset);
+
+      if (alloc_cluster.invocation_index == RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) {
+         dst_offset +=
+            radv_read_invocation(alloc_cluster, 0, hw_node_count) * RADV_GFX12_BVH_NODE_SIZE - RADV_GFX12_BVH_NODE_SIZE;
+      }
+
+      DEREF(parent_node).primitive_base_id = pack_node_id(dst_offset, 0);
+      DEREF(parent_node).child_count_exponents = (DEREF(parent_node).child_count_exponents & 0x0fffffff) |
+                                                 ((first_leaf_child_index + total_pair_count - 1) << 28);
+   }
+   dst_offset = radv_read_invocation(cluster, 0, dst_offset) + hw_node_index * RADV_GFX12_BVH_NODE_SIZE;
+
+   uint32_t second_dst_offset = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset);
+   bool rewrite_primitive_base_id = jump_to_second_batch && total_pair_count == 1;
+   if (rewrite_primitive_base_id)
+      DEREF(parent_node).primitive_base_id = pack_node_id(second_dst_offset, 0);
+
+   radv_gfx12_box_child child = DEREF(parent_node).children[first_leaf_child_index + cluster.invocation_index];
+
+   memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+   if (pair_index < pair_count - 1)
+      child.dword2 = child.dword2 & 0xffffff;
+
+   uint32_t jump_size = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset) - dst_offset;
+   if (jump_to_second_batch && !rewrite_primitive_base_id && pair_base_index + pair_index == (total_pair_count - 1) - 1)
+      child.dword2 = (child.dword2 & 0xffffff) | ((jump_size / RADV_GFX12_BVH_NODE_SIZE) << 28);
+
+   /* Update the node type because it encodes the pair index which cannot be known in advance.
+    * The BVH8 encoding uses 4 bits for the type. The high bit is used to reference up to 8 pairs.
+    */
+   child.dword2 |= ((pair_index & 0x3) << 24);
+   if (pair_index >= 4)
+      child.dword2 |= (8 << 24);
+
+   DEREF(parent_node).children[first_leaf_child_index + pair_base_index + pair_index] = child;
+
+   /* Return because the triangle is written by the second batch. */
+   if (is_single_prim_node && jump_to_second_batch)
+      return;
+
+   VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset;
+
+   bit_writer writer;
+   bit_writer_init(writer, dst_leaf_addr);
+
+   if (cluster.invocation_index == first_active_in_node) {
+      bit_writer_write(writer, encode_vertex_payload_bit_size.x - 1, 5);    /* x_vertex_bits_minus_one */
+      bit_writer_write(writer, encode_vertex_payload_bit_size.y - 1, 5);    /* y_vertex_bits_minus_one */
+      bit_writer_write(writer, encode_vertex_payload_bit_size.z - 1, 5);    /* z_vertex_bits_minus_one */
+      bit_writer_write(writer, encode_trailing_zero_bits, 5);               /* trailing_zero_bits */
+      bit_writer_write(writer, encode_geometry_id_base_bit_size / 2, 4);    /* geometry_index_base_bits_div_2 */
+      bit_writer_write(writer, encode_geometry_id_payload_bit_size / 2, 4); /* geometry_index_bits_div_2 */
+      bit_writer_write(writer, pair_count - 1, 3);                          /* triangle_pair_count_minus_one */
+      bit_writer_write(writer, 0, 1);                                       /* vertex_type */
+      bit_writer_write(writer, encode_triangle_id_base_bit_size, 5);        /* primitive_index_base_bits */
+      bit_writer_write(writer, encode_triangle_id_payload_bit_size, 5);     /* primitive_index_bits */
+      bit_writer_write(writer, encode_indices_midpoint, 10);
+
+      uvec3 vertex_prefix = floatBitsToUint(vertices[0]);
+      uvec3 vertex_base_bit_size = uvec3(32 - encode_trailing_zero_bits) - encode_vertex_payload_bit_size;
+      if (vertex_base_bit_size.x > 0) {
+         bit_writer_write(writer, vertex_prefix.x >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.x),
+                          vertex_base_bit_size.x);
+      }
+      if (vertex_base_bit_size.y > 0) {
+         bit_writer_write(writer, vertex_prefix.y >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.y),
+                          vertex_base_bit_size.y);
+      }
+      if (vertex_base_bit_size.z > 0) {
+         bit_writer_write(writer, vertex_prefix.z >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.z),
+                          vertex_base_bit_size.z);
+      }
+   }
+
+   uint32_t vertex_used_mask[6];
+   for (uint32_t processed_node_index = 0; processed_node_index < hw_node_count; processed_node_index++) {
+      if (processed_node_index != hw_node_index)
+         continue;
+
+      for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++)
+         vertex_used_mask[vertex_index] = radv_ballot(cluster, vertex_used[vertex_index]);
+   }
+
+   for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++) {
+      uvec3 vertex = floatBitsToUint(vertices[vertex_index]) >> encode_trailing_zero_bits;
+      vertex = vertex & uvec3((1ul << uint64_t(encode_vertex_payload_bit_size.x)) - 1,
+                              (1ul << uint64_t(encode_vertex_payload_bit_size.y)) - 1,
+                              (1ul << uint64_t(encode_vertex_payload_bit_size.z)) - 1);
+
+      for (uint32_t i = 0; i < pair_count; i++) {
+         uint32_t invocation = (node_invocations >> (i * 4)) & 0xf;
+         if ((vertex_used_mask[vertex_index] & (1u << invocation)) == 0)
+            continue;
+
+         uvec3 current_vertex = radv_read_invocation(cluster, invocation, vertex);
+
+         if (cluster.invocation_index == first_active_in_node) {
+            bit_writer_write(writer, current_vertex.x, encode_vertex_payload_bit_size.x);
+            bit_writer_write(writer, current_vertex.y, encode_vertex_payload_bit_size.y);
+            bit_writer_write(writer, current_vertex.z, encode_vertex_payload_bit_size.z);
+         }
+      }
+   }
+
+   if (encode_geometry_id_payload_bit_size > 0) {
+      uint32_t geometry_id_payload_mask =
+         (encode_geometry_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_geometry_id_payload_bit_size) - 1);
+      uint32_t geometry_id_payloads[2] = {
+         geometry_id0 & geometry_id_payload_mask,
+         geometry_id1 & geometry_id_payload_mask,
+      };
+
+      for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) {
+         uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf;
+
+         uint32_t payload0 = radv_read_invocation(cluster, invocation, geometry_id_payloads[0]);
+         uint32_t payload1 = radv_read_invocation(cluster, invocation, geometry_id_payloads[1]);
+         if (cluster.invocation_index == first_active_in_node) {
+            bit_writer_write(writer, payload1, encode_geometry_id_payload_bit_size);
+            if (invocation != first_active_in_node)
+               bit_writer_write(writer, payload0, encode_geometry_id_payload_bit_size);
+         }
+      }
+   }
+
+   if (cluster.invocation_index == first_active_in_node) {
+      bit_writer_write(writer, geometry_id0, encode_geometry_id_base_bit_size);
+      bit_writer_write(writer, triangle_id0, encode_triangle_id_base_bit_size);
+   }
+
+   if (encode_triangle_id_payload_bit_size > 0) {
+      uint32_t triangle_id_payload_mask =
+         (encode_triangle_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_triangle_id_payload_bit_size) - 1);
+      uint32_t triangle_id_payloads[2] = {
+         triangle_id0 & triangle_id_payload_mask,
+         triangle_id1 & triangle_id_payload_mask,
+      };
+
+      for (uint32_t i = 0; i < pair_count; i++) {
+         uint32_t invocation = (node_invocations >> (i * 4)) & 0xf;
+
+         uint32_t payload0 = radv_read_invocation(cluster, invocation, triangle_id_payloads[0]);
+         uint32_t payload1 = radv_read_invocation(cluster, invocation, triangle_id_payloads[1]);
+         if (cluster.invocation_index == first_active_in_node) {
+            if (invocation != first_active_in_node)
+               bit_writer_write(writer, payload0, encode_triangle_id_payload_bit_size);
+            bit_writer_write(writer, payload1, encode_triangle_id_payload_bit_size);
+         }
+      }
+   }
+
+   if (cluster.invocation_index == first_active_in_node) {
+      uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count;
+      uint32_t target = 32 * 32 - triangle_pair_descs_size;
+      uint32_t skip_count = target - writer.total_count;
+      if (skip_count <= 32)
+         bit_writer_write(writer, 0, skip_count);
+      else
+         bit_writer_skip_to(writer, target);
+   }
+
+   uint32_t encoded_vertex_indices = 0;
+   for (uint32_t i = 0; i < 6; i++) {
+      uint32_t vertex_index = uint32_t((vertex_indices >> (i * 8)) & 0xff);
+      uint32_t invocation = vertex_index >> 4;
+      uint32_t array_index = vertex_index & 0xf;
+
+      uint32_t encoded_index = bitCount(vertex_used_mask[array_index] & ((1u << invocation) - 1));
+      for (uint32_t j = 0; j < 5; j++) {
+         if (array_index > j) {
+            encoded_index += bitCount(vertex_used_mask[j]);
+         }
+      }
+
+      encoded_vertex_indices |= (encoded_index << (i * 4));
+   }
+
+   for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) {
+      uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf;
+
+      bool has_second_triangle =
+         radv_read_invocation(cluster, invocation, pair_index_node_index1 != RADV_BVH_INVALID_NODE);
+      bool current_opaque0 = radv_read_invocation(cluster, invocation, opaque0);
+      bool current_opaque1 = radv_read_invocation(cluster, invocation, opaque1);
+      uint32_t current_encoded_vertex_indices = radv_read_invocation(cluster, invocation, encoded_vertex_indices);
+
+      if (cluster.invocation_index == first_active_in_node) {
+         bit_writer_write(writer, 1, 1);                                                /* prim_range_stop */
+         bit_writer_write(writer, 0, 1);                                                /* tri1_double_sided */
+         bit_writer_write(writer, (has_second_triangle && current_opaque1) ? 1 : 0, 1); /* tri1_opaque */
+         bit_writer_write(writer, has_second_triangle ? (current_encoded_vertex_indices >> 12) : 0,
+                          12);                                 /* tri1_v0_index, tri1_v1_index, tri1_v2_index */
+         bit_writer_write(writer, 0, 1);                       /* tri0_double_sided */
+         bit_writer_write(writer, current_opaque0 ? 1 : 0, 1); /* tri0_opaque */
+         bit_writer_write(writer, current_encoded_vertex_indices & 0xfff,
+                          12); /* tri0_v0_index, tri0_v1_index, tri0_v2_index */
+      }
+   }
+
+   if (cluster.invocation_index == first_active_in_node)
+      bit_writer_finish(writer);
+}
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@ -14,6 +14,9 @@ bvh_shaders = [
  [
    'encode_gfx12.comp',
    'encode_gfx12',
+  ],[
+    'encode_triangles_gfx12.comp',
+    'encode_triangles_gfx12',
  ],
  [
    'encode.comp',
--- a/src/amd/vulkan/meta/radv_meta.h
+++ b/src/amd/vulkan/meta/radv_meta.h
@ -120,6 +120,7 @@ enum radv_meta_object_key_type {
   RADV_META_OBJECT_KEY_BVH_COPY,
   RADV_META_OBJECT_KEY_BVH_COPY_BLAS_ADDRS_GFX12,
   RADV_META_OBJECT_KEY_BVH_ENCODE,
+   RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12,
   RADV_META_OBJECT_KEY_BVH_UPDATE,
   RADV_META_OBJECT_KEY_BVH_HEADER,
 };
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@ -33,6 +33,10 @@ static const uint32_t encode_gfx12_spv[] = {
 #include "bvh/encode_gfx12.spv.h"
 };

+static const uint32_t encode_triangles_gfx12_spv[] = {
+#include "bvh/encode_triangles_gfx12.spv.h"
+};
+
 static const uint32_t header_spv[] = {
 #include "bvh/header.spv.h"
 };
@ -71,6 +75,7 @@ enum radv_encode_key_bits {
   RADV_ENCODE_KEY_COMPACT = (1 << 0),
   RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 1),
   RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 2),
+   RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 3),
 };

 static void
@ -148,12 +153,16 @@ radv_get_acceleration_structure_layout(struct radv_device *device,
   /* root node */
   offset += internal_node_size;

+   if (!(state->config.encode_key[0] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)) {
      accel_struct->leaf_nodes_offset = offset;
      offset += bvh_leaf_size * state->leaf_node_count;
+   }

   accel_struct->internal_nodes_offset = offset;
   /* Factor out the root node. */
   offset += internal_node_size * (internal_count - 1);
+   if (state->config.encode_key[0] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
+      offset += bvh_leaf_size * state->leaf_node_count;

   accel_struct->size = offset;
 }
@ -230,6 +239,23 @@ radv_get_as_size(VkDevice _device, const struct vk_acceleration_structure_build_
   return accel_struct.size;
 }

+static uint32_t
+radv_get_triangle_batches_size(const struct vk_acceleration_structure_build_state *state)
+{
+   return state->leaf_node_count * sizeof(struct radv_triangle_encode_task);
+}
+
+static VkDeviceSize
+radv_get_encode_scratch_size(VkDevice _device, const struct vk_acceleration_structure_build_state *state)
+{
+   if (state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) {
+      uint32_t retry_batch_indices_size = state->leaf_node_count * sizeof(uint32_t);
+      return radv_get_triangle_batches_size(state) + retry_batch_indices_size;
+   }
+
+   return 0;
+}
+
 static VkDeviceSize
 radv_get_update_scratch_size(VkDevice _device, const struct vk_acceleration_structure_build_state *state)
 {
@ -267,7 +293,7 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
      if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
                                        VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
          geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
-         encode_key |= RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12;
+         encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
   }

   if (state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
@ -275,6 +301,7 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s

   state->config.encode_key[0] = encode_key;
   state->config.encode_key[1] = encode_key;
+   state->config.encode_key[2] = encode_key;

   uint32_t update_key = 0;
   if (state->build_info->srcAccelerationStructure == state->build_info->dstAccelerationStructure)
@ -351,6 +378,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
      flags |= RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS;
   if (key & RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12)
      flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
+   if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
+      flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;

   return flags;
 }
@ -438,11 +467,22 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration
         },
      .dst_node_offset = layout.internal_nodes_offset - layout.bvh_offset,
      .dst_leaf_node_offset = layout.leaf_nodes_offset - layout.bvh_offset,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X] = 0,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Y] = 1,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Z] = 1,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X] = 0,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Y] = 1,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Z] = 1,
   };

+   uint32_t header_update_size =
+      offsetof(struct vk_ir_header, driver_internal) - offsetof(struct vk_ir_header, sync_data);
+   if (state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
+      header_update_size = sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data);
+
   const uint8_t *update_data = ((const uint8_t *)&header + offsetof(struct vk_ir_header, sync_data));
   radv_update_memory_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, sync_data), update_data,
-                         sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data));
+                         header_update_size);
   if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope)
      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;

@ -467,6 +507,118 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration
   radv_compute_dispatch(cmd_buffer, &dispatch);
 }

+static VkResult
+radv_encode_triangles_bind_pipeline_gfx12(VkCommandBuffer commandBuffer,
+                                          const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return VK_SUCCESS;
+
+   /* Wait for internal encoding to finish. */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   radv_bvh_build_bind_pipeline(commandBuffer, RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12,
+                                encode_triangles_gfx12_spv, sizeof(encode_triangles_gfx12_spv),
+                                sizeof(struct encode_triangles_gfx12_args), 0);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_encode_triangles_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return;
+
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, state->build_info->dstAccelerationStructure);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   uint64_t intermediate_header_addr = state->build_info->scratchData.deviceAddress + state->scratch.header_offset;
+   uint64_t intermediate_bvh_addr = state->build_info->scratchData.deviceAddress + state->scratch.ir_offset;
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, state, &layout);
+
+   const struct encode_triangles_gfx12_args args = {
+      .intermediate_bvh = intermediate_bvh_addr,
+      .output_base = vk_acceleration_structure_get_va(dst),
+      .header = intermediate_header_addr,
+      .output_bvh_offset = layout.bvh_offset,
+      .leaf_node_offsets_offset = layout.leaf_node_offsets_offset,
+      .batches_size = radv_get_triangle_batches_size(state),
+   };
+   radv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
+
+   struct radv_dispatch_info dispatch = {
+      .unaligned = true,
+      .indirect_va = intermediate_header_addr +
+                     offsetof(struct vk_ir_header, driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X]),
+   };
+
+   radv_compute_dispatch(cmd_buffer, &dispatch);
+}
+
+static VkResult
+radv_encode_triangles_retry_bind_pipeline_gfx12(VkCommandBuffer commandBuffer,
+                                                const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return VK_SUCCESS;
+
+   /* Wait for the first triangle compression pass to finish. */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+   vk_barrier_compute_w_to_indirect_compute_r(commandBuffer);
+
+   radv_bvh_build_bind_pipeline(commandBuffer, RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12,
+                                encode_triangles_gfx12_spv, sizeof(encode_triangles_gfx12_spv),
+                                sizeof(struct encode_triangles_gfx12_args),
+                                RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_encode_triangles_retry_gfx12(VkCommandBuffer commandBuffer,
+                                  const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return;
+
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, state->build_info->dstAccelerationStructure);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   uint64_t intermediate_header_addr = state->build_info->scratchData.deviceAddress + state->scratch.header_offset;
+   uint64_t intermediate_bvh_addr = state->build_info->scratchData.deviceAddress + state->scratch.ir_offset;
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, state, &layout);
+
+   const struct encode_triangles_gfx12_args args = {
+      .intermediate_bvh = intermediate_bvh_addr,
+      .output_base = vk_acceleration_structure_get_va(dst),
+      .header = intermediate_header_addr,
+      .output_bvh_offset = layout.bvh_offset,
+      .leaf_node_offsets_offset = layout.leaf_node_offsets_offset,
+      .batches_size = radv_get_triangle_batches_size(state),
+   };
+   radv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
+
+   struct radv_dispatch_info dispatch = {
+      .unaligned = true,
+      .indirect_va =
+         intermediate_header_addr +
+         offsetof(struct vk_ir_header, driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X]),
+   };
+
+   radv_compute_dispatch(cmd_buffer, &dispatch);
+}
+
 static VkResult
 radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state)
 {
@ -806,20 +958,29 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
      .get_build_config = radv_get_build_config,
      .get_as_size = radv_get_as_size,
      .get_update_scratch_size = radv_get_update_scratch_size,
-      .encode_bind_pipeline[1] = radv_init_header_bind_pipeline,
-      .encode_as[1] = radv_init_header,
      .init_update_scratch = radv_init_update_scratch,
      .update_bind_pipeline[0] = radv_update_bind_pipeline,
   };

   if (radv_use_bvh8(pdev)) {
      device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as_gfx12;
+      device->meta_state.accel_struct_build.build_ops.get_encode_scratch_size = radv_get_encode_scratch_size;
      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline_gfx12;
      device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[1] =
+         radv_encode_triangles_bind_pipeline_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_as[1] = radv_encode_triangles_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[2] =
+         radv_encode_triangles_retry_bind_pipeline_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_as[2] = radv_encode_triangles_retry_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[3] = radv_init_header_bind_pipeline;
+      device->meta_state.accel_struct_build.build_ops.encode_as[3] = radv_init_header;
   } else {
      device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as;
      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline;
      device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[1] = radv_init_header_bind_pipeline;
+      device->meta_state.accel_struct_build.build_ops.encode_as[1] = radv_init_header;
      device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv;
      device->meta_state.accel_struct_build.build_ops.leaf_spirv_override_size = sizeof(leaf_spv);
   }