diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h
index 4f9d418319b..a63a534d349 100644
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@@ -21,6 +21,7 @@ TYPE(radv_gfx12_box_node, 4);
 TYPE(radv_gfx12_instance_node, 8);
 TYPE(radv_gfx12_instance_node_user_data, 4);
 TYPE(radv_gfx12_primitive_node, 4);
+TYPE(radv_triangle_encode_task, 4);
 
 uint32_t
 id_to_offset(uint32_t id)
diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h
index 22e76c3f181..861d59eb0bb 100644
--- a/src/amd/vulkan/bvh/build_interface.h
+++ b/src/amd/vulkan/bvh/build_interface.h
@@ -18,13 +18,15 @@
 #define VOID_REF  uint64_t
 #endif
 
-#define RADV_BUILD_FLAG_COMPACT                 (1u << (VK_BUILD_FLAG_COUNT + 0))
-#define RADV_BUILD_FLAG_BVH8                    (1u << (VK_BUILD_FLAG_COUNT + 1))
-#define RADV_BUILD_FLAG_UPDATE_IN_PLACE         (1u << (VK_BUILD_FLAG_COUNT + 2))
-#define RADV_BUILD_FLAG_NO_INFS                 (1u << (VK_BUILD_FLAG_COUNT + 3))
-#define RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS (1u << (VK_BUILD_FLAG_COUNT + 4))
-#define RADV_BUILD_FLAG_UPDATE_SINGLE_GEOMETRY  (1u << (VK_BUILD_FLAG_COUNT + 5))
-#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6))
+#define RADV_BUILD_FLAG_COMPACT                        (1u << (VK_BUILD_FLAG_COUNT + 0))
+#define RADV_BUILD_FLAG_BVH8                           (1u << (VK_BUILD_FLAG_COUNT + 1))
+#define RADV_BUILD_FLAG_UPDATE_IN_PLACE                (1u << (VK_BUILD_FLAG_COUNT + 2))
+#define RADV_BUILD_FLAG_NO_INFS                        (1u << (VK_BUILD_FLAG_COUNT + 3))
+#define RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS        (1u << (VK_BUILD_FLAG_COUNT + 4))
+#define RADV_BUILD_FLAG_UPDATE_SINGLE_GEOMETRY         (1u << (VK_BUILD_FLAG_COUNT + 5))
+#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES        (1u << (VK_BUILD_FLAG_COUNT + 6))
+#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES       (1u << (VK_BUILD_FLAG_COUNT + 7))
+#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 8))
 
 #define RADV_COPY_MODE_COPY        0
 #define RADV_COPY_MODE_SERIALIZE   1
@@ -55,6 +57,15 @@ struct encode_gfx12_args {
    uint32_t geometry_type;
 };
 
+struct encode_triangles_gfx12_args {
+   VOID_REF intermediate_bvh;
+   VOID_REF output_base;
+   REF(vk_ir_header) header;
+   uint32_t output_bvh_offset;
+   uint32_t leaf_node_offsets_offset;
+   uint32_t batches_size;
+};
+
 struct header_args {
    REF(vk_ir_header) src;
    REF(radv_accel_struct_header) dst;
@@ -84,4 +95,11 @@ struct update_gfx12_args {
    vk_bvh_geometry_data geom_data0;
 };
 
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X       0
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Y       1
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Z       2
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X 3
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Y 4
+#define RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Z 5
+
 #endif /* BUILD_INTERFACE_H */
diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index 54e4d424a91..f6e867df6bb 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -195,4 +195,13 @@ struct radv_gfx12_primitive_node {
    uint32_t dwords[32];
 };
 
+#define RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT 16
+#define RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT 8
+
+struct radv_triangle_encode_task {
+   uint32_t parent_offset;
+   /* The pair index is stored in the 4 high bits and the node index is stored in the low bits. */
+   uint32_t pair_index_node_index[RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT];
+};
+
 #endif /* BVH_H */
diff --git a/src/amd/vulkan/bvh/encode_gfx12.comp b/src/amd/vulkan/bvh/encode_gfx12.comp
index c59548d0647..0ee77dd9545 100644
--- a/src/amd/vulkan/bvh/encode_gfx12.comp
+++ b/src/amd/vulkan/bvh/encode_gfx12.comp
@@ -91,7 +91,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
                right = RADV_BVH_INVALID_NODE;
             } else if (right != RADV_BVH_INVALID_NODE && ir_id_to_type(left) == vk_ir_node_triangle &&
                        ir_id_to_type(right) == vk_ir_node_triangle &&
-                       VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES)) {
+                       (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
+                        VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))) {
                second_child = right;
                right = RADV_BVH_INVALID_NODE;
             }
@@ -100,7 +101,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
          }
          right = radv_read_invocation(cluster, collapse_index, right);
 
-         if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES)) {
+         if (VK_BUILD_FLAG(RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES) ||
+             VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) {
             bool is_valid_triangle = child != RADV_BVH_INVALID_NODE && ir_id_to_type(child) == vk_ir_node_triangle;
             uint32_t right_pair_mask =
                radv_ballot(cluster, is_valid_triangle && second_child == RADV_BVH_INVALID_NODE &&
@@ -142,7 +144,8 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
       uint32_t dst_leaf_offset;
       uint32_t dst_internal_offset;
       if (cluster.invocation_index == 0) {
-         dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
+         if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))
+            dst_leaf_offset = atomicAdd(DEREF(args.header).dst_leaf_node_offset, child_leaf_nodes_size);
          dst_internal_offset = atomicAdd(DEREF(args.header).dst_node_offset, child_internal_nodes_size);
       }
       dst_leaf_offset = radv_read_invocation(cluster, 0, dst_leaf_offset);
@@ -170,7 +173,17 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
       extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
       uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
 
-      uint32_t valid_child_count = child_leaf_node_count + child_internal_node_count;
+      uint32_t valid_child_count = child_internal_node_count;
+
+      uint32_t output_valid_child_count = valid_child_count;
+      /* Do not include triangle nodes if RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES because
+       * the count can only be computed by the encode pass.
+       */
+      if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES))
+         output_valid_child_count += child_leaf_node_count;
+
+      valid_child_count += child_leaf_node_count;
+
       if (cluster.invocation_index == 0) {
          DEREF(dst).internal_base_id = pack_node_id(dst_internal_offset, 0);
          DEREF(dst).primitive_base_id = pack_node_id(dst_leaf_offset, 0);
@@ -178,7 +191,7 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
             DEREF(dst).parent_id = RADV_BVH_INVALID_NODE;
          DEREF(dst).origin = origin;
          DEREF(dst).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
-                                            (extent_exponents.z << 16) | ((valid_child_count - 1) << 28);
+                                            (extent_exponents.z << 16) | ((output_valid_child_count - 1) << 28);
          DEREF(dst).obb_matrix_index = 0x7f;
       }
 
@@ -199,6 +212,39 @@ encode_gfx12(uint32_t ir_leaf_node_size, REF(vk_ir_box_node) intermediate_intern
 
             REF(radv_gfx12_box_node) child_box = REF(radv_gfx12_box_node)(dst_child_addr);
             DEREF(child_box).parent_id = node_id;
+         } else if (VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES)) {
+            /* We try to encode 16 (RADV_TRIANGLE_ENCODE_TASK_TRIANGLE_COUNT) triangles into a single node. */
+            uint32_t batch_aligned_triangle_index;
+            if (cluster.invocation_index == radv_first_active_invocation(cluster)) {
+               /* Each invocation will encode a triangle pair. */
+               batch_aligned_triangle_index =
+                  atomicAdd(DEREF(args.header).driver_internal[0], RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+            }
+            batch_aligned_triangle_index =
+               radv_read_invocation(cluster, radv_first_active_invocation(cluster), batch_aligned_triangle_index);
+
+            VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header));
+            REF(radv_triangle_encode_task) task =
+               INDEX(radv_triangle_encode_task, triangle_tasks,
+                     batch_aligned_triangle_index / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+            if (cluster.invocation_index == radv_first_active_invocation(cluster))
+               DEREF(task).parent_offset = bvh_offset;
+
+            uint32_t triangle_pair_index = child_index - child_internal_node_count;
+
+            DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 0] =
+               (child_index << 28) | (ir_id_to_offset(child) / ir_leaf_node_size);
+            if (second_child != RADV_BVH_INVALID_NODE) {
+               DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] =
+                  (child_index << 28) | (ir_id_to_offset(second_child) / ir_leaf_node_size);
+            } else {
+               DEREF(task).pair_index_node_index[triangle_pair_index * 2 + 1] = RADV_BVH_INVALID_NODE;
+            }
+
+            if (child_leaf_node_count < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT &&
+                cluster.invocation_index == radv_first_active_invocation(cluster))
+               DEREF(task).pair_index_node_index[child_leaf_node_count * 2] = RADV_BVH_INVALID_NODE;
          } else {
             if (VK_BUILD_FLAG(RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS)) {
                /* Write leaf node offset. */
diff --git a/src/amd/vulkan/bvh/encode_triangles_gfx12.comp b/src/amd/vulkan/bvh/encode_triangles_gfx12.comp
new file mode 100644
index 00000000000..b28169ff7d0
--- /dev/null
+++ b/src/amd/vulkan/bvh/encode_triangles_gfx12.comp
@@ -0,0 +1,692 @@
+/*
+ * Copyright © 2025 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_clustered : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+#define GFX12
+#define USE_GLOBAL_SYNC
+
+#include "vk_debug.h"
+
+#include "build_helpers.h"
+#include "build_interface.h"
+#include "encode.h"
+#include "invocation_cluster.h"
+
+layout(push_constant) uniform CONSTS
+{
+   encode_triangles_gfx12_args args;
+};
+
+#define UNASSIGNED_VERTEX_INDICES 0xfffffffffffful
+
+void
+main()
+{
+   bool is_retry = VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY);
+
+   uint32_t global_id = gl_GlobalInvocationID.x;
+
+   /* Each invocation cluster handles one task. */
+   radv_invocation_cluster cluster;
+   radv_invocation_cluster_init(cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+   uint32_t task_index = global_id / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+   if (is_retry) {
+      VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size);
+      task_index = DEREF(INDEX(uint32_t, retry_indices, task_index));
+   }
+
+   VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header));
+   REF(radv_triangle_encode_task) task = INDEX(radv_triangle_encode_task, triangle_tasks, task_index);
+   uint32_t pair_index_node_index0 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2];
+   uint32_t pair_index_node_index1 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2 + 1];
+
+   uint32_t total_pair_count = min(findLSB(radv_ballot(cluster, pair_index_node_index0 == RADV_BVH_INVALID_NODE)), 8u);
+
+   if (cluster.invocation_index >= total_pair_count)
+      return;
+
+   uint32_t leaf_index0 = pair_index_node_index0 & 0x0fffffff;
+   vk_ir_triangle_node node0 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index0));
+
+   uint32_t triangle_id0 = node0.triangle_id;
+   uint32_t geometry_id0 = node0.geometry_id_and_flags & 0xffffff;
+   bool opaque0 = (node0.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
+   uint32_t triangle_id1 = triangle_id0;
+   uint32_t geometry_id1 = geometry_id0;
+   bool opaque1 = false;
+
+   vec3 vertices[6];
+   vertices[0] = vec3(node0.coords[0][0], node0.coords[0][1], node0.coords[0][2]);
+   vertices[1] = vec3(node0.coords[1][0], node0.coords[1][1], node0.coords[1][2]);
+   vertices[2] = vec3(node0.coords[2][0], node0.coords[2][1], node0.coords[2][2]);
+
+   uint32_t pair_vertex_indices = 0x210;
+
+   uint32_t pair_size = 1;
+   if (pair_index_node_index1 != RADV_BVH_INVALID_NODE) {
+      pair_size = 2;
+
+      uint32_t leaf_index1 = pair_index_node_index1 & 0x0fffffff;
+      vk_ir_triangle_node node1 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index1));
+
+      triangle_id1 = node1.triangle_id;
+      geometry_id1 = node1.geometry_id_and_flags & 0xffffff;
+      opaque1 = (node1.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
+
+      vertices[3] = vec3(node1.coords[0][0], node1.coords[0][1], node1.coords[0][2]);
+      vertices[4] = vec3(node1.coords[1][0], node1.coords[1][1], node1.coords[1][2]);
+      vertices[5] = vec3(node1.coords[2][0], node1.coords[2][1], node1.coords[2][2]);
+
+      pair_vertex_indices = 0x543210;
+
+      /* Deduplicate vertices here so it does not have to be done during the compression loop. */
+      for (uint32_t i = 0; i < 3; i++) {
+         for (uint32_t j = 0; j < 3; j++) {
+            if (vertices[3 + i] == vertices[j]) {
+               uint32_t bit_offset = (i + 3) * 4;
+               uint32_t clear_mask = ~(0xf << bit_offset);
+               pair_vertex_indices = (pair_vertex_indices & clear_mask) | (j << bit_offset);
+               break;
+            }
+         }
+      }
+   }
+
+   /* Encode inside a loop. Every active invocation tries to compress with the previously chosen
+    * nodes. The invocation with the smallest node size is chosen. TODO: Are there better heuristics?
+    * If there are no new candidates because the node would be too large, encode the previously chosen nodes
+    * and break out of the loop. In this case the first active invocation is chosen.
+    */
+
+   /* Each vertex is described by 8 bits. The highest 4 contain the invocation index and the low 4 bits contain the
+    * array index.
+    */
+   uint64_t vertex_indices = UNASSIGNED_VERTEX_INDICES;
+
+   bool vertex_used[6] = {false, false, false, false, false, false};
+
+   uint32_t hw_node_index = 0;
+   uvec3 encode_vertex_payload_bit_size;
+   uint32_t encode_trailing_zero_bits;
+   uint32_t encode_geometry_id_base_bit_size;
+   uint32_t encode_geometry_id_payload_bit_size;
+   uint32_t encode_triangle_id_base_bit_size;
+   uint32_t encode_triangle_id_payload_bit_size;
+   uint32_t encode_indices_midpoint;
+
+   uint32_t invocation_vertex_count = pair_index_node_index1 != RADV_BVH_INVALID_NODE ? 6 : 3;
+
+   while (true) {
+      /* assigned is true for every invocation whorse triangles are already part of the node. */
+      bool assigned = vertex_indices != UNASSIGNED_VERTEX_INDICES;
+      uint32_t assigned_mask = radv_ballot(cluster, assigned);
+      uint32_t first_assigned_invocation = findLSB(assigned_mask);
+      uint32_t last_assigned_invocation = assigned_mask != 0 ? findMSB(assigned_mask) : 0;
+
+      if (!assigned)
+         vertex_indices = 0;
+
+      bool found[6] = {false, false, false, false, false, false};
+
+      /* At this point vertex_used is only set for assigned invocations since the rejected candidate invocations are
+       * reset.
+       */
+      uint32_t vertex_count = 0;
+      for (uint32_t i = 0; i < 6; i++)
+         vertex_count += bitCount(radv_ballot(cluster, vertex_used[i]));
+
+      for (uint32_t target_invocation = first_assigned_invocation; target_invocation <= last_assigned_invocation;
+           target_invocation++) {
+
+         if (((assigned_mask >> target_invocation) & 1) == 0)
+            continue;
+
+         vec3 target_vertices[6];
+         bool target_vertex_used[6];
+         for (uint32_t i = 0; i < 6; i++) {
+            target_vertices[i] = radv_read_invocation(cluster, target_invocation, vertices[i]);
+            target_vertex_used[i] = radv_read_invocation(cluster, target_invocation, vertex_used[i]);
+         }
+
+         uint32_t target_vertex_count = radv_read_invocation(cluster, target_invocation, invocation_vertex_count);
+
+         if (!assigned) {
+            for (uint32_t candidate_vertex_index = 0; candidate_vertex_index < invocation_vertex_count;
+                 candidate_vertex_index++) {
+               if (found[candidate_vertex_index])
+                  continue;
+
+               uint32_t assign_index = 0;
+
+               for (uint32_t target_vertex_index = 0; target_vertex_index < target_vertex_count;
+                    target_vertex_index++) {
+                  if (target_vertex_used[target_vertex_index] &&
+                      target_vertices[target_vertex_index] == vertices[candidate_vertex_index]) {
+                     found[candidate_vertex_index] = true;
+                     assign_index = target_vertex_index;
+                  }
+               }
+
+               if (found[candidate_vertex_index])
+                  vertex_indices |= uint64_t((target_invocation << 4) + assign_index)
+                                    << uint64_t(candidate_vertex_index * 8);
+            }
+         }
+      }
+
+      /* Handle the remaining vertices that are not already present in the assigned invocations. */
+      if (!assigned) {
+         for (uint32_t i = 0; i < invocation_vertex_count; i++) {
+            if (found[i])
+               continue;
+
+            uint32_t pair_vertex_index = (pair_vertex_indices >> (i * 4)) & 0xf;
+            if (pair_vertex_index == i) {
+               vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8);
+               vertex_used[i] = true;
+               vertex_count++;
+            } else {
+               uint64_t vertex_index = (vertex_indices >> uint64_t(pair_vertex_index * 8)) & 0xff;
+               vertex_indices |= vertex_index << uint64_t(i * 8);
+            }
+         }
+      }
+
+      /* Compute the node layout and size. For assigned invocations, the values contain information about the node with
+       * only the assigned triangles and for !assigned invocations, the current invocation is included.
+       */
+
+      uint32_t triangle_id_base_bit_size;
+      uint32_t triangle_id_payload_bit_size;
+      uint32_t geometry_id_base_bit_size;
+      uint32_t geometry_id_payload_bit_size;
+      for (uint32_t i = 0; i <= first_assigned_invocation; i++) {
+         /* Determine the number of bits required to represent the node ids in the hw's encoding format.
+          * Base and "offset" are masked and OR'd together, so look at the highest-ordered differing bit.
+          */
+         uint32_t triangle_id_base = radv_read_invocation(cluster, i, triangle_id0);
+         triangle_id_base_bit_size = findMSB(triangle_id_base) + 1;
+         uint32_t invoc_triangle_id_payload_bit_size =
+            max(findMSB(triangle_id0 ^ triangle_id_base), findMSB(triangle_id1 ^ triangle_id_base)) + 1;
+         triangle_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_triangle_id_payload_bit_size : 0,
+                                                             RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+         uint32_t geometry_id_base = radv_read_invocation(cluster, i, geometry_id0);
+         geometry_id_base_bit_size = align(findMSB(geometry_id_base) + 1, 2);
+         uint32_t invoc_geometry_id_payload_bit_size =
+            max(findMSB(geometry_id0 ^ geometry_id_base), findMSB(geometry_id1 ^ geometry_id_base)) + 1;
+         geometry_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_geometry_id_payload_bit_size : 0,
+                                                             RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+         if (!assigned) {
+            triangle_id_payload_bit_size = max(triangle_id_payload_bit_size, invoc_triangle_id_payload_bit_size);
+            geometry_id_payload_bit_size = max(geometry_id_payload_bit_size, invoc_geometry_id_payload_bit_size);
+         }
+
+         if (cluster.invocation_index <= i)
+            break;
+      }
+
+      geometry_id_payload_bit_size = align(geometry_id_payload_bit_size, 2);
+
+      /* vertex_used[0] is guaranteed to be true for at least one invocation. */
+      uvec3 vertex_prefix = first_assigned_invocation == 0xffffffff
+                               ? floatBitsToUint(vertices[0])
+                               : radv_read_invocation(cluster, first_assigned_invocation, floatBitsToUint(vertices[0]));
+      uvec3 vertex_payload_mask = uvec3(0);
+      uint32_t vertex_non_zero_mask = 0;
+      for (uint32_t i = 0; i < invocation_vertex_count; i++) {
+         vertex_payload_mask |= vertex_prefix ^ floatBitsToUint(vertices[i]);
+         vertex_non_zero_mask |=
+            floatBitsToUint(vertices[i].x) | floatBitsToUint(vertices[i].y) | floatBitsToUint(vertices[i].z);
+      }
+
+      uint32_t trailing_zero_bits = min(findLSB(vertex_non_zero_mask), 32u);
+      uvec3 vertex_payload_bit_size = min(findMSB(vertex_payload_mask), 31u) + 1;
+
+      if (!assigned) {
+         trailing_zero_bits = 32;
+         vertex_payload_bit_size = uvec3(0);
+      }
+
+      trailing_zero_bits = subgroupClusteredMin(trailing_zero_bits, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+      vertex_payload_bit_size =
+         subgroupClusteredMax(vertex_payload_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+      if (!assigned) {
+         trailing_zero_bits = min(trailing_zero_bits, min(findLSB(vertex_non_zero_mask), 32u));
+         vertex_payload_bit_size = max(vertex_payload_bit_size, min(findMSB(vertex_payload_mask), 31u) + 1);
+      }
+
+      vertex_payload_bit_size.x =
+         vertex_payload_bit_size.x > trailing_zero_bits ? vertex_payload_bit_size.x - trailing_zero_bits : 1;
+      vertex_payload_bit_size.y =
+         vertex_payload_bit_size.y > trailing_zero_bits ? vertex_payload_bit_size.y - trailing_zero_bits : 1;
+      vertex_payload_bit_size.z =
+         vertex_payload_bit_size.z > trailing_zero_bits ? vertex_payload_bit_size.z - trailing_zero_bits : 1;
+
+      uvec3 vertex_base_bit_size = uvec3(32 - trailing_zero_bits) - vertex_payload_bit_size;
+
+      uint32_t required_bit_size = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE;
+
+      required_bit_size += vertex_base_bit_size.x + vertex_base_bit_size.y + vertex_base_bit_size.z;
+      required_bit_size +=
+         vertex_count * (vertex_payload_bit_size.x + vertex_payload_bit_size.y + vertex_payload_bit_size.z);
+
+      uint32_t pair_count = bitCount(assigned_mask);
+      if (!assigned)
+         pair_count++;
+
+      required_bit_size += geometry_id_base_bit_size + (pair_count * 2 - 1) * geometry_id_payload_bit_size;
+      uint32_t indices_midpoint = required_bit_size;
+      required_bit_size += triangle_id_base_bit_size + (pair_count * 2 - 1) * triangle_id_payload_bit_size;
+
+      uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count;
+      required_bit_size += triangle_pair_descs_size;
+
+      if (vertex_count > 15)
+         required_bit_size = RADV_GFX12_BVH_NODE_SIZE * 8 + 1;
+
+      /* This is only relevant for unassigned invocations. If every invocation is assigned, the 0xffffffff will force a
+       * final flush.
+       */
+      uint32_t min_required_bit_size =
+         subgroupClusteredMin(assigned ? 0xffffffff : required_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+      /* The last iteration always needs to write the remaining triangles. */
+      if (min_required_bit_size > RADV_GFX12_BVH_NODE_SIZE * 8) {
+         if (assigned) {
+            encode_vertex_payload_bit_size = vertex_payload_bit_size;
+            encode_trailing_zero_bits = trailing_zero_bits;
+            encode_geometry_id_base_bit_size = geometry_id_base_bit_size;
+            encode_geometry_id_payload_bit_size = geometry_id_payload_bit_size;
+            encode_triangle_id_base_bit_size = triangle_id_base_bit_size;
+            encode_triangle_id_payload_bit_size = triangle_id_payload_bit_size;
+            encode_indices_midpoint = indices_midpoint;
+            break;
+         } else {
+            hw_node_index++;
+
+            vertex_indices = UNASSIGNED_VERTEX_INDICES;
+            for (uint32_t i = 0; i < 6; i++)
+               vertex_used[i] = false;
+         }
+      } else {
+         uint32_t chosen_invocation =
+            findMSB(radv_ballot(cluster, !assigned && required_bit_size == min_required_bit_size));
+         if (cluster.invocation_index != chosen_invocation && !assigned) {
+            vertex_indices = UNASSIGNED_VERTEX_INDICES;
+            for (uint32_t i = 0; i < 6; i++)
+               vertex_used[i] = false;
+         }
+      }
+   }
+
+   uint32_t hw_node_count = subgroupClusteredMax(hw_node_index, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) + 1;
+
+   uint32_t pair_index;
+   uint32_t pair_base_index = 0;
+   uint32_t pair_count;
+   uint32_t first_active_in_node;
+   uint32_t node_mask;
+   uint32_t node_invocations;
+   for (uint32_t i = 0; i < hw_node_count; i++) {
+      uint32_t current_node_mask = radv_ballot(cluster, hw_node_index == i);
+      if (hw_node_index == i) {
+         node_mask = current_node_mask;
+         pair_count = bitCount(node_mask);
+         first_active_in_node = findLSB(node_mask);
+         pair_index = bitCount(node_mask & ((1u << cluster.invocation_index) - 1));
+         node_invocations = subgroupClusteredOr(cluster.invocation_index << (pair_index * 4),
+                                                RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+         break;
+      }
+      pair_base_index += bitCount(current_node_mask);
+   }
+
+   bool is_single_prim_node = pair_count == 1 && pair_index_node_index1 == RADV_BVH_INVALID_NODE;
+
+   /* If there is a node that contains only one primitive, abort this encoding attempt and retry during a second pass
+    * which will pair such nodes. This needs a separate pass so that the allocated nodes of two batches can be
+    * guaranteed to be close enough since primitive nodes can only have small relative offsets. The retry pass has
+    * RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (is_retry) set.
+    */
+   uint32_t single_prim_node_invoc = findLSB(radv_ballot(cluster, is_single_prim_node));
+   bool has_single_prim_node = radv_ballot(cluster, is_single_prim_node) != 0;
+   if (!is_retry && has_single_prim_node) {
+      if (cluster.invocation_index == 0) {
+         uint32_t retry_base_invocation =
+            atomicAdd(DEREF(args.header).driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X],
+                      RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
+
+         uint32_t retry_batch_index_index = retry_base_invocation / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+         VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size);
+         DEREF(INDEX(uint32_t, retry_indices, retry_batch_index_index)) = task_index;
+      }
+
+      return;
+   }
+
+   if (is_retry) {
+      /* Move the single primitive node to the end since it needs to offset into the next batch. */
+      uint32_t single_prim_pair_base_index = radv_read_invocation(cluster, single_prim_node_invoc, pair_base_index);
+
+      if (pair_base_index > single_prim_pair_base_index)
+         pair_base_index--;
+      if (is_single_prim_node)
+         pair_base_index = total_pair_count - 1;
+   }
+
+   REF(radv_gfx12_box_node) parent_node =
+      REF(radv_gfx12_box_node)(args.output_base + args.output_bvh_offset + DEREF(task).parent_offset);
+   uint32_t first_leaf_child_index = (DEREF(parent_node).child_count_exponents >> 28) + 1;
+   if (first_leaf_child_index == 0x10)
+      first_leaf_child_index = 0;
+
+   /* Two batches are always combined into one during the retry pass is there is a second batch. The goal is to merge
+    * all primitive nodes with just one triangle (except one if there is an odd number of such nodes). Since the
+    * compression loop above can always merge at least two nodes, the following assumptions should hold:
+    *
+    *    - There is at most one primitive node with only one triangle in a batch
+    *    - this primitive has the max hw_node_index in this batch.
+    *
+    * If there is a second batch, the first batch will allocate one less primitive node. This is the triangle that will
+    * be merged into the second batch which we know has the highest hw_node_index/dst_offset. The second batch starts in
+    * dst memory where the primitive that was removed from the first batch should have been. The merged triangle can be
+    * referenced in two different ways:
+    *
+    *    - If the batch contains only one triangle, the primitive_base_id is changed to point at the merged node.
+    *    - Otherwise the node size inside the cild info before the moved triangle child info is set to skip ahead to the
+    *      merged primitive node in the second batch.
+    */
+
+   if (is_retry) {
+      assert(bitCount(radv_ballot(cluster, is_single_prim_node)) == 1,
+             "radv: encode_triangles_gfx12: There must be exactly one node with only one triangle.\n");
+      assert(!is_single_prim_node || hw_node_index == hw_node_count - 1,
+             "radv: encode_triangles_gfx12: The single triangle primitive node must be last.\n");
+   }
+
+   radv_invocation_cluster alloc_cluster = cluster;
+   uint32_t alloc_hw_node_count = hw_node_count;
+   bool has_second_batch = false;
+   bool jump_to_second_batch = false;
+   uint32_t single_prim_node_invocs[2];
+   if (is_retry) {
+      radv_invocation_cluster_init(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT * 2);
+
+      has_second_batch = (radv_ballot(alloc_cluster, true) >> RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) != 0;
+
+      single_prim_node_invocs[0] = radv_read_invocation(alloc_cluster, 0, single_prim_node_invoc);
+      single_prim_node_invocs[1] =
+         radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, single_prim_node_invoc) +
+         RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+
+      if (has_second_batch) {
+         alloc_hw_node_count =
+            radv_read_invocation(alloc_cluster, 0, hw_node_count) +
+            radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, hw_node_count) - 1;
+
+         jump_to_second_batch = alloc_cluster.invocation_index < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
+         if (is_single_prim_node) {
+            encode_vertex_payload_bit_size = uvec3(32);
+            encode_trailing_zero_bits = 0;
+            encode_geometry_id_base_bit_size = 24;
+            encode_geometry_id_payload_bit_size = 24;
+            encode_triangle_id_base_bit_size = 24;
+            encode_triangle_id_payload_bit_size = 24;
+            encode_indices_midpoint = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 32 * 9 * 2 + 24 * 2;
+
+            vertex_indices = 0;
+            for (uint32_t i = 0; i < 6; i++) {
+               vertex_used[i] = true;
+               vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8);
+            }
+
+            vertices[3] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[0]);
+            vertices[4] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[1]);
+            vertices[5] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[2]);
+
+            triangle_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], triangle_id0);
+            geometry_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], geometry_id0);
+            opaque1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], opaque0);
+
+            /* Indicate that there is a second node. The actual value of pair_index_node_index1 is not used. */
+            pair_index_node_index1 = 0;
+         }
+      }
+   }
+
+   /* Allocate space for the primitive node. */
+   uint32_t dst_offset;
+   if (cluster.invocation_index == 0) {
+      if (alloc_cluster.invocation_index == 0)
+         dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, alloc_hw_node_count * RADV_GFX12_BVH_NODE_SIZE);
+      dst_offset = radv_read_invocation(alloc_cluster, 0, dst_offset);
+
+      if (alloc_cluster.invocation_index == RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) {
+         dst_offset +=
+            radv_read_invocation(alloc_cluster, 0, hw_node_count) * RADV_GFX12_BVH_NODE_SIZE - RADV_GFX12_BVH_NODE_SIZE;
+      }
+
+      DEREF(parent_node).primitive_base_id = pack_node_id(dst_offset, 0);
+      DEREF(parent_node).child_count_exponents = (DEREF(parent_node).child_count_exponents & 0x0fffffff) |
+                                                 ((first_leaf_child_index + total_pair_count - 1) << 28);
+   }
+   dst_offset = radv_read_invocation(cluster, 0, dst_offset) + hw_node_index * RADV_GFX12_BVH_NODE_SIZE;
+
+   uint32_t second_dst_offset = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset);
+   bool rewrite_primitive_base_id = jump_to_second_batch && total_pair_count == 1;
+   if (rewrite_primitive_base_id)
+      DEREF(parent_node).primitive_base_id = pack_node_id(second_dst_offset, 0);
+
+   radv_gfx12_box_child child = DEREF(parent_node).children[first_leaf_child_index + cluster.invocation_index];
+
+   memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+   if (pair_index < pair_count - 1)
+      child.dword2 = child.dword2 & 0xffffff;
+
+   uint32_t jump_size = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset) - dst_offset;
+   if (jump_to_second_batch && !rewrite_primitive_base_id && pair_base_index + pair_index == (total_pair_count - 1) - 1)
+      child.dword2 = (child.dword2 & 0xffffff) | ((jump_size / RADV_GFX12_BVH_NODE_SIZE) << 28);
+
+   /* Update the node type because it encodes the pair index which cannot be known in advance.
+    * The BVH8 encoding uses 4 bits for the type. The high bit is used to reference up to 8 pairs.
+    */
+   child.dword2 |= ((pair_index & 0x3) << 24);
+   if (pair_index >= 4)
+      child.dword2 |= (8 << 24);
+
+   DEREF(parent_node).children[first_leaf_child_index + pair_base_index + pair_index] = child;
+
+   /* Return because the triangle is written by the second batch. */
+   if (is_single_prim_node && jump_to_second_batch)
+      return;
+
+   VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset;
+
+   bit_writer writer;
+   bit_writer_init(writer, dst_leaf_addr);
+
+   if (cluster.invocation_index == first_active_in_node) {
+      bit_writer_write(writer, encode_vertex_payload_bit_size.x - 1, 5);    /* x_vertex_bits_minus_one */
+      bit_writer_write(writer, encode_vertex_payload_bit_size.y - 1, 5);    /* y_vertex_bits_minus_one */
+      bit_writer_write(writer, encode_vertex_payload_bit_size.z - 1, 5);    /* z_vertex_bits_minus_one */
+      bit_writer_write(writer, encode_trailing_zero_bits, 5);               /* trailing_zero_bits */
+      bit_writer_write(writer, encode_geometry_id_base_bit_size / 2, 4);    /* geometry_index_base_bits_div_2 */
+      bit_writer_write(writer, encode_geometry_id_payload_bit_size / 2, 4); /* geometry_index_bits_div_2 */
+      bit_writer_write(writer, pair_count - 1, 3);                          /* triangle_pair_count_minus_one */
+      bit_writer_write(writer, 0, 1);                                       /* vertex_type */
+      bit_writer_write(writer, encode_triangle_id_base_bit_size, 5);        /* primitive_index_base_bits */
+      bit_writer_write(writer, encode_triangle_id_payload_bit_size, 5);     /* primitive_index_bits */
+      bit_writer_write(writer, encode_indices_midpoint, 10);
+
+      uvec3 vertex_prefix = floatBitsToUint(vertices[0]);
+      uvec3 vertex_base_bit_size = uvec3(32 - encode_trailing_zero_bits) - encode_vertex_payload_bit_size;
+      if (vertex_base_bit_size.x > 0) {
+         bit_writer_write(writer, vertex_prefix.x >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.x),
+                          vertex_base_bit_size.x);
+      }
+      if (vertex_base_bit_size.y > 0) {
+         bit_writer_write(writer, vertex_prefix.y >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.y),
+                          vertex_base_bit_size.y);
+      }
+      if (vertex_base_bit_size.z > 0) {
+         bit_writer_write(writer, vertex_prefix.z >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.z),
+                          vertex_base_bit_size.z);
+      }
+   }
+
+   uint32_t vertex_used_mask[6];
+   for (uint32_t processed_node_index = 0; processed_node_index < hw_node_count; processed_node_index++) {
+      if (processed_node_index != hw_node_index)
+         continue;
+
+      for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++)
+         vertex_used_mask[vertex_index] = radv_ballot(cluster, vertex_used[vertex_index]);
+   }
+
+   for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++) {
+      uvec3 vertex = floatBitsToUint(vertices[vertex_index]) >> encode_trailing_zero_bits;
+      vertex = vertex & uvec3((1ul << uint64_t(encode_vertex_payload_bit_size.x)) - 1,
+                              (1ul << uint64_t(encode_vertex_payload_bit_size.y)) - 1,
+                              (1ul << uint64_t(encode_vertex_payload_bit_size.z)) - 1);
+
+      for (uint32_t i = 0; i < pair_count; i++) {
+         uint32_t invocation = (node_invocations >> (i * 4)) & 0xf;
+         if ((vertex_used_mask[vertex_index] & (1u << invocation)) == 0)
+            continue;
+
+         uvec3 current_vertex = radv_read_invocation(cluster, invocation, vertex);
+
+         if (cluster.invocation_index == first_active_in_node) {
+            bit_writer_write(writer, current_vertex.x, encode_vertex_payload_bit_size.x);
+            bit_writer_write(writer, current_vertex.y, encode_vertex_payload_bit_size.y);
+            bit_writer_write(writer, current_vertex.z, encode_vertex_payload_bit_size.z);
+         }
+      }
+   }
+
+   if (encode_geometry_id_payload_bit_size > 0) {
+      uint32_t geometry_id_payload_mask =
+         (encode_geometry_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_geometry_id_payload_bit_size) - 1);
+      uint32_t geometry_id_payloads[2] = {
+         geometry_id0 & geometry_id_payload_mask,
+         geometry_id1 & geometry_id_payload_mask,
+      };
+
+      for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) {
+         uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf;
+
+         uint32_t payload0 = radv_read_invocation(cluster, invocation, geometry_id_payloads[0]);
+         uint32_t payload1 = radv_read_invocation(cluster, invocation, geometry_id_payloads[1]);
+         if (cluster.invocation_index == first_active_in_node) {
+            bit_writer_write(writer, payload1, encode_geometry_id_payload_bit_size);
+            if (invocation != first_active_in_node)
+               bit_writer_write(writer, payload0, encode_geometry_id_payload_bit_size);
+         }
+      }
+   }
+
+   if (cluster.invocation_index == first_active_in_node) {
+      bit_writer_write(writer, geometry_id0, encode_geometry_id_base_bit_size);
+      bit_writer_write(writer, triangle_id0, encode_triangle_id_base_bit_size);
+   }
+
+   if (encode_triangle_id_payload_bit_size > 0) {
+      uint32_t triangle_id_payload_mask =
+         (encode_triangle_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_triangle_id_payload_bit_size) - 1);
+      uint32_t triangle_id_payloads[2] = {
+         triangle_id0 & triangle_id_payload_mask,
+         triangle_id1 & triangle_id_payload_mask,
+      };
+
+      for (uint32_t i = 0; i < pair_count; i++) {
+         uint32_t invocation = (node_invocations >> (i * 4)) & 0xf;
+
+         uint32_t payload0 = radv_read_invocation(cluster, invocation, triangle_id_payloads[0]);
+         uint32_t payload1 = radv_read_invocation(cluster, invocation, triangle_id_payloads[1]);
+         if (cluster.invocation_index == first_active_in_node) {
+            if (invocation != first_active_in_node)
+               bit_writer_write(writer, payload0, encode_triangle_id_payload_bit_size);
+            bit_writer_write(writer, payload1, encode_triangle_id_payload_bit_size);
+         }
+      }
+   }
+
+   if (cluster.invocation_index == first_active_in_node) {
+      uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count;
+      uint32_t target = 32 * 32 - triangle_pair_descs_size;
+      uint32_t skip_count = target - writer.total_count;
+      if (skip_count <= 32)
+         bit_writer_write(writer, 0, skip_count);
+      else
+         bit_writer_skip_to(writer, target);
+   }
+
+   uint32_t encoded_vertex_indices = 0;
+   for (uint32_t i = 0; i < 6; i++) {
+      uint32_t vertex_index = uint32_t((vertex_indices >> (i * 8)) & 0xff);
+      uint32_t invocation = vertex_index >> 4;
+      uint32_t array_index = vertex_index & 0xf;
+
+      uint32_t encoded_index = bitCount(vertex_used_mask[array_index] & ((1u << invocation) - 1));
+      for (uint32_t j = 0; j < 5; j++) {
+         if (array_index > j) {
+            encoded_index += bitCount(vertex_used_mask[j]);
+         }
+      }
+
+      encoded_vertex_indices |= (encoded_index << (i * 4));
+   }
+
+   for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) {
+      uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf;
+
+      bool has_second_triangle =
+         radv_read_invocation(cluster, invocation, pair_index_node_index1 != RADV_BVH_INVALID_NODE);
+      bool current_opaque0 = radv_read_invocation(cluster, invocation, opaque0);
+      bool current_opaque1 = radv_read_invocation(cluster, invocation, opaque1);
+      uint32_t current_encoded_vertex_indices = radv_read_invocation(cluster, invocation, encoded_vertex_indices);
+
+      if (cluster.invocation_index == first_active_in_node) {
+         bit_writer_write(writer, 1, 1);                                                /* prim_range_stop */
+         bit_writer_write(writer, 0, 1);                                                /* tri1_double_sided */
+         bit_writer_write(writer, (has_second_triangle && current_opaque1) ? 1 : 0, 1); /* tri1_opaque */
+         bit_writer_write(writer, has_second_triangle ? (current_encoded_vertex_indices >> 12) : 0,
+                          12);                                 /* tri1_v0_index, tri1_v1_index, tri1_v2_index */
+         bit_writer_write(writer, 0, 1);                       /* tri0_double_sided */
+         bit_writer_write(writer, current_opaque0 ? 1 : 0, 1); /* tri0_opaque */
+         bit_writer_write(writer, current_encoded_vertex_indices & 0xfff,
+                          12); /* tri0_v0_index, tri0_v1_index, tri0_v2_index */
+      }
+   }
+
+   if (cluster.invocation_index == first_active_in_node)
+      bit_writer_finish(writer);
+}
diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build
index 654692cc31c..3320ef67428 100644
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@@ -14,6 +14,9 @@ bvh_shaders = [
   [
     'encode_gfx12.comp',
     'encode_gfx12',
+  ],[
+    'encode_triangles_gfx12.comp',
+    'encode_triangles_gfx12',
   ],
   [
     'encode.comp',
diff --git a/src/amd/vulkan/meta/radv_meta.h b/src/amd/vulkan/meta/radv_meta.h
index 4401ff72c6c..715d31e87e6 100644
--- a/src/amd/vulkan/meta/radv_meta.h
+++ b/src/amd/vulkan/meta/radv_meta.h
@@ -120,6 +120,7 @@ enum radv_meta_object_key_type {
    RADV_META_OBJECT_KEY_BVH_COPY,
    RADV_META_OBJECT_KEY_BVH_COPY_BLAS_ADDRS_GFX12,
    RADV_META_OBJECT_KEY_BVH_ENCODE,
+   RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12,
    RADV_META_OBJECT_KEY_BVH_UPDATE,
    RADV_META_OBJECT_KEY_BVH_HEADER,
 };
diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c
index 4f5cde9491f..3d8734e5068 100644
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@@ -33,6 +33,10 @@ static const uint32_t encode_gfx12_spv[] = {
 #include "bvh/encode_gfx12.spv.h"
 };
 
+static const uint32_t encode_triangles_gfx12_spv[] = {
+#include "bvh/encode_triangles_gfx12.spv.h"
+};
+
 static const uint32_t header_spv[] = {
 #include "bvh/header.spv.h"
 };
@@ -71,6 +75,7 @@ enum radv_encode_key_bits {
    RADV_ENCODE_KEY_COMPACT = (1 << 0),
    RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 1),
    RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 2),
+   RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 3),
 };
 
 static void
@@ -148,12 +153,16 @@ radv_get_acceleration_structure_layout(struct radv_device *device,
    /* root node */
    offset += internal_node_size;
 
-   accel_struct->leaf_nodes_offset = offset;
-   offset += bvh_leaf_size * state->leaf_node_count;
+   if (!(state->config.encode_key[0] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)) {
+      accel_struct->leaf_nodes_offset = offset;
+      offset += bvh_leaf_size * state->leaf_node_count;
+   }
 
    accel_struct->internal_nodes_offset = offset;
    /* Factor out the root node. */
    offset += internal_node_size * (internal_count - 1);
+   if (state->config.encode_key[0] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
+      offset += bvh_leaf_size * state->leaf_node_count;
 
    accel_struct->size = offset;
 }
@@ -230,6 +239,23 @@ radv_get_as_size(VkDevice _device, const struct vk_acceleration_structure_build_
    return accel_struct.size;
 }
 
+static uint32_t
+radv_get_triangle_batches_size(const struct vk_acceleration_structure_build_state *state)
+{
+   return state->leaf_node_count * sizeof(struct radv_triangle_encode_task);
+}
+
+static VkDeviceSize
+radv_get_encode_scratch_size(VkDevice _device, const struct vk_acceleration_structure_build_state *state)
+{
+   if (state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) {
+      uint32_t retry_batch_indices_size = state->leaf_node_count * sizeof(uint32_t);
+      return radv_get_triangle_batches_size(state) + retry_batch_indices_size;
+   }
+
+   return 0;
+}
+
 static VkDeviceSize
 radv_get_update_scratch_size(VkDevice _device, const struct vk_acceleration_structure_build_state *state)
 {
@@ -267,7 +293,7 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
       if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
                                         VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
           geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
-         encode_key |= RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12;
+         encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
    }
 
    if (state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
@@ -275,6 +301,7 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
 
    state->config.encode_key[0] = encode_key;
    state->config.encode_key[1] = encode_key;
+   state->config.encode_key[2] = encode_key;
 
    uint32_t update_key = 0;
    if (state->build_info->srcAccelerationStructure == state->build_info->dstAccelerationStructure)
@@ -351,6 +378,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
       flags |= RADV_BUILD_FLAG_WRITE_LEAF_NODE_OFFSETS;
    if (key & RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12)
       flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
+   if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
+      flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;
 
    return flags;
 }
@@ -438,11 +467,22 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration
          },
       .dst_node_offset = layout.internal_nodes_offset - layout.bvh_offset,
       .dst_leaf_node_offset = layout.leaf_nodes_offset - layout.bvh_offset,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X] = 0,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Y] = 1,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_Z] = 1,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X] = 0,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Y] = 1,
+      .driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_Z] = 1,
    };
 
+   uint32_t header_update_size =
+      offsetof(struct vk_ir_header, driver_internal) - offsetof(struct vk_ir_header, sync_data);
+   if (state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
+      header_update_size = sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data);
+
    const uint8_t *update_data = ((const uint8_t *)&header + offsetof(struct vk_ir_header, sync_data));
    radv_update_memory_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, sync_data), update_data,
-                         sizeof(struct vk_ir_header) - offsetof(struct vk_ir_header, sync_data));
+                         header_update_size);
    if (radv_device_physical(device)->info.cp_sdma_ge_use_system_memory_scope)
       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
 
@@ -467,6 +507,118 @@ radv_encode_as_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration
    radv_compute_dispatch(cmd_buffer, &dispatch);
 }
 
+static VkResult
+radv_encode_triangles_bind_pipeline_gfx12(VkCommandBuffer commandBuffer,
+                                          const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return VK_SUCCESS;
+
+   /* Wait for internal encoding to finish. */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   radv_bvh_build_bind_pipeline(commandBuffer, RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12,
+                                encode_triangles_gfx12_spv, sizeof(encode_triangles_gfx12_spv),
+                                sizeof(struct encode_triangles_gfx12_args), 0);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_encode_triangles_gfx12(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return;
+
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, state->build_info->dstAccelerationStructure);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   uint64_t intermediate_header_addr = state->build_info->scratchData.deviceAddress + state->scratch.header_offset;
+   uint64_t intermediate_bvh_addr = state->build_info->scratchData.deviceAddress + state->scratch.ir_offset;
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, state, &layout);
+
+   const struct encode_triangles_gfx12_args args = {
+      .intermediate_bvh = intermediate_bvh_addr,
+      .output_base = vk_acceleration_structure_get_va(dst),
+      .header = intermediate_header_addr,
+      .output_bvh_offset = layout.bvh_offset,
+      .leaf_node_offsets_offset = layout.leaf_node_offsets_offset,
+      .batches_size = radv_get_triangle_batches_size(state),
+   };
+   radv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
+
+   struct radv_dispatch_info dispatch = {
+      .unaligned = true,
+      .indirect_va = intermediate_header_addr +
+                     offsetof(struct vk_ir_header, driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_INVOCATIONS_X]),
+   };
+
+   radv_compute_dispatch(cmd_buffer, &dispatch);
+}
+
+static VkResult
+radv_encode_triangles_retry_bind_pipeline_gfx12(VkCommandBuffer commandBuffer,
+                                                const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return VK_SUCCESS;
+
+   /* Wait for the first triangle compression pass to finish. */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+   vk_barrier_compute_w_to_indirect_compute_r(commandBuffer);
+
+   radv_bvh_build_bind_pipeline(commandBuffer, RADV_META_OBJECT_KEY_BVH_ENCODE_TRIANGLES_GFX12,
+                                encode_triangles_gfx12_spv, sizeof(encode_triangles_gfx12_spv),
+                                sizeof(struct encode_triangles_gfx12_args),
+                                RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY);
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_encode_triangles_retry_gfx12(VkCommandBuffer commandBuffer,
+                                  const struct vk_acceleration_structure_build_state *state)
+{
+   bool compress_triangles = state->config.encode_key[2] & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
+   if (!compress_triangles)
+      return;
+
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, state->build_info->dstAccelerationStructure);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   uint64_t intermediate_header_addr = state->build_info->scratchData.deviceAddress + state->scratch.header_offset;
+   uint64_t intermediate_bvh_addr = state->build_info->scratchData.deviceAddress + state->scratch.ir_offset;
+
+   struct acceleration_structure_layout layout;
+   radv_get_acceleration_structure_layout(device, state, &layout);
+
+   const struct encode_triangles_gfx12_args args = {
+      .intermediate_bvh = intermediate_bvh_addr,
+      .output_base = vk_acceleration_structure_get_va(dst),
+      .header = intermediate_header_addr,
+      .output_bvh_offset = layout.bvh_offset,
+      .leaf_node_offsets_offset = layout.leaf_node_offsets_offset,
+      .batches_size = radv_get_triangle_batches_size(state),
+   };
+   radv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
+
+   struct radv_dispatch_info dispatch = {
+      .unaligned = true,
+      .indirect_va =
+         intermediate_header_addr +
+         offsetof(struct vk_ir_header, driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X]),
+   };
+
+   radv_compute_dispatch(cmd_buffer, &dispatch);
+}
+
 static VkResult
 radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state)
 {
@@ -806,20 +958,29 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
       .get_build_config = radv_get_build_config,
       .get_as_size = radv_get_as_size,
       .get_update_scratch_size = radv_get_update_scratch_size,
-      .encode_bind_pipeline[1] = radv_init_header_bind_pipeline,
-      .encode_as[1] = radv_init_header,
       .init_update_scratch = radv_init_update_scratch,
       .update_bind_pipeline[0] = radv_update_bind_pipeline,
    };
 
    if (radv_use_bvh8(pdev)) {
       device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as_gfx12;
+      device->meta_state.accel_struct_build.build_ops.get_encode_scratch_size = radv_get_encode_scratch_size;
       device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline_gfx12;
       device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[1] =
+         radv_encode_triangles_bind_pipeline_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_as[1] = radv_encode_triangles_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[2] =
+         radv_encode_triangles_retry_bind_pipeline_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_as[2] = radv_encode_triangles_retry_gfx12;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[3] = radv_init_header_bind_pipeline;
+      device->meta_state.accel_struct_build.build_ops.encode_as[3] = radv_init_header;
    } else {
       device->meta_state.accel_struct_build.build_ops.update_as[0] = radv_update_as;
       device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[0] = radv_encode_bind_pipeline;
       device->meta_state.accel_struct_build.build_ops.encode_as[0] = radv_encode_as;
+      device->meta_state.accel_struct_build.build_ops.encode_bind_pipeline[1] = radv_init_header_bind_pipeline;
+      device->meta_state.accel_struct_build.build_ops.encode_as[1] = radv_init_header;
       device->meta_state.accel_struct_build.build_ops.leaf_spirv_override = leaf_spv;
       device->meta_state.accel_struct_build.build_ops.leaf_spirv_override_size = sizeof(leaf_spv);
    }