diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index 8cd90ea2840..e2be4fc4651 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -98,6 +98,8 @@ struct radv_accel_struct_header {
 
 struct radv_ir_node {
    radv_aabb aabb;
+   /* Generic normalized cost of not merging this node. */
+   float cost;
 };
 
 #define FINAL_TREE_PRESENT 0
diff --git a/src/amd/vulkan/bvh/leaf.comp b/src/amd/vulkan/bvh/leaf.comp
index a338969dc5a..959244b8f74 100644
--- a/src/amd/vulkan/bvh/leaf.comp
+++ b/src/amd/vulkan/bvh/leaf.comp
@@ -208,6 +208,7 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3
    DEREF(node).instance_id = global_id;
 
    DEREF(node).base.aabb = bounds;
+   DEREF(node).base.cost = 0.0;
 
    return true;
 }
@@ -268,6 +269,7 @@ main(void)
          }
 
       DEREF(node).base.aabb = bounds;
+      DEREF(node).base.cost = 0.0;
 
       DEREF(node).triangle_id = global_id;
       DEREF(node).geometry_id_and_flags = args.geometry_id;
@@ -289,6 +291,7 @@ main(void)
          }
 
       DEREF(node).base.aabb = bounds;
+      DEREF(node).base.cost = 0.0;
       DEREF(node).primitive_id = global_id;
       DEREF(node).geometry_id_and_flags = args.geometry_id;
    } else {
diff --git a/src/amd/vulkan/bvh/ploc_internal.comp b/src/amd/vulkan/bvh/ploc_internal.comp
index e6189e768e2..7c06bf07426 100644
--- a/src/amd/vulkan/bvh/ploc_internal.comp
+++ b/src/amd/vulkan/bvh/ploc_internal.comp
@@ -101,6 +101,9 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
    return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
 }
 
+/* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */
+#define BVH_LEVEL_COST 0.2
+
 uint32_t
 push_node(uint32_t children[2])
 {
@@ -113,6 +116,8 @@ push_node(uint32_t children[2])
    total_bounds.min = vec3(INFINITY);
    total_bounds.max = vec3(-INFINITY);
 
+   float cost = 0.0;
+
    for (uint i = 0; i < 2; ++i) {
       if (children[i] != RADV_BVH_INVALID_NODE) {
          VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
@@ -121,12 +126,15 @@ push_node(uint32_t children[2])
 
          total_bounds.min = min(total_bounds.min, bounds.min);
          total_bounds.max = max(total_bounds.max, bounds.max);
+
+         cost += DEREF(child).cost;
       }
 
       DEREF(dst_node).children[i] = children[i];
    }
 
    DEREF(dst_node).base.aabb = total_bounds;
+   DEREF(dst_node).base.cost = cost * 0.5 + BVH_LEVEL_COST;
    DEREF(dst_node).in_final_tree = FINAL_TREE_UNKNOWN;
    return dst_id;
 }
@@ -152,6 +160,7 @@ decode_neighbour_offset(uint32_t encoded_offset)
 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
 
 shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
+shared float shared_costs[NUM_PLOC_LDS_ITEMS];
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
 
 uint32_t
@@ -177,18 +186,37 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
       REF(radv_ir_node) node = REF(radv_ir_node)(addr);
 
       shared_bounds[i - lds_base] = DEREF(node).aabb;
+      shared_costs[i - lds_base] = DEREF(node).cost;
    }
 }
 
 float
-combined_node_area(uint32_t lds_base, uint32_t i, uint32_t j)
+combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
 {
    radv_aabb combined_bounds;
    combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
    combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
-   vec3 size = combined_bounds.max - combined_bounds.min;
+   float area = aabb_surface_area(combined_bounds);
 
-   return size.x * size.y + size.y * size.z + size.z * size.x;
+   /* p_a and p_b are the probabilities that i or j are hit by a ray:
+    *    Assuming that the current node is hit (p = 1) and the probability of hitting a node
+    *    is proportional to its surface area, p = area * c with p = 1 for the current node.
+    *    -> c = 1 / area
+    *
+    * We can use those probabilities to limit the impact of child cost to be proportional to
+    * its hit probability. (Child cost is the cost of not merging a node which increases with
+    * tree depth for internal nodes)
+    *
+    * Dividing area by both relative costs will make it more likely that we merge nodes with
+    * a hight child cost.
+    */
+   float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
+   float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
+
+   float combined_cost =
+      (1.0 + shared_costs[i - lds_base] * p_i) * (1.0 + shared_costs[j - lds_base] * p_j);
+
+   return area / combined_cost;
 }
 
 shared uint32_t shared_aggregate_sum;
@@ -240,7 +268,7 @@ main(void)
             for (uint32_t j = max(i + 1, base_index - neigbourhood_overlap); j <= i + right_bound;
                  ++j) {
 
-               float sah = combined_node_area(lds_base, i, j);
+               float sah = combined_node_cost(lds_base, i, j);
                uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
                uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
                min_offset = min(min_offset, i_encoded_offset);
@@ -276,7 +304,7 @@ main(void)
             if (preferred_pair != i) {
                uint32_t encoded_min_sah =
                   nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
-               float sah = combined_node_area(lds_base, i, preferred_pair);
+               float sah = combined_node_cost(lds_base, i, preferred_pair);
                uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
                uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
                if (encoded_sah <= encoded_min_sah) {