radv/bvh: Adjust sah cost based on depth

Adds a cost field to radv_ir_node and uses it to model the cost of tree depth. This improves framerates by 2% if my benchmarking is correct. Reviewed-by: Adam Jackson <ajax@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19756>
2025-12-24 02:20:11 +01:00 · 2022-11-15 18:36:09 +01:00 · 2022-11-15 18:36:09 +01:00 · 6f45c98b58
commit 6f45c98b58
parent 2ba55ec504
3 changed files with 38 additions and 5 deletions
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@ -98,6 +98,8 @@ struct radv_accel_struct_header {

 struct radv_ir_node {
   radv_aabb aabb;
+   /* Generic normalized cost of not merging this node. */
+   float cost;
 };

 #define FINAL_TREE_PRESENT 0
--- a/src/amd/vulkan/bvh/leaf.comp
+++ b/src/amd/vulkan/bvh/leaf.comp
@ -208,6 +208,7 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3
   DEREF(node).instance_id = global_id;

   DEREF(node).base.aabb = bounds;
+   DEREF(node).base.cost = 0.0;

   return true;
 }
@ -268,6 +269,7 @@ main(void)
         }

      DEREF(node).base.aabb = bounds;
+      DEREF(node).base.cost = 0.0;

      DEREF(node).triangle_id = global_id;
      DEREF(node).geometry_id_and_flags = args.geometry_id;
@ -289,6 +291,7 @@ main(void)
         }

      DEREF(node).base.aabb = bounds;
+      DEREF(node).base.cost = 0.0;
      DEREF(node).primitive_id = global_id;
      DEREF(node).geometry_id_and_flags = args.geometry_id;
   } else {
--- a/src/amd/vulkan/bvh/ploc_internal.comp
+++ b/src/amd/vulkan/bvh/ploc_internal.comp
@ -101,6 +101,9 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
   return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
 }

+/* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */
+#define BVH_LEVEL_COST 0.2
+
 uint32_t
 push_node(uint32_t children[2])
 {
@ -113,6 +116,8 @@ push_node(uint32_t children[2])
   total_bounds.min = vec3(INFINITY);
   total_bounds.max = vec3(-INFINITY);

+   float cost = 0.0;
+
   for (uint i = 0; i < 2; ++i) {
      if (children[i] != RADV_BVH_INVALID_NODE) {
         VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
@ -121,12 +126,15 @@ push_node(uint32_t children[2])

         total_bounds.min = min(total_bounds.min, bounds.min);
         total_bounds.max = max(total_bounds.max, bounds.max);
+
+         cost += DEREF(child).cost;
      }

      DEREF(dst_node).children[i] = children[i];
   }

   DEREF(dst_node).base.aabb = total_bounds;
+   DEREF(dst_node).base.cost = cost * 0.5 + BVH_LEVEL_COST;
   DEREF(dst_node).in_final_tree = FINAL_TREE_UNKNOWN;
   return dst_id;
 }
@ -152,6 +160,7 @@ decode_neighbour_offset(uint32_t encoded_offset)
 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD

 shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
+shared float shared_costs[NUM_PLOC_LDS_ITEMS];
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];

 uint32_t
@ -177,18 +186,37 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
      REF(radv_ir_node) node = REF(radv_ir_node)(addr);

      shared_bounds[i - lds_base] = DEREF(node).aabb;
+      shared_costs[i - lds_base] = DEREF(node).cost;
   }
 }

 float
-combined_node_area(uint32_t lds_base, uint32_t i, uint32_t j)
+combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
 {
   radv_aabb combined_bounds;
   combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
   combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
-   vec3 size = combined_bounds.max - combined_bounds.min;
+   float area = aabb_surface_area(combined_bounds);

-   return size.x * size.y + size.y * size.z + size.z * size.x;
+   /* p_a and p_b are the probabilities that i or j are hit by a ray:
+    *    Assuming that the current node is hit (p = 1) and the probability of hitting a node
+    *    is proportional to its surface area, p = area * c with p = 1 for the current node.
+    *    -> c = 1 / area
+    *
+    * We can use those probabilities to limit the impact of child cost to be proportional to
+    * its hit probability. (Child cost is the cost of not merging a node which increases with
+    * tree depth for internal nodes)
+    *
+    * Dividing area by both relative costs will make it more likely that we merge nodes with
+    * a hight child cost.
+    */
+   float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
+   float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
+
+   float combined_cost =
+      (1.0 + shared_costs[i - lds_base] * p_i) * (1.0 + shared_costs[j - lds_base] * p_j);
+
+   return area / combined_cost;
 }

 shared uint32_t shared_aggregate_sum;
@ -240,7 +268,7 @@ main(void)
            for (uint32_t j = max(i + 1, base_index - neigbourhood_overlap); j <= i + right_bound;
                 ++j) {

-               float sah = combined_node_area(lds_base, i, j);
+               float sah = combined_node_cost(lds_base, i, j);
               uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
               uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
               min_offset = min(min_offset, i_encoded_offset);
@ -276,7 +304,7 @@ main(void)
            if (preferred_pair != i) {
               uint32_t encoded_min_sah =
                  nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
-               float sah = combined_node_area(lds_base, i, preferred_pair);
+               float sah = combined_node_cost(lds_base, i, preferred_pair);
               uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
               uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
               if (encoded_sah <= encoded_min_sah) {