radv/bvh: Adjust sah cost based on depth

Adds a cost field to radv_ir_node and uses it to model the cost of tree
depth. This improves framerates by 2% if my benchmarking is correct.

Reviewed-by: Adam Jackson <ajax@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19756>
This commit is contained in:
Konstantin Seurer 2022-11-15 18:36:09 +01:00 committed by Marge Bot
parent 2ba55ec504
commit 6f45c98b58
3 changed files with 38 additions and 5 deletions

View file

@ -98,6 +98,8 @@ struct radv_accel_struct_header {
struct radv_ir_node {
radv_aabb aabb;
/* Generic normalized cost of not merging this node. */
float cost;
};
#define FINAL_TREE_PRESENT 0

View file

@ -208,6 +208,7 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3
DEREF(node).instance_id = global_id;
DEREF(node).base.aabb = bounds;
DEREF(node).base.cost = 0.0;
return true;
}
@ -268,6 +269,7 @@ main(void)
}
DEREF(node).base.aabb = bounds;
DEREF(node).base.cost = 0.0;
DEREF(node).triangle_id = global_id;
DEREF(node).geometry_id_and_flags = args.geometry_id;
@ -289,6 +291,7 @@ main(void)
}
DEREF(node).base.aabb = bounds;
DEREF(node).base.cost = 0.0;
DEREF(node).primitive_id = global_id;
DEREF(node).geometry_id_and_flags = args.geometry_id;
} else {

View file

@ -101,6 +101,9 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t
return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
}
/* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */
#define BVH_LEVEL_COST 0.2
uint32_t
push_node(uint32_t children[2])
{
@ -113,6 +116,8 @@ push_node(uint32_t children[2])
total_bounds.min = vec3(INFINITY);
total_bounds.max = vec3(-INFINITY);
float cost = 0.0;
for (uint i = 0; i < 2; ++i) {
if (children[i] != RADV_BVH_INVALID_NODE) {
VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
@ -121,12 +126,15 @@ push_node(uint32_t children[2])
total_bounds.min = min(total_bounds.min, bounds.min);
total_bounds.max = max(total_bounds.max, bounds.max);
cost += DEREF(child).cost;
}
DEREF(dst_node).children[i] = children[i];
}
DEREF(dst_node).base.aabb = total_bounds;
DEREF(dst_node).base.cost = cost * 0.5 + BVH_LEVEL_COST;
DEREF(dst_node).in_final_tree = FINAL_TREE_UNKNOWN;
return dst_id;
}
@ -152,6 +160,7 @@ decode_neighbour_offset(uint32_t encoded_offset)
#define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
shared float shared_costs[NUM_PLOC_LDS_ITEMS];
shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
uint32_t
@ -177,18 +186,37 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
REF(radv_ir_node) node = REF(radv_ir_node)(addr);
shared_bounds[i - lds_base] = DEREF(node).aabb;
shared_costs[i - lds_base] = DEREF(node).cost;
}
}
float
combined_node_area(uint32_t lds_base, uint32_t i, uint32_t j)
combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
{
radv_aabb combined_bounds;
combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
vec3 size = combined_bounds.max - combined_bounds.min;
float area = aabb_surface_area(combined_bounds);
return size.x * size.y + size.y * size.z + size.z * size.x;
/* p_a and p_b are the probabilities that i or j are hit by a ray:
* Assuming that the current node is hit (p = 1) and the probability of hitting a node
* is proportional to its surface area, p = area * c with p = 1 for the current node.
* -> c = 1 / area
*
* We can use those probabilities to limit the impact of child cost to be proportional to
* its hit probability. (Child cost is the cost of not merging a node which increases with
* tree depth for internal nodes)
*
* Dividing area by both relative costs will make it more likely that we merge nodes with
* a hight child cost.
*/
float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
float combined_cost =
(1.0 + shared_costs[i - lds_base] * p_i) * (1.0 + shared_costs[j - lds_base] * p_j);
return area / combined_cost;
}
shared uint32_t shared_aggregate_sum;
@ -240,7 +268,7 @@ main(void)
for (uint32_t j = max(i + 1, base_index - neigbourhood_overlap); j <= i + right_bound;
++j) {
float sah = combined_node_area(lds_base, i, j);
float sah = combined_node_cost(lds_base, i, j);
uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
min_offset = min(min_offset, i_encoded_offset);
@ -276,7 +304,7 @@ main(void)
if (preferred_pair != i) {
uint32_t encoded_min_sah =
nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
float sah = combined_node_area(lds_base, i, preferred_pair);
float sah = combined_node_cost(lds_base, i, preferred_pair);
uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
if (encoded_sah <= encoded_min_sah) {