diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index 8cd90ea2840..e2be4fc4651 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -98,6 +98,8 @@ struct radv_accel_struct_header { struct radv_ir_node { radv_aabb aabb; + /* Generic normalized cost of not merging this node. */ + float cost; }; #define FINAL_TREE_PRESENT 0 diff --git a/src/amd/vulkan/bvh/leaf.comp b/src/amd/vulkan/bvh/leaf.comp index a338969dc5a..959244b8f74 100644 --- a/src/amd/vulkan/bvh/leaf.comp +++ b/src/amd/vulkan/bvh/leaf.comp @@ -208,6 +208,7 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3 DEREF(node).instance_id = global_id; DEREF(node).base.aabb = bounds; + DEREF(node).base.cost = 0.0; return true; } @@ -268,6 +269,7 @@ main(void) } DEREF(node).base.aabb = bounds; + DEREF(node).base.cost = 0.0; DEREF(node).triangle_id = global_id; DEREF(node).geometry_id_and_flags = args.geometry_id; @@ -289,6 +291,7 @@ main(void) } DEREF(node).base.aabb = bounds; + DEREF(node).base.cost = 0.0; DEREF(node).primitive_id = global_id; DEREF(node).geometry_id_and_flags = args.geometry_id; } else { diff --git a/src/amd/vulkan/bvh/ploc_internal.comp b/src/amd/vulkan/bvh/ploc_internal.comp index e6189e768e2..7c06bf07426 100644 --- a/src/amd/vulkan/bvh/ploc_internal.comp +++ b/src/amd/vulkan/bvh/ploc_internal.comp @@ -101,6 +101,9 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot); } +/* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */ +#define BVH_LEVEL_COST 0.2 + uint32_t push_node(uint32_t children[2]) { @@ -113,6 +116,8 @@ push_node(uint32_t children[2]) total_bounds.min = vec3(INFINITY); total_bounds.max = vec3(-INFINITY); + float cost = 0.0; + for (uint i = 0; i < 2; ++i) { if (children[i] != RADV_BVH_INVALID_NODE) { VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i])); @@ -121,12 +126,15 @@ push_node(uint32_t children[2]) total_bounds.min = min(total_bounds.min, bounds.min); total_bounds.max = max(total_bounds.max, bounds.max); + + cost += DEREF(child).cost; } DEREF(dst_node).children[i] = children[i]; } DEREF(dst_node).base.aabb = total_bounds; + DEREF(dst_node).base.cost = cost * 0.5 + BVH_LEVEL_COST; DEREF(dst_node).in_final_tree = FINAL_TREE_UNKNOWN; return dst_id; } @@ -152,6 +160,7 @@ decode_neighbour_offset(uint32_t encoded_offset) #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS]; +shared float shared_costs[NUM_PLOC_LDS_ITEMS]; shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS]; uint32_t @@ -177,18 +186,37 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base, REF(radv_ir_node) node = REF(radv_ir_node)(addr); shared_bounds[i - lds_base] = DEREF(node).aabb; + shared_costs[i - lds_base] = DEREF(node).cost; } } float -combined_node_area(uint32_t lds_base, uint32_t i, uint32_t j) +combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j) { radv_aabb combined_bounds; combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min); combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max); - vec3 size = combined_bounds.max - combined_bounds.min; + float area = aabb_surface_area(combined_bounds); - return size.x * size.y + size.y * size.z + size.z * size.x; + /* p_a and p_b are the probabilities that i or j are hit by a ray: + * Assuming that the current node is hit (p = 1) and the probability of hitting a node + * is proportional to its surface area, p = area * c with p = 1 for the current node. + * -> c = 1 / area + * + * We can use those probabilities to limit the impact of child cost to be proportional to + * its hit probability. (Child cost is the cost of not merging a node which increases with + * tree depth for internal nodes) + * + * Dividing area by both relative costs will make it more likely that we merge nodes with + * a hight child cost. + */ + float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area; + float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area; + + float combined_cost = + (1.0 + shared_costs[i - lds_base] * p_i) * (1.0 + shared_costs[j - lds_base] * p_j); + + return area / combined_cost; } shared uint32_t shared_aggregate_sum; @@ -240,7 +268,7 @@ main(void) for (uint32_t j = max(i + 1, base_index - neigbourhood_overlap); j <= i + right_bound; ++j) { - float sah = combined_node_area(lds_base, i, j); + float sah = combined_node_cost(lds_base, i, j); uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j); uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i); min_offset = min(min_offset, i_encoded_offset); @@ -276,7 +304,7 @@ main(void) if (preferred_pair != i) { uint32_t encoded_min_sah = nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK); - float sah = combined_node_area(lds_base, i, preferred_pair); + float sah = combined_node_cost(lds_base, i, preferred_pair); uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK); uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair); if (encoded_sah <= encoded_min_sah) {