mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-20 19:58:19 +02:00
Only files under src/amd/vulkan/** are concerned. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28599>
395 lines
16 KiB
Text
395 lines
16 KiB
Text
/*
|
|
* Copyright © 2022 Bas Nieuwenhuizen
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#version 460
|
|
|
|
#extension GL_GOOGLE_include_directive : require
|
|
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
|
#extension GL_EXT_scalar_block_layout : require
|
|
#extension GL_EXT_buffer_reference : require
|
|
#extension GL_EXT_buffer_reference2 : require
|
|
#extension GL_KHR_memory_scope_semantics : require
|
|
#extension GL_KHR_shader_subgroup_vote : require
|
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
|
#extension GL_KHR_shader_subgroup_ballot : require
|
|
|
|
layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
#define USE_GLOBAL_SYNC
|
|
#include "build_interface.h"
|
|
|
|
TYPE(ploc_prefix_scan_partition, 4);
|
|
|
|
layout(push_constant) uniform CONSTS
|
|
{
|
|
ploc_args args;
|
|
};
|
|
|
|
shared uint32_t exclusive_prefix_sum;
|
|
shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64];
|
|
|
|
/*
|
|
* Global prefix scan over all workgroups to find out the index of the collapsed node to write.
|
|
* See https://research.nvidia.com/sites/default/files/publications/nvr-2016-002.pdf
|
|
* One partition = one workgroup in this case.
|
|
*/
|
|
uint32_t
|
|
prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
|
|
{
|
|
if (gl_LocalInvocationIndex == 0) {
|
|
exclusive_prefix_sum = 0;
|
|
if (task_index >= gl_WorkGroupSize.x) {
|
|
REF(ploc_prefix_scan_partition) current_partition =
|
|
REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
|
|
|
|
REF(ploc_prefix_scan_partition) previous_partition = current_partition - 1;
|
|
|
|
while (true) {
|
|
/* See if this previous workgroup already set their inclusive sum */
|
|
if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
|
|
gl_StorageSemanticsBuffer,
|
|
gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
|
|
atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
|
|
break;
|
|
} else {
|
|
atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
|
|
previous_partition -= 1;
|
|
}
|
|
}
|
|
/* Set the inclusive sum for the next workgroups */
|
|
atomicStore(DEREF(current_partition).inclusive_sum,
|
|
DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
|
|
gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable);
|
|
}
|
|
}
|
|
|
|
if (subgroupElect())
|
|
aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
|
|
barrier();
|
|
|
|
if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) {
|
|
aggregate_sums[gl_LocalInvocationID.x] =
|
|
exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
|
|
}
|
|
barrier();
|
|
|
|
return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
|
|
}
|
|
|
|
/* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */
|
|
#define BVH_LEVEL_COST 0.2
|
|
|
|
uint32_t
|
|
push_node(uint32_t children[2], radv_aabb bounds[2])
|
|
{
|
|
uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
|
|
uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
|
|
uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal);
|
|
REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
|
|
|
|
radv_aabb total_bounds;
|
|
total_bounds.min = vec3(INFINITY);
|
|
total_bounds.max = vec3(-INFINITY);
|
|
|
|
for (uint i = 0; i < 2; ++i) {
|
|
VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
|
|
REF(radv_ir_node) child = REF(radv_ir_node)(node);
|
|
|
|
total_bounds.min = min(total_bounds.min, bounds[i].min);
|
|
total_bounds.max = max(total_bounds.max, bounds[i].max);
|
|
|
|
DEREF(dst_node).children[i] = children[i];
|
|
}
|
|
|
|
DEREF(dst_node).base.aabb = total_bounds;
|
|
DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
|
|
return dst_id;
|
|
}
|
|
|
|
#define PLOC_NEIGHBOURHOOD 16
|
|
#define PLOC_OFFSET_MASK ((1 << 5) - 1)
|
|
|
|
uint32_t
|
|
encode_neighbour_offset(float sah, uint32_t i, uint32_t j)
|
|
{
|
|
int32_t offset = int32_t(j) - int32_t(i);
|
|
uint32_t encoded_offset = offset + PLOC_NEIGHBOURHOOD - (offset > 0 ? 1 : 0);
|
|
return (floatBitsToUint(sah) & (~PLOC_OFFSET_MASK)) | encoded_offset;
|
|
}
|
|
|
|
int32_t
|
|
decode_neighbour_offset(uint32_t encoded_offset)
|
|
{
|
|
int32_t offset = int32_t(encoded_offset & PLOC_OFFSET_MASK) - PLOC_NEIGHBOURHOOD;
|
|
return offset + (offset >= 0 ? 1 : 0);
|
|
}
|
|
|
|
#define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
|
|
|
|
shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
|
|
shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
|
|
|
|
uint32_t
|
|
load_id(VOID_REF ids, uint32_t iter, uint32_t index)
|
|
{
|
|
if (iter == 0)
|
|
return DEREF(REF(key_id_pair)(INDEX(key_id_pair, ids, index))).id;
|
|
else
|
|
return DEREF(REF(uint32_t)(INDEX(uint32_t, ids, index)));
|
|
}
|
|
|
|
void
|
|
load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
|
|
uint32_t neighbourhood_overlap, uint32_t search_bound)
|
|
{
|
|
for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound;
|
|
i += gl_WorkGroupSize.x) {
|
|
uint32_t id = load_id(ids, iter, i);
|
|
if (id == RADV_BVH_INVALID_NODE)
|
|
continue;
|
|
|
|
VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
|
|
REF(radv_ir_node) node = REF(radv_ir_node)(addr);
|
|
|
|
shared_bounds[i - lds_base] = DEREF(node).aabb;
|
|
}
|
|
}
|
|
|
|
float
|
|
combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
|
|
{
|
|
radv_aabb combined_bounds;
|
|
combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
|
|
combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
|
|
return aabb_surface_area(combined_bounds);
|
|
}
|
|
|
|
shared uint32_t shared_aggregate_sum;
|
|
|
|
void
|
|
main(void)
|
|
{
|
|
VOID_REF src_ids = args.ids_0;
|
|
VOID_REF dst_ids = args.ids_1;
|
|
|
|
/* We try to use LBVH for BVHs where we know there will be less than 5 leaves,
|
|
* but sometimes all leaves might be inactive */
|
|
if (DEREF(args.header).active_leaf_count <= 2) {
|
|
if (gl_GlobalInvocationID.x == 0) {
|
|
uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
|
|
uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
|
|
REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
|
|
|
|
radv_aabb total_bounds;
|
|
total_bounds.min = vec3(INFINITY);
|
|
total_bounds.max = vec3(-INFINITY);
|
|
|
|
uint32_t i = 0;
|
|
for (; i < DEREF(args.header).active_leaf_count; i++) {
|
|
uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id;
|
|
|
|
if (child_id != RADV_BVH_INVALID_NODE) {
|
|
VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id));
|
|
REF(radv_ir_node) child = REF(radv_ir_node)(node);
|
|
radv_aabb bounds = DEREF(child).aabb;
|
|
|
|
total_bounds.min = min(total_bounds.min, bounds.min);
|
|
total_bounds.max = max(total_bounds.max, bounds.max);
|
|
}
|
|
|
|
DEREF(dst_node).children[i] = child_id;
|
|
}
|
|
for (; i < 2; i++)
|
|
DEREF(dst_node).children[i] = RADV_BVH_INVALID_NODE;
|
|
|
|
DEREF(dst_node).base.aabb = total_bounds;
|
|
DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
|
|
}
|
|
return;
|
|
}
|
|
|
|
atomicCompSwap(DEREF(args.header).sync_data.task_counts[0], 0xFFFFFFFF,
|
|
DEREF(args.header).active_leaf_count);
|
|
atomicCompSwap(DEREF(args.header).sync_data.current_phase_end_counter, 0xFFFFFFFF,
|
|
DIV_ROUND_UP(DEREF(args.header).active_leaf_count, gl_WorkGroupSize.x));
|
|
REF(ploc_prefix_scan_partition)
|
|
partitions = REF(ploc_prefix_scan_partition)(args.prefix_scan_partitions);
|
|
|
|
uint32_t task_index = fetch_task(args.header, false);
|
|
|
|
for (uint iter = 0;; ++iter) {
|
|
uint32_t current_task_count = task_count(args.header);
|
|
if (task_index == TASK_INDEX_INVALID)
|
|
break;
|
|
|
|
/* Find preferred partners and merge them */
|
|
PHASE (args.header) {
|
|
uint32_t base_index = task_index - gl_LocalInvocationID.x;
|
|
uint32_t neighbourhood_overlap = min(PLOC_NEIGHBOURHOOD, base_index);
|
|
uint32_t double_neighbourhood_overlap = min(2 * PLOC_NEIGHBOURHOOD, base_index);
|
|
/* Upper bound to where valid nearest node indices are written. */
|
|
uint32_t write_bound =
|
|
min(current_task_count, base_index + gl_WorkGroupSize.x + PLOC_NEIGHBOURHOOD);
|
|
/* Upper bound to where valid nearest node indices are searched. */
|
|
uint32_t search_bound =
|
|
min(current_task_count, base_index + gl_WorkGroupSize.x + 2 * PLOC_NEIGHBOURHOOD);
|
|
uint32_t lds_base = base_index - double_neighbourhood_overlap;
|
|
|
|
load_bounds(src_ids, iter, task_index, lds_base, neighbourhood_overlap, search_bound);
|
|
|
|
for (uint32_t i = gl_LocalInvocationID.x; i < NUM_PLOC_LDS_ITEMS; i += gl_WorkGroupSize.x)
|
|
nearest_neighbour_indices[i] = 0xFFFFFFFF;
|
|
barrier();
|
|
|
|
for (uint32_t i = task_index - double_neighbourhood_overlap; i < write_bound;
|
|
i += gl_WorkGroupSize.x) {
|
|
uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);
|
|
|
|
uint32_t fallback_pair = i == 0 ? (i + 1) : (i - 1);
|
|
uint32_t min_offset = encode_neighbour_offset(INFINITY, i, fallback_pair);
|
|
|
|
for (uint32_t j = max(i + 1, base_index - neighbourhood_overlap); j <= i + right_bound;
|
|
++j) {
|
|
|
|
float sah = combined_node_cost(lds_base, i, j);
|
|
uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
|
|
uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
|
|
min_offset = min(min_offset, i_encoded_offset);
|
|
atomicMin(nearest_neighbour_indices[j - lds_base], j_encoded_offset);
|
|
}
|
|
if (i >= base_index - neighbourhood_overlap)
|
|
atomicMin(nearest_neighbour_indices[i - lds_base], min_offset);
|
|
}
|
|
|
|
if (gl_LocalInvocationID.x == 0)
|
|
shared_aggregate_sum = 0;
|
|
barrier();
|
|
|
|
for (uint32_t i = task_index - neighbourhood_overlap; i < write_bound;
|
|
i += gl_WorkGroupSize.x) {
|
|
uint32_t left_bound = min(i, PLOC_NEIGHBOURHOOD);
|
|
uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);
|
|
/*
|
|
* Workaround for a worst-case scenario in PLOC: If the combined area of
|
|
* all nodes (in the neighbourhood) is the same, then the chosen nearest
|
|
* neighbour is the first neighbour. However, this means that no nodes
|
|
* except the first two will find each other as nearest neighbour. Therefore,
|
|
* only one node is contained in each BVH level. By first testing if the immediate
|
|
* neighbour on one side is the nearest, all immediate neighbours will be merged
|
|
* on every step.
|
|
*/
|
|
uint32_t preferred_pair;
|
|
if ((i & 1) != 0)
|
|
preferred_pair = i - min(left_bound, 1);
|
|
else
|
|
preferred_pair = i + min(right_bound, 1);
|
|
|
|
if (preferred_pair != i) {
|
|
uint32_t encoded_min_sah =
|
|
nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
|
|
float sah = combined_node_cost(lds_base, i, preferred_pair);
|
|
uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
|
|
uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
|
|
if (encoded_sah <= encoded_min_sah) {
|
|
nearest_neighbour_indices[i - lds_base] = encoded_offset;
|
|
}
|
|
}
|
|
}
|
|
barrier();
|
|
|
|
bool has_valid_node = true;
|
|
|
|
if (task_index < current_task_count) {
|
|
uint32_t base_index = task_index - gl_LocalInvocationID.x;
|
|
|
|
uint32_t neighbour_index =
|
|
task_index +
|
|
decode_neighbour_offset(nearest_neighbour_indices[task_index - lds_base]);
|
|
uint32_t other_neighbour_index =
|
|
neighbour_index +
|
|
decode_neighbour_offset(nearest_neighbour_indices[neighbour_index - lds_base]);
|
|
uint32_t id = load_id(src_ids, iter, task_index);
|
|
if (other_neighbour_index == task_index) {
|
|
if (task_index < neighbour_index) {
|
|
uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
|
|
uint32_t children[2] = {id, neighbour_id};
|
|
radv_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};
|
|
|
|
DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds);
|
|
DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
|
|
RADV_BVH_INVALID_NODE;
|
|
} else {
|
|
/* We still store in the other case so we don't destroy the node id needed to
|
|
* create the internal node */
|
|
has_valid_node = false;
|
|
}
|
|
} else {
|
|
DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = id;
|
|
}
|
|
|
|
/* Compact - prepare prefix scan */
|
|
uvec4 ballot = subgroupBallot(has_valid_node);
|
|
|
|
uint32_t aggregate_sum = subgroupBallotBitCount(ballot);
|
|
if (subgroupElect())
|
|
atomicAdd(shared_aggregate_sum, aggregate_sum);
|
|
}
|
|
|
|
barrier();
|
|
/*
|
|
* The paper proposes initializing all partitions to an invalid state
|
|
* and only computing aggregates afterwards. We skip that step and
|
|
* initialize the partitions to a valid state. This also simplifies
|
|
* the look-back, as there will never be any blocking due to invalid
|
|
* partitions.
|
|
*/
|
|
if (gl_LocalInvocationIndex == 0) {
|
|
REF(ploc_prefix_scan_partition)
|
|
current_partition = REF(ploc_prefix_scan_partition)(
|
|
INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
|
|
DEREF(current_partition).aggregate = shared_aggregate_sum;
|
|
if (task_index < gl_WorkGroupSize.x) {
|
|
DEREF(current_partition).inclusive_sum = shared_aggregate_sum;
|
|
} else {
|
|
DEREF(current_partition).inclusive_sum = 0xFFFFFFFF;
|
|
}
|
|
}
|
|
|
|
if (task_index == 0)
|
|
set_next_task_count(args.header, task_count(args.header));
|
|
}
|
|
|
|
/* Compact - prefix scan and update */
|
|
PHASE (args.header) {
|
|
uint32_t current_task_count = task_count(args.header);
|
|
|
|
uint32_t id = task_index < current_task_count
|
|
? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
|
|
: RADV_BVH_INVALID_NODE;
|
|
uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE);
|
|
|
|
uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
|
|
if (task_index >= current_task_count)
|
|
continue;
|
|
|
|
if (id != RADV_BVH_INVALID_NODE) {
|
|
DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
|
|
++new_offset;
|
|
}
|
|
|
|
if (task_index == current_task_count - 1) {
|
|
set_next_task_count(args.header, new_offset);
|
|
if (new_offset == 1)
|
|
DEREF(args.header).sync_data.next_phase_exit_flag = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|