mesa/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
Jason Ekstrand 5f948503e4 anv: Import GRL
GRL, or Graphics Library for Ray-tracing is a library we share with the
Windows drivers for doing BVH builds on the GPU.  It consists of a few
headers shared between CL and C code, a bunch of CL kernels, and some
GRL meta-kernels in their own format.

Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16970>
2022-09-28 05:38:37 +00:00

1683 lines
59 KiB
C

//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//
#include "AABB.h"
#include "GRLGen12.h"
#include "api_interface.h"
#include "common.h"
#include "qbvh6.h"
#define MAX_SPLITS_PER_INSTANCE 64
#define NUM_REBRAID_BINS 32
#define NUM_CHILDREN 6
#define MAX_NODE_OFFSET 65535 // can't open nodes whose offsets exceed this
// OCL/DPC++ *SHOULD* have a uniform keyword... but they dont... so I'm making my own
#define uniform
#define varying
#define SGPRINT_UNIFORM(fmt,val) {sub_group_barrier(CLK_LOCAL_MEM_FENCE); if( get_sub_group_local_id() == 0 ) { printf(fmt,val); }}
#define SGPRINT_6x(prefix,fmt,type,val) {\
type v0 = sub_group_broadcast( val, 0 );\
type v1 = sub_group_broadcast( val, 1 );\
type v2 = sub_group_broadcast( val, 2 );\
type v3 = sub_group_broadcast( val, 3 );\
type v4 = sub_group_broadcast( val, 4 );\
type v5 = sub_group_broadcast( val, 5 );\
sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
if( get_sub_group_local_id() == 0 ) { \
printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
v0,v1,v2,v3,v4,v5);}}
#define SGPRINT_16x(prefix,fmt,type,val) {\
type v0 = sub_group_broadcast( val, 0 );\
type v1 = sub_group_broadcast( val, 1 );\
type v2 = sub_group_broadcast( val, 2 );\
type v3 = sub_group_broadcast( val, 3 );\
type v4 = sub_group_broadcast( val, 4 );\
type v5 = sub_group_broadcast( val, 5 );\
type v6 = sub_group_broadcast( val, 6 );\
type v7 = sub_group_broadcast( val, 7 );\
type v8 = sub_group_broadcast( val, 8 );\
type v9 = sub_group_broadcast( val, 9 );\
type v10 = sub_group_broadcast( val, 10 );\
type v11 = sub_group_broadcast( val, 11 );\
type v12 = sub_group_broadcast( val, 12 );\
type v13 = sub_group_broadcast( val, 13 );\
type v14 = sub_group_broadcast( val, 14 );\
type v15 = sub_group_broadcast( val, 15 );\
sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
if( get_sub_group_local_id() == 0 ) { \
printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
#if 1
#define GRL_ATOMIC_INC(addr) atomic_add(addr, 1);
#else
#define GRL_ATOMIC_INC(addr) atomic_inc(addr);
#endif
#if 0
#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) \
_loop_trip++;\
if ( _loop_trip > max_iterations )\
{\
printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!!\n" );\
printf( name"\n");\
break;\
}
#else
#define LOOP_TRIPWIRE_INIT
#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name)
#endif
typedef struct SGHeap
{
uint32_t key_value;
bool lane_mask;
} SGHeap;
GRL_INLINE void SGHeap_init(uniform SGHeap *h)
{
h->lane_mask = false;
h->key_value = 0xbaadf00d;
}
GRL_INLINE bool SGHeap_full(uniform SGHeap *h)
{
return sub_group_all(h->lane_mask);
}
GRL_INLINE bool SGHeap_empty(uniform SGHeap *h)
{
return sub_group_all(!h->lane_mask);
}
GRL_INLINE bool SGHeap_get_lane_mask(uniform SGHeap *h)
{
return h->lane_mask;
}
GRL_INLINE uint16_t SGHeap_get_lane_values(uniform SGHeap *h)
{
return (h->key_value & 0xffff);
}
GRL_INLINE ushort isolate_lowest_bit( ushort m )
{
return m & ~(m - 1);
}
// lane i receives the index of the ith set bit in mask.
GRL_INLINE ushort subgroup_bit_rank( uniform ushort mask )
{
varying ushort lane = get_sub_group_local_id();
ushort idx = 16;
for ( uint i = 0; i < NUM_CHILDREN; i++ )
{
ushort lo = isolate_lowest_bit( mask );
mask = mask ^ lo;
idx = (lane == i) ? lo : idx;
}
return ctz( idx );
}
// push a set of elements spread across a subgroup. Return mask of elements that were not pushed
GRL_INLINE uint16_t SGHeap_vectorized_push(uniform SGHeap *h, varying uint16_t key, varying uint16_t value, uniform ushort push_mask)
{
#if 0 // an attempt to make this algorithm branchless
varying uint key_value = (((uint)key) << 16) | ((uint)value);
uniform ushort free_mask = intel_sub_group_ballot( !h->lane_mask );
varying ushort free_slot_idx = subgroup_bit_prefix_exclusive( free_mask ); // for each heap slot, what is its position in a compacted list of free slots (prefix sum)
varying ushort push_idx = subgroup_bit_prefix_exclusive( push_mask ); // for each lane, what is its position in a compacted list of pushing lanes (prefix sum)
uniform ushort num_pushes = min( popcount( free_mask ), popcount( push_mask ) );
varying ushort push_index = subgroup_bit_rank( push_mask ); // lane i gets the index of the i'th set bit in push_mask
varying uint shuffled = intel_sub_group_shuffle( key_value, intel_sub_group_shuffle( push_index, free_slot_idx ) );
varying bool pushed = false;
if ( !h->lane_mask && free_slot_idx < num_pushes )
{
h->lane_mask = true;
h->key_value = shuffled;
pushed = true;
}
return push_mask & intel_sub_group_ballot( push_idx >= num_pushes );
#else
varying uint lane = get_sub_group_local_id();
varying uint key_value = (((uint)key) << 16) | ((uint)value);
uniform ushort free_mask = intel_sub_group_ballot(!h->lane_mask);
// TODO_OPT: Look for some clever way to remove this loop
while (free_mask && push_mask)
{
// insert first active child into first available lane
uniform uint child_id = ctz(push_mask);
uniform uint victim_lane = ctz(free_mask);
uniform uint kv = sub_group_broadcast( key_value, child_id );
if (victim_lane == lane)
{
h->lane_mask = true;
h->key_value = kv;
}
push_mask ^= (1 << child_id);
free_mask ^= (1 << victim_lane);
}
return push_mask;
#endif
}
// push an item onto a heap that is full except for one slot
GRL_INLINE void SGHeap_push_and_fill(uniform SGHeap *h, uniform uint16_t key, uniform uint16_t value)
{
uniform uint32_t key_value = (((uint)key) << 16) | value;
if (!h->lane_mask)
{
h->lane_mask = true;
h->key_value = key_value; // only one lane will be active at this point
}
}
// pop the min item from a full heap
GRL_INLINE void SGHeap_full_pop_min(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
{
varying uint lane = get_sub_group_local_id();
uniform uint kv = sub_group_reduce_min(h->key_value);
if (h->key_value == kv)
h->lane_mask = false;
*key_out = (kv >> 16);
*value_out = (kv & 0xffff);
}
// pop the max item from a heap
GRL_INLINE void SGHeap_pop_max(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
{
uniform uint lane = get_sub_group_local_id();
uniform uint kv = sub_group_reduce_max(h->lane_mask ? h->key_value : 0);
if (h->key_value == kv)
h->lane_mask = false;
*key_out = (kv >> 16);
*value_out = (kv & 0xffff);
}
GRL_INLINE void SGHeap_printf( SGHeap* heap )
{
uint key = heap->key_value >> 16;
uint value = heap->key_value & 0xffff;
if ( get_sub_group_local_id() == 0)
printf( "HEAP: \n" );
SGPRINT_16x( " mask: ", "%6u ", bool, heap->lane_mask );
SGPRINT_16x( " key : ", "0x%04x ", uint, key );
SGPRINT_16x( " val : ", "0x%04x ", uint, value );
}
GRL_INLINE float transformed_aabb_halfArea(float3 lower, float3 upper, const float *Transform)
{
// Compute transformed extent per 'transform_aabb'. Various terms cancel
float3 Extent = upper - lower;
float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
return (ex * ey) + (ey * ez) + (ex * ez);
}
GRL_INLINE uint16_t quantize_area(float relative_area)
{
// clamp relative area at 0.25 (1/4 of root area)
// and apply a non-linear distribution because most things in real scenes are small
relative_area = pow(min(1.0f, relative_area * 4.0f), 0.125f);
return convert_ushort_rtn( relative_area * 65535.0f );
}
GRL_INLINE varying uint16_t SUBGROUP_get_child_areas(uniform InternalNode *n,
uniform const float *Transform,
uniform float relative_area_scale)
{
varying uint16_t area;
varying uint16_t lane = get_sub_group_local_id();
varying int exp_x = n->exp_x;
varying int exp_y = n->exp_y;
varying int exp_z = n->exp_z;
{
// decode the AABB positions. Lower in the bottom 6 lanes, upper in the top
uniform uint8_t *px = &n->lower_x[0];
uniform uint8_t *py = &n->lower_y[0];
uniform uint8_t *pz = &n->lower_z[0];
varying float fx = convert_float(px[lane]);
varying float fy = convert_float(py[lane]);
varying float fz = convert_float(pz[lane]);
fx = n->lower[0] + bitShiftLdexp(fx, exp_x - 8);
fy = n->lower[1] + bitShiftLdexp(fy, exp_y - 8);
fz = n->lower[2] + bitShiftLdexp(fz, exp_z - 8);
// transform the AABBs to world space
varying float3 lower = (float3)(fx, fy, fz);
varying float3 upper = intel_sub_group_shuffle(lower, lane + 6);
{
// TODO_OPT: This is only utilizing 6 lanes.
// We might be able to do better by vectorizing the calculation differently
float a1 = transformed_aabb_halfArea( lower, upper, Transform );
float a2 = a1 * relative_area_scale;
area = quantize_area( a2 );
}
}
return area;
}
GRL_INLINE ushort get_child_area(
InternalNode* n,
ushort child,
const float* Transform,
float relative_area_scale )
{
uint16_t area;
uint16_t lane = get_sub_group_local_id();
int exp_x = n->exp_x;
int exp_y = n->exp_y;
int exp_z = n->exp_z;
// decode the AABB positions. Lower in the bottom 6 lanes, upper in the top
uint8_t* px = &n->lower_x[0];
uint8_t* py = &n->lower_y[0];
uint8_t* pz = &n->lower_z[0];
float3 lower, upper;
lower.x = convert_float( n->lower_x[child] );
lower.y = convert_float( n->lower_y[child] );
lower.z = convert_float( n->lower_z[child] );
upper.x = convert_float( n->upper_x[child] );
upper.y = convert_float( n->upper_y[child] );
upper.z = convert_float( n->upper_z[child] );
lower.x = bitShiftLdexp( lower.x, exp_x - 8 ); // NOTE: the node's 'lower' field cancels out, so don't add it
lower.y = bitShiftLdexp( lower.y, exp_y - 8 ); // see transform_aabb_halfArea
lower.z = bitShiftLdexp( lower.z, exp_z - 8 );
upper.x = bitShiftLdexp( upper.x, exp_x - 8 );
upper.y = bitShiftLdexp( upper.y, exp_y - 8 );
upper.z = bitShiftLdexp( upper.z, exp_z - 8 );
float a1 = transformed_aabb_halfArea( lower, upper, Transform );
float a2 = a1 * relative_area_scale;
area = quantize_area( a2 );
return area;
}
GRL_INLINE varying int SUBGROUP_get_child_offsets(uniform InternalNode *n)
{
varying uint lane = get_sub_group_local_id();
varying uint child = (lane < NUM_CHILDREN) ? lane : 0;
varying uint block_incr = InternalNode_GetChildBlockIncr( n, child );
//varying uint prefix = sub_group_scan_exclusive_add( block_incr );
varying uint prefix;
if ( NUM_CHILDREN == 6 )
{
prefix = block_incr + intel_sub_group_shuffle_up( 0u, block_incr, 1u );
prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 2 );
prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 4 );
prefix = prefix - block_incr;
}
return n->childOffset + prefix;
}
// compute the maximum number of leaf nodes that will be produced given 'num_splits' node openings
GRL_INLINE uint get_num_nodes(uint num_splits, uint max_children)
{
// each split consumes one node and replaces it with N nodes
// there is initially one node
// number of nodes is thus: N*s + 1 - s ==> (N-1)*s + 1
return (max_children - 1) * num_splits + 1;
}
// compute the number of node openings that can be performed given a fixed extra node budget
GRL_INLINE uint get_num_splits(uint num_nodes, uint max_children)
{
// inverse of get_num_nodes: x = (n-1)s + 1
// s = (x-1)/(n-1)
if (num_nodes == 0)
return 0;
return (num_nodes - 1) / (max_children - 1);
}
GRL_INLINE uint get_rebraid_bin_index(uint16_t quantized_area, uint NUM_BINS)
{
// arrange bins in descending order by size
float relative_area = quantized_area * (1.0f/65535.0f);
relative_area = 1.0f - relative_area; // arrange bins largest to smallest
size_t bin = round(relative_area * (NUM_BINS - 1));
return bin;
}
GRL_INLINE global InternalNode *get_node(global BVHBase *base, int incr)
{
global char *ptr = (((global char *)base) + BVH_ROOT_NODE_OFFSET); // NOTE: Assuming this will be hoisted out of inner loops
return (global InternalNode *)(ptr + incr * 64);
}
GRL_INLINE bool is_aabb_valid(float3 lower, float3 upper)
{
return all(isfinite(lower)) &&
all(isfinite(upper)) &&
all(lower <= upper);
}
GRL_INLINE bool is_node_openable(InternalNode *n)
{
// TODO_OPT: Optimize me by fetching dwords instead of looping over bytes
// TODO: OPT: Pre-compute openability and pack into the pad byte next to the nodeType field??
bool openable = n->nodeType == NODE_TYPE_INTERNAL;
if ( openable )
{
for ( uint i = 0; i < NUM_CHILDREN; i++ )
{
bool valid = InternalNode_IsChildValid( n, i );
uint childType = InternalNode_GetChildType( n, i );
openable = openable & (!valid || (childType == NODE_TYPE_INTERNAL));
}
}
return openable;
}
GRL_INLINE bool SUBGROUP_can_open_root(
uniform global BVHBase *bvh_base,
uniform const struct GRL_RAYTRACING_INSTANCE_DESC* instance
)
{
if (bvh_base == 0 || GRL_get_InstanceMask(instance) == 0)
return false;
// TODO_OPT: SG-vectorize this AABB test
uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
if (!is_aabb_valid(root_lower, root_upper))
return false;
uniform global InternalNode *node = get_node(bvh_base, 0);
if ( node->nodeType != NODE_TYPE_INTERNAL )
return false;
varying bool openable = true;
varying uint lane = get_sub_group_local_id();
if (lane < NUM_CHILDREN)
{
varying uint childType = InternalNode_GetChildType(node, lane);
varying bool valid = InternalNode_IsChildValid(node, lane);
openable = childType == NODE_TYPE_INTERNAL || !valid;
}
return sub_group_all(openable);
}
GRL_INLINE
varying uint2
SUBGROUP_count_instance_splits(uniform global struct AABB3f *geometry_bounds,
uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
{
uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
if (!SUBGROUP_can_open_root(bvh_base, instance))
return (uint2)(0, 0);
uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
uniform uint16_t quantized_area = quantize_area(transformed_aabb_halfArea(root_lower, root_upper, instance->Transform) * relative_area_scale);
uniform uint16_t node_offs = 0;
uniform SGHeap heap;
uniform uint num_splits = 0;
SGHeap_init(&heap);
varying uint sg_split_counts_hi = 0; // cross-subgroup bin counters
varying uint sg_split_counts_lo = 0;
uniform global InternalNode* node_array = get_node( bvh_base, 0 );
LOOP_TRIPWIRE_INIT;
while (1)
{
uniform global InternalNode* node = node_array + node_offs;
// count this split
uniform uint bin = get_rebraid_bin_index(quantized_area, NUM_REBRAID_BINS);
varying uint lane = get_sub_group_local_id();
sg_split_counts_hi += ((lane + 16) == bin) ? 1 : 0;
sg_split_counts_lo += (lane == bin) ? 1 : 0;
// open this node and push all of its openable children to heap
varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node);
varying bool sg_openable = 0;
if (lane < NUM_CHILDREN & sg_offs <= MAX_NODE_OFFSET )
if (InternalNode_IsChildValid(node, lane))
sg_openable = is_node_openable( node_array + sg_offs);
uniform uint openable_children = intel_sub_group_ballot(sg_openable);
if ( openable_children )
{
varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
if ( !SGHeap_full( &heap ) )
{
openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
}
while ( openable_children )
{
// pop min element
uniform uint16_t min_area;
uniform uint16_t min_offs;
SGHeap_full_pop_min( &heap, &min_area, &min_offs );
// eliminate all children smaller than heap minimum
openable_children &= intel_sub_group_ballot( sg_area > min_area );
if ( openable_children )
{
// if any children survived,
// kick out heap minimum and replace with first child.. otherwise we will re-push the minimum
uniform uint child_id = ctz( openable_children );
openable_children ^= (1 << child_id);
min_area = sub_group_broadcast( sg_area, child_id );
min_offs = sub_group_broadcast( sg_offs, child_id );
}
// re-insert onto heap
SGHeap_push_and_fill( &heap, min_area, min_offs );
// repeat until all children are accounted for. It is possible
// for multiple children to fit in the heap, because heap minimum is now changed and we need to recompute it
}
}
num_splits++;
if (num_splits == MAX_SPLITS_PER_INSTANCE)
break;
if (SGHeap_empty(&heap))
break;
// get next node from heap
SGHeap_pop_max(&heap, &quantized_area, &node_offs);
LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_count_splits" );
}
return (uint2)(sg_split_counts_lo, sg_split_counts_hi);
}
typedef struct RebraidBuffers
{
global uint *bin_split_counts; // [num_bins]
global uint *bin_instance_counts; // [num_bins]
global uint *instance_bin_counts; // num_intances * num_bins
} RebraidBuffers;
GRL_INLINE RebraidBuffers cast_rebraid_buffers(global uint *scratch, uint instanceID)
{
RebraidBuffers b;
b.bin_split_counts = scratch;
b.bin_instance_counts = scratch + NUM_REBRAID_BINS;
b.instance_bin_counts = scratch + (2 + instanceID) * NUM_REBRAID_BINS;
return b;
}
///////////////////////////////////////////////////////////////////////////////////////////
// Compute AABB
// Dispatch one work item per instance
///////////////////////////////////////////////////////////////////////////////////////////
GRL_INLINE void rebraid_compute_AABB(
global struct BVHBase* bvh,
global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
{
// don't open null rtas
global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
struct AABB new_primref;
if (bvh_base != 0)
{
float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
const float *Transform = instance->Transform;
if (is_aabb_valid(root_lower, root_upper))
{
new_primref = AABBfromAABB3f(transform_aabb(root_lower, root_upper, Transform));
}
else
{
// degenerate instance which might be updated to be non-degenerate
// use AABB position to guide BVH construction
//
new_primref.lower.x = Transform[3];
new_primref.lower.y = Transform[7];
new_primref.lower.z = Transform[11];
new_primref.upper = new_primref.lower;
}
}
else
{
AABB_init(&new_primref);
}
struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
if (get_sub_group_local_id() == 0)
{
AABB3f_atomic_merge_global_lu(&bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz );
}
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(1, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
rebraid_computeAABB_DXR_instances(
global struct BVHBase* bvh,
global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances)
{
const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
rebraid_compute_AABB(bvh, instances + instanceID);
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(1, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
rebraid_computeAABB_DXR_instances_indirect(
global struct BVHBase* bvh,
global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
global struct IndirectBuildRangeInfo const * const indirect_data)
{
const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
(((global char*)instances) + indirect_data->primitiveOffset);
rebraid_compute_AABB(bvh, instances + instanceID);
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(1, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
rebraid_computeAABB_DXR_instances_pointers(
global struct BVHBase* bvh,
global void *instances_in)
{
global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
(global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
rebraid_compute_AABB(bvh, instances[instanceID]);
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(1, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
rebraid_computeAABB_DXR_instances_pointers_indirect(
global struct BVHBase* bvh,
global void *instances_in,
global struct IndirectBuildRangeInfo const * const indirect_data)
{
instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
(global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
rebraid_compute_AABB(bvh, instances[instanceID]);
}
///////////////////////////////////////////////////////////////////////////////////////////
// Init scratch: Dispatch one work group
///////////////////////////////////////////////////////////////////////////////////////////
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(64, 1, 1))) void kernel rebraid_init_scratch(global uint *scratch)
{
scratch[get_local_id(0) + get_group_id(0)*get_local_size(0)] = 0;
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel rebraid_chase_instance_pointers(global struct GRL_RAYTRACING_INSTANCE_DESC *instances_out,
global void *instance_buff)
{
global const struct GRL_RAYTRACING_INSTANCE_DESC **instances_in =
(global const struct GRL_RAYTRACING_INSTANCE_DESC **)instance_buff;
instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(1, 1, 1)))
void kernel rebraid_chase_instance_pointers_indirect(
global struct GRL_RAYTRACING_INSTANCE_DESC* instances_out,
global void* instance_buff,
global struct IndirectBuildRangeInfo const* const indirect_data)
{
instance_buff = ((global char*)instance_buff) + indirect_data->primitiveOffset;
global const struct GRL_RAYTRACING_INSTANCE_DESC**
instances_in = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instance_buff;
instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
}
///////////////////////////////////////////////////////////////////////////////////////////
// Count splits
///////////////////////////////////////////////////////////////////////////////////////////
GRL_INLINE void DEBUG_SUBGROUP_print_split_counts( uniform uint instanceID, varying uint split_counts_lo, varying uint split_counts_hi )
{
uniform uint vals[32] = {
sub_group_broadcast( split_counts_lo, 0 ), sub_group_broadcast( split_counts_lo, 1 ),
sub_group_broadcast( split_counts_lo, 2 ), sub_group_broadcast( split_counts_lo, 3 ),
sub_group_broadcast( split_counts_lo, 4 ), sub_group_broadcast( split_counts_lo, 5 ),
sub_group_broadcast( split_counts_lo, 6 ), sub_group_broadcast( split_counts_lo, 7 ),
sub_group_broadcast( split_counts_lo, 8 ), sub_group_broadcast( split_counts_lo, 9 ),
sub_group_broadcast( split_counts_lo, 10 ), sub_group_broadcast( split_counts_lo, 11 ),
sub_group_broadcast( split_counts_lo, 12 ), sub_group_broadcast( split_counts_lo, 13 ),
sub_group_broadcast( split_counts_lo, 14 ), sub_group_broadcast( split_counts_lo, 15 ),
sub_group_broadcast( split_counts_hi, 0 ), sub_group_broadcast( split_counts_hi, 1 ),
sub_group_broadcast( split_counts_hi, 2 ), sub_group_broadcast( split_counts_hi, 3 ),
sub_group_broadcast( split_counts_hi, 4 ), sub_group_broadcast( split_counts_hi, 5 ),
sub_group_broadcast( split_counts_hi, 6 ), sub_group_broadcast( split_counts_hi, 7 ),
sub_group_broadcast( split_counts_hi, 8 ), sub_group_broadcast( split_counts_hi, 9 ),
sub_group_broadcast( split_counts_hi, 10 ), sub_group_broadcast( split_counts_hi, 11 ),
sub_group_broadcast( split_counts_hi, 12 ), sub_group_broadcast( split_counts_hi, 13 ),
sub_group_broadcast( split_counts_hi, 14 ), sub_group_broadcast( split_counts_hi, 15 )
};
if ( get_sub_group_local_id() == 0 )
{
printf(
"Instance: %4u "
"%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u "
"%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u \n"
,
instanceID,
vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7],
vals[8], vals[9], vals[10], vals[11], vals[12], vals[13], vals[14], vals[15],
vals[16], vals[17], vals[18], vals[19], vals[20], vals[21], vals[22], vals[23],
vals[24], vals[25], vals[26], vals[27], vals[28], vals[29], vals[30], vals[31]
);
}
}
GRL_INLINE void do_rebraid_count_splits_SG(
uniform global struct BVHBase* bvh,
uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
uniform global uint *rebraid_scratch)
{
uniform const uint instanceID = get_sub_group_global_id();
uniform RebraidBuffers buffers = cast_rebraid_buffers(rebraid_scratch,instanceID);
varying uint lane = get_sub_group_local_id();
varying uint2 splits = SUBGROUP_count_instance_splits(&bvh->Meta.bounds, instances + instanceID);
varying uint split_counts_lo = splits.x;
varying uint split_counts_hi = splits.y;
// write this instance's per-bin counts
global uint* counts = buffers.instance_bin_counts;
intel_sub_group_block_write2( counts, splits );
// update the per-bin split and instance counters
if (split_counts_lo > 0)
{
atomic_add(&buffers.bin_split_counts[lane], split_counts_lo);
GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane]);
}
if (split_counts_hi > 0)
{
atomic_add(&buffers.bin_split_counts[lane + 16], split_counts_hi);
GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane + 16]);
}
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(16, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
rebraid_count_splits_SG(
uniform global struct BVHBase* bvh,
uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
uniform global uint *rebraid_scratch)
{
do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(16, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
rebraid_count_splits_SG_indirect(
uniform global struct BVHBase* bvh,
uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
uniform global uint *rebraid_scratch,
global struct IndirectBuildRangeInfo const * const indirect_data)
{
instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
(((global char*)instances) + indirect_data->primitiveOffset);
do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
}
#define HEAP_SIZE 16
#define COUNT_SPLITS_WG_SIZE 16
struct SLMHeapNode
{
short offs;
ushort area;
};
struct SLMHeap
{
struct SLMHeapNode nodes[HEAP_SIZE];
ushort size;
ushort min_key;
};
GRL_INLINE bool SLMHeapNode_Greater( struct SLMHeapNode a, struct SLMHeapNode b )
{
return a.area > b.area;
}
GRL_INLINE ushort SLMHeapNode_UnpackKey( struct SLMHeapNode a )
{
return a.area;
}
GRL_INLINE void SLMHeapNode_Unpack( struct SLMHeapNode a, ushort* area_out, short* offs_out )
{
*area_out = a.area;
*offs_out = a.offs;
}
GRL_INLINE struct SLMHeapNode SLMHeapNode_Pack( ushort area, short offs )
{
struct SLMHeapNode n;
n.offs = offs;
n.area = area;
return n;
}
GRL_INLINE void SLMHeap_Init( struct SLMHeap* heap )
{
heap->size = 0;
heap->min_key = 0xffff;
}
GRL_INLINE bool SLMHeap_empty( struct SLMHeap* heap )
{
return heap->size == 0;
}
GRL_INLINE bool SLMHeap_full( struct SLMHeap* heap )
{
return heap->size == HEAP_SIZE;
}
GRL_INLINE void SLMHeap_push( struct SLMHeap* heap, ushort area, short offs )
{
ushort insert_pos;
if ( SLMHeap_full( heap ) )
{
ushort current_min_key = heap->min_key;
if ( area <= current_min_key )
return; // don't push stuff that's smaller than the current minimum
// search for the minimum element
// The heap is laid out in level order, so it is sufficient to search only the last half
ushort last_leaf = HEAP_SIZE - 1;
ushort first_leaf = (last_leaf / 2) + 1;
// as we search, keep track of what the new min-key will be so we can cull future pushes
ushort new_min_key = area;
ushort min_pos = 0;
do
{
ushort idx = first_leaf++;
ushort current_key = SLMHeapNode_UnpackKey( heap->nodes[idx] );
bool found_min_pos = (min_pos == 0) && (current_key == current_min_key);
if ( found_min_pos )
min_pos = idx;
else
new_min_key = min( current_key, new_min_key );
} while ( first_leaf != last_leaf );
heap->min_key = new_min_key;
insert_pos = min_pos;
}
else
{
insert_pos = heap->size++;
heap->min_key = min( area, heap->min_key );
}
heap->nodes[insert_pos] = SLMHeapNode_Pack( area, offs );
// heap-up
while ( insert_pos )
{
ushort parent = insert_pos / 2;
struct SLMHeapNode parent_node = heap->nodes[parent];
struct SLMHeapNode current_node = heap->nodes[insert_pos];
if ( SLMHeapNode_Greater( parent_node, current_node ) )
break;
heap->nodes[insert_pos] = parent_node;
heap->nodes[parent] = current_node;
insert_pos = parent;
}
}
bool SLMHeap_pop_max( struct SLMHeap* heap, ushort* area_out, short* offs_out )
{
if ( SLMHeap_empty( heap ) )
return false;
SLMHeapNode_Unpack( heap->nodes[0], area_out, offs_out );
// heap down
ushort size = heap->size;
ushort idx = 0;
do
{
ushort left = 2 * idx + 1;
ushort right = 2 * idx + 2;
if ( left >= size )
break;
if ( right >= size )
{
heap->nodes[idx] = heap->nodes[left];
break;
}
struct SLMHeapNode left_node = heap->nodes[left];
struct SLMHeapNode right_node = heap->nodes[right];
bool go_left = SLMHeapNode_Greater( left_node, right_node );
heap->nodes[idx] = go_left ? left_node : right_node;
idx = go_left ? left : right;
} while ( 1 );
heap->size = size - 1;
return true;
}
void SLMHeap_Print( struct SLMHeap* heap )
{
printf( " size=%u min=%u {", heap->size, heap->min_key );
for ( uint i = 0; i < heap->size; i++ )
printf( "%04x:%04x", heap->nodes[i].area, heap->nodes[i].offs );
}
GRL_INLINE bool can_open_root(
global struct BVHBase* bvh_base,
const struct GRL_RAYTRACING_INSTANCE_DESC* instance
)
{
float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
if ( !is_aabb_valid( root_lower, root_upper ) || GRL_get_InstanceMask(instance) == 0 )
return false;
global InternalNode* node = get_node( bvh_base, 0 );
if ( node->nodeType != NODE_TYPE_INTERNAL )
return false;
return is_node_openable( node );
}
GRL_INLINE void count_instance_splits(
global struct AABB3f* geometry_bounds,
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
local ushort* bin_split_counts,
local struct SLMHeap* heap
)
{
global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
SLMHeap_Init( heap );
float relative_area_scale = 1.0f / AABB3f_halfArea( geometry_bounds );
float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
ushort quantized_area = quantize_area( transformed_aabb_halfArea( root_lower, root_upper, instance->Transform ) * relative_area_scale );
short node_offs = 0;
ushort num_splits = 0;
global InternalNode* node_array = get_node( bvh_base, 0 );
while ( 1 )
{
global InternalNode* node = node_array + node_offs;
// count this split
uint bin = get_rebraid_bin_index( quantized_area, NUM_REBRAID_BINS );
bin_split_counts[bin]++;
// open this node and push children to heap
// TODO_OPT: Restructure this control flow to prevent differnet lanes from skipping different loop iterations and diverging
// TODO_OPT: Precompute openability masks in BLAS nodes at build time... one bit for self and N bits for each child
int offs = node->childOffset;
for ( ushort i = 0; i < NUM_CHILDREN; i++ )
{
if ( InternalNode_IsChildValid( node, i ) )
{
if ( offs >= SHRT_MIN && offs <= SHRT_MAX )
{
if ( is_node_openable( node_array + offs ) )
{
ushort area = get_child_area( node, i, instance->Transform, relative_area_scale );
SLMHeap_push( heap, area, (short)offs );
}
}
}
offs += InternalNode_GetChildBlockIncr( node, i );
}
num_splits++;
if ( num_splits == MAX_SPLITS_PER_INSTANCE )
break;
if ( !SLMHeap_pop_max( heap, &quantized_area, &node_offs ) )
break;
}
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__( (reqd_work_group_size( COUNT_SPLITS_WG_SIZE, 1, 1 )) )
void kernel
rebraid_count_splits(
global struct BVHBase* bvh_base,
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
global uint* rebraid_scratch,
uint num_instances
)
{
local struct SLMHeap heap[COUNT_SPLITS_WG_SIZE];
local ushort split_counts[COUNT_SPLITS_WG_SIZE][NUM_REBRAID_BINS];
// initialize stuff
// TODO_OPT: transpose this and subgroup-vectorize it so that
// block-writes can be used
for ( uint i = 0; i < NUM_REBRAID_BINS; i++ )
split_counts[get_local_id( 0 )][i] = 0;
// count splits for this thread's instance
uniform uint base_instance = get_group_id( 0 ) * get_local_size( 0 );
uint instanceID = base_instance + get_local_id( 0 );
if ( instanceID < num_instances )
{
global BVHBase* bvh_base = (global BVHBase*)instances[instanceID].AccelerationStructure;
if ( can_open_root( bvh_base, &instances[instanceID] ) )
{
count_instance_splits( &bvh_base->Meta.bounds,
&instances[instanceID],
&split_counts[get_local_id( 0 )][0],
&heap[get_local_id(0)] );
}
}
barrier( CLK_LOCAL_MEM_FENCE );
RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
// reduce bins
for ( uint bin = get_local_id( 0 ); bin < NUM_REBRAID_BINS; bin += get_local_size( 0 ) )
{
// TODO_OPT: There's probably a better way to arrange this computation
uint bin_split_count = 0;
uint bin_instance_count = 0;
for ( uint i = 0; i < COUNT_SPLITS_WG_SIZE; i++ )
{
uint s = split_counts[i][bin];
bin_split_count += s;
bin_instance_count += (s > 0) ? 1 : 0;
}
if ( bin_split_count > 0 )
{
atomic_add( &buffers.bin_split_counts[bin], bin_split_count );
atomic_add( &buffers.bin_instance_counts[bin], bin_instance_count );
}
}
// write out bin counts for each instance
for ( uniform uint i = get_sub_group_id(); i < COUNT_SPLITS_WG_SIZE; i += get_num_sub_groups() )
{
uniform uint iid = base_instance + i;
if ( iid > num_instances )
break;
global uint* instance_bin_counts = cast_rebraid_buffers( rebraid_scratch, iid ).instance_bin_counts;
for ( uniform ushort j = 0; j < NUM_REBRAID_BINS; j += get_sub_group_size() )
{
uint count = split_counts[i][j + get_sub_group_local_id() ];
intel_sub_group_block_write( instance_bin_counts + j, count );
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////
// Build PrimRefs
///////////////////////////////////////////////////////////////////////////////////////////
GRL_INLINE uint get_instance_split_count(RebraidBuffers buffers, uint instanceID, uint available_splits)
{
global uint* instance_desired_split_count = buffers.instance_bin_counts;
global uint *bin_split_counts = buffers.bin_split_counts;
global uint *bin_instance_counts = buffers.bin_instance_counts;
uint total_splits = 0;
uint remaining_available_splits = available_splits;
uint max_bin = 0;
uint desired_splits_this_bin = 0;
uint instance_splits = 0;
do
{
// stop when we reach a level where we can't satisfy the demand
desired_splits_this_bin = instance_desired_split_count[max_bin];
uint total_bin_splits = bin_split_counts[max_bin];
if (total_bin_splits > remaining_available_splits)
break;
// we have enough budget to give all instances everything they want at this level, so do it
remaining_available_splits -= total_bin_splits;
instance_splits += desired_splits_this_bin;
desired_splits_this_bin = 0;
max_bin++;
} while (max_bin < NUM_REBRAID_BINS);
if (max_bin < NUM_REBRAID_BINS)
{
// we have more split demand than we have splits available. The current bin is the last one that gets any splits
// distribute the leftovers as evenly as possible to instances that want them
if (desired_splits_this_bin > 0)
{
// this instance wants splits. how many does it want?
uint desired_total = instance_splits + desired_splits_this_bin;
// distribute to all instances as many as possible
uint count = bin_instance_counts[max_bin];
uint whole = remaining_available_splits / count;
remaining_available_splits -= whole * count;
// distribute remainder to lower numbered instances
size_t partial = (instanceID < remaining_available_splits) ? 1 : 0;
// give the instance its share.
instance_splits += whole + partial;
instance_splits = min(instance_splits, desired_total); // don't give it more than it needs
}
}
return instance_splits;
}
GRL_INLINE void build_unopened_primref(
struct AABB3f* centroid_bounds,
global __const BVHBase *bvh_base,
global volatile uint *primref_counter,
global struct AABB *primref_buffer,
global __const float *Transform,
uint instanceID,
float matOverhead,
ushort instanceMask)
{
float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
struct AABB primRef;
AABB_init( &primRef );
uint bvhoffset = (uint)BVH_ROOT_NODE_OFFSET;
if (is_aabb_valid(root_lower, root_upper) && instanceMask != 0)
{
primRef = AABBfromAABB3f(compute_xfm_bbox(Transform, BVHBase_GetRootNode(bvh_base), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &bvh_base->Meta.bounds, matOverhead));
}
else
{
primRef.lower.x = Transform[3];
primRef.lower.y = Transform[7];
primRef.lower.z = Transform[11];
primRef.upper.xyz = primRef.lower.xyz;
instanceMask = 0;
bvhoffset = NO_NODE_OFFSET;
}
primRef.lower.w = as_float(instanceID | (instanceMask << 24));
primRef.upper.w = as_float(bvhoffset);
float3 centroid = primRef.lower.xyz + primRef.upper.xyz;
centroid_bounds->lower[0] = centroid.x;
centroid_bounds->upper[0] = centroid.x;
centroid_bounds->lower[1] = centroid.y;
centroid_bounds->upper[1] = centroid.y;
centroid_bounds->lower[2] = centroid.z;
centroid_bounds->upper[2] = centroid.z;
uint place = GRL_ATOMIC_INC(primref_counter);
primref_buffer[place] = primRef;
}
GRL_INLINE void build_opened_primrefs(
varying bool lane_mask,
varying uint offset,
varying InternalNode* node,
varying struct AABB3f* centroid_bounds,
uniform global BVHBase *bvh_base,
uniform volatile global uint *primref_counter,
uniform global struct AABB *primref_buffer,
uniform uint instanceID,
uniform const float *Transform,
uniform float matOverhead,
varying ushort instanceMask)
{
// TODO_OPT: This function is often called with <= 6 active lanes
// If lanes are sparse, consider jumping to a sub-group vectorized variant...
if (lane_mask)
{
varying uint place = GRL_ATOMIC_INC(primref_counter);
struct AABB box = AABBfromAABB3f(compute_xfm_bbox(Transform, node, XFM_BOX_NOT_REFINED_CLIPPED, &bvh_base->Meta.bounds, matOverhead));
box.lower.w = as_float(instanceID | (instanceMask << 24));
box.upper.w = as_float(offset * 64 + (uint)BVH_ROOT_NODE_OFFSET);
primref_buffer[place] = box;
AABB3f_extend_point( centroid_bounds, box.lower.xyz + box.upper.xyz );
}
}
GRL_INLINE void SUBGROUP_open_nodes(
uniform global struct AABB3f *geometry_bounds,
uniform uint split_limit,
uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance,
uniform uint instanceID,
uniform volatile global uint *primref_counter,
uniform global struct AABB *primref_buffer,
varying struct AABB3f* centroid_bounds,
float transformOverhead)
{
uniform SGHeap heap;
SGHeap_init(&heap);
uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
uniform uint16_t node_offs = 0;
varying uint lane = get_sub_group_local_id();
uniform InternalNode* node_array = get_node( bvh_base, 0 );
LOOP_TRIPWIRE_INIT;
while ( 1 )
{
uniform InternalNode *node = node_array + node_offs;
varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node);
varying bool sg_valid = false;
varying bool sg_openable = false;
if (lane < NUM_CHILDREN)
{
sg_valid = InternalNode_IsChildValid(node, lane);
if (sg_valid && (sg_offs <= MAX_NODE_OFFSET))
{
sg_openable = is_node_openable( node_array + sg_offs);
}
}
uniform uint16_t valid_children = intel_sub_group_ballot(sg_valid);
uniform uint16_t openable_children = intel_sub_group_ballot(sg_openable);
uniform uint16_t unopenable_children = valid_children & (~openable_children);
if ( openable_children )
{
varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
// try to push all openable children to the heap
if ( !SGHeap_full( &heap ) )
{
openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
}
// we have more openable children than will fit in the heap
// process these one by one.
// TODO: Try re-writing with sub_group_any() and see if compiler does a better job
while ( openable_children )
{
// pop min element
uniform uint16_t min_area;
uniform uint16_t min_offs;
SGHeap_full_pop_min( &heap, &min_area, &min_offs );
// eliminate all children smaller than heap minimum.
// mark eliminated children as unopenable
varying uint culled_children = openable_children & intel_sub_group_ballot( sg_area <= min_area );
unopenable_children ^= culled_children;
openable_children &= ~culled_children;
if ( openable_children )
{
// if any children survived the purge
// find the first such child and swap its offset for the one from the heap
//
uniform uint child_id = ctz( openable_children );
uniform uint16_t old_min_offs = min_offs;
min_area = sub_group_broadcast( sg_area, child_id );
min_offs = sub_group_broadcast( sg_offs, child_id );
if ( lane == child_id )
sg_offs = old_min_offs;
openable_children ^= (1 << child_id);
unopenable_children ^= (1 << child_id);
}
SGHeap_push_and_fill( &heap, min_area, min_offs );
}
}
if (unopenable_children)
{
varying bool sg_create_primref = ((1 << lane) & unopenable_children);
build_opened_primrefs(sg_create_primref, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
}
--split_limit;
if (split_limit == 0)
{
// split limit exceeded
// create primrefs for all remaining openable nodes in heap
varying bool sg_mask = SGHeap_get_lane_mask(&heap);
sg_offs = SGHeap_get_lane_values(&heap);
build_opened_primrefs(sg_mask, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
break;
}
// NOTE: the heap should never be empty. If it is, the instance was given too many splits.
// get next node from heap
uint16_t quantized_area;
SGHeap_pop_max(&heap, &quantized_area, &node_offs);
LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_build_primrefs" );
}
}
#define OPEN_QUEUE_SIZE 256
#define OPEN_QUEUE_NUM_SGS 16
typedef struct OpenQueueEntry
{
uint instanceID;
ushort num_splits;
} OpenQueueEntry;
typedef struct OpenQueue
{
uint num_produced;
uint num_consumed;
OpenQueueEntry Q[OPEN_QUEUE_SIZE];
} OpenQueue;
uniform uint SUBGROUP_GetNextQueueEntry( local OpenQueue* queue )
{
uint next = 0;
if ( get_sub_group_local_id() == 0 )
next = GRL_ATOMIC_INC( &queue->num_consumed );
return sub_group_broadcast( next, 0 );
}
GRL_INLINE void do_rebraid_build_primrefs(
local struct AABB3f* SLM_CentroidBounds,
local OpenQueue* SLM_Q,
global struct Globals* globals,
global struct BVHBase* base,
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
global uint* rebraid_scratch,
global struct AABB* primref_buffer,
uint extra_primref_count,
uint num_instances)
{
varying uint instanceID = get_sub_group_size() * get_sub_group_global_id() + get_sub_group_local_id();
uniform volatile global uint* primref_counter = &globals->numPrimitives;
uniform RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
uniform uint available_splits = get_num_splits( extra_primref_count, NUM_CHILDREN );
varying struct AABB3f centroidBounds;
AABB3f_init( &centroidBounds );
if ( get_local_id( 0 ) == 0 )
{
SLM_Q->num_produced = 0;
SLM_Q->num_consumed = 0;
AABB3f_init( SLM_CentroidBounds );
}
barrier( CLK_LOCAL_MEM_FENCE );
// assign splits to unopened instances. Build primrefs for unsplit instances in vectorized form
varying uint num_splits = 0;
if ( instanceID < num_instances )
{
num_splits = get_instance_split_count( buffers, instanceID, available_splits );
if ( num_splits == 0 )
{
varying global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
varying global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
if ( bvh_base != 0 )
{
build_unopened_primref( &centroidBounds, bvh_base, primref_counter, primref_buffer, instance->Transform, instanceID, 0.0f, GRL_get_InstanceMask(instance));
}
}
else
{
// defer opened instances
uint place = GRL_ATOMIC_INC( &SLM_Q->num_produced );
SLM_Q->Q[place].instanceID = instanceID;
SLM_Q->Q[place].num_splits = (ushort)num_splits;
}
}
barrier( CLK_LOCAL_MEM_FENCE );
// if there were opened instances, process them, one per subgroup
uniform uint num_produced = SLM_Q->num_produced;
uniform uint next = SUBGROUP_GetNextQueueEntry( SLM_Q );
while ( next < num_produced )
{
uniform uint instanceID = SLM_Q->Q[next].instanceID;
uniform uint num_splits = SLM_Q->Q[next].num_splits;
uniform global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
float transformOverhead =
#if FINE_TRANSFORM_NODE_BOX
transformation_bbox_surf_overhead(instance->Transform);
#else
0.0f;
#endif
SUBGROUP_open_nodes(
&base->Meta.bounds,
num_splits,
instance,
instanceID,
primref_counter,
primref_buffer,
&centroidBounds,
transformOverhead);
next = SUBGROUP_GetNextQueueEntry( SLM_Q );
}
// reduce the centroid bounds AABB
struct AABB3f reduced = AABB3f_sub_group_reduce( &centroidBounds );
if ( get_sub_group_local_id() == 0 )
AABB3f_atomic_merge_localBB_nocheck( SLM_CentroidBounds, &reduced );
barrier( CLK_LOCAL_MEM_FENCE );
if( get_local_id(0) == 0 )
{
atomic_min( (global float*) (&globals->centroidBounds.lower) + 0, SLM_CentroidBounds->lower[0] );
atomic_min( (global float*) (&globals->centroidBounds.lower) + 1, SLM_CentroidBounds->lower[1] );
atomic_min( (global float*) (&globals->centroidBounds.lower) + 2, SLM_CentroidBounds->lower[2] );
atomic_max( (global float*) (&globals->centroidBounds.upper) + 0, SLM_CentroidBounds->upper[0] );
atomic_max( (global float*) (&globals->centroidBounds.upper) + 1, SLM_CentroidBounds->upper[1] );
atomic_max( (global float*) (&globals->centroidBounds.upper) + 2, SLM_CentroidBounds->upper[2] );
}
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
__attribute__( (intel_reqd_sub_group_size( 16 )) )
void kernel rebraid_build_primrefs(
global struct Globals* globals,
global struct BVHBase* base,
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
global uint* rebraid_scratch,
global struct AABB* primref_buffer,
uint extra_primref_count,
uint num_instances)
{
local struct AABB3f SLM_CentroidBounds;
local OpenQueue SLM_Q;
do_rebraid_build_primrefs(
&SLM_CentroidBounds,
&SLM_Q,
globals,
base,
instance_buffer,
rebraid_scratch,
primref_buffer,
extra_primref_count,
num_instances);
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
__attribute__( (intel_reqd_sub_group_size( 16 )) )
void kernel rebraid_build_primrefs_indirect(
global struct Globals* globals,
global struct BVHBase* base,
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
global uint* rebraid_scratch,
global struct AABB* primref_buffer,
global struct IndirectBuildRangeInfo const * const indirect_data,
uint extra_primref_count )
{
local struct AABB3f SLM_CentroidBounds;
local OpenQueue SLM_Q;
instance_buffer = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
(((global char*)instance_buffer) + indirect_data->primitiveOffset);
do_rebraid_build_primrefs(
&SLM_CentroidBounds,
&SLM_Q,
globals,
base,
instance_buffer,
rebraid_scratch,
primref_buffer,
extra_primref_count,
indirect_data->primitiveCount);
}
///////////////////////////////////////////////////////////////////////////////////////////
// Misc
///////////////////////////////////////////////////////////////////////////////////////////
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__((reqd_work_group_size(16, 1, 1)))
__attribute__((intel_reqd_sub_group_size(16))) void kernel
ISA_TEST(global InternalNode *n, global uint *out, global float *xform, float scale)
{
out[get_sub_group_local_id()] = InternalNode_IsChildValid(n, get_sub_group_local_id());
}
GRL_ANNOTATE_IGC_DO_NOT_SPILL
__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) void kernel
DEBUG_PRINT(
global struct Globals* globals,
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
global uint* rebraid_scratch,
global struct AABB* primref_buffer,
dword num_extra,
dword input_instances )
{
#if 0
// validate primrefs
if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
{
uint refs = globals->numPrimitives;
for ( uint i = 0; i < refs; i++ )
{
if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) ||
any( isnan(primref_buffer[i].lower.xyz) ) ||
any( isnan(primref_buffer[i].upper.xyz) ) )
{
struct AABB box = primref_buffer[i];
printf( "BAD BOX: %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
box.lower.x, box.lower.y, box.lower.z,
box.upper.x, box.upper.y, box.upper.z,
as_uint( box.lower.w ) );
}
const uint instIndex = PRIMREF_instanceID(&primref_buffer[i]); // TODO: Refactor me. We should not be using struct AABB for primRefs
const uint rootByteOffset = as_uint( primref_buffer[i].upper.w ); // It should be struct PrimRef
if ( instIndex >= input_instances )
printf( "BAD INSTANCE INDEX: %u", i );
else
{
global struct BVHBase* blas = (global struct BVHBase*)instance_buffer[instIndex].AccelerationStructure;
if ( blas )
{
struct InternalNode* start = BVHBase_GetInternalNodes( blas );
struct InternalNode* end = BVHBase_GetInternalNodesEnd( blas );
InternalNode* entryPoint = (struct InternalNode*)((char*)instance_buffer[instIndex].AccelerationStructure + rootByteOffset);
if ( entryPoint < start || entryPoint >= end )
printf( "BAD ENTRYPOINT: %u\n", i );
if ( (rootByteOffset & 63) != 0 )
printf( "MISALIGNED ENTRYPOInt: %u\n", i );
}
}
}
}
#endif
#if 0
if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
printf( "REBRAIDED: %u\n", globals->numPrimitives );
// print instance bin information
if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
{
printf( "REBRAIDED: %u\n", globals->numPrimitives );
for( uint i=0; i<231; i++ )
{
RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch,i );
printf( " ID:%4u ", i );
for ( uint j = 0; j < NUM_REBRAID_BINS; j++ )
{
global uint* count = buffers.instance_bin_counts;
printf( " %2u ", count[j] );
}
printf( "\n" );
}
}
#endif
#if 0
if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
{
printf( "Instances: %u\n", globals->numPrimitives );
for ( uint i = 0; i < globals->numPrimitives; i++ )
{
if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) )
{
struct AABB box = primref_buffer[i];
printf( " %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
box.lower.x, box.lower.y, box.lower.z,
box.upper.x, box.upper.y, box.upper.z,
as_uint( box.lower.w ) );
}
}
}
#endif
}