diff --git a/src/intel/vulkan/grl/gpu/AABB.h b/src/intel/vulkan/grl/gpu/AABB.h new file mode 100644 index 00000000000..11d848e3c09 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/AABB.h @@ -0,0 +1,450 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "shared.h" +#include "intrinsics.h" +#ifndef __OPENCL_VERSION__ +#include "stdio.h" +#endif + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) +/* ====== QUAD ENCODING config ====== */ + +#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom +#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS) +#define QUAD_GEOMID_MASK ((1<lower = (float4)(INFINITY, INFINITY, INFINITY, 0); + aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0); +} + +GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb) +{ + const uint v = as_uint(aabb->lower.w); + return v & QUAD_GEOMID_MASK; +} + +GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb) +{ + return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK; +} + +GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb) +{ + const uint v = as_uint(aabb->lower.w); + const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK; + const uint deltaID = v >> QUAD_GEOMID_BITS; + const uint primID1 = primID0 + deltaID; + return primID1; +} + +GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb ) +{ + const uint v = as_uint( aabb->upper.w ); + return (v >> QUAD_PRIMID_BITS) ; +} + +GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb ) +{ + return as_uint(aabb->lower.w) & INSTANCE_ID_MASK; +} + +GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb ) +{ + return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS; +} + +GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags ) +{ + /* encode geomID, primID */ + uint flags = (geomFlags << QUAD_PRIMID_BITS); + primref->lower.w = as_float( geomID ); + primref->upper.w = as_float( primID | flags ); +} + +GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags ) +{ + const uint primID_diff = primID1 - primID0; + uint flags = geomFlags << QUAD_PRIMID_BITS; + + primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) ); + primref->upper.w = as_float( (primID0 | flags) ); +} + +GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper ) +{ + primref->lower.xyz = lower.xyz; + primref->upper.xyz = upper.xyz; +} + +GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural ) +{ + PrimRef new_ref; + new_ref.lower.xyz = lower; + new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24)); + new_ref.upper.xyz = upper; + new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0)); + return new_ref; +} + +GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref ) +{ + return (as_uint(primref->upper.w) & 0x80000000) != 0; +} + +GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref) +{ + return (as_uint(primref->upper.w) & 0x7fffffff); +} + +GRL_INLINE float3 PRIMREF_lower( PrimRef* primref ) +{ + return primref->lower.xyz; +} +GRL_INLINE float3 PRIMREF_upper( PrimRef* primref ) +{ + return primref->upper.xyz; +} + +GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v) +{ + aabb->lower = min(aabb->lower, v->lower); + aabb->upper = max(aabb->upper, v->upper); +} + +GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p) +{ + aabb->lower = min(aabb->lower, p); + aabb->upper = max(aabb->upper, p); +} + +GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper) +{ + aabb->lower = min(aabb->lower, lower); + aabb->upper = max(aabb->upper, upper); +} + +GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v) +{ + struct AABB box; + box.lower = aabb->lower - (float4)v; + box.upper = aabb->upper + (float4)v; + return box; +} + +GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v) +{ + aabb->lower = max(aabb->lower, v->lower); + aabb->upper = min(aabb->upper, v->upper); +} + +GRL_INLINE float4 AABB_size(struct AABB *aabb) +{ + return aabb->upper - aabb->lower; +} + +GRL_INLINE float4 AABB_centroid2(struct AABB *aabb) +{ + return aabb->lower + aabb->upper; +} + +GRL_INLINE float AABB_halfArea(struct AABB *aabb) +{ + const float4 d = AABB_size(aabb); + return halfarea(d.xyz); +} + +GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v) +{ + struct AABB temp = *aabb; + AABB_intersect(&temp, v); + float4 len = AABB_size(&temp); + float ret = 0.0f; + if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) { + float3 v = { len.x, len.y, len.z }; + ret = halfarea(v); + } + return ret; +} + +GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big) +{ + const int4 b0 = small->lower >= big->lower; + const int4 b1 = small->upper <= big->upper; + const int4 b = b0 & b1; + return b.x & b.y & b.z; +} + +GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box) +{ + struct AABB box4d = { + {box.lower[0], box.lower[1], box.lower[2], 0.0f}, + {box.upper[0], box.upper[1], box.upper[2], 0.0f} + }; + return box4d; +} + +GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box) +{ + struct AABB3f box3d = { + {box.lower[0], box.lower[1], box.lower[2]}, + {box.upper[0], box.upper[1], box.upper[2]} + }; + return box3d; +} + +GRL_INLINE bool AABB_verify(struct AABB* aabb) +{ + bool error = false; + if (aabb->lower.x > aabb->upper.x) + error = true; + if (aabb->lower.y > aabb->upper.y) + error = true; + if (aabb->lower.z > aabb->upper.z) + error = true; + if (!isfinite(aabb->lower.x)) + error = true; + if (!isfinite(aabb->lower.y)) + error = true; + if (!isfinite(aabb->lower.z)) + error = true; + if (!isfinite(aabb->upper.x)) + error = true; + if (!isfinite(aabb->upper.y)) + error = true; + if (!isfinite(aabb->upper.z)) + error = true; + return error; +} + +GRL_INLINE void AABB_print(struct AABB* aabb) +{ + printf("AABB {\n area = %f\n lower = %f\n upper = %f\n geomID = %i primID0 = %i primID1 = %i\n aabb->lower.w = %x aabb->upper.w = %x }\n", + AABB_halfArea(aabb), + aabb->lower.xyz, + aabb->upper.xyz, + PRIMREF_geomID(aabb), + PRIMREF_primID0(aabb), + PRIMREF_primID1(aabb), + as_uint(aabb->lower.w), + as_uint(aabb->upper.w)); +} + +#ifdef __OPENCL_VERSION__ + +GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID) +{ + PrimRef shuffledPrimref; + shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID); + shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID); + shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID); + shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID); + shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID); + shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID); + shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID); + shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID); + return shuffledPrimref; +} + +GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID) +{ + struct AABB bounds; + bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID); + bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID); + bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID); + bounds.lower.w = 0; + bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID); + bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID); + bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID); + bounds.upper.w = 0; + return bounds; +} +GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID) +{ + struct AABB bounds; + bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID); + bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID); + bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID); + bounds.lower.w = 0; + bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID); + bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID); + bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID); + bounds.upper.w = 0; + return bounds; +} + +GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID) +{ + float coordData[8] = { + sub_group_broadcast(aabb->lower.x, slotID), + sub_group_broadcast(aabb->lower.y, slotID), + sub_group_broadcast(aabb->lower.z, slotID), + sub_group_broadcast(aabb->lower.w, slotID), + sub_group_broadcast(aabb->upper.x, slotID), + sub_group_broadcast(aabb->upper.y, slotID), + sub_group_broadcast(aabb->upper.z, slotID), + sub_group_broadcast(aabb->upper.w, slotID) }; + + uint coordDataFiltered; + const uint lane = get_sub_group_local_id(); + if (lane < 8) coordDataFiltered = as_uint(coordData[lane]); + return coordDataFiltered; +} + +GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb) +{ + struct AABB bounds; + bounds.lower.x = sub_group_reduce_min(aabb->lower.x); + bounds.lower.y = sub_group_reduce_min(aabb->lower.y); + bounds.lower.z = sub_group_reduce_min(aabb->lower.z); + bounds.lower.w = 0; + bounds.upper.x = sub_group_reduce_max(aabb->upper.x); + bounds.upper.y = sub_group_reduce_max(aabb->upper.y); + bounds.upper.z = sub_group_reduce_max(aabb->upper.z); + bounds.upper.w = 0; + return bounds; +} + + +GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb ) +{ + float3 l = aabb->lower.xyz; + float3 u = aabb->upper.xyz; + l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) ); + l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) ); + l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) ); + u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) ); + u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) ); + u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) ); + + struct AABB bounds; + bounds.lower.x = l.x; + bounds.lower.y = l.y; + bounds.lower.z = l.z; + bounds.lower.w = 0; + bounds.upper.x = u.x; + bounds.upper.y = u.y; + bounds.upper.z = u.z; + bounds.upper.w = 0; + return bounds; +} + + +GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb) +{ + struct AABB bounds; + bounds.lower.x = work_group_reduce_min(aabb->lower.x); + bounds.lower.y = work_group_reduce_min(aabb->lower.y); + bounds.lower.z = work_group_reduce_min(aabb->lower.z); + bounds.upper.x = work_group_reduce_max(aabb->upper.x); + bounds.upper.y = work_group_reduce_max(aabb->upper.y); + bounds.upper.z = work_group_reduce_max(aabb->upper.z); + return bounds; +} + +GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb) +{ + struct AABB bounds; + bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x); + bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y); + bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z); + bounds.lower.w = 0; + bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x); + bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y); + bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z); + bounds.upper.w = 0; + return bounds; +} + +GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb) +{ + struct AABB bounds; + bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x); + bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y); + bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z); + bounds.lower.w = 0; + bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x); + bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y); + bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z); + bounds.upper.w = 0; + return bounds; +} + +GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb) +{ + atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x); + atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y); + atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z); + atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x); + atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y); + atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z); +} + +GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper ) +{ + atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x); + atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y); + atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z); + atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x); + atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y); + atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z); +} + +GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper) +{ + uint lane = get_sub_group_local_id(); + float l[3]; + l[0] = sub_group_reduce_min(lower.x); + l[1] = sub_group_reduce_min(lower.y); + l[2] = sub_group_reduce_min(lower.z); + float u[3]; + u[0] = sub_group_reduce_max(upper.x); + u[1] = sub_group_reduce_max(upper.y); + u[2] = sub_group_reduce_max(upper.z); + + if (lane < 3) + { + atomic_min((global float*)&aabb->lower + lane, l[lane]); + atomic_max((global float*)&aabb->upper + lane, u[lane]); + } +} + + +GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper) +{ + if (lower.x < aabb->lower.x) + atomic_min((local float *)&aabb->lower + 0, lower.x); + if (lower.y < aabb->lower.y) + atomic_min((local float *)&aabb->lower + 1, lower.y); + if (lower.z < aabb->lower.z) + atomic_min((local float *)&aabb->lower + 2, lower.z); + if (upper.x > aabb->upper.x) + atomic_max((local float *)&aabb->upper + 0, upper.x); + if (upper.y > aabb->upper.y) + atomic_max((local float *)&aabb->upper + 1, upper.y); + if (upper.z > aabb->upper.z) + atomic_max((local float *)&aabb->upper + 2, upper.z); +} +#endif + +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/api_interface.h b/src/intel/vulkan/grl/gpu/api_interface.h new file mode 100644 index 00000000000..71a1fff6327 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/api_interface.h @@ -0,0 +1,840 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once +#include "GRLStructs.h" +#include "shared.h" +#include "libs/lsc_intrinsics.h" + +typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC; + +typedef struct GRL_RAYTRACING_AABB +{ + float MinX; + float MinY; + float MinZ; + float MaxX; + float MaxY; + float MaxZ; +} GRL_RAYTRACING_AABB; + +GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source) +{ + dest->MinX = source->lower.x; + dest->MinY = source->lower.y; + dest->MinZ = source->lower.z; + dest->MaxX = source->upper.x; + dest->MaxY = source->upper.y; + dest->MaxZ = source->upper.z; +} + +GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID) +{ + global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer; + uint index_format = geomDesc->Desc.Triangles.IndexFormat; + + if (index_format == INDEX_FORMAT_R32_UINT) + { + const uint* data = (const uint*)(indices + triID * 3 * 4); + return (uint3)(data[0], data[1], data[2]); + } + else if (index_format == INDEX_FORMAT_NONE) + { + return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2); + } + else + { + const ushort* data = (const ushort*)(indices + triID * 3 * 2); + return (uint3)(data[0], data[1], data[2]); + } +} + +GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID) +{ + if (index_format == INDEX_FORMAT_R32_UINT) + { + return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0); + } + else if (index_format == INDEX_FORMAT_NONE) + { + return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2); + } + else + { + const ushort* data = (const ushort*)(indices + triID * 3 * 2); + return (uint3)(data[0], data[1], data[2]); + } +} + +// Load all 3 indices from one triangle, and a single index from another +GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert) +{ + global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer; + uint index_format = geomDesc->Desc.Triangles.IndexFormat; + + if (index_format == INDEX_FORMAT_R32_UINT) + { + const uint* data0 = (const uint*)(indices + triID * 3 * 4); + const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4); + return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]); + } + else if (index_format == INDEX_FORMAT_NONE) + { + return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert); + } + else + { + const ushort* data0 = (const ushort*)(indices + triID * 3 * 2); + const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2); + return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]); + } +} + +GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type) +{ + geomDesc->Type = type; +} + +GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Type; +} + +GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags) +{ + geomDesc->Flags = flags; +} + +GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Flags; +} + +GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform) +{ + geomDesc->Desc.Triangles.pTransformBuffer = transform; +} + +GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.pTransformBuffer; +} + +GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format) +{ + geomDesc->Desc.Triangles.IndexFormat = format; +} + +GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.IndexFormat; +} + +GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format) +{ + geomDesc->Desc.Triangles.VertexFormat = format; +} + +GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.VertexFormat; +} + +GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count) +{ + geomDesc->Desc.Triangles.IndexCount = count; +} + +GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.IndexCount; +} + +GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count) +{ + geomDesc->Desc.Triangles.VertexCount = count; +} + +GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.VertexCount; +} + +GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer) +{ + geomDesc->Desc.Triangles.pIndexBuffer = buffer; +} + +GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.pIndexBuffer; +} + +GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address) +{ + geomDesc->Desc.Triangles.pVertexBuffer = address; +} + +GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.pVertexBuffer; +} + +GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride) +{ + geomDesc->Desc.Triangles.VertexBufferByteStride = stride; +} + +GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Triangles.VertexBufferByteStride; +} + +GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat); +} + +GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count) +{ + geomDesc->Desc.Procedural.AABBCount = count; +} + +GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Procedural.AABBCount; +} + +GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address) +{ + geomDesc->Desc.Procedural.pAABBs_GPUVA = address; +} + +GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Procedural.pAABBs_GPUVA; +} + +GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride) +{ + geomDesc->Desc.Procedural.AABBByteStride = stride; +} + +GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) +{ + return geomDesc->Desc.Procedural.AABBByteStride; +} + +GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc) +{ + return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL; +} + +GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc) +{ + return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL; +} + +GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc) +{ + return 0x00FFFFFF; +} + +GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value) +{ + return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value); +} + +GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc) +{ + if (GRL_is_triangle(desc)) + { + if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE) + { + return desc->Desc.Triangles.VertexCount / 3; + } + else + { + return desc->Desc.Triangles.IndexCount / 3; + } + } + else + { + return desc->Desc.Procedural.AABBCount; + } +} + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values + +GRL_INLINE float snorm_to_float(short v) +{ + return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this? +} + +GRL_INLINE float snorm8_to_float(signed char v) +{ + return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this? +} + +GRL_INLINE float unorm_to_float(unsigned short v) +{ + return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this? +} + +//only lower 10 bits of v are used +GRL_INLINE float unorm10_to_float(unsigned v) +{ + const unsigned short mask = (unsigned short)((1u << 10u) - 1u); + const unsigned short v10 = (unsigned short)v & mask; + return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this? +} + +GRL_INLINE float unorm8_to_float(unsigned char v) +{ + return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this? +} + +GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID) +{ + float4 v = (float4)(0, 0, 0, 0); + global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer; + uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride; + uint vertex_format = geomDesc->Desc.Triangles.VertexFormat; + + if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) + { + const float* data = (const float*)(vertices + vtxID * vertex_stride); + v = (float4)(data[0], data[1], data[2], 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT) + { + const float* data = (const float*)(vertices + vtxID * vertex_stride); + v = (float4)(data[0], data[1], 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) + { + const half* data = (const half*)(vertices + vtxID * vertex_stride); + v = (float4)(data[0], data[1], data[2], 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT) + { + const half* data = (const half*)(vertices + vtxID * vertex_stride); + v = (float4)(data[0], data[1], 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) + { + const short* data = (const short*)(vertices + vtxID * vertex_stride); + v = (float4)(snorm_to_float(data[0]), + snorm_to_float(data[1]), + snorm_to_float(data[2]), + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM) + { + const short* data = (const short*)(vertices + vtxID * vertex_stride); + v = (float4)(snorm_to_float(data[0]), + snorm_to_float(data[1]), + 0.0f, + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) + { + const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride); + v = (float4)(unorm_to_float(data[0]), + unorm_to_float(data[1]), + unorm_to_float(data[2]), + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM) + { + const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride); + v = (float4)(unorm_to_float(data[0]), + unorm_to_float(data[1]), + 0.0f, + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) + { + const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride); + v = (float4)(unorm10_to_float(data), + unorm10_to_float((data >> 10)), + unorm10_to_float((data >> 20)), + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) + { + const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); + v = (float4)(unorm8_to_float(data[0]), + unorm8_to_float(data[1]), + unorm8_to_float(data[2]), + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM) + { + const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); + v = (float4)(unorm8_to_float(data[0]), + unorm8_to_float(data[1]), + 0.0f, + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) + { + const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); + v = (float4)(snorm8_to_float(data[0]), + snorm8_to_float(data[1]), + snorm8_to_float(data[2]), + 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM) + { + const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); + v = (float4)(snorm8_to_float(data[0]), + snorm8_to_float(data[1]), + 0.0f, + 0.0f); + } + + /* perform vertex transformation */ + if (geomDesc->Desc.Triangles.pTransformBuffer) + { + global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer; + const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3]; + const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7]; + const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11]; + v = (float4)(x, y, z, 0.0f); + } + + return v; +} + +GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out) +{ + if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) + { + const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0)); + const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0)); + const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0)); + out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f); + out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f); + out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT) + { + const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride); + const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride); + const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f); + out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f); + out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) + { + const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride); + const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride); + const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f); + out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f); + out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT) + { + const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride); + const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride); + const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f); + out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f); + out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) + { + const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride); + const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride); + const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f); + out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f); + out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM) + { + const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride); + const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride); + const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f); + out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f); + out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) + { + const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride); + const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride); + const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f); + out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f); + out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM) + { + const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride); + const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride); + const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f); + out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f); + out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) + { + const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride); + const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride); + const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f); + out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f); + out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) + { + const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); + const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); + const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f); + out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f); + out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM) + { + const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); + const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); + const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f); + out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f); + out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) + { + const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); + const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); + const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f); + out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f); + out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM) + { + const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); + const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); + const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); + out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f); + out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f); + out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f); + } + + /* perform vertex transformation */ + if (transform_buffer) + { + global float* xfm = (global float*)transform_buffer; + for (uint i = 0; i < 3; ++i) + { + const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3]; + const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7]; + const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11]; + out[i] = (float4)(x, y, z, 0.0f); + } + } +} + +GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + float3* out0, float3* out1, float3* out2, float3* out3, + const uint4 vtxID, const uint vertex_format, global char* vertices) +{ + float3 v0, v1, v2, v3; + + if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) + { + const float* data0 = (const float*)(vertices + vtxID.x); + const float* data1 = (const float*)(vertices + vtxID.y); + const float* data2 = (const float*)(vertices + vtxID.z); + const float* data3 = (const float*)(vertices + vtxID.w); + v0 = (float3)(data0[0], data0[1], data0[2]); + v1 = (float3)(data1[0], data1[1], data1[2]); + v2 = (float3)(data2[0], data2[1], data2[2]); + v3 = (float3)(data3[0], data3[1], data3[2]); + } + else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT) + { + const float* data0 = (const float*)(vertices + vtxID.x); + const float* data1 = (const float*)(vertices + vtxID.y); + const float* data2 = (const float*)(vertices + vtxID.z); + const float* data3 = (const float*)(vertices + vtxID.w); + v0 = (float3)(data0[0], data0[1], 0.0f); + v1 = (float3)(data1[0], data1[1], 0.0f); + v2 = (float3)(data2[0], data2[1], 0.0f); + v3 = (float3)(data3[0], data3[1], 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) + { + const half* data0 = (const half*)(vertices + vtxID.x); + const half* data1 = (const half*)(vertices + vtxID.y); + const half* data2 = (const half*)(vertices + vtxID.z); + const half* data3 = (const half*)(vertices + vtxID.w); + v0 = (float3)(data0[0], data0[1], data0[2]); + v1 = (float3)(data1[0], data1[1], data1[2]); + v2 = (float3)(data2[0], data2[1], data2[2]); + v3 = (float3)(data3[0], data3[1], data3[2]); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT) + { + const half* data0 = (const half*)(vertices + vtxID.x); + const half* data1 = (const half*)(vertices + vtxID.y); + const half* data2 = (const half*)(vertices + vtxID.z); + const half* data3 = (const half*)(vertices + vtxID.w); + v0 = (float3)(data0[0], data0[1], 0.0f); + v1 = (float3)(data1[0], data1[1], 0.0f); + v2 = (float3)(data2[0], data2[1], 0.0f); + v3 = (float3)(data3[0], data3[1], 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) + { + const short* data0 = (const short*)(vertices + vtxID.x); + const short* data1 = (const short*)(vertices + vtxID.y); + const short* data2 = (const short*)(vertices + vtxID.z); + const short* data3 = (const short*)(vertices + vtxID.w); + v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2])); + v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2])); + v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2])); + v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2])); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM) + { + const short* data0 = (const short*)(vertices + vtxID.x); + const short* data1 = (const short*)(vertices + vtxID.y); + const short* data2 = (const short*)(vertices + vtxID.z); + const short* data3 = (const short*)(vertices + vtxID.w); + v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f); + v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f); + v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f); + v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) + { + const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x); + const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y); + const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z); + const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w); + v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2])); + v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2])); + v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2])); + v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2])); + } + else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM) + { + const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x); + const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y); + const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z); + const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w); + v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f); + v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f); + v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f); + v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) + { + const unsigned data0 = *(const unsigned*)(vertices + vtxID.x); + const unsigned data1 = *(const unsigned*)(vertices + vtxID.y); + const unsigned data2 = *(const unsigned*)(vertices + vtxID.z); + const unsigned data3 = *(const unsigned*)(vertices + vtxID.w); + v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20))); + v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20))); + v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20))); + v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20))); + } + else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) + { + const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x); + const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y); + const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z); + const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w); + v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2])); + v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2])); + v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2])); + v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2])); + } + else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM) + { + const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x); + const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y); + const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z); + const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w); + v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f); + v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f); + v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f); + v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f); + } + else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) + { + const signed char* data0 = (const signed char*)(vertices + vtxID.x); + const signed char* data1 = (const signed char*)(vertices + vtxID.y); + const signed char* data2 = (const signed char*)(vertices + vtxID.z); + const signed char* data3 = (const signed char*)(vertices + vtxID.w); + v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2])); + v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2])); + v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2])); + v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2])); + } + else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM) + { + const signed char* data0 = (const signed char*)(vertices + vtxID.x); + const signed char* data1 = (const signed char*)(vertices + vtxID.y); + const signed char* data2 = (const signed char*)(vertices + vtxID.z); + const signed char* data3 = (const signed char*)(vertices + vtxID.w); + v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f); + v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f); + v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f); + v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f); + } + else + { + v0 = (float3)(0.0f, 0.0f, 0.0f); + v1 = (float3)(0.0f, 0.0f, 0.0f); + v2 = (float3)(0.0f, 0.0f, 0.0f); + v3 = (float3)(0.0f, 0.0f, 0.0f); + } + + + /* perform vertex transformation */ + if (geomDesc->Desc.Triangles.pTransformBuffer) + { + global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer; + + v0.xyz = (float3)( + xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3], + xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7], + xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11] + ); + + v1.xyz = (float3)( + xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3], + xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7], + xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11] + ); + + v2.xyz = (float3)( + xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3], + xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7], + xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11] + ); + + v3.xyz = (float3)( + xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3], + xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7], + xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11] + ); + } + + *out0 = v0; + *out1 = v1; + *out2 = v2; + *out3 = v3; +} + + +GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + float3* out0, float3* out1, float3* out2, float3* out3, + uint4 vtxID) +{ + global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer; + uint vertex_format = geomDesc->Desc.Triangles.VertexFormat; + uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride; + + vtxID *= vertex_stride; + + GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3, + vtxID, vertex_format, vertices); +} + + +GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID) +{ + global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA; + global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride); + return *(global GRL_RAYTRACING_AABB*)aabb; +} + +// same as for d3d12 +typedef struct GRL_RAYTRACING_INSTANCE_DESC +{ + float Transform[12]; + // unsigned int InstanceID : 24; + // unsigned int InstanceMask : 8; + uint32_t DW0; + // unsigned int InstanceContributionToHitGroupIndex : 24; + // unsigned int Flags : 8; + uint32_t DW1; + global char* AccelerationStructure; +} GRL_RAYTRACING_INSTANCE_DESC; + +GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column) +{ + return d->Transform[row * 4 + column]; +} + +GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d) +{ + return d->DW0 & ((1 << 24) - 1); +} + +GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d) +{ + return d->DW0 >> 24; +} + +GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d) +{ + return d->DW1 & ((1 << 24) - 1); +} + +GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d) +{ + return d->DW1 >> 24; +} + +GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d) +{ + return (gpuva_t)d->AccelerationStructure; +} + +GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value) +{ + d->Transform[row * 4 + column] = value; +} + +GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id) +{ + d->DW0 &= 255 << 24; + d->DW0 |= id & ((1 << 24) - 1); +} + +GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask) +{ + d->DW0 &= ((1 << 24) - 1); + d->DW0 |= mask << 24; +} + +GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution) +{ + d->DW1 &= 255 << 24; + d->DW1 |= contribution & ((1 << 24) - 1); +} + +GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags) +{ + d->DW1 &= ((1 << 24) - 1); + d->DW1 |= flags << 24; +} + +GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address) +{ + d->AccelerationStructure = (global char*)address; +} diff --git a/src/intel/vulkan/grl/gpu/atomic_update.cl b/src/intel/vulkan/grl/gpu/atomic_update.cl new file mode 100644 index 00000000000..5171a122dc1 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/atomic_update.cl @@ -0,0 +1,1112 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "GRLGen12.h" + +#include "bvh_build_refit.h" +#include "bvh_build_treelet_refit.h" + + +struct RefitScratch +{ + float lower[3]; + uint mask; + float upper[3]; + uint _pad; + +}; + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(64, 1, 1))) void kernel +init_refit_scratch( + global struct BVHBase* bvh, + global struct RefitScratch* scratch ) +{ + uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); + + if ( tid < BVHBase_GetNumInternalNodes(bvh) ) + { + float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0); + store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) ); + store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) ); + } +} + +bool is_fat_leaf( InternalNode* curNode ) +{ + return curNode->nodeType != BVH_INTERNAL_NODE; // TODO: Not enough for traversal shaders!! if ts enabled need to check child types +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(64, 1, 1))) void kernel +build_fatleaf_table( + global struct BVHBase* bvh ) +{ + uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); + + if ( tid < BVHBase_GetNumInternalNodes(bvh) ) + { + InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; + + if ( is_fat_leaf(curNode) ) + { + uint offs = atomic_inc_global( &bvh->fatLeafCount ); + + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + uint bp = *InnerNode_GetBackPointer(backPointers, tid); + + LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+offs; + leaf->backpointer = bp; + leaf->inner_node_index = tid; + leaf->leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart; + } + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(64, 1, 1))) void kernel +build_fatleaf_table_new_update( + global struct Globals *globals, + global struct BVHBase* bvh ) +{ + uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); + + if ( tid < BVHBase_GetNumInternalNodes(bvh) ) + { + InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; + + if ( is_fat_leaf(curNode) ) + { + // This implementation uses fatleaf table structure but it is actually quad table + // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children + // but performance was worse + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + uint bp = *InnerNode_GetBackPointer(backPointers, tid); + uint fatLeafTableStart = bvh->fatLeafTableStart; + + uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart; + uint numChildren = (bp >> 3) & 0x7; + + uint quad_leaf_table_index = leaf_index; + + // Check if num children is outside of the % 256 work group + // If so, move these cases to the offset after numQuads and push them to the leftovers part + // where fatleaves are stored every 8th pos with additional padding + // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups + + uint prev_group = leaf_index & 255; + uint next_group = (leaf_index + (numChildren - 1)) & 255; + uint slm_pos = prev_group; + bool is_leftover = prev_group > next_group; + + if(is_leftover) + { + LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index)); + uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255; + + uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 ); + + for(uint i = 0; i < BVH_NODE_N6; i++) + { + uint pos = (i < numChildren) ? i : 0; + LeafTableEntry* leaf_null = &leafBase[pos]; + leaf_null->leaf_index = -1 << 3; + } + + quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset; + slm_pos = leftovers_offset & 255; + } + + LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index)); + + for(uint i = 0; i < BVH_NODE_N6; i++) + { + uint pos = (i < numChildren) ? i : 0; + LeafTableEntry* leafCur = &leaf[pos]; + leafCur->backpointer = bp; + leafCur->inner_node_index = (tid << 8) | slm_pos; + leafCur->leaf_index = (leaf_index << 3) | pos; + } + + // Need to clean the unused area where we pad to 8 for leftovers + if(is_leftover) + { + for(uint i = 1; i < 8; i++) + { + uint pos = (i >= numChildren) ? i : 7; + LeafTableEntry* leafCur = &leaf[pos]; + leafCur->leaf_index = -1 << 3; + } + } + } + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(64, 1, 1))) void kernel +build_innernode_table( + global struct BVHBase* bvh ) +{ + uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); + + if ( tid < BVHBase_GetNumInternalNodes(bvh) ) + { + InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; + + if ( !is_fat_leaf( curNode ) ) + { + uint offs = atomic_inc_global( &bvh->innerCount ); + + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + uint bp = *InnerNode_GetBackPointer(backPointers, tid); + + InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh)+offs; + inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7); + inner->first_child = tid + curNode->childOffset; + } + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) void kernel +fixup_quad_table( + global struct BVHBase* bvh ) +{ + // This kernel has 2 work groups that set the magic number for unused data in + // fatleaf table. One work group for thelast group of the first part where quads are packed, + // second one for the last group of the part where quads are stored padded + + uint numQuads = BVHBase_GetNumQuads(bvh); + uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate; + uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255; + + uint numQuads_aligned_256 = (numQuads + 255) & ~255; + uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256; + uint quadOffsetStart = quadOffsetEnd - 256; + + uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers; + + uint leftovers = quadOffsetEnd - quads_number_last_group; + + uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers; + + if(leftovers != 0) + { + LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh); + + LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid]; + leaf->leaf_index = -1 << 3; + } + + if(get_group_id(0) == 1 && get_local_id(0) == 0) + bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd; +} + + +// updates one quad leaf and gets BBOX contatining it +GRL_INLINE void refit_bottom_child_quad_WB( + global struct QuadLeaf* quad, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + struct AABB* childAABB) +{ + /* get the geomID and primID0/1 for both quad triangles */ + const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc); + const uint primID0 = quad->primIndex0; + const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad); + ushort fourth_vert = 0; + + if (primID1 != primID0) + { + ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad); + fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert; + fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert; + } + + global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID; + + uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert); + + // read the indices of the 4 verts we want + float3 vtx0, vtx1, vtx2, vtx3; + GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices); + + childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) ); + childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) ); + + float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x ); + float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y ); + float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z ); + + global uint4* dst_verts = (global uint4*) &(quad->v[0][0]); + store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) ); + store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) ); + store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) ); +} + +inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs ) +{ + uint vi = as_uint(v); + return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs)); +} +inline uchar4 uchar4_broadcast( uchar4 v, uint offs ) +{ + uint vi = as_uint(v); + return as_uchar4(sub_group_broadcast(vi,offs)); +} + +GRL_INLINE void sg_InternalNode_setFields( + struct InternalNode* node, + struct AABB reduced_aabb, + const int offset, const uint nodeType, struct AABB* input_aabb, + const uint numChildren, const uchar nodeMask ) +{ + const float up = 1.0f + ulp; + const float down = 1.0f - ulp; + + struct AABB conservative_aabb = conservativeAABB(&reduced_aabb); + const float3 org = conservative_aabb.lower.xyz; + + const float3 len = AABB_size(&conservative_aabb).xyz * up; + int3 exp; + const float3 mant = frexp_vec3(len, &exp); + exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); + + uchar4 lower_uchar = 0x80; + uchar4 upper_uchar = 0; + + ushort lane = get_sub_group_local_id(); + ushort simd8_id = lane/8; + ushort logical_lane = lane%8; + + if( logical_lane < numChildren ) + { + struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ??? + + float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) ); + lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) ); + float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) ); + upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) ); + lower_uchar.xyz = convert_uchar3_rtn( lower ); + upper_uchar.xyz = convert_uchar3_rtp( upper ); + } + + uchar4 lo0 = lower_uchar; + uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 ); + uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 ); + uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 ); + uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 ); + uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 ); + + uchar4 hi0 = upper_uchar; + uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 ); + uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 ); + uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 ); + uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 ); + uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 ); + + if( logical_lane == 0 ) + { + uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE); + + uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset); + + char3 exp_char = (char3)(exp.x,exp.y,exp.z); + + uint4 block1 = (uint4)( + as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)), + as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) , + as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) , + as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x)) + ); + + uint4 block2 = (uint4)( + as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) , + as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) , + as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) , + as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y)) + ); + + uint4 block3 = (uint4)( + as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)), + as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)), + as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)), + as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z)) + ); + + global uint4* pNode = (global uint4*)node; + +#if 0 + printf( + "block0 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" + "block1 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" + "block2 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" + "block3 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" , + block0.x,block0.y,block0.z,block0.w, + pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w, + block1.x,block1.y,block1.z,block1.w, + pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w, + block2.x,block2.y,block2.z,block2.w, + pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w , + block3.x,block3.y,block3.z,block3.w, + pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w ); +#endif + + store_uint4_L1WB_L3WB( pNode, 0, block0 ); + store_uint4_L1WB_L3WB( pNode, 1, block1 ); + store_uint4_L1WB_L3WB( pNode, 2, block2 ); + store_uint4_L1WB_L3WB( pNode, 3, block3 ); + } + +} + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +void kernel +traverse_aabbs_quad( + global struct BVHBase* bvh, + global struct RefitScratch* scratch, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc + ) +{ + + uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh); + varying ushort lane = get_sub_group_local_id(); + + uniform uint num_leaves = bvh->fatLeafCount; + + local struct RefitScratch local_scratch[256]; + if( get_local_id(0) < min(num_nodes,256u) ) + { + for( uint i=0; i<3; i++ ){ + local_scratch[get_local_id(0)].lower[i] = FLT_MAX; + local_scratch[get_local_id(0)].upper[i] = FLT_MAX; + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + + ushort SIMD8_PER_SG = get_sub_group_size()/8; + ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG; + ushort simd8_local_id = get_sub_group_local_id()/8; + ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; + ushort logical_lane = lane%8; + + uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG; + + + if ( fatleaf_index < num_leaves ) + { + LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index; + uint innerNodeIdx = leaf->inner_node_index; + uint bp = leaf->backpointer; + uint leaf_index = leaf->leaf_index; + + varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx; + varying QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index; + + uint childOffs = (((char*)quad) - ((char*)curNode))/64; + + varying struct AABB childrenBox; + AABB_init(&childrenBox); + + uint numChildren = (bp >> 3) & 0x7; + if (logical_lane < numChildren) + { + refit_bottom_child_quad_WB( + (global struct QuadLeaf*) &quad[logical_lane], + geomDesc, + &childrenBox ); + } + + struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox); + struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); + for (uint i = 1; i < SIMD8_PER_SG; i++) + { + struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); + int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; + reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); + reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); + } + + sg_InternalNode_setFields( + curNode, + reduce_bounds, + childOffs, + NODE_TYPE_QUAD, + &childrenBox, + numChildren, + 0xff ); + + // atomic min operation vectorized across 6 lanes + // [ lower.xyz ][-][upper.xyz][-] + // + // Lanes 3 and 7 are inactive. 'upper' is negated + bool atomic_mask = (1<> 6); + + // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256 + if(atomic_mask && parent != 0x03FFFFFF) + { + while( parent >= 256 ) + { + innerNodeIdx = parent; + bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v ); + parent = bp >> 6; + } + while( parent != 0x03FFFFFF ) + { + innerNodeIdx = parent; + bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v ); + parent = bp >> 6; + } + } + + } + + + barrier( CLK_LOCAL_MEM_FENCE ); + num_nodes = min(num_nodes,256u); + + local float* in = (local float*)&local_scratch[0]; + global float* out = (global float*)&scratch[0]; + + for (uint i = get_local_id(0); i < num_nodes*6; i += 256 ) + { + // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet + uint idx = i + (i/3); + + float v = in[idx]; + if( v != FLT_MAX ) + atomic_min( out + idx , v ); + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(64, 1, 1))) +void kernel +write_inner_nodes( + global struct BVHBase* bvh, + global struct RefitScratch* scratch + ) +{ + uint SIMD8_PER_SG = get_sub_group_size()/8; + uniform uint node_id = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8); + varying ushort lane = get_sub_group_local_id() % 8; + varying uint num_inners = bvh->innerCount; + + if ( node_id < num_inners ) + { + InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id; + uint node_index = entry->node_index_and_numchildren>>3; + uint numChildren = entry->node_index_and_numchildren & 7; + uint first_child = entry->first_child; + + varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index; + + varying struct AABB childAABB; + AABB_init(&childAABB); + + if( lane < numChildren ) + { + uint child = first_child + lane; + childAABB.lower.x = scratch[child].lower[0]; + childAABB.lower.y = scratch[child].lower[1]; + childAABB.lower.z = scratch[child].lower[2]; + childAABB.upper.x = -scratch[child].upper[0]; + childAABB.upper.y = -scratch[child].upper[1]; + childAABB.upper.z = -scratch[child].upper[2]; + } + + varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB); + struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); + for (uint i = 1; i < SIMD8_PER_SG; i++) + { + struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); + int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8); + reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); + reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); + } + + sg_InternalNode_setFields( + curNode, + reduce_bounds, + first_child - node_index, + NODE_TYPE_INTERNAL, + &childAABB, + numChildren, + 0xff ); + + } + + if (node_id == 0 && lane == 0 ) + { + bvh->Meta.bounds.lower[0] = scratch[0].lower[0]; + bvh->Meta.bounds.lower[1] = scratch[0].lower[1]; + bvh->Meta.bounds.lower[2] = scratch[0].lower[2]; + bvh->Meta.bounds.upper[0] = -scratch[0].upper[0]; + bvh->Meta.bounds.upper[1] = -scratch[0].upper[1]; + bvh->Meta.bounds.upper[2] = -scratch[0].upper[2]; + } + +} + + + +#if 1 +#define SLM_BOX_COUNT 1024 + +struct AABB load_box( uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes ) +{ + if( place < SLM_BOX_COUNT ) + return local_boxes[place]; + else + return extra_boxes[place-SLM_BOX_COUNT]; +} + +void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes ) +{ + if (place < SLM_BOX_COUNT) + { + local_boxes[place] = box; + } + else + { + global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT]; + store_uint4_L1WB_L3WB( ptr, 0, as_uint4(box.lower) ); + store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) ); + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(512, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel +update_single_group_quads( + global struct BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global struct AABB* extra_boxes +) +{ + uniform uint tid = get_sub_group_global_id(); + uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh); + uniform uint num_leaves = bvh->fatLeafCount; + uniform uint num_inners = bvh->innerCount; + + varying ushort lane = get_sub_group_local_id(); + + local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB + + // initialize nodes + for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0)) + { + struct AABB tmp; + AABB_init(&tmp); + tmp.upper = -tmp.upper; + store_box( tmp, i, local_boxes, extra_boxes ); + } + + + if( num_nodes > SLM_BOX_COUNT ) + mem_fence_workgroup_default(); + + barrier( CLK_LOCAL_MEM_FENCE ); + + + ushort SIMD8_PER_SG = get_sub_group_size()/8; + ushort NUM_SIMD8 = get_num_sub_groups()*SIMD8_PER_SG; + ushort simd8_local_id = get_sub_group_local_id()/8; + ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; + ushort logical_lane = lane%8; + + + for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 ) + { + LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i; + uint innerNodeIdx = leaf->inner_node_index; + uint bp = leaf->backpointer; + uint leaf_index = leaf->leaf_index; + + varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx; + QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index; + + uint childOffs = (((char*)quad) - ((char*)curNode))/64; + + varying struct AABB childrenBox; + AABB_init(&childrenBox); + + uint numChildren = (bp >> 3) & 0x7; + if (logical_lane < numChildren) + { + + refit_bottom_child_quad_WB( + (global struct QuadLeaf*) &quad[logical_lane], + geomDesc, + &childrenBox ); + } + + struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox); + struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); + for (uint i = 1; i < SIMD8_PER_SG; i++) + { + struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); + int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; + reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); + reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); + } + + + if( logical_lane == 0 ) + { + struct AABB negated = reduce_bounds; + negated.upper = -negated.upper; + store_box( negated, innerNodeIdx, local_boxes, extra_boxes ); + } + + sg_InternalNode_setFields( + curNode, + reduce_bounds, + childOffs, + NODE_TYPE_QUAD, + &childrenBox, + numChildren, + 0xff ); + + + // atomic min operation vectorized across 6 lanes + // [ lower.xyz ][-][upper.xyz][-] + // + // Lanes 3 and 7 are inactive. 'upper' is negated + uint lmod = logical_lane % 4; + uint ldiv = logical_lane / 4; + float vlo = reduce_bounds.lower.x; + float vhi = reduce_bounds.upper.x; + vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo; + vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi; + vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo; + vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi; + float v = (ldiv == 0) ? vlo : -vhi; + bool atomic_mask = (1<> 6); + + // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT + if(atomic_mask && parent != 0x03FFFFFF) + { + while( parent >= SLM_BOX_COUNT ) + { + innerNodeIdx = parent; + bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v ); + parent = bp >> 6; + } + while( parent != 0x03FFFFFF ) + { + innerNodeIdx = parent; + bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v ); + parent = bp >> 6; + } + } + + } + + if( num_nodes > SLM_BOX_COUNT ) + mem_fence_workgroup_default(); + + barrier( CLK_LOCAL_MEM_FENCE ); + + for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 ) + { + InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i; + uint node_index = inner->node_index_and_numchildren>>3; + uint numChildren = inner->node_index_and_numchildren & 7; + uint first_child = inner->first_child; + + varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index; + + //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders + { // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads + uint child = first_child + logical_lane; + + bool child_valid = (logical_lane < numChildren); + + struct AABB childAABB; + AABB_init(&childAABB); + if (child_valid) + { + childAABB = load_box( child, local_boxes, extra_boxes ); + childAABB.upper = -childAABB.upper; + } + + varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB); + struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); + for (uint i = 1; i < SIMD8_PER_SG; i++) + { + struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); + int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8); + reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); + reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); + } + + sg_InternalNode_setFields( + curNode, + reduce_bounds, + first_child - node_index, + NODE_TYPE_INTERNAL, + &childAABB, + numChildren, + 0xff ); + } + } + + + if (get_sub_group_id() == 0 && lane == 0 ) + { + bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x; + bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y; + bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z; + bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x; + bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y; + bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z; + } + +} +#endif + +GRL_INLINE void traverse_aabbs_new_update_func( + global struct BVHBase* bvh, + global char* vertices, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global struct RefitScratch* scratch, + uint vertex_format, + local struct AABB3f* children_AABBs, + local uint* num_fat_leaves, + local struct LeafTableEntry* leafTable_local, + const bool single_geo + ) +{ + // The first part of the kernel with vertices loads/stores is executed with quad per work item, + // using previously prepared QuadDataIndices to get the quad data and vert indices + // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is + // executed per simd. + // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional + // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit + // on future gens. + + varying uint lid = get_local_id(0); + varying uint tid = lid + get_group_id(0)*get_local_size(0); + + num_fat_leaves[0] = 0; + leafTable_local[lid].leaf_index = -1 << 3; + + LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid)); + uint innerNodeIdx_mem = leaf->inner_node_index; + uint bp = leaf->backpointer; + uint leaf_index_mem = leaf->leaf_index; + + uint numChildren = (bp >> 3) & 0x7; + + uint leaf_index = leaf_index_mem >> 3; + uint slm_child_offset = leaf_index_mem & 0x7; + + uint innerNodeIdx = innerNodeIdx_mem >> 8; + uint slm_pos_main = innerNodeIdx_mem & 0xFF; + + uint first_el_of_group = get_group_id(0)*get_local_size(0); + uint quadsNum = BVHBase_GetNumQuads(bvh); + uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1; + + // Skip writes when not all children for single fatleaf are present in this work group + bool skip_tid = leaf_index == 0x1FFFFFFF; + leaf_index = skip_tid ? expected_tid : leaf_index; + + // Compute bounding box for quads + varying struct AABB3f childrenBox; + + tid = leaf_index + slm_child_offset; + + // Read vertex indices and quad header from separate buffer + uint quadIndicesStart = bvh->quadIndicesDataStart; + varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid)); + QuadDataIndices vertexMap = vertex_indice_ptr[0]; + + varying global uint4* bounds = (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) ); + uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]); + uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]); + + global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc; + + if(!single_geo) + { + uint geomID = vertexMap.header_data[0] & 0xFFFFFF; + desc += geomID; + vertices = (global char*)desc->Desc.Triangles.pVertexBuffer; + vertex_format = desc->Desc.Triangles.VertexFormat; + } + + float3 vtx0, vtx1, vtx2, vtx3; + GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices); + + for(uint i = 0; i < 3; i++) + childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) ); + + for(uint i = 0; i < 3; i++) + childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) ); + + float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x ); + float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y ); + float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z ); + + // Store quad data in bvh + // Make sure this goes without partial writes to get best perf + store_uint4_L1WB_L3WB( bounds, 0, quad_data ); + store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) ); + store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) ); + store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + struct AABB reduce_bounds; + + if(!skip_tid) + { + // Store AABB in SLM, to be used later for children quantization in fatleaf + children_AABBs[slm_pos_main + slm_child_offset] = childrenBox; + + if(slm_child_offset == 0) + { + uint offset = atomic_inc_local(&num_fat_leaves[0]); + leafTable_local[offset].inner_node_index = innerNodeIdx_mem; + leafTable_local[offset].backpointer = bp; + leafTable_local[offset].leaf_index = leaf_index_mem; + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + varying ushort lane = get_sub_group_local_id(); + ushort SIMD8_PER_SG = get_sub_group_size()/8; + ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG; + ushort simd8_local_id = get_sub_group_local_id()/8; + ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; + ushort logical_lane = lane%8; + + uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31; + + for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32) + { + uniform uint fatleaf_index = simd8_id + offset; + uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index; + uint bp = leafTable_local[fatleaf_index].backpointer; + uint leaf_index_mem = leafTable_local[fatleaf_index].leaf_index; + + uint numChildren = (bp >> 3) & 0x7; + + uint leaf_index = leaf_index_mem >> 3; + uint slm_child_offset = leaf_index_mem & 0x7; + + uint innerNodeIdx = innerNodeIdx_mem >> 8; + uint slm_pos_main = innerNodeIdx_mem & 0xFF; + + bool skip_tid = leaf_index == 0x1FFFFFFF; + bool active_lane = (logical_lane < numChildren); + uint lane_children = active_lane ? logical_lane : 0; + + fatleaf_index = leaf_index; + + varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx)); + + global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart ); + + varying struct AABB childrenBox_bu; + AABB_init(&childrenBox_bu); + + if(!skip_tid) + childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]); + + struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu); + struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); + + for (uint i = 1; i < SIMD8_PER_SG; i++) + { + struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); + int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; + reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); + reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); + } + + if(!skip_tid) + { + uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index; + varying QuadLeaf* quad = (QuadLeaf*)(((char*)bvh) + quad_offset); + uint childOffs = (((char*)quad) - ((char*)curNode))/64; + + sg_InternalNode_setFields( + curNode, + reduce_bounds, + childOffs, + NODE_TYPE_QUAD, + &childrenBox_bu, + numChildren, + 0xff ); + + bool atomic_mask = (1<> 6); + + global float* parent_v = (global float*) &(scratch[parent]) + logical_lane; + + if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF)) + { + innerNodeIdx = parent; + bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + atomic_min( parent_v, v ); + parent = bp >> 6; + + if(parent != 0x03FFFFFF) + { + while( parent != 0x03FFFFFF ) + { + innerNodeIdx = parent; + bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + + global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane; + if(*parent_v_global >= v) + atomic_min( parent_v_global, v ); + else + break; + + parent = bp >> 6; + } + } + } + } + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +void kernel +traverse_aabbs_new_update( + global struct BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global struct RefitScratch* scratch + ) +{ + varying uint lid = get_local_id(0); + varying uint tid = lid + get_group_id(0)*get_local_size(0); + + local struct AABB3f children_AABBs[256]; + local struct LeafTableEntry leafTable_local[256]; + local uint num_fat_leaves; + + traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +void kernel +traverse_aabbs_new_update_single_geo( + global struct BVHBase* bvh, + global char* vertices, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global struct RefitScratch* scratch, + const uint vertex_format + ) +{ + varying uint lid = get_local_id(0); + varying uint tid = lid + get_group_id(0)*get_local_size(0); + + local struct AABB3f children_AABBs[256]; + local struct LeafTableEntry leafTable_local[256]; + local uint num_fat_leaves; + + if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM) + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); + else + traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1, + &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); +} diff --git a/src/intel/vulkan/grl/gpu/atomic_update.grl b/src/intel/vulkan/grl/gpu/atomic_update.grl new file mode 100644 index 00000000000..9e1d6923d4a --- /dev/null +++ b/src/intel/vulkan/grl/gpu/atomic_update.grl @@ -0,0 +1,198 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module atomic_update; + +kernel_module atomic_update ("atomic_update.cl") +{ + links lsc_intrinsics; + kernel init_refit_scratch < kernelFunction = "init_refit_scratch" >; + kernel traverse_aabbs_quad < kernelFunction = "traverse_aabbs_quad" >; + kernel write_inner_nodes < kernelFunction = "write_inner_nodes" >; + kernel build_fatleaf_table < kernelFunction = "build_fatleaf_table" >; + kernel build_innernode_table < kernelFunction = "build_innernode_table" >; + + kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >; + + kernel build_fatleaf_table_new_update < kernelFunction = "build_fatleaf_table_new_update" >; + kernel fixup_quad_table < kernelFunction = "fixup_quad_table" >; + kernel traverse_aabbs_new_update < kernelFunction = "traverse_aabbs_new_update" >; + kernel traverse_aabbs_new_update_single_geo < kernelFunction = "traverse_aabbs_new_update_single_geo" >; +} + +import struct MKBuilderState "structs.grl"; + +// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch" +metakernel init_refit_scratch_metakernel_registers() +{ + REG0.hi = 0; + REG1 = 3; + REG2 = 63; + REG3 = 4; + REG4 = 2; + + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; +} + +metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes ) +{ + REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!! + define C_3 REG1; + define C_63 REG2; + define C_4 REG3; + define C_2 REG4; + + REG0 = REG0 - C_3; // nodedataCurr - fixed offset + REG0 = REG0 + C_63; // + 63 + REG0 = REG0 >> C_4; // >> 4 + REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64 + + DISPATCHDIM_X = REG0.lo; + + dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 ) + args(bvh_base,scratch); + +} + +metakernel build_node_tables( qword bvh_base ) +{ + REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!! + REG1 = 2; + REG2 = 63; + REG3 = 4; + REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!! + + REG0 = REG0 - REG4; // nodedataCurr - fixed offset + REG0 = REG0 + REG2; // + 63 + REG0 = REG0 >> REG3; // >> 4 + REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64 + + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 ) + args(bvh_base); + dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 ) + args(bvh_base); +} + +metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base ) +{ + REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!! + REG1 = 2; + REG2 = 63; + REG3 = 4; + REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!! + + REG0 = REG0 - REG4; // nodedataCurr - fixed offset + REG0 = REG0 + REG2; // + 63 + REG0 = REG0 >> REG3; // >> 4 + REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64 + + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 ) + args(state.build_globals, bvh_base); + dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 ) + args(bvh_base); +} + +metakernel fixup_quad_table( qword bvh_base ) +{ + dispatch fixup_quad_table(2,1,1) + args(bvh_base); +} + +// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes" +metakernel init_traverse_aabbs_quad_and_write_inner_nodes() +{ + REG0.hi = 0; + REG1 = 1; + REG2 = 31; + REG3 = 4; + REG4 = 2; + REG5 = 7; + REG6 = 255; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; +} + +metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes ) +{ + + REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode! + define C_1 REG1; + define C_31 REG2; + define C_4 REG3; + + REG0 = REG0 + C_31; // + 31 + REG0 = REG0 >> C_4; // >> 4 + REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32 + + DISPATCHDIM_X = REG0.lo; + + dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 ) + args(bvh_base,scratch,geos); +} + +metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes ) +{ + REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode! + define C_1 REG1; + define C_2 REG4; + define C_7 REG5; + + REG0 = REG0 + C_7; // + 7 + REG0 = REG0 >> C_2; // >> 2 + REG0 = REG0 >> C_1; // >> 1 ==> >> 3 (/8) + DISPATCHDIM_X = REG0.lo; + + dispatch_indirect write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 ) + args(bvh_base,scratch); +} + +metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs ) +{ + dispatch update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 ) + args(bvh_base,geos,aabbs); +} + +metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch ) +{ + REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode! + define C_255 REG6; + define C_4 REG3; + + REG0 = REG0 + C_255; // + 255 + REG0 = REG0 >> C_4; // >> 4 + REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32 + + DISPATCHDIM_X = REG0.lo; + + dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 ) + args(bvh_base, geos, scratch); +} + +metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format ) +{ + REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode! + define C_255 REG6; + define C_4 REG3; + + REG0 = REG0 + C_255; // + 255 + REG0 = REG0 >> C_4; // >> 4 + REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32 + + DISPATCHDIM_X = REG0.lo; + + dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 ) + args(bvh_base, vertices, geos, scratch, vertex_format); +} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/binned_sah_shared.h b/src/intel/vulkan/grl/gpu/binned_sah_shared.h new file mode 100644 index 00000000000..8b22f6612cd --- /dev/null +++ b/src/intel/vulkan/grl/gpu/binned_sah_shared.h @@ -0,0 +1,265 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// +// This file contains structure definitions shared by GRL OCL kernels and host code +// + +#include "GRLGen12.h" +#pragma once + +#define BFS_NUM_BINS 16 +#define BFS_NUM_VCONTEXTS 256 +#define BFS_MAX_DEPTH 32 + +#define TRIVIAL_BUILD_THRESHOLD 6 +#define SINGLE_WG_BUILD_THRESHOLD 256 + +#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384 + + +typedef uchar vcontext_id_t; + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) +GRL_NAMESPACE_BEGIN(GPUBVHBuilder) + +struct BFS_Split +{ + float sah; + int dim; + int pos; +}; + + +struct BFS_BinInfo +{ + float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6] + // The 6 are lower(xyz) and -upper(xyz) + // bins use negated-max so that we can use vectorized mins instead of min/max pairs + uint counts[3 * BFS_NUM_BINS]; +}; + +enum_uint8(SAHBuildFlags) +{ + SAH_FLAG_NEED_BACKPOINTERS = 1, // identifies a mixed internal node where each child can have a different type + SAH_FLAG_NEED_MASKS = 2 +}; + +struct SAHBuildGlobals +{ + qword p_primref_index_buffers; + qword p_primrefs_buffer; + qword p_bvh2; + qword p_globals; // TODO: deprecate this + qword p_bvh_base; + gpuva_t p_qnode_root_buffer; + + dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks' + dword num_primrefs; + dword leaf_size; + dword leaf_type; + + dword root_buffer_num_produced; + dword root_buffer_num_produced_hi; + dword root_buffer_num_consumed; + dword root_buffer_num_consumed_hi; + dword root_buffer_num_to_consume; + dword root_buffer_num_to_consume_hi; +}; + +struct SAHBuildBuffersInfo +{ + gpuva_t p_globals; + gpuva_t p_primref_index_buffers; + gpuva_t p_primrefs_buffer; + gpuva_t p_bvh2; + gpuva_t p_bvh_base; + gpuva_t p_qnode_root_buffer; + dword sah_globals_flags; + dword _pad; + gpuva_t _pad2; +}; + +typedef union LRBounds +{ + struct + { + struct AABB3f left_centroid_bounds; + struct AABB3f left_geom_bounds; + struct AABB3f right_centroid_bounds; + struct AABB3f right_geom_bounds; + } boxes; + struct + { + float Array[24]; + } scalars; +} LRBounds; + + +struct VContext +{ + uint dispatch_primref_begin; // range of primrefs for this task + uint dispatch_primref_end; + uint bvh2_root; // BVH2 root node for this task + uint tree_depth; // depth of this node in the tree + uint num_left; // primref counts + uint num_right; + uint lr_mask; // lower 8b : left mask. upper 8b : right mask + uint batch_index; + + // pass1 global working state and output + struct BFS_Split split; + struct BFS_BinInfo global_bin_info; + + // pass2 global working state and output + LRBounds lr_bounds; +}; + + + +struct BFSDispatchRecord +{ + ushort batch_index; + ushort context_id; +}; + + +struct BFSDispatchQueue +{ + uint num_dispatches; + uint wg_count[BFS_NUM_VCONTEXTS]; + struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS]; +}; + +struct BFS1SpillStackEntry +{ + uint primref_begin; + uint primref_end; + uint bvh2_root; + ushort tree_depth; + ushort batch_index; +}; + +struct BFS1SpillStack +{ + uint size; + struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH]; +}; + +struct QNodeGlobalRootBufferEntry +{ + uint bvh2_node; + uint qnode; + uint build_idx; + uint _pad; +}; + +struct QNodeGlobalRootBuffer +{ + uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM + struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2]; +}; + +struct DFSDispatchRecord +{ + uint primref_base; + uint bvh2_base; + uint batch_index; + ushort num_primrefs; + ushort tree_depth; +}; + + +struct DFSDispatchQueue +{ + struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2]; +}; + +#define VCONTEXT_STATE_EXECUTING 0 +#define VCONTEXT_STATE_UNALLOCATED 1 + +union SchedulerUnion +{ + struct VContextScheduler + { + ///////////////////////////////////////////////////////////// + // State data used for communication with command streamer + // NOTE: This part must match definition in 'new_sah_builder.grl' + ///////////////////////////////////////////////////////////// + + dword num_bfs_wgs; + dword num_dfs_wgs; + + dword scheduler_postsync; + dword _pad1; + + dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). + dword num_single_builds; // number of single-wg builds (#primrefs < threshold) + + dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass + dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition + + ///////////////////////////////////////////////////////////// + + dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer + dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer + + dword vcontext_state[BFS_NUM_VCONTEXTS]; + + struct BFSDispatchQueue bfs_queue; + struct DFSDispatchQueue dfs_queue; + + struct VContext contexts[BFS_NUM_VCONTEXTS]; + + struct BFS1SpillStack bfs2_spill_stack; + } vContextScheduler; + + struct QnodeScheduler + { + dword num_qnode_grb_curr_entries; + dword num_qnode_grb_new_entries; + + dword scheduler_postsync; + dword _pad1; + + dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). + dword num_single_builds; // number of single-wg builds (#primrefs < threshold) + + dword batched_builds_to_process; + dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer + + ///////////////////////////////////////////////////////////// + + dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer + dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer + + struct QNodeGlobalRootBuffer qnode_global_root_buffer; + } qnodeScheduler; +}; + + +struct BVH2Node +{ + struct AABB3f box; + uint meta_u; // leaf: primref start. inner: offset from node to its first child + uint meta_ss; + //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes + //uchar is_inner; // 1 if inner, 0 if leaf + //uchar mask; +}; + +struct BVH2 +{ + uint num_nodes; + uint _pad[7]; // align to 32B +}; + + +GRL_NAMESPACE_END(GPUBVHBuilder) +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/build_leaf.grl b/src/intel/vulkan/grl/gpu/build_leaf.grl new file mode 100644 index 00000000000..7b154d03b43 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/build_leaf.grl @@ -0,0 +1,206 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module leaf_builder; + +kernel_module leaf_kernels ("bvh_build_leaf.cl") +{ + links lsc_intrinsics; + + kernel opencl_kernel_primref_to_quads < kernelFunction="primref_to_quads" >; + kernel opencl_kernel_primref_to_procedurals < kernelFunction="primref_to_procedurals" >; + kernel opencl_kernel_create_HW_instance_nodes < kernelFunction="create_HW_instance_nodes" >; + kernel opencl_kernel_create_HW_instance_nodes_pointers < kernelFunction="create_HW_instance_nodes_pointers" >; +} + +import struct MKBuilderState "structs.grl"; +import struct MKSizeEstimate "structs.grl"; + +const Instances_GROUPSIZE = 16; + +metakernel buildLeafDXR_instances( + MKBuilderState state, + qword build_primref_index_buffers, + qword srcInstanceDescrArray, + dword stride, + dword offset, + dword numPrims) +{ + define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE; + dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args( + state.build_globals, + build_primref_index_buffers, + state.build_primref_buffer, + state.bvh_buffer, + srcInstanceDescrArray, + stride, + offset); +} + +metakernel buildLeafDXR_instances_indirect( + MKBuilderState state, + qword build_primref_index_buffers, + qword srcInstanceDescrArray, + qword indirectBuildRangeInfo, + dword stride, + dword offset) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // Instances_GROUPSIZE - 1 + C_4 = 4; // log_2(Instances_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_create_HW_instance_nodes args( + state.build_globals, + build_primref_index_buffers, + state.build_primref_buffer, + state.bvh_buffer, + srcInstanceDescrArray, + stride, + offset); +} + +metakernel buildLeafDXR_instances_pointers( + MKBuilderState state, + qword build_primref_index_buffers, + qword srcInstanceDescrArrayPtr, + dword stride, + dword offset, + dword numPrims) +{ + define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE; + dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args( + state.build_globals, + build_primref_index_buffers, + state.build_primref_buffer, + state.bvh_buffer, + srcInstanceDescrArrayPtr, + stride, + offset); +} + +metakernel buildLeafDXR_instances_pointers_indirect( + MKBuilderState state, + qword build_primref_index_buffers, + qword srcInstanceDescrArrayPtr, + qword indirectBuildRangeInfo, + dword stride, + dword offset) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // Instances_GROUPSIZE - 1 + C_4 = 4; // log_2(Instances_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args( + state.build_globals, + build_primref_index_buffers, + state.build_primref_buffer, + state.bvh_buffer, + srcInstanceDescrArrayPtr, + stride, + offset); +} + +metakernel buildLeafDXR_procedurals( + MKBuilderState state, + qword build_primref_index_buffers, + dword stride, + dword offset, + qword p_numPrimitives) +{ + define C_1 REG0; + define REG_PRIMS_PER_WG REG1; + define REG_PRIMS_PER_WG_SHR REG2; + + C_1 = 1; + REG_PRIMS_PER_WG = 16; + REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements) + + define reg_numPrimitives REG3; + define reg_num_wgs REG4; + + reg_numPrimitives = load_dword(p_numPrimitives); + reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG; + reg_num_wgs = reg_num_wgs - C_1; + reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR; + + DISPATCHDIM_X = reg_num_wgs; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_primref_to_procedurals args( + state.build_globals, + state.build_primref_buffer, + build_primref_index_buffers, + state.bvh_buffer, + state.geomDesc_buffer, + stride, + offset); +} + +metakernel buildLeafDXR_quads( + MKBuilderState state, + qword build_primref_index_buffers, + dword stride, + dword offset, + qword p_numPrimitives, + dword allow_update) +{ + define C_1 REG0; + define REG_PRIMS_PER_WG REG1; + define SHIFT REG2; + + C_1 = 1; + REG_PRIMS_PER_WG = 32; + SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements) + + define reg_numPrimitives REG3; + define reg_num_wgs REG4; + + reg_numPrimitives = load_dword(p_numPrimitives); + reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG; + reg_num_wgs = reg_num_wgs - C_1; + reg_num_wgs = reg_num_wgs >> SHIFT; + reg_num_wgs = reg_num_wgs >> C_1; + + DISPATCHDIM_X = reg_num_wgs; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_primref_to_quads args( + state.build_globals, + state.build_primref_buffer, + build_primref_index_buffers, + state.bvh_buffer, + state.geomDesc_buffer, + stride, + offset, + allow_update); +} diff --git a/src/intel/vulkan/grl/gpu/build_primref.grl b/src/intel/vulkan/grl/gpu/build_primref.grl new file mode 100644 index 00000000000..33728bd01f6 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/build_primref.grl @@ -0,0 +1,229 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module build_primref; + +kernel_module primref_kernels ("bvh_build_primref.cl") +{ + links lsc_intrinsics; + + kernel opencl_kernel_primrefs_from_DXR_instances < kernelFunction="primrefs_from_DXR_instances" >; + kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >; + kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >; + kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >; + + kernel opencl_kernel_triangles_to_primrefs < kernelFunction="triangles_to_primrefs" >; + kernel opencl_kernel_triangles_to_primrefs_indirect < kernelFunction="triangles_to_primrefs_indirect" >; + kernel opencl_kernel_procedurals_to_primrefs < kernelFunction="procedurals_to_primrefs" >; + kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >; +} + +import struct MKBuilderState "structs.grl"; +import struct MKSizeEstimate "structs.grl"; + + +const PrimirefsFromInstances_GROUPSIZE = 16; + +metakernel buildPrimirefsFromInstances( + qword instanceDescBuff, + MKSizeEstimate estimate, + MKBuilderState build_state, + dword allowUpdate) +{ + define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE); + dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args( + build_state.build_globals, + build_state.bvh_buffer, + instanceDescBuff, + estimate.numPrimitives, + build_state.build_primref_buffer, + allowUpdate); +} + +metakernel buildPrimirefsFromInstancesIndirect( + qword instanceDescBuff, + qword indirectBuildRangeInfo, + MKBuilderState build_state, + dword allowUpdate) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 + C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args( + build_state.build_globals, + build_state.bvh_buffer, + instanceDescBuff, + indirectBuildRangeInfo, + build_state.build_primref_buffer, + allowUpdate); +} + +metakernel buildPrimirefsFromInstancesArrOfPtrs( + qword instanceDescPtrArrayBuff, + MKSizeEstimate estimate, + MKBuilderState build_state, + dword allowUpdate) +{ + define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE); + dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args( + build_state.build_globals, + build_state.bvh_buffer, + instanceDescPtrArrayBuff, + estimate.numPrimitives, + build_state.build_primref_buffer, + allowUpdate); +} + +metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect( + qword instanceDescPtrArrayBuff, + qword indirectBuildRangeInfo, + MKSizeEstimate estimate, + MKBuilderState build_state, + dword allowUpdate) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 + C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args( + build_state.build_globals, + build_state.bvh_buffer, + instanceDescPtrArrayBuff, + build_state.build_primref_buffer, + indirectBuildRangeInfo, + allowUpdate); +} + + + + +metakernel primrefs_from_tris( + MKBuilderState build_state, + MKSizeEstimate estimate, + qword geo_ptr, + dword geom_id, + dword geom_flags, + dword num_prims) +{ + define num_threads ((num_prims+15)/16); + dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.build_primref_buffer, + geo_ptr, + (geom_id & 0x00ffffff) + (geom_flags<<24), + num_prims); +} + +metakernel primrefs_from_tris_indirect( + MKBuilderState build_state, + MKSizeEstimate estimate, + qword geo_ptr, + qword indirectBuildRangeInfo, + dword geom_id, + dword geom_flags) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 + C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.build_primref_buffer, + geo_ptr, + indirectBuildRangeInfo, + (geom_id & 0x00ffffff) + (geom_flags << 24)); +} + +metakernel primrefs_from_proc( + MKBuilderState build_state, + MKSizeEstimate estimate, + qword geo_ptr, + dword geom_id, + dword geom_flags, + dword num_prims) +{ + define num_threads ((num_prims+15)/16); + dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.build_primref_buffer, + geo_ptr, + (geom_id & 0x00ffffff) + (geom_flags<<24), + num_prims); +} + +metakernel primrefs_from_proc_indirect( + MKBuilderState build_state, + MKSizeEstimate estimate, + qword geo_ptr, + qword indirectBuildRangeInfo, + dword geom_id, + dword geom_flags) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 + C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.build_primref_buffer, + geo_ptr, + indirectBuildRangeInfo, + (geom_id & 0x00ffffff) + (geom_flags<<24)); +} diff --git a/src/intel/vulkan/grl/gpu/build_refit.grl b/src/intel/vulkan/grl/gpu/build_refit.grl new file mode 100644 index 00000000000..46d6e76add2 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/build_refit.grl @@ -0,0 +1,324 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module build_refit; + +kernel_module morton_kernels ("bvh_build_refit.cl") +{ + links lsc_intrinsics; + + kernel update_instance_leaves < kernelFunction="update_instance_leaves" >; + kernel refit_indirect_sg < kernelFunction="Refit_indirect_sg" >; + kernel update_instance_leaves_indirect < kernelFunction="update_instance_leaves_indirect" >; + + +} + +const INSTANCE_LEAF_GROUP_SIZE = 16; +const REFIT_GROUP_SIZE = 8; + +metakernel update_instance_leaves( + qword bvh, + qword dxrInstancesArray, + qword dxrInstancesPtrArray, + qword instance_leaf_aabbs, + dword num_instances ) +{ + define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE; + + dispatch update_instance_leaves(num_groups, 1, 1) args( + bvh, + dxrInstancesArray, + dxrInstancesPtrArray, + instance_leaf_aabbs); +} + +metakernel update_instance_leaves_indirect( + qword bvh, + qword dxrInstancesArray, + qword dxrInstancesPtrArray, + qword instance_leaf_aabbs, + qword indirectBuildRangeInfo) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1 + C_4 = 4; // log_2(INSTANCE_LEAF_GROUP_SIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect update_instance_leaves_indirect args( + bvh, + dxrInstancesArray, + dxrInstancesPtrArray, + instance_leaf_aabbs, + indirectBuildRangeInfo); +} + +/* +metakernel refit( + qword bvh, + qword geomDesc, + qword instance_aabbs, + dword dispatchSize ) +{ + define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE; + + dispatch refit(num_groups, 1, 1) args( + bvh, + geomDesc, + instance_aabbs); +} + +const REFIT_SIMD_SIZE = 8; +const REFIT_SIMD_SIZE_SHIFT = 3; + +metakernel refit_indirect( + qword bvh, + qword bvh_inner_nodes_start_value, + qword bvh_inner_nodes_end, + qword geomDesc, + qword instance_aabbs ) +{ + define cRoundingSIMD REG4; + define TWO REG3; + define ONE REG5; + cRoundingSIMD = (REFIT_SIMD_SIZE - 1); + + TWO = 2; + ONE = 1; + + REG0 = bvh_inner_nodes_start_value; + REG1 = load_dword(bvh_inner_nodes_end); + REG1.hi = 0; + REG2 = REG1 - REG0; + REG2 = REG2 + cRoundingSIMD; + REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer + REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. + + DISPATCHDIM_X = REG2.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect refit_indirect args( + bvh, + geomDesc, + instance_aabbs); + +} +*/ + +metakernel refit_indirect_sg( + qword bvh, + qword bvh_inner_nodes_start_value, + qword bvh_inner_nodes_end, + qword geomDesc, + qword instance_aabbs ) +{ + + REG0 = bvh_inner_nodes_start_value; + REG1.lo = load_dword(bvh_inner_nodes_end); + REG1.hi = 0; + REG2 = REG1 - REG0; + + DISPATCHDIM_X = REG2.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect refit_indirect_sg args( + bvh, + geomDesc, + instance_aabbs); + +} +/* +//////////////////////////////////////////////////////////////// +// constructing treelets +// phase 1: mark nodes that will be roots of bottom treelets +// also for each node leave a number of startpoints that are under it and max depth of the path from the node +metakernel find_refit_treelets( + qword bvh, + qword treelet_node_data, + qword scratch_startpoints, + qword startpointAlloc, + qword bvh_inner_nodes_start_value, + qword bvh_inner_nodes_end ) +{ + define cRoundingSIMD REG4; + define TWO REG3; + define ONE REG5; + cRoundingSIMD = (REFIT_SIMD_SIZE - 1); + + TWO = 2; + ONE = 1; + + REG0 = bvh_inner_nodes_start_value; + REG1.lo = load_dword(bvh_inner_nodes_end); + REG1.hi = 0; + REG2 = REG1 - REG0; + REG2 = REG2 + cRoundingSIMD; + REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer + REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. + + DISPATCHDIM_X = REG2.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect find_refit_treelets args( + bvh, + treelet_node_data, + scratch_startpoints, + startpointAlloc); +} + + +//////////////////////////////////////////////////////////////// +// constructing treelets +// phase 2 totally parallel, run threads up to assign startpoints to given treelet +// +metakernel assign_refit_startpoints_to_treelets( + qword bvh, + qword treelet_node_data, + qword scratch_startpoints, + qword bvh_inner_nodes_start_value, + qword bvh_inner_nodes_end ) +{ + define cRoundingSIMD REG4; + define TWO REG3; + define ONE REG5; + cRoundingSIMD = (REFIT_SIMD_SIZE - 1); + + TWO = 2; + ONE = 1; + + REG0 = bvh_inner_nodes_start_value; + REG1.lo = load_dword(bvh_inner_nodes_end); + REG1.hi = 0; + REG2 = REG1 - REG0; + REG2 = REG2 + cRoundingSIMD; + REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer + REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. + + DISPATCHDIM_X = REG2.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect assign_refit_startpoints_to_treelets args( + bvh, + treelet_node_data, + scratch_startpoints); +} + + +//////////////////////////////////////////////////////////////// +// constructing treelets +// phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path +metakernel finalize_treelets_in_groups( + qword bvh, + qword scratch_startpoints, + qword ptrNumTreelets ) +{ + REG0 = load_qword(ptrNumTreelets); + + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect finalize_treelets_in_groups args( + bvh, + scratch_startpoints); +} + + +//////////////////////////////////////////////////////////////// +// Updating treelets +// phase 1 update vertex and generate boxes for vertices +// + +const PER_GROUP_ELEMENTS_ROUNDING = 15; +const PER_GROUP_ELEMENTS_SHIFT = 4; + +metakernel init_treelets_refit(qword pSquashGroupsCountToReset) +{ + REG1 = 0; + store_qword(pSquashGroupsCountToReset, REG1); + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + //REG4 = PER_GROUP_ELEMENTS_SHIFT; + //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING; + //REG5.lo = 0; +} + +metakernel update_quads( + qword scratch_box, + qword bvh, + qword input, + dword numPrimsDividedBy32, + qword bigSquashInput) +{ + //REG0 = load_qword(quads_nodes_begin_end_pair); + //REG1.hi = REG0.lo; // this holds inner nodes begin + //REG2 = REG0 - REG1; + //REG2 = REG2 + REG5; + //REG2 = REG2 >> REG4; + //DISPATCHDIM_X = REG2.hi; + + dispatch refit_quads(numPrimsDividedBy32, 1, 1) args( + bvh, + input, + scratch_box, + numPrimsDividedBy32, + bigSquashInput ); +} + +// +//////////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////////// +// +// phase 1 or 2 - update primitives as well as bottom up refit internal nodes +// in single dispatch (in single group per tree) +metakernel refit_tree_by_group_including_quads( + qword squashed_inputs, + dword numBuilds +) +{ + dispatch refit_tree_per_group(numBuilds, 1, 1) args( + squashed_inputs); +} +// +//////////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////////// +// +// phase 2 bottom up refit internal nodes +// +metakernel refit_treelet_per_group( + qword bigSquashInput, + qword ptrNumTreelets) +{ + DISPATCHDIM_X = load_dword(ptrNumTreelets); + + dispatch_indirect refit_treelet_per_group args( + bigSquashInput); +} +// +//////////////////////////////////////////////////////////////// + +#endif +*/ diff --git a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl new file mode 100644 index 00000000000..d72f192056e --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl @@ -0,0 +1,4823 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "binned_sah_shared.h" + +#include "libs/lsc_intrinsics.h" +#include "intrinsics.h" +#include "AABB.h" +#include "AABB3f.h" + +#include "qbvh6.h" +#include "common.h" + +#include "libs/lsc_intrinsics.h" + +#define SGPRINT_16x(prefix,fmt,type,val) {\ + type v0 = sub_group_broadcast( val, 0 );\ + type v1 = sub_group_broadcast( val, 1 );\ + type v2 = sub_group_broadcast( val, 2 );\ + type v3 = sub_group_broadcast( val, 3 );\ + type v4 = sub_group_broadcast( val, 4 );\ + type v5 = sub_group_broadcast( val, 5 );\ + type v6 = sub_group_broadcast( val, 6 );\ + type v7 = sub_group_broadcast( val, 7 );\ + type v8 = sub_group_broadcast( val, 8 );\ + type v9 = sub_group_broadcast( val, 9 );\ + type v10 = sub_group_broadcast( val, 10 );\ + type v11 = sub_group_broadcast( val, 11 );\ + type v12 = sub_group_broadcast( val, 12 );\ + type v13 = sub_group_broadcast( val, 13 );\ + type v14 = sub_group_broadcast( val, 14 );\ + type v15 = sub_group_broadcast( val, 15 );\ + sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if( get_sub_group_local_id() == 0 ) { \ + printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \ + fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \ + v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}} + + +#define SGPRINT_6x(prefix,fmt,type,val) {\ + type v0 = sub_group_broadcast( val, 0 );\ + type v1 = sub_group_broadcast( val, 1 );\ + type v2 = sub_group_broadcast( val, 2 );\ + type v3 = sub_group_broadcast( val, 3 );\ + type v4 = sub_group_broadcast( val, 4 );\ + type v5 = sub_group_broadcast( val, 5 );\ + sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if( get_sub_group_local_id() == 0 ) { \ + printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \ + v0,v1,v2,v3,v4,v5);}} + +#define BFS_WG_SIZE 512 + +#define BFS_NUM_VCONTEXTS 256 // must be multiple of 64 + +#define TREE_ARITY 6 + +#define DFS_WG_SIZE 256 +#define DFS_THRESHOLD 256 + + +void BFSDispatchQueue_print(struct BFSDispatchQueue* q, uint n) +{ + for (uint i = 0; i < q->num_dispatches; i++) + printf(" %u,ctx=%u,batch=%u\n", q->wg_count[i], q->records[i].context_id, q->records[i].batch_index); +} + +void VContextScheduler_print(struct VContextScheduler* scheduler) +{ + if (get_local_id(0) == 0) + { + printf("SCHEDULER:\n"); + printf(" bfs=%u dfs=%u\n", scheduler->num_bfs_wgs, scheduler->num_dfs_wgs); + + printf("BFS QUEUE:\n"); + BFSDispatchQueue_print(&scheduler->bfs_queue, scheduler->num_bfs_wgs); + + + printf("DFS QUEUE\n"); + for (uint i = 0; i < scheduler->num_dfs_wgs; i++) + { + struct DFSDispatchRecord* r = &scheduler->dfs_queue.records[i]; + printf(" (%u-%u) root=%u depth=%u batch_index=%u\n", + r->primref_base, r->primref_base + r->num_primrefs, + r->bvh2_base, r->tree_depth, r->batch_index); + } + + printf("CONTEXTS:\n"); + for (uint i = 0; i < BFS_NUM_VCONTEXTS; i++) + { + if (scheduler->vcontext_state[i] != VCONTEXT_STATE_UNALLOCATED) + { + printf(" context: %u state=%u\n", i, scheduler->vcontext_state[i]); + printf(" prims: %u-%u\n", scheduler->contexts[i].dispatch_primref_begin, scheduler->contexts[i].dispatch_primref_end); + printf(" depth: %u\n", scheduler->contexts[i].tree_depth); + printf(" root: %u\n", scheduler->contexts[i].bvh2_root); + printf(" batch: %u\n", scheduler->contexts[i].batch_index); + } + } + + + + } + +} + + +inline float3 select_min(float3 v, bool mask) +{ + return (float3)(mask ? v.x : (float)(INFINITY), + mask ? v.y : (float)(INFINITY), + mask ? v.z : (float)(INFINITY)); +} +inline float3 select_max(float3 v, bool mask) +{ + return (float3)(mask ? v.x : -(float)(INFINITY), + mask ? v.y : -(float)(INFINITY), + mask ? v.z : -(float)(INFINITY)); +} + +/////////////////////////////////////////////////////////////////////////// + +// The 'LRBounds' structure uses negated-max to allow +// both atomic_min and atomic_max to be issued fused into one message + +struct AABB3f LRBounds_get_left_centroid( LRBounds* b ) +{ + struct AABB3f* pbox = &b->boxes.left_centroid_bounds; + return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); +} +struct AABB3f LRBounds_get_right_centroid( LRBounds* b ) +{ + struct AABB3f* pbox = &b->boxes.right_centroid_bounds; + return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); +} +struct AABB3f LRBounds_get_left_geom( LRBounds* b ) +{ + struct AABB3f* pbox = &b->boxes.left_geom_bounds; + return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); +} +struct AABB3f LRBounds_get_right_geom( LRBounds* b ) +{ + struct AABB3f* pbox = &b->boxes.right_geom_bounds; + return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); +} + + +void LRBounds_merge_left( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax ) +{ + // All of the input vectors have come from sub-group reductions and are thus uniform + // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs + // The code below should result in 1 atomic_min message and a simularly large stack of movs + + float mergeVal0 = INFINITY; + float mergeVal1 = INFINITY; + uint i = get_sub_group_local_id(); + + // insert the various merge values into one register + // We use two parallel variables here to enable some ILP + + uint imod = (i>=6) ? (i-6) : i; + mergeVal0 = (imod==0) ? CMin.x : mergeVal0; + mergeVal1 = (imod==0) ? GMin.x : mergeVal1; + + mergeVal0 = (imod==1) ? CMin.y : mergeVal0; + mergeVal1 = (imod==1) ? GMin.y : mergeVal1; + + mergeVal0 = (imod==2) ? CMin.z : mergeVal0; + mergeVal1 = (imod==2) ? GMin.z : mergeVal1; + + mergeVal0 = (imod==3) ? -CMax.x : mergeVal0; + mergeVal1 = (imod==3) ? -GMax.x : mergeVal1; + + mergeVal0 = (imod==4) ? -CMax.y : mergeVal0; + mergeVal1 = (imod==4) ? -GMax.y : mergeVal1; + + mergeVal0 = (imod==5) ? -CMax.z : mergeVal0; + mergeVal1 = (imod==5) ? -GMax.z : mergeVal1; + + float merge = (i<6) ? mergeVal0 : mergeVal1; + if( i < 12 ) + atomic_min( &b->scalars.Array[i], merge ); + + //atomic_min( &b->boxes.left_centroid_bounds.lower[0], CMin.x ); + //atomic_min( &b->boxes.left_centroid_bounds.lower[1], CMin.y ); + //atomic_min( &b->boxes.left_centroid_bounds.lower[2], CMin.z ); + //atomic_min( &b->boxes.left_centroid_bounds.upper[0], -CMax.x ); + //atomic_min( &b->boxes.left_centroid_bounds.upper[1], -CMax.y ); + //atomic_min( &b->boxes.left_centroid_bounds.upper[2], -CMax.z ); + //atomic_min( &b->boxes.left_geom_bounds.lower[0], GMin.x ); + //atomic_min( &b->boxes.left_geom_bounds.lower[1], GMin.y ); + //atomic_min( &b->boxes.left_geom_bounds.lower[2], GMin.z ); + //atomic_min( &b->boxes.left_geom_bounds.upper[0], -GMax.x ); + //atomic_min( &b->boxes.left_geom_bounds.upper[1], -GMax.y ); + //atomic_min( &b->boxes.left_geom_bounds.upper[2], -GMax.z ); +} + +void LRBounds_merge_right( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax ) +{ + // All of the input vectors have come from sub-group reductions and are thus uniform + // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs + // The code below should result in 1 atomic_min message and a simularly large stack of movs + + float mergeVal0 = INFINITY; + float mergeVal1 = INFINITY; + uint i = get_sub_group_local_id(); + + // insert the various merge values into one register + // We use two parallel variables here to enable some ILP + + uint imod = (i>=6) ? (i-6) : i; + mergeVal0 = (imod==0) ? CMin.x : mergeVal0; + mergeVal1 = (imod==0) ? GMin.x : mergeVal1; + + mergeVal0 = (imod==1) ? CMin.y : mergeVal0; + mergeVal1 = (imod==1) ? GMin.y : mergeVal1; + + mergeVal0 = (imod==2) ? CMin.z : mergeVal0; + mergeVal1 = (imod==2) ? GMin.z : mergeVal1; + + mergeVal0 = (imod==3) ? -CMax.x : mergeVal0; + mergeVal1 = (imod==3) ? -GMax.x : mergeVal1; + + mergeVal0 = (imod==4) ? -CMax.y : mergeVal0; + mergeVal1 = (imod==4) ? -GMax.y : mergeVal1; + + mergeVal0 = (imod==5) ? -CMax.z : mergeVal0; + mergeVal1 = (imod==5) ? -GMax.z : mergeVal1; + + float merge = (i<6) ? mergeVal0 : mergeVal1; + if( i < 12 ) + atomic_min( &b->scalars.Array[i+12], merge ); + + //atomic_min( &b->boxes.right_centroid_bounds.lower[0], CMin.x ); + //atomic_min( &b->boxes.right_centroid_bounds.lower[1], CMin.y ); + //atomic_min( &b->boxes.right_centroid_bounds.lower[2], CMin.z ); + //atomic_min( &b->boxes.right_centroid_bounds.upper[0], -CMax.x ); + //atomic_min( &b->boxes.right_centroid_bounds.upper[1], -CMax.y ); + //atomic_min( &b->boxes.right_centroid_bounds.upper[2], -CMax.z ); + //atomic_min( &b->boxes.right_geom_bounds.lower[0], GMin.x ); + //atomic_min( &b->boxes.right_geom_bounds.lower[1], GMin.y ); + //atomic_min( &b->boxes.right_geom_bounds.lower[2], GMin.z ); + //atomic_min( &b->boxes.right_geom_bounds.upper[0], -GMax.x ); + //atomic_min( &b->boxes.right_geom_bounds.upper[1], -GMax.y ); + //atomic_min( &b->boxes.right_geom_bounds.upper[2], -GMax.z ); +} + +void LRBounds_merge( global LRBounds* globalBounds, local LRBounds* localBounds ) +{ + uint i = get_local_id(0); + if( i < 24 ) + atomic_min(&globalBounds->scalars.Array[i], localBounds->scalars.Array[i] ); +} + + +void LRBounds_init( LRBounds* bounds ) +{ + uint i = get_local_id(0) * 4; + if( i < 24 ) + { + // compiler should merge it into a 4xdword send + bounds->scalars.Array[i+0] = INFINITY; + bounds->scalars.Array[i+1] = INFINITY; + bounds->scalars.Array[i+2] = INFINITY; + bounds->scalars.Array[i+3] = INFINITY; + } + +} + + +inline void LRBounds_init_subgroup( LRBounds* bounds) +{ + uint sg_size = get_sub_group_size(); + uint lane = get_sub_group_local_id(); + + for (uint i = lane * 4; i < 24; i += sg_size * 4) + { + // compiler should merge it into a 4xdword send + bounds->scalars.Array[i+0] = INFINITY; + bounds->scalars.Array[i+1] = INFINITY; + bounds->scalars.Array[i+2] = INFINITY; + bounds->scalars.Array[i+3] = INFINITY; + } + +} + +/////////////////////////////////////////////////////////////////////////// + +inline void BinInfo_init(struct BFS_BinInfo* bin_info) +{ + for (uint id = get_local_id(0) * 4; id < 18 * BFS_NUM_BINS; id += get_local_size(0) * 4) + { + float inf = INFINITY; + // compiler should merge it into a 4xdword send + bin_info->min_max[id+0] = inf; + bin_info->min_max[id+1] = inf; + bin_info->min_max[id+2] = inf; + bin_info->min_max[id+3] = inf; + } + for (uint id = get_local_id(0) * 4; id < 3 * BFS_NUM_BINS; id += get_local_size(0) * 4) + { + // compiler should merge it into a 4xdword send + bin_info->counts[id+0] = 0; + bin_info->counts[id+1] = 0; + bin_info->counts[id+2] = 0; + bin_info->counts[id+3] = 0; + } +} + + +// copy global to local +inline void BinInfo_copy( local struct BFS_BinInfo* local_bin_info, global struct BFS_BinInfo* global_bin_info ) +{ + for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0)) + { + float inf = INFINITY ; + float f = global_bin_info->min_max[id]; + local_bin_info->min_max[id] = f; + } + for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0)) + { + local_bin_info->counts[id] = global_bin_info->counts[id]; + } +} + +inline void BinInfo_init_subgroup(struct BFS_BinInfo* bin_info) +{ + uint sg_size = get_sub_group_size(); + uint lane = get_sub_group_local_id(); + + for (uint i = lane * 4; i < 3 * BFS_NUM_BINS; i += sg_size * 4) + { + // compiler should merge it into a 4xdword send + bin_info->counts[i+0] = 0; + bin_info->counts[i+1] = 0; + bin_info->counts[i+2] = 0; + bin_info->counts[i+3] = 0; + } + + + for (uint i = lane * 4; i < 18 * BFS_NUM_BINS; i += sg_size * 4) + { + // compiler should merge it into a 4xdword send + bin_info->min_max[i+0] = INFINITY; + bin_info->min_max[i+1] = INFINITY; + bin_info->min_max[i+2] = INFINITY; + bin_info->min_max[i+3] = INFINITY; + } + +} + +float3 shuffle_down_float3( float3 a, float3 b, uint delta ) +{ + return (float3)( + intel_sub_group_shuffle_down( a.x, b.x, delta ), + intel_sub_group_shuffle_down( a.y, b.y, delta ), + intel_sub_group_shuffle_down( a.z, b.z, delta ) + ); +} + + + + +void BinInfo_primref_ballot_loop( local struct BFS_BinInfo* bin_info, uint axis, uint bin, float3 lower, float3 upper, bool active_lane ) +{ + local float* bins_min = &bin_info->min_max[0]; + local float* bins_max = &bin_info->min_max[3]; + + varying uint place = (bin + axis*BFS_NUM_BINS); + varying uint lane = get_sub_group_local_id(); + + uniform uint active_mask = intel_sub_group_ballot(active_lane); + + while( active_mask ) + { + uniform uint leader = ctz( active_mask ); + uniform uint lead_place = intel_sub_group_shuffle( place, leader ); + varying bool matching_bin = lead_place == place && active_lane; + + varying float3 lo = (float3)(INFINITY,INFINITY,INFINITY); + varying float3 hi = (float3)(-INFINITY,-INFINITY,-INFINITY); + if (matching_bin) + { + lo = lower.xyz; + hi = upper.xyz; + } + + lo = sub_group_reduce_min_float3( lo ); + hi = sub_group_reduce_max_float3( hi ); + + { + // atomic min operation vectorized across 6 lanes + // [ lower.xyz ][-][upper.xyz][-] + // + // Lanes 3 and 7 are inactive + + uint lmod = lane % 4; + uint ldiv = lane / 4; + float vlo = lo.x; + float vhi = hi.x; + vlo = (lmod == 1) ? lo.y : vlo; + vhi = (lmod == 1) ? hi.y : vhi; + vlo = (lmod == 2) ? lo.z : vlo; + vhi = (lmod == 2) ? hi.z : vhi; + + float v = (ldiv == 0) ? vlo : -vhi; + + if( (1<min_max[ 6*lead_place + lmod + 3*ldiv ], v ); + } + + //if( lane == 0 ) + // atomic_add_local(&bin_info->counts[lead_place], popcount(active_mask & intel_sub_group_ballot(matching_bin)) ); + + active_mask = active_mask & intel_sub_group_ballot(!matching_bin); + } +} + +inline void BinInfo_add_primref(struct BinMapping* binMapping, local struct BFS_BinInfo* bin_info, PrimRef* primref, bool active_lane ) +{ + + const float4 lower = primref->lower; + const float4 upper = primref->upper; + const float4 p = lower + upper; + const uint4 i = convert_uint4( (p - binMapping->ofs) * binMapping->scale ); + + BinInfo_primref_ballot_loop( bin_info, 0, i.x, lower.xyz, upper.xyz, active_lane ); + BinInfo_primref_ballot_loop( bin_info, 1, i.y, lower.xyz, upper.xyz, active_lane ); + BinInfo_primref_ballot_loop( bin_info, 2, i.z, lower.xyz, upper.xyz, active_lane ); + + if (active_lane) + { + atomic_inc_local( &bin_info->counts[i.x + 0 * BFS_NUM_BINS] ); + atomic_inc_local( &bin_info->counts[i.y + 1 * BFS_NUM_BINS] ); + atomic_inc_local( &bin_info->counts[i.z + 2 * BFS_NUM_BINS] ); + } +} + +inline void BinInfo_merge(global struct BFS_BinInfo* global_info, local struct BFS_BinInfo* local_info) +{ + uint id = get_local_id(0); + for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0)) + { + float v = local_info->min_max[id]; + if( v != INFINITY ) + atomic_min(&global_info->min_max[id], v); + } + for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0)) + { + uint c = local_info->counts[id]; + if( c ) + atomic_add_global(&global_info->counts[id], c); + } +} + +inline struct AABB3f BinInfo_get_AABB(struct BFS_BinInfo* bin_info, ushort bin, ushort axis) +{ + float* min = &bin_info->min_max[6*(bin + axis*BFS_NUM_BINS)]; + float* max = min + 3; + struct AABB3f box; + for (uint i = 0; i < 3; i++) + { + box.lower[i] = min[i]; + box.upper[i] = -max[i]; + } + + return box; +} + +inline uint3 BinInfo_get_counts(struct BFS_BinInfo* bin_info, ushort bin) +{ + uint3 counts; + counts.x = bin_info->counts[bin + 0 * BFS_NUM_BINS]; // TODO: block load these + counts.y = bin_info->counts[bin + 1 * BFS_NUM_BINS]; + counts.z = bin_info->counts[bin + 2 * BFS_NUM_BINS]; + return counts; +} +inline uint BinInfo_get_count(struct BFS_BinInfo* bin_info, ushort bin, ushort axis) +{ + return bin_info->counts[bin + axis * BFS_NUM_BINS]; +} + + +void BVH2_Initialize( struct BVH2* bvh ) +{ + bvh->num_nodes = 1; +} + +inline bool BVH2_IsInnerNode( global struct BVH2* bvh, uint node_index ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + return (n->meta_ss & 0x10000) != 0; +} +inline uint BVH2_GetRoot( struct BVH2* bvh ) +{ + return 0; +} + +////////////////////////////////////////////// +// BVH2NodeMetaData funcs +////////////////////////////////////////////// +struct BVH2NodeMetaData +{ + uint meta_u; // leaf: primref start. inner: offset from node to its first child + uint meta_ss; +}; + +inline struct BVH2NodeMetaData BVH2_GetNodeMetaData( global struct BVH2* bvh, uint node_index ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + struct BVH2NodeMetaData meta; + meta.meta_u = n->meta_u; + meta.meta_ss = n->meta_ss; + return meta; +} + +inline bool BVH2NodeMetaData_IsInnerNode( struct BVH2NodeMetaData* meta ) +{ + return (meta->meta_ss & 0x10000) != 0; +} + +inline ushort BVH2NodeMetaData_GetLeafPrimCount( struct BVH2NodeMetaData* meta ) +{ + return meta->meta_ss & 0xffff; +} + +inline uint BVH2NodeMetaData_GetLeafPrimStart( struct BVH2NodeMetaData* meta ) +{ + return meta->meta_u; +} + +inline uint BVH2NodeMetaData_GetMask( struct BVH2NodeMetaData* meta ) +{ + return (meta->meta_ss>>24); +} + +////////////////////////////////////////////// + +inline ushort BVH2_GetLeafPrimCount( struct BVH2* bvh, uint node_index ) +{ + struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; + return n->meta_ss & 0xffff; +} +inline uint BVH2_GetLeafPrimStart( struct BVH2* bvh, uint node_index ) +{ + struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; + return n->meta_u; +} +inline uint2 BVH2_GetChildIndices( struct BVH2* bvh, uint node_index ) +{ + struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; + uint2 idx; + idx.x = n->meta_u; + idx.y = idx.x + (n->meta_ss & 0xffff); + return idx; +} + +inline float BVH2_GetNodeArea( global struct BVH2* bvh, uint node_index ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + return AABB3f_halfArea( &n->box ); +} + + +inline struct AABB3f BVH2_GetNodeBox( global struct BVH2* bvh, uint node_index ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + return n->box; +} +inline void BVH2_SetNodeBox( global struct BVH2* bvh, uint node_index, struct AABB3f* box ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + n->box = *box; +} + +inline void BVH2_SetNodeBox_lu( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + AABB3f_set( &n->box, lower, upper ); +} + +inline void BVH2_InitNodeBox( struct BVH2* bvh, uint node_index ) +{ + struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; + AABB3f_init( &n->box ); +} + +inline struct AABB BVH2_GetAABB( global struct BVH2* bvh, uint node_index ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + struct AABB r; + r.lower.xyz = AABB3f_load_lower( &n->box ); + r.upper.xyz = AABB3f_load_upper( &n->box ); + return r; +} + +inline void BVH2_WriteInnerNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint2 child_offsets, uint mask ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + n->box = *box; + n->meta_u = child_offsets.x; + n->meta_ss = 0x10000 + (child_offsets.y - child_offsets.x) + (mask<<24); + // n->is_inner = true; +} + +inline void BVH2_WriteLeafNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint prim_start, uint prim_count, uint mask ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + n->box = *box; + n->meta_u = prim_start; + n->meta_ss = prim_count + (mask<<24); + // n->is_inner = true; +} + +inline uint BVH2_GetMask( global struct BVH2* bvh, uint node_index ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + return (n->meta_ss>>24); +} + + +uint BVH2_AllocateNodes( global struct BVH2* bvh, uint num_nodes ) +{ + return atomic_add_global( &bvh->num_nodes, num_nodes ); +} + +inline void BVH2_AtomicMergeNodeBox( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper ) +{ + global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; + AABB3f_atomic_merge_global_lu( &n->box, lower, upper ); +} + + +void BVH2_print( global struct BVH2* bvh, uint start_node ) +{ + if ( get_local_id( 0 ) == 0 && get_sub_group_id() == 0 ) + { + uint num_nodes = bvh->num_nodes; + + uint2 stack[BFS_MAX_DEPTH * 2]; + uint sp = 0; + + printf( "allocated_nodes=%u\n", num_nodes ); + + stack[sp++] = (uint2)(start_node, 0); + while ( sp > 0 ) + { + uint2 data = stack[--sp]; + uint node = data.x; + uint depth = data.y; + + for ( uint i = 0; i < depth; i++ ) + printf( " " ); + + if ( BVH2_IsInnerNode( bvh, node ) ) + { + uint2 kids = BVH2_GetChildIndices( bvh, node ); + printf( " %5u: inner: %u %u \n", node, kids.x, kids.y ); + stack[sp++] = (uint2)(kids.y, depth + 1); + stack[sp++] = (uint2)(kids.x, depth + 1); + + struct AABB3f l = BVH2_GetNodeBox( bvh, kids.x ); + struct AABB3f r = BVH2_GetNodeBox( bvh, kids.y ); + struct AABB3f p = BVH2_GetNodeBox( bvh, node ); + + float3 pl = AABB3f_load_lower( &p ); + float3 pu = AABB3f_load_upper( &p ); + float3 ll = AABB3f_load_lower( &l ); + float3 lu = AABB3f_load_upper( &l ); + float3 rl = AABB3f_load_lower( &r ); + float3 ru = AABB3f_load_upper( &r ); + if ( any( ll < pl ) || any( rl < pl ) || + any( lu > pu ) || any( ru > pu ) ) + { + for ( uint i = 0; i < depth; i++ ) + printf( " " ); + + printf( "BAD_BOUNDS!!!!!!!! %u\n", node ); + } + + + } + else + { + + uint start = BVH2_GetLeafPrimStart( bvh, node ); + uint count = BVH2_GetLeafPrimCount( bvh, node ); + printf( " %5u: leaf: start=%u count=%u\n ",node,start,count ); + + } + } + } + barrier( CLK_LOCAL_MEM_FENCE ); +} + + +global uint* SAHBuildGlobals_GetPrimrefIndices_In( struct SAHBuildGlobals* globals, bool odd_pass ) +{ + uint num_refs = globals->num_primrefs; + global uint* ib = (global uint*) globals->p_primref_index_buffers; + return ib + (odd_pass ? num_refs : 0); +} + +global uint* SAHBuildGlobals_GetPrimrefIndices_Out( struct SAHBuildGlobals* globals, bool odd_pass ) +{ + uint num_refs = globals->num_primrefs; + global uint* ib = (global uint*) globals->p_primref_index_buffers; + return ib + (odd_pass ? 0 : num_refs); +} + +global PrimRef* SAHBuildGlobals_GetPrimrefs( struct SAHBuildGlobals* globals ) +{ + return (global PrimRef*) globals->p_primrefs_buffer; +} + +global struct BVH2* SAHBuildGlobals_GetBVH2( struct SAHBuildGlobals* globals ) +{ + return (global struct BVH2*)globals->p_bvh2; +} + +uint SAHBuildGlobals_GetLeafSizeInBytes( struct SAHBuildGlobals* globals ) +{ + return globals->leaf_size; +} + +uint SAHBuildGlobals_GetLeafType( struct SAHBuildGlobals* globals ) +{ + return globals->leaf_type; +} + +uint SAHBuildGlobals_GetInternalNodeType( struct SAHBuildGlobals* globals ) +{ + return NODE_TYPE_INTERNAL; +} + +global struct BVHBase* SAHBuildGlobals_GetBVHBase( struct SAHBuildGlobals* globals ) +{ + return (global struct BVHBase*) globals->p_bvh_base; +} + +uint SAHBuildGlobals_GetTotalPrimRefs( struct SAHBuildGlobals* globals ) +{ + return globals->num_primrefs; +} + +inline bool SAHBuildGlobals_NeedBackPointers( struct SAHBuildGlobals* globals ) +{ + return globals->flags & SAH_FLAG_NEED_BACKPOINTERS; +} +inline bool SAHBuildGlobals_NeedMasks( struct SAHBuildGlobals* globals ) +{ + return globals->flags & SAH_FLAG_NEED_MASKS; +} + + +void SAHBuildGlobals_print( struct SAHBuildGlobals* globals ) +{ + if ( get_local_id( 0 ) == 0 ) + { + printf( "SAHBuildGlobals: %p\n", globals ); + printf( " p_primref_index_buffers =%p\n", globals->p_primref_index_buffers ); + printf( " p_primrefs_buffer =%p\n", globals->p_primrefs_buffer ); + printf( " p_bvh2 =%p\n", globals->p_bvh2 ); + printf( " p_globals =%p\n", globals->p_globals ); + printf( " p_bvh_base =%p\n", globals->p_bvh_base ); + printf( " num_primrefs = %u\n", globals->num_primrefs ); + printf( " leaf_size = %u\n", globals->leaf_size ); + printf( " leaf_type = %u\n", globals->leaf_type ); + printf( " p_qnode_buffer = %p\n", globals->p_qnode_root_buffer); + } + + barrier( CLK_LOCAL_MEM_FENCE ); +} + + +uint get_num_wgs(uint thread_count, uint WG_SIZE) +{ + return (thread_count + WG_SIZE - 1) / WG_SIZE; +} + + + + + +struct BFSDispatchArgs +{ + global struct VContextScheduler* scheduler; + global struct VContext* context; + global struct BVH2* bvh2; + global uint* primref_index_in; + global uint* primref_index_out; + global PrimRef* primref_buffer; + + uint wg_primref_begin; + uint wg_primref_end; + uint dispatch_primref_begin; + uint dispatch_primref_end; + uint context_id; + uint num_wgs; + uint bvh2_root; + uint global_num_primrefs; + bool do_mask_processing; +}; + + + + +// TODO_OPT: Enable larger WGs +// We need a way to do this in a portable fashion. +// Gen12 can support larger WGs than Gen9 can +// +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) +kernel void +begin( global struct VContextScheduler* scheduler, + dword leaf_size, + dword leaf_type, + global uint* primref_index_buffers, + global PrimRef* primref_buffer, + global struct BVH2* bvh2, + global struct BVHBase* bvh_base, + global struct Globals* globals, + global struct SAHBuildGlobals* sah_globals, + global uint2* qnode_root_buffer, + dword sah_globals_flags + ) +{ + dword num_primrefs = globals->numPrimitives; + if ( get_local_id( 0 ) == 0 ) + { + sah_globals->p_primrefs_buffer = (qword) primref_buffer; + sah_globals->p_primref_index_buffers = (qword)primref_index_buffers; + sah_globals->p_bvh2 = (qword) bvh2; + sah_globals->p_bvh_base = (qword) bvh_base; + sah_globals->leaf_size = leaf_size; + sah_globals->leaf_type = leaf_type; + sah_globals->num_primrefs = num_primrefs; + sah_globals->p_globals = (qword) globals; + sah_globals->p_qnode_root_buffer = (gpuva_t) qnode_root_buffer; + sah_globals->flags = sah_globals_flags; + + // initialize the spill stack + scheduler->bfs2_spill_stack.size = 0; + + // initialize BVH2 node counter + BVH2_Initialize( bvh2 ); + + // configure first vcontext for first build + scheduler->contexts[0].dispatch_primref_begin = 0; + scheduler->contexts[0].dispatch_primref_end = num_primrefs; + scheduler->contexts[0].bvh2_root = BVH2_GetRoot( bvh2 ); + scheduler->contexts[0].tree_depth = 0; + scheduler->contexts[0].batch_index = 0; + + scheduler->bfs_queue.records[0].context_id = 0; + + scheduler->contexts[0].num_left = 0; + scheduler->contexts[0].num_right = 0; + scheduler->contexts[0].lr_mask = 0; + + // copy centroid bounds into the BVH2 root node' + BVH2_SetNodeBox_lu( bvh2, BVH2_GetRoot( bvh2 ), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz ); + + // zero the trivial build counters.. these are only used by the batch-build path + // but single-wg QNode path (if used) depends on them + scheduler->num_trivial_builds = 0; + scheduler->num_single_builds = 0; + + // initialize the root-buffer counters + sah_globals->root_buffer_num_produced = 0; + sah_globals->root_buffer_num_produced_hi = 0; + sah_globals->root_buffer_num_consumed = 0; + sah_globals->root_buffer_num_consumed_hi = 0; + } + + // initialize vcontext states + for ( uint i = get_local_id( 0 ); i < BFS_NUM_VCONTEXTS; i += get_local_size( 0 ) ) + scheduler->vcontext_state[i] = (i==0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED; + + // initialize global bin info in vcontext - only context[0] will be used in first iteration + BinInfo_init( &scheduler->contexts[0].global_bin_info ); + LRBounds_init( &scheduler->contexts[0].lr_bounds ); + + // barrier( CLK_GLOBAL_MEM_FENCE ); // lsc flush ... driver now does these as part of COMPUTE_WALKER +} + +// TODO_OPT: Enable larger WGs +// We need a way to do this in a portable fashion. +// Gen12 can support larger WGs than Gen9 can +// + + +// TODO_OPT: Enable larger WGs +// We need a way to do this in a portable fashion. +// Gen12 can support larger WGs than Gen9 can +// +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(512, 1, 1))) +kernel void +categorize_builds_and_init_scheduler( + global struct VContextScheduler* scheduler, + global gpuva_t* globals_ptrs, // OCL-C does not allow kernel parameters to be pointer-to-pointer, so we trick it... + global struct SAHBuildBuffersInfo* buffers_info, + global struct SAHBuildGlobals* builds_out, + dword num_builds +) +{ + local uint num_trivial; + local uint num_single; + local uint num_full; + + if (get_group_id(0) == 0) // first workgroup performs build categorization + { + if (get_local_id(0) == 0) + { + num_trivial = 0; + num_single = 0; + num_full = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // first pass, count builds of each type + uint triv = 0; + uint single = 0; + uint full = 0; + for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0)) + { + global struct Globals* globals = (global struct Globals*) globals_ptrs[i]; + dword num_refs = globals->numPrimitives; + + if (num_refs <= TRIVIAL_BUILD_THRESHOLD) + triv++; + else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD) + single++; + else + full++; + } + + // merge counts across work-group. These variables are now offsets into this thread's ranges + triv = atomic_add_local(&num_trivial, triv); + single = atomic_add_local(&num_single, single); + full = atomic_add_local(&num_full, full); + + barrier(CLK_LOCAL_MEM_FENCE); + + global struct SAHBuildGlobals* trivial_builds_out = builds_out; + global struct SAHBuildGlobals* single_builds_out = builds_out + num_trivial; + global struct SAHBuildGlobals* full_builds_out = builds_out + num_trivial + num_single; + + for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0)) + { + global struct Globals* globals = (global struct Globals*) globals_ptrs[i]; + global struct SAHBuildBuffersInfo* buffers = &buffers_info[i]; + + dword num_refs = globals->numPrimitives; + dword leaf_type = globals->leafPrimType; + dword leaf_size = globals->leafSize; + + global struct SAHBuildGlobals* place; + if (num_refs <= TRIVIAL_BUILD_THRESHOLD) + place = trivial_builds_out + (triv++); + else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD) + place = single_builds_out + (single++); + else + place = full_builds_out + (full++); + + place->p_primref_index_buffers = buffers->p_primref_index_buffers; + place->p_primrefs_buffer = buffers->p_primrefs_buffer; + place->p_bvh2 = buffers->p_bvh2; + place->p_bvh_base = buffers->p_bvh_base; + place->p_globals = (gpuva_t)globals; + place->num_primrefs = num_refs; + place->leaf_size = leaf_size; + place->leaf_type = leaf_type; + place->flags = buffers->sah_globals_flags; + place->p_qnode_root_buffer = buffers->p_qnode_root_buffer; + + // only initialize BVH2 if it will actually be used by the build + // trivial passes will not use it + if( num_refs > SINGLE_WG_BUILD_THRESHOLD ) + { + // initialize BVH2 node counter + global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(place); + BVH2_Initialize(bvh2); + + // copy centroid bounds into the BVH2 root node' + BVH2_SetNodeBox_lu(bvh2, BVH2_GetRoot(bvh2), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz); + } + } + + if (get_local_id(0) == 0) + { + scheduler->num_trivial_builds = num_trivial; + scheduler->num_single_builds = num_single; + scheduler->batched_build_offset = num_trivial + num_single; + scheduler->batched_build_count = num_full; + } + } + else // second workgroup initializes the scheduler + { + // initialize vcontext states + for (uint i = get_local_id(0); i < BFS_NUM_VCONTEXTS; i += get_local_size(0)) + scheduler->vcontext_state[i] = (i == 0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED; + + // initialize global bin info in vcontexts + for (uint i = get_sub_group_id(); i < BFS_NUM_VCONTEXTS; i += get_num_sub_groups()) + BinInfo_init_subgroup(&scheduler->contexts[i].global_bin_info); + + // initialize the spill stack + if (get_local_id(0) == 0) + scheduler->bfs2_spill_stack.size = 0; + } + + //barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );// lsc flush ... driver now does these as part of COMPUTE_WALKER +} + + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BFS_NUM_VCONTEXTS, 1, 1))) +kernel void +begin_batchable( + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* sah_globals +) +{ + ushort scheduler_build_offset = scheduler->batched_build_offset; + ushort scheduler_num_builds = scheduler->batched_build_count; + + ushort num_builds = min( scheduler_num_builds, (ushort)BFS_NUM_VCONTEXTS ); + + uint num_wgs = 0; + + ushort tid = get_local_id(0); + if ( tid < num_builds ) + { + ushort batch_index = scheduler_build_offset + tid; + + uint num_primrefs = sah_globals[batch_index].num_primrefs; + + // configure first vcontext for first build + scheduler->contexts[tid].dispatch_primref_begin = 0; + scheduler->contexts[tid].dispatch_primref_end = num_primrefs; + scheduler->contexts[tid].bvh2_root = BVH2_GetRoot( SAHBuildGlobals_GetBVH2(&sah_globals[batch_index]) ); + scheduler->contexts[tid].tree_depth = 0; + scheduler->contexts[tid].batch_index = batch_index; + scheduler->vcontext_state[tid] = VCONTEXT_STATE_EXECUTING; + + scheduler->contexts[tid].num_left = 0; + scheduler->contexts[tid].num_right = 0; + scheduler->contexts[tid].lr_mask = 0; + + num_wgs = get_num_wgs( num_primrefs, BFS_WG_SIZE ); + + scheduler->bfs_queue.wg_count[tid] = num_wgs; + scheduler->bfs_queue.records[tid].batch_index = batch_index; + scheduler->bfs_queue.records[tid].context_id = tid; + } + + num_wgs = work_group_reduce_add(num_wgs); + + if (tid == 0) + { + // write out build count and offset for next BFS iteration + scheduler->batched_build_offset = scheduler_build_offset + num_builds; + scheduler->batched_build_count = scheduler_num_builds - num_builds; + + // write out initial WG count and loop termination mask for command streamer to consume + scheduler->batched_build_wg_count = num_wgs; + scheduler->batched_build_loop_mask = (scheduler_num_builds > num_builds) ? 1 : 0; + + scheduler->bfs_queue.num_dispatches = num_builds; + } + + for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() ) + BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info ); + + for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() ) + LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds ); +} + + + +bool is_leaf( uint num_refs ) +{ + return num_refs <= TREE_ARITY; +} + +bool is_dfs( uint num_refs ) +{ + return num_refs > TREE_ARITY&& num_refs <= DFS_THRESHOLD; +} + +bool is_bfs( uint num_refs ) +{ + return num_refs > DFS_THRESHOLD; +} + +int2 is_leaf_2( uint2 num_refs ) +{ + return num_refs.xy <= TREE_ARITY; +} +int2 is_bfs_2( uint2 num_refs ) +{ + return num_refs.xy > DFS_THRESHOLD; +} + +int2 is_dfs_2( uint2 num_refs ) +{ + return num_refs.xy > TREE_ARITY && num_refs.xy <= DFS_THRESHOLD; +} + +#if 0 +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +sg_scheduler( global struct VContextScheduler* scheduler ) +{ + local struct BFS1SpillStackEntry SLM_local_spill_stack[BFS_NUM_VCONTEXTS]; + local uchar SLM_context_state[BFS_NUM_VCONTEXTS]; + local vcontext_id_t SLM_free_list[BFS_NUM_VCONTEXTS]; + local vcontext_id_t SLM_exec_list[BFS_NUM_VCONTEXTS]; + + + varying ushort lane = get_sub_group_local_id(); + + uniform uint free_list_size = 0; + uniform uint exec_list_size = 0; + + // read context states, build lists of free and executing contexts + for (varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size()) + { + uchar state = scheduler->vcontext_state[i]; + SLM_context_state[i] = state; + + uniform ushort exec_mask = intel_sub_group_ballot(state == VCONTEXT_STATE_EXECUTING); + + varying ushort prefix_exec = subgroup_bit_prefix_exclusive(exec_mask); + varying ushort prefix_free = lane - prefix_exec; + varying ushort exec_list_pos = exec_list_size + prefix_exec; + varying ushort free_list_pos = free_list_size + prefix_free; + + if (state == VCONTEXT_STATE_EXECUTING) + SLM_exec_list[exec_list_pos] = i; + else + SLM_free_list[free_list_pos] = i; + + uniform ushort num_exec = popcount(exec_mask); + exec_list_size += num_exec; + free_list_size += get_sub_group_size() - num_exec; + } + + uniform uint total_bfs_dispatches = 0; + uniform uint total_dfs_dispatches = 0; + uniform uint bfs_spill_stack_size = 0; + uniform uint total_bfs_wgs = 0; + + // process executing context. accumulate bfs/dfs dispatches and free-list entries + for (uint i = 0; i < exec_list_size; i+= get_sub_group_size() ) + { + varying ushort num_dfs_dispatches = 0; + varying ushort num_bfs_spills = 0; + + varying ushort num_bfs_children; + varying ushort context_id; + struct VContext* context; + varying uint num_left ; + varying uint num_right ; + varying uint primref_begin ; + varying uint primref_end ; + varying uint depth ; + + bool active_lane = (i + lane) < exec_list_size; + if ( active_lane ) + { + context_id = SLM_exec_list[i + lane]; + context = &scheduler->contexts[context_id]; + + num_left = context->num_left; + num_right = context->num_right; + primref_begin = context->dispatch_primref_begin; + primref_end = context->dispatch_primref_end; + depth = context->tree_depth; + + // get dispatch counts + + num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right); + num_bfs_children = is_bfs(num_left) + is_bfs(num_right); + num_bfs_spills = (num_bfs_children == 2) ? 1 : 0; + } + + // allocate space for DFS, BFS dispatches, and BFS spills + varying uint dfs_pos = total_dfs_dispatches + sub_group_scan_exclusive_add(num_dfs_dispatches); + varying ushort mask_bfs_spills = intel_sub_group_ballot(num_bfs_children & 2); // spill if #children == 2 + varying ushort mask_bfs_dispatches = intel_sub_group_ballot(num_bfs_children & 3); // dispatch if #children == 1 or 2 + varying uint bfs_spill_pos = bfs_spill_stack_size + subgroup_bit_prefix_exclusive(mask_bfs_spills); + varying uint bfs_dispatch_pos = total_bfs_dispatches + subgroup_bit_prefix_exclusive(mask_bfs_dispatches); + + total_dfs_dispatches += sub_group_reduce_add(num_dfs_dispatches); + bfs_spill_stack_size += popcount(mask_bfs_spills); + total_bfs_dispatches += popcount(mask_bfs_dispatches); + + varying uint num_bfs_wgs = 0; + if (active_lane) + { + if (num_dfs_dispatches) + { + if (is_dfs(num_left)) + { + scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin; + scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left; + scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->left_bvh2_root; + scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; + dfs_pos++; + } + if (is_dfs(num_right)) + { + scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left; + scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right; + scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->right_bvh2_root; + scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; + } + } + + uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right); + if (num_bfs_children == 2) + { + // spill the right child.. push an entry onto local spill stack + SLM_local_spill_stack[bfs_spill_pos].primref_begin = primref_begin + num_left; + SLM_local_spill_stack[bfs_spill_pos].primref_end = primref_end; + SLM_local_spill_stack[bfs_spill_pos].bvh2_root = context->right_bvh2_root; + SLM_local_spill_stack[bfs_spill_pos].tree_depth = depth + 1; + + // setup BFS1 dispatch for left child + context->dispatch_primref_end = primref_begin + num_left; + context->bvh2_root = context->left_bvh2_root; + context->tree_depth = depth + 1; + num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE); + + scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs; + scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id; + } + else if (num_bfs_children == 1) + { + // setup BFS1 dispatch for whichever child wants it + if (is_bfs(num_left)) + { + // bfs on left child + context->dispatch_primref_end = context->dispatch_primref_begin + num_left; + context->bvh2_root = context->left_bvh2_root; + context->tree_depth = depth + 1; + num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE); + } + else + { + // bfs on right child + context->dispatch_primref_begin = context->dispatch_primref_begin + num_left; + context->bvh2_root = context->right_bvh2_root; + context->tree_depth = depth + 1; + num_bfs_wgs = get_num_wgs(num_right, BFS_WG_SIZE); + } + + scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs; + scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id; + } + else + { + // no bfs dispatch.. this context is now free + SLM_context_state[context_id] = VCONTEXT_STATE_UNALLOCATED; + } + } + + // count bfs work groups + total_bfs_wgs += sub_group_reduce_add(num_bfs_wgs); + + // add newly deallocated contexts to the free list + uniform uint free_mask = intel_sub_group_ballot( active_lane && num_bfs_children == 0); + varying uint free_list_pos = free_list_size + subgroup_bit_prefix_exclusive(free_mask); + free_list_size += popcount(free_mask); + + if ( free_mask & (1<bfs2_spill_stack.size; + + if(bfs_spill_stack_size < free_list_size && memory_spill_stack_size > 0 ) + { + uniform uint read_count = min(free_list_size - bfs_spill_stack_size, memory_spill_stack_size); + + for (varying uint i = lane; i < read_count; i+= get_sub_group_size()) + SLM_local_spill_stack[bfs_spill_stack_size + i] = scheduler->bfs2_spill_stack.entries[memory_spill_stack_size - 1 - i]; + + bfs_spill_stack_size += read_count; + memory_spill_stack_size -= read_count; + } + + // steal pending BFS work and assign it to free contexts + uniform uint num_steals = min(bfs_spill_stack_size, free_list_size); + + for (uniform uint i = 0; i < num_steals; i += get_sub_group_size()) + { + varying uint num_bfs_wgs = 0; + + if (i + lane < num_steals) + { + uint context_id = SLM_free_list[i+lane]; + struct VContext* context = &scheduler->contexts[context_id]; + struct BFS1SpillStackEntry entry = SLM_local_spill_stack[i+lane]; + + context->dispatch_primref_begin = entry.primref_begin; + context->dispatch_primref_end = entry.primref_end; + context->bvh2_root = entry.bvh2_root; + context->tree_depth = entry.tree_depth; + + num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE); + + scheduler->bfs_queue.wg_count[total_bfs_dispatches + i + lane] = num_bfs_wgs; + scheduler->bfs_queue.records[total_bfs_dispatches + i + lane].context_id = context_id; + + SLM_context_state[context_id] = VCONTEXT_STATE_EXECUTING; + } + + total_bfs_wgs += sub_group_reduce_add( num_bfs_wgs ); + } + + total_bfs_dispatches += num_steals; + + // write out excess spills to global spill stack + uniform uint extra_spills = bfs_spill_stack_size - num_steals; + for (varying uint i = lane; i < extra_spills; i += get_sub_group_size()) + { + scheduler->bfs2_spill_stack.entries[memory_spill_stack_size + i] = SLM_local_spill_stack[num_steals+i]; + } + + + // write out modified context states + for ( varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size()) + scheduler->vcontext_state[i] = SLM_context_state[i]; + + + if (get_local_id(0) == 0) + { + // write out new memory stack size + scheduler->bfs2_spill_stack.size = memory_spill_stack_size + extra_spills; + + // store workgroup counters + scheduler->bfs_queue.num_dispatches = total_bfs_dispatches; + scheduler->num_bfs_wgs = total_bfs_wgs; + scheduler->num_dfs_wgs = total_dfs_dispatches; + } + + // barrier(CLK_GLOBAL_MEM_FENCE); // make memory writes globally visible// lsc flush ... driver now does these as part of COMPUTE_WALKER +} +#endif + +#define SCHEDULER_SG_SIZE 16 +#define SCHEDULER_WG_SIZE BFS_NUM_VCONTEXTS +#define SCHEDULER_NUM_SGS (SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE) + + +struct BFSDispatchArgs get_bfs_args_from_record_batchable( + struct BFSDispatchRecord* record, + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals_buffer ); + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(SCHEDULER_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(SCHEDULER_SG_SIZE))) +kernel void +scheduler(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) +{ + local struct BFS1SpillStackEntry SLM_local_spill_stack[2 * BFS_NUM_VCONTEXTS]; + local uint SLM_local_spill_stack_size; + local uint SLM_dfs_dispatch_count; + + if (get_local_id(0) == 0) + { + SLM_local_spill_stack_size = 0; + SLM_dfs_dispatch_count = 0; + } + + uint context_id = get_local_id(0); + uint state = scheduler->vcontext_state[context_id]; + uint initial_state = state; + + uint batch_index = 0; + global struct VContext* context = &scheduler->contexts[context_id]; + + barrier(CLK_LOCAL_MEM_FENCE); + + + uint global_spill_stack_size = scheduler->bfs2_spill_stack.size; + + + if (state == VCONTEXT_STATE_EXECUTING) + { + uint left_bvh2_root; + uint right_bvh2_root; + + uint num_left = context->num_left; + uint num_right = context->num_right; + + uint primref_begin = context->dispatch_primref_begin; + uint primref_end = context->dispatch_primref_end; + + uint depth = context->tree_depth; + uint batch_index = context->batch_index; + + struct BFSDispatchRecord record; + record.context_id = context_id; + record.batch_index = context->batch_index; + + struct BFSDispatchArgs args = get_bfs_args_from_record_batchable( &record, scheduler, sah_globals); + + // do cleanup of bfs_pass2 + { + // compute geom bounds + struct AABB3f left_geom_bounds; + struct AABB3f right_geom_bounds; + struct AABB3f left_centroid_bounds; + struct AABB3f right_centroid_bounds; + uint2 lr_counts = (uint2)(num_left, num_right); + + { + left_centroid_bounds = LRBounds_get_left_centroid( &context->lr_bounds ); + left_geom_bounds = LRBounds_get_left_geom( &context->lr_bounds ); + right_centroid_bounds = LRBounds_get_right_centroid( &context->lr_bounds ); + right_geom_bounds = LRBounds_get_right_geom( &context->lr_bounds ); + } + + int2 v_is_leaf = is_leaf_2( lr_counts ); + int2 v_is_dfs = is_dfs_2( lr_counts ); + int2 v_is_bfs = is_bfs_2( lr_counts ); + uint left_mask = args.do_mask_processing ? context->lr_mask & 0xff : 0xff; + uint right_mask = args.do_mask_processing ? (context->lr_mask & 0xff00) >> 8 : 0xff; + + // how many BVH2 nodes do we need to allocate? For DFS, we need to pre-allocate full subtree + uint2 lr_node_counts = select( (uint2)(1,1), (2*lr_counts-1), v_is_dfs ); + uint left_node_count = lr_node_counts.x; + uint right_node_count = lr_node_counts.y; + + // allocate the nodes + uint first_node = BVH2_AllocateNodes( args.bvh2, left_node_count + right_node_count ); + + // point our root node at its children + left_bvh2_root = first_node; + right_bvh2_root = first_node + left_node_count; + + // store combined geom bounds in the root node's AABB.. we previously stored centroid bounds there + // but node creation requires geom bounds + struct AABB3f geom_bounds = left_geom_bounds; + AABB3f_extend(&geom_bounds, &right_geom_bounds); + BVH2_WriteInnerNode( args.bvh2, args.bvh2_root, &geom_bounds, (uint2)(left_bvh2_root,right_bvh2_root), left_mask | right_mask ); + +// printf(" node: %u mask: %x\n", args.bvh2_root, left_mask|right_mask ); + + // store the appropriate AABBs in the child nodes + // - BFS passes need centroid bounds + // - DFS passes need geom bounds + // Here we also write leaf connectivity information (prim start+count) + // this will be overwritten later if we are creating an inner node + struct AABB3f left_box, right_box; + left_box = AABB3f_select( left_geom_bounds, left_centroid_bounds, v_is_bfs.xxx ); + right_box = AABB3f_select( right_geom_bounds, right_centroid_bounds, v_is_bfs.yyy ); + + uint left_start = primref_begin; + uint right_start = primref_begin + num_left; + BVH2_WriteLeafNode( args.bvh2, left_bvh2_root, &left_box, left_start, num_left, left_mask ); + BVH2_WriteLeafNode( args.bvh2, right_bvh2_root, &right_box, right_start, num_right, right_mask ); + + // make input and output primref index buffers consistent in the event we're creating a leaf + // There should only ever be one leaf created, otherwise we'd have done a DFS pass sooner + if (any( v_is_leaf.xy )) + { + uint start = v_is_leaf.x ? left_start : right_start; + uint num_refs = v_is_leaf.x ? num_left : num_right; + + for(uint i = 0; i < num_refs; i++) + { + args.primref_index_in[start + i] = args.primref_index_out[start + i]; + } + } + } + + // when BFS2 finishes, we need to dispatch two child tasks. + // DFS dispatches can run free and do not need a context + // BFS dispatches need a context. + // In the case where both of the child nodes are BFS, the current context can immediately run one of the child dispatches + // and the other is spilled for an unallocated context to pick up + + uint num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right); + if (num_dfs_dispatches) + { + uint dfs_pos = atomic_add_local(&SLM_dfs_dispatch_count, num_dfs_dispatches); + if (is_dfs(num_left)) + { + scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin; + scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left; + scheduler->dfs_queue.records[dfs_pos].bvh2_base = left_bvh2_root; + scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; + scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index; + dfs_pos++; + } + if (is_dfs(num_right)) + { + scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left; + scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right; + scheduler->dfs_queue.records[dfs_pos].bvh2_base = right_bvh2_root; + scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; + scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index; + } + } + + uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right); + if (num_bfs_children) + { + uint place = atomic_add_local(&SLM_local_spill_stack_size, num_bfs_children); + if (is_bfs(num_left)) + { + SLM_local_spill_stack[place].primref_begin = primref_begin; + SLM_local_spill_stack[place].primref_end = primref_begin + num_left; + SLM_local_spill_stack[place].bvh2_root = left_bvh2_root; + SLM_local_spill_stack[place].tree_depth = depth + 1; + SLM_local_spill_stack[place].batch_index = batch_index; + place++; + } + if (is_bfs(num_right)) + { + SLM_local_spill_stack[place].primref_begin = primref_begin + num_left; + SLM_local_spill_stack[place].primref_end = primref_end; + SLM_local_spill_stack[place].bvh2_root = right_bvh2_root; + SLM_local_spill_stack[place].tree_depth = depth + 1; + SLM_local_spill_stack[place].batch_index = batch_index; + place++; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint local_spill_stack_size = SLM_local_spill_stack_size; + + struct BFS1SpillStackEntry entry; + state = VCONTEXT_STATE_UNALLOCATED; + if (context_id < local_spill_stack_size) + { + // pull BFS work from the local spill stack if there's enough work there + entry = SLM_local_spill_stack[context_id]; + state = VCONTEXT_STATE_EXECUTING; + } + else if ((context_id - local_spill_stack_size) < (global_spill_stack_size)) + { + // if there isn't enough work on the local stack, consume from the global one + uint global_pos = (global_spill_stack_size - 1) - (context_id - local_spill_stack_size); + entry = scheduler->bfs2_spill_stack.entries[global_pos]; + state = VCONTEXT_STATE_EXECUTING; + } + + // contexts which received work set themselves up for the next BFS1 dispatch + uint num_bfs_wgs = 0; + uint num_bfs_dispatches = 0; + if (state == VCONTEXT_STATE_EXECUTING) + { + context->dispatch_primref_begin = entry.primref_begin; + context->dispatch_primref_end = entry.primref_end; + context->bvh2_root = entry.bvh2_root; + context->tree_depth = entry.tree_depth; + context->batch_index = entry.batch_index; + + context->num_left = 0; + context->num_right = 0; + context->lr_mask = 0; + + batch_index = entry.batch_index; + num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE); + num_bfs_dispatches = 1; + } + + + if (local_spill_stack_size > BFS_NUM_VCONTEXTS) + { + // write out additional spills if we produced more work than we can consume + uint excess_spills = local_spill_stack_size - BFS_NUM_VCONTEXTS; + uint write_base = global_spill_stack_size; + uint lid = get_local_id(0); + if (lid < excess_spills) + scheduler->bfs2_spill_stack.entries[write_base + lid] = SLM_local_spill_stack[BFS_NUM_VCONTEXTS + lid]; + + if (lid == 0) + scheduler->bfs2_spill_stack.size = global_spill_stack_size + excess_spills; + } + else if (global_spill_stack_size > 0) + { + // otherwise, if we consumed any spills from the global stack, update the stack size + if (get_local_id(0) == 0) + { + uint global_spills_consumed = min(global_spill_stack_size, BFS_NUM_VCONTEXTS - local_spill_stack_size); + scheduler->bfs2_spill_stack.size = global_spill_stack_size - global_spills_consumed; + } + } + + + // Do various WG reductions.. the code below is a hand-written version of the following: + // + // uint bfs_dispatch_queue_pos = work_group_scan_exclusive_add( num_bfs_dispatches ); + // uint reduce_num_bfs_wgs = work_group_reduce_add(num_bfs_wgs); + // uint reduce_num_bfs_dispatches = work_group_reduce_add(num_bfs_dispatches); + uint bfs_dispatch_queue_pos; + uint reduce_num_bfs_dispatches; + uint reduce_num_bfs_wgs; + local uint partial_dispatches[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE]; + local uint partial_wgs[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE]; + { + partial_dispatches[get_sub_group_id()] = sub_group_reduce_add(num_bfs_dispatches); + partial_wgs[get_sub_group_id()] = sub_group_reduce_add(num_bfs_wgs); + + uint sg_prefix = sub_group_scan_exclusive_add(num_bfs_dispatches); + + uint prefix_dispatches = 0; + uint total_dispatches = 0; + uint total_wgs = 0; + ushort lane = get_sub_group_local_id(); + + barrier(CLK_LOCAL_MEM_FENCE); + + for (ushort i = 0; i < SCHEDULER_NUM_SGS; i += SCHEDULER_SG_SIZE) // this loop is intended to be fully unrolled after compilation + { + uint p_dispatch = partial_dispatches[i + lane]; + uint p_wg = partial_wgs[i + lane]; + + prefix_dispatches += (i + lane < get_sub_group_id()) ? p_dispatch : 0; + total_dispatches += p_dispatch; + total_wgs += p_wg; + } + + bfs_dispatch_queue_pos = sg_prefix + sub_group_reduce_add(prefix_dispatches); + reduce_num_bfs_dispatches = sub_group_reduce_add(total_dispatches); + reduce_num_bfs_wgs = sub_group_reduce_add(total_wgs); + } + + // insert records into BFS queue + if (num_bfs_dispatches) + { + scheduler->bfs_queue.wg_count[bfs_dispatch_queue_pos] = num_bfs_wgs; + scheduler->bfs_queue.records[bfs_dispatch_queue_pos].context_id = context_id; + scheduler->bfs_queue.records[bfs_dispatch_queue_pos].batch_index = batch_index; + } + + + // store modified vcontext state if it has changed + if (initial_state != state) + scheduler->vcontext_state[context_id] = state; + + + // store workgroup counters + if (get_local_id(0) == 0) + { + scheduler->bfs_queue.num_dispatches = reduce_num_bfs_dispatches; + scheduler->num_bfs_wgs = reduce_num_bfs_wgs; + scheduler->num_dfs_wgs = SLM_dfs_dispatch_count; + } + + const uint contexts_to_clear = min( (uint)BFS_NUM_VCONTEXTS, (uint)(local_spill_stack_size+global_spill_stack_size) ); + + for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() ) + BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info ); + + for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() ) + LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds ); +} + +#if 0 +uint record_search( struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue ) +{ + uint group = get_group_id(0); + ushort lane = get_sub_group_local_id(); + uint num_dispatches = queue->num_dispatches; + uint base = 0; + for (uint i = 0; i < num_dispatches; i += get_sub_group_size()) + { + uint counts = intel_sub_group_block_read(&queue->wg_count[i]); + + for (uint j = 0; j < get_sub_group_size(); j++) + { + uint n = sub_group_broadcast(counts, j); + if (group < n) + { + *record_out = queue->records[i + j]; + return group; + } + group -= n; + } + } + + return 0; // NOTE: unreachable in practice +} +#endif + + +uint record_search(struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue) +{ + uint group = get_group_id(0); + + uint num_dispatches = queue->num_dispatches; + + uint dispatch_id = 0; + uint local_id = 0; + uint i = 0; + do + { + uint counts = intel_sub_group_block_read(&queue->wg_count[i]); + uint prefix = sub_group_scan_exclusive_add(counts); + + uint g = group - prefix; + uint ballot = intel_sub_group_ballot(g < counts); + if (ballot) + { + uint lane = ctz(ballot); + dispatch_id = i + lane; + local_id = intel_sub_group_shuffle(g, lane); + break; + } + + group -= sub_group_broadcast(prefix + counts, get_sub_group_size() - 1); + + i += get_sub_group_size(); + } while (i < num_dispatches); + + + *record_out = queue->records[dispatch_id]; + return local_id; +} + + + + +struct BFSDispatchArgs get_bfs_args(struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals, uint local_group_id) +{ + uint context_id = record->context_id; + struct VContext* context = &scheduler->contexts[context_id]; + bool odd_pass = context->tree_depth & 1; + + struct BFSDispatchArgs args; + args.scheduler = scheduler; + args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, odd_pass ); + args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, odd_pass ); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); + args.wg_primref_begin = context->dispatch_primref_begin + local_group_id * BFS_WG_SIZE; + args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, context->dispatch_primref_end ); + args.dispatch_primref_begin = context->dispatch_primref_begin; + args.dispatch_primref_end = context->dispatch_primref_end; + args.context_id = context_id; + args.context = &scheduler->contexts[context_id]; + args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE; + args.bvh2_root = context->bvh2_root; + args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); + args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); + args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals ); + return args; +} + +struct BFSDispatchArgs get_bfs_args_queue( global struct BFSDispatchQueue* queue, + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals ) +{ + + // TODO_OPT: Load this entire prefix array into SLM instead of searching.. + // Or use sub-group ops + + struct BFSDispatchRecord record; + uint local_group_id = record_search(&record, queue); + + return get_bfs_args(&record, scheduler, globals, local_group_id); +} + + +struct BFSDispatchArgs get_bfs_args_from_record( struct BFSDispatchRecord* record, + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals ) +{ + return get_bfs_args(record, scheduler, globals, 0); +} + + +struct BFSDispatchArgs get_bfs_args_batchable( + global struct BFSDispatchQueue* queue, + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals_buffer ) +{ + + // TODO_OPT: Load this entire prefix array into SLM instead of searching.. + // Or use sub-group ops + + struct BFSDispatchRecord record; + uint local_group_id = record_search(&record, queue); + + global struct SAHBuildGlobals* globals = globals_buffer + record.batch_index; + + return get_bfs_args(&record, scheduler, globals, local_group_id); +} + + +struct BFSDispatchArgs get_bfs_args_from_record_batchable( + struct BFSDispatchRecord* record, + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals_buffer ) +{ + global struct SAHBuildGlobals* globals = globals_buffer + record->batch_index; + + return get_bfs_args(record, scheduler, globals, 0); +} + +struct BFSDispatchArgs get_bfs_args_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals ) +{ + uint context_id = 0; + + uint num_refs = SAHBuildGlobals_GetTotalPrimRefs( globals ); + + struct BFSDispatchArgs args; + args.scheduler = scheduler; + args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, false ); + args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, false ); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); + args.wg_primref_begin = get_group_id(0) * BFS_WG_SIZE; + args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, num_refs ); + args.dispatch_primref_begin = 0; + args.dispatch_primref_end = num_refs; + args.context_id = context_id; + args.context = &scheduler->contexts[context_id]; + args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE; + args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); + args.bvh2_root = BVH2_GetRoot( args.bvh2 ); + args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); + args.do_mask_processing = SAHBuildGlobals_NeedMasks(globals); + return args; +} + + +inline void BinMapping_init( struct BinMapping* binMapping, struct AABB3f* centBounds, const uint bins ) +{ + const float4 eps = 1E-34f; + const float4 omega = 1E+34f; + float3 l = AABB3f_load_lower( centBounds ); + float3 u = AABB3f_load_upper( centBounds ); + float4 diag; + diag.xyz = max( eps.xyz, u - l ); + diag.w = 0; + float4 scale = (float4)(0.99f * (float)bins) / diag; + scale = select( (float4)(0.0f), scale, diag > eps ); + scale = select( (float4)(0.0f), scale, diag < omega ); + binMapping->scale = scale; + binMapping->ofs.xyz = l.xyz; + binMapping->ofs.w = 0; +} + + +inline ulong getBestSplit( float3 sah, uint ID, const float4 scale, const ulong defaultSplit ) +{ + ulong splitX = (((ulong)as_uint( sah.x )) << 32) | ((uint)ID << 2) | 0; + ulong splitY = (((ulong)as_uint( sah.y )) << 32) | ((uint)ID << 2) | 1; + ulong splitZ = (((ulong)as_uint( sah.z )) << 32) | ((uint)ID << 2) | 2; + /* ignore zero sized dimensions */ + splitX = select( splitX, defaultSplit, (ulong)(scale.x == 0) ); + splitY = select( splitY, defaultSplit, (ulong)(scale.y == 0) ); + splitZ = select( splitZ, defaultSplit, (ulong)(scale.z == 0) ); + ulong bestSplit = min( min( splitX, splitY ), splitZ ); + bestSplit = sub_group_reduce_min( bestSplit ); + return bestSplit; +} + + + +inline float left_to_right_area16( struct AABB3f* low ) +{ + struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low ); + return halfArea_AABB3f( &low_prefix ); +} + +inline uint left_to_right_counts16( uint low ) +{ + return sub_group_scan_exclusive_add( low ); +} + +inline float right_to_left_area16( struct AABB3f* low ) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + struct AABB3f low_reverse = AABB3f_sub_group_shuffle( low, ID ); + struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse ); + const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID ); + return low_area; +} + +inline uint right_to_left_counts16( uint low ) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + const uint low_reverse = intel_sub_group_shuffle( low, ID ); + const uint low_prefix = sub_group_scan_inclusive_add( low_reverse ); + return intel_sub_group_shuffle( low_prefix, ID ); +} + +inline float2 left_to_right_area32( struct AABB3f* low, struct AABB3f* high ) +{ + struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low ); + struct AABB3f low_reduce = AABB3f_sub_group_reduce( low ); + struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max( high ); + AABB3f_extend( &high_prefix, &low_reduce ); + const float low_area = halfArea_AABB3f( &low_prefix ); + const float high_area = halfArea_AABB3f( &high_prefix ); + return (float2)(low_area, high_area); +} + +inline uint2 left_to_right_counts32( uint low, uint high ) +{ + const uint low_prefix = sub_group_scan_exclusive_add( low ); + const uint low_reduce = sub_group_reduce_add( low ); + const uint high_prefix = sub_group_scan_exclusive_add( high ); + return (uint2)(low_prefix, low_reduce + high_prefix); +} + +inline float2 right_to_left_area32( struct AABB3f* low, struct AABB3f* high ) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + struct AABB3f low_reverse = AABB3f_sub_group_shuffle( high, ID ); + struct AABB3f high_reverse = AABB3f_sub_group_shuffle( low, ID ); + struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse ); + struct AABB3f low_reduce = AABB3f_sub_group_reduce( &low_reverse ); + struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max( &high_reverse ); + AABB3f_extend( &high_prefix, &low_reduce ); + const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &high_prefix ), ID ); + const float high_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID ); + return (float2)(low_area, high_area); +} + +inline uint2 right_to_left_counts32( uint low, uint high ) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + const uint low_reverse = intel_sub_group_shuffle( high, ID ); + const uint high_reverse = intel_sub_group_shuffle( low, ID ); + const uint low_prefix = sub_group_scan_inclusive_add( low_reverse ); + const uint low_reduce = sub_group_reduce_add( low_reverse ); + const uint high_prefix = sub_group_scan_inclusive_add( high_reverse ) + low_reduce; + return (uint2)(intel_sub_group_shuffle( high_prefix, ID ), intel_sub_group_shuffle( low_prefix, ID )); +} + +inline uint fastDivideBy6_uint( uint v ) +{ +#if 1 + const ulong u = (ulong)v >> 1; + return (uint)((u * 0x55555556ul) >> 32); +#else + return v / 6; +#endif +} + +inline uint3 fastDivideBy6_uint3( uint3 v ) +{ + return (uint3)(fastDivideBy6_uint( v.x ), fastDivideBy6_uint( v.y ), fastDivideBy6_uint( v.z )); +} + +#define SAH_LOG_BLOCK_SHIFT 2 + +inline struct BFS_Split BinInfo_reduce( struct BFS_BinInfo* binInfo, const float4 scale ) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + struct AABB3f boundsX = BinInfo_get_AABB( binInfo, subgroupLocalID, 0 ); + + const float lr_areaX = left_to_right_area16( &boundsX ); + const float rl_areaX = right_to_left_area16( &boundsX ); + + struct AABB3f boundsY = BinInfo_get_AABB( binInfo, subgroupLocalID, 1 ); + + const float lr_areaY = left_to_right_area16( &boundsY ); + const float rl_areaY = right_to_left_area16( &boundsY ); + + struct AABB3f boundsZ = BinInfo_get_AABB( binInfo, subgroupLocalID, 2 ); + + const float lr_areaZ = left_to_right_area16( &boundsZ ); + const float rl_areaZ = right_to_left_area16( &boundsZ ); + + const uint3 counts = BinInfo_get_counts( binInfo, subgroupLocalID ); + + const uint lr_countsX = left_to_right_counts16( counts.x ); + const uint rl_countsX = right_to_left_counts16( counts.x ); + const uint lr_countsY = left_to_right_counts16( counts.y ); + const uint rl_countsY = right_to_left_counts16( counts.y ); + const uint lr_countsZ = left_to_right_counts16( counts.z ); + const uint rl_countsZ = right_to_left_counts16( counts.z ); + + const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ); + const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ); + + const uint3 lr_count = fastDivideBy6_uint3( (uint3)(lr_countsX, lr_countsY, lr_countsZ) + 6 - 1 ); + const uint3 rl_count = fastDivideBy6_uint3( (uint3)(rl_countsX, rl_countsY, rl_countsZ) + 6 - 1 ); + float3 sah = fma( lr_area, convert_float3( lr_count ), rl_area * convert_float3( rl_count ) ); + + /* first bin is invalid */ + sah.x = select( (float)(INFINITY), sah.x, subgroupLocalID != 0 ); + sah.y = select( (float)(INFINITY), sah.y, subgroupLocalID != 0 ); + sah.z = select( (float)(INFINITY), sah.z, subgroupLocalID != 0 ); + + const ulong defaultSplit = (((ulong)as_uint( (float)(INFINITY) )) << 32); + + const ulong bestSplit = getBestSplit( sah, subgroupLocalID, scale, defaultSplit ); + + struct BFS_Split split; + split.sah = as_float( (uint)(bestSplit >> 32) ); + split.dim = (uint)bestSplit & 3; + split.pos = (uint)bestSplit >> 2; + + return split; +} + + +struct BFS_BinInfoReduce3_SLM +{ + uint sah[3*BFS_NUM_BINS]; +}; + + + +inline struct BFS_Split BinInfo_reduce3( local struct BFS_BinInfoReduce3_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale ) +{ + // process each bin/axis combination across sub-groups + for (uint i = get_sub_group_id(); i < 3 * BFS_NUM_BINS; i += get_num_sub_groups()) + { + uint my_bin = i % BFS_NUM_BINS; + uint my_axis = i / BFS_NUM_BINS; + + float3 left_lower = (float3)(INFINITY,INFINITY,INFINITY); + float3 left_upper = -left_lower; + float3 right_lower = (float3)(INFINITY,INFINITY,INFINITY); + float3 right_upper = -right_lower; + + // load the other bins and assign them to the left or to the right + // of this subgroup's bin + uint lane = get_sub_group_local_id(); + struct AABB3f sg_bins = BinInfo_get_AABB(binInfo,lane,my_axis); + + bool is_left = (lane < my_bin); + float3 lower = AABB3f_load_lower(&sg_bins); + float3 upper = AABB3f_load_upper(&sg_bins); + + float3 lower_l = select_min( lower, is_left ); + float3 upper_l = select_max( upper, is_left ); + float3 lower_r = select_min( lower, !is_left ); + float3 upper_r = select_max( upper, !is_left ); + + lower_l = sub_group_reduce_min_float3( lower_l ); + lower_r = sub_group_reduce_min_float3( lower_r ); + upper_l = sub_group_reduce_max_float3( upper_l ); + upper_r = sub_group_reduce_max_float3( upper_r ); + float3 dl = upper_l - lower_l; + float3 dr = upper_r - lower_r; + float area_l = dl.x* (dl.y + dl.z) + (dl.y * dl.z); + float area_r = dr.x* (dr.y + dr.z) + (dr.y * dr.z); + + // get the counts + uint sg_bin_count = BinInfo_get_count(binInfo, lane, my_axis); + uint count_l = (is_left) ? sg_bin_count : 0; + uint count_r = (is_left) ? 0 : sg_bin_count; + count_l = sub_group_reduce_add(count_l); + count_r = sub_group_reduce_add(count_r); + + // compute sah + count_l = fastDivideBy6_uint(count_l + 6 - 1); + count_r = fastDivideBy6_uint(count_r + 6 - 1); + float lr_partial = area_l * count_l; + float rl_partial = area_r * count_r; + float sah = lr_partial + rl_partial; + + // first bin is invalid + sah = select((float)(INFINITY), sah, my_bin != 0); + + // ignore zero sized dimensions + sah = select( sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) ); + sah = select( sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) ); + sah = select( sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) ); + + // tuck the axis into the bottom bits of sah cost. + // The result is an integer between 0 and +inf (7F800000) + // If we have 3 axes with infinite sah cost, we will select axis 0 + slm->sah[i] = (as_uint(sah)&~0x3) | my_axis; + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + // reduce split candidates down to one subgroup + // sah is strictly positive, so integer compares can be used + // which results in a faster sub_group_reduce_min() + // + uint best_sah = 0xffffffff; + + uint lid = get_sub_group_local_id(); + if (lid < BFS_NUM_BINS) + { + best_sah = slm->sah[lid]; + lid += BFS_NUM_BINS; + best_sah = min( best_sah, slm->sah[lid] ); + lid += BFS_NUM_BINS; + best_sah = min( best_sah, slm->sah[lid] ); + } + + uint reduced_bestsah = sub_group_reduce_min( best_sah ); + uint best_bin = ctz(intel_sub_group_ballot(best_sah == reduced_bestsah)); + uint best_axis = as_uint(reduced_bestsah) & 0x3; + + struct BFS_Split ret; + ret.sah = as_float(reduced_bestsah); + ret.dim = best_axis; + ret.pos = best_bin; + return ret; +} + + +struct BFS_BinInfoReduce_SLM +{ + struct + { + float sah; + uint bin; + } axisInfo[3]; +}; + + + +inline struct BFS_Split BinInfo_reduce2( local struct BFS_BinInfoReduce_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale, uint num_primrefs) +{ + ushort my_axis = get_sub_group_id(); + ushort my_bin = get_sub_group_local_id(); + + if (my_axis < 3) + { + struct AABB3f aabb = BinInfo_get_AABB(binInfo, my_bin, my_axis); + uint count = BinInfo_get_count(binInfo, my_bin, my_axis); + + float lr_area = left_to_right_area16(&aabb); + float rl_area = right_to_left_area16(&aabb); + + uint lr_count = sub_group_scan_exclusive_add(count); + uint rl_count = num_primrefs - lr_count; + + lr_count = fastDivideBy6_uint(lr_count + 6 - 1); + rl_count = fastDivideBy6_uint(rl_count + 6 - 1); + float lr_partial = lr_area * lr_count; + float rl_partial = rl_area * rl_count; + float sah = lr_partial + rl_partial; + + // first bin is invalid + sah = select((float)(INFINITY), sah, my_bin != 0); + + float best_sah = sub_group_reduce_min( sah ); + uint best_bin = ctz(intel_sub_group_ballot(sah == best_sah)); + + // ignore zero sized dimensions + best_sah = select( best_sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) ); + best_sah = select( best_sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) ); + best_sah = select( best_sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) ); + + if (get_sub_group_local_id() == 0) + { + slm->axisInfo[my_axis].sah = best_sah; + slm->axisInfo[my_axis].bin = best_bin; + } + } + barrier( CLK_LOCAL_MEM_FENCE ); + + float sah = (float)(INFINITY); + if( get_sub_group_local_id() < 3 ) + sah = slm->axisInfo[get_sub_group_local_id()].sah; + + float bestsah = min(sub_group_broadcast(sah, 0), min(sub_group_broadcast(sah, 1), sub_group_broadcast(sah, 2))); + uint bestAxis = ctz( intel_sub_group_ballot(bestsah == sah) ); + + struct BFS_Split split; + split.sah = bestsah; + split.dim = bestAxis; + split.pos = slm->axisInfo[bestAxis].bin; + return split; +} + + +inline bool is_left( struct BinMapping* binMapping, struct BFS_Split* split, struct AABB* primref ) +{ + const uint dim = split->dim; + const float lower = primref->lower[dim]; + const float upper = primref->upper[dim]; + const float c = lower + upper; + const uint pos = convert_uint_rtz( (c - binMapping->ofs[dim]) * binMapping->scale[dim] ); + return pos < split->pos; +} + +struct BFS_Pass1_SLM +{ + struct BFS_BinInfo bin_info; +// struct BFS_BinInfoReduce3_SLM reduce3; +}; + + +void DO_BFS_pass1( local struct BFS_Pass1_SLM* slm, + uint thread_primref_id, + bool thread_primref_valid, + struct BFSDispatchArgs args + ) +{ + local struct BFS_BinInfo* local_bin_info = &slm->bin_info; + global struct VContext* context = args.context; + struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // root AABB is initialized to centroid bounds + + struct BinMapping bin_mapping; + BinMapping_init( &bin_mapping, ¢roid_bounds, BFS_NUM_BINS ); + + // fetch this thread's primref + PrimRef ref; + if ( thread_primref_valid ) + ref = args.primref_buffer[thread_primref_id]; + + // init bin info + BinInfo_init( local_bin_info ); + + // fence on local bin-info init + barrier( CLK_LOCAL_MEM_FENCE ); + + // merge this thread's primref into local bin info + BinInfo_add_primref( &bin_mapping, local_bin_info, &ref, thread_primref_valid ); + + // fence on local bin-info update + barrier( CLK_LOCAL_MEM_FENCE ); + + BinInfo_merge(&context->global_bin_info, local_bin_info); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size(BFS_WG_SIZE,1,1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +BFS_pass1_indexed( + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* sah_globals ) +{ + local struct BFS_Pass1_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals ); + + bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end; + uint thread_primref_id = 0; + if ( thread_primref_valid ) + thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )]; + + DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args ); +} + + +__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +BFS_pass1_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) +{ + local struct BFS_Pass1_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals ); + + uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 ); + bool thread_primref_valid = thread_primref_id < args.wg_primref_end; + + DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args ); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +BFS_pass1_indexed_batchable( + global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals_buffer ) +{ + local struct BFS_Pass1_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer ); + + bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end; + uint thread_primref_id = 0; + if (thread_primref_valid) + thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)]; + + DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +BFS_pass1_initial_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) +{ + local struct BFS_Pass1_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer ); + + uint thread_primref_id = args.wg_primref_begin + get_local_id(0); + bool thread_primref_valid = thread_primref_id < args.wg_primref_end; + + DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// BVH2 construction -- BFS Phase Pass2 +/// +///////////////////////////////////////////////////////////////////////////////////////////////// + +struct BFS_Pass2_SLM +{ + struct BFS_BinInfoReduce3_SLM reduce3; + //struct AABB3f left_centroid_bounds; + //struct AABB3f right_centroid_bounds; + //struct AABB3f left_geom_bounds; + //struct AABB3f right_geom_bounds; + LRBounds lr_bounds; + uint left_count; + uint right_count; + uint lr_mask; + uint left_primref_base; + uint right_primref_base; +// uint num_wgs; + +// uint output_indices[BFS_WG_SIZE]; +}; + + + + + + + +void DO_BFS_pass2( + local struct BFS_Pass2_SLM* slm, + uint thread_primref_id, + bool thread_primref_valid, + struct BFSDispatchArgs args +) +{ + global struct VContext* context = args.context; + + struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); + + // load the thread's primref + PrimRef ref; + if ( thread_primref_valid ) + ref = args.primref_buffer[thread_primref_id]; + + struct BinMapping bin_mapping; + BinMapping_init( &bin_mapping, ¢roid_bounds, BFS_NUM_BINS ); + + // initialize working SLM space + LRBounds_init(&slm->lr_bounds); + if(get_local_id(0) == 0) + { + slm->left_count = 0; + slm->right_count = 0; + + if( args.do_mask_processing ) + slm->lr_mask = 0; + } + + // compute split - every workgroup does the same computation + // local barrier inside BinInfo_reduce3 + struct BFS_Split split = BinInfo_reduce3( &slm->reduce3, &context->global_bin_info,bin_mapping.scale ); + + uint wg_prim_count = args.wg_primref_end - args.wg_primref_begin; + + // partition primrefs into L/R subsets... + bool go_left = false; + if (split.sah == (float)(INFINITY)) // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes + go_left = get_local_id(0) < (wg_prim_count / 2); + else + go_left = is_left( &bin_mapping, &split, &ref ); + + // assign this primref a position in the output array, and expand corresponding centroid-bounds + uint local_index; + { + float3 centroid = ref.lower.xyz + ref.upper.xyz; + + uint l_ballot = intel_sub_group_ballot( go_left && thread_primref_valid ); + uint r_ballot = intel_sub_group_ballot( !go_left && thread_primref_valid ); + if (l_ballot) + { + bool active_lane = l_ballot & (1 << get_sub_group_local_id()); + float3 Cmin, Cmax, Gmin, Gmax; + Cmin = select_min( centroid.xyz, active_lane ); + Cmax = select_max( centroid.xyz, active_lane ); + Gmin = select_min( ref.lower.xyz, active_lane ); + Gmax = select_max( ref.upper.xyz, active_lane ); + + Cmin = sub_group_reduce_min_float3( Cmin ); + Cmax = sub_group_reduce_max_float3( Cmax ); + Gmin = sub_group_reduce_min_float3( Gmin ); + Gmax = sub_group_reduce_max_float3( Gmax ); + + LRBounds_merge_left( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax ); + } + + if (r_ballot) + { + bool active_lane = r_ballot & (1 << get_sub_group_local_id()); + float3 Cmin, Cmax, Gmin, Gmax; + Cmin = select_min(centroid.xyz, active_lane); + Cmax = select_max(centroid.xyz, active_lane); + Gmin = select_min(ref.lower.xyz, active_lane); + Gmax = select_max(ref.upper.xyz, active_lane); + + Cmin = sub_group_reduce_min_float3(Cmin); + Cmax = sub_group_reduce_max_float3(Cmax); + Gmin = sub_group_reduce_min_float3(Gmin); + Gmax = sub_group_reduce_max_float3(Gmax); + + LRBounds_merge_right( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax ); + } + + if( args.do_mask_processing ) + { + uint mask =0; + if (thread_primref_valid) + { + mask = PRIMREF_instanceMask(&ref) ; + mask = go_left ? mask : mask<<8; + } + + // TODO OPT: there is no 'sub_group_reduce_or' and IGC does not do the reduction trick + // for atomics on sub-group uniform addresses + for( uint i= get_sub_group_size()/2; i>0; i/= 2) + mask = mask | intel_sub_group_shuffle_down(mask,mask,i); + if( get_sub_group_local_id() == 0 ) + atomic_or_local( &slm->lr_mask, mask ); + } + + uint l_base = 0; + uint r_base = 0; + if( get_sub_group_local_id() == 0 && l_ballot ) + l_base = atomic_add_local( &slm->left_count, popcount(l_ballot) ); + if( get_sub_group_local_id() == 0 && r_ballot ) + r_base = atomic_add_local( &slm->right_count, popcount(r_ballot) ); + + sub_group_barrier( CLK_LOCAL_MEM_FENCE ); + l_base = sub_group_broadcast(l_base,0); + r_base = sub_group_broadcast(r_base,0); + + l_base = l_base + subgroup_bit_prefix_exclusive( l_ballot ); + r_base = r_base + subgroup_bit_prefix_exclusive( r_ballot ); + + local_index = (go_left) ? l_base : r_base; + } + + + barrier( CLK_LOCAL_MEM_FENCE ); + + // merge local into global + // TODO_OPT: Look at spreading some of this across subgroups + if ( get_sub_group_id() == 0 ) + { + // allocate primref space for this wg and merge local/global centroid bounds + uint num_left = slm->left_count; + { + if (num_left && get_sub_group_local_id() == 0) + { + num_left = atomic_add_global( &context->num_left, num_left ); + slm->left_primref_base = args.dispatch_primref_begin + num_left; + } + } + uint num_right = slm->right_count; + { + if (num_right && get_sub_group_local_id() == 0) + { + num_right = atomic_add_global( &context->num_right, num_right ); + slm->right_primref_base = (args.dispatch_primref_end - 1) - num_right; + } + } + + if( args.do_mask_processing && get_sub_group_local_id() == 0 ) + atomic_or_global( &context->lr_mask, slm->lr_mask ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + LRBounds_merge( &context->lr_bounds, &slm->lr_bounds ); + + // move thread's primref ID into correct position in output index buffer + if (thread_primref_valid) + { + uint pos = go_left ? slm->left_primref_base + local_index + : slm->right_primref_base - local_index; + + args.primref_index_out[pos] = thread_primref_id; + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void +BFS_pass2_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) +{ + local struct BFS_Pass2_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals ); + + bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end; + uint thread_primref_id = 0; + if ( thread_primref_valid ) + thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )]; + + DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args ); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void +BFS_pass2_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) +{ + local struct BFS_Pass2_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals ); + + uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 ); + bool thread_primref_valid = thread_primref_id < args.wg_primref_end; + + DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args ); +} + + +__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +BFS_pass2_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) +{ + local struct BFS_Pass2_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer ); + + bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end; + uint thread_primref_id = 0; + if (thread_primref_valid) + thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)]; + + DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args); + +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +BFS_pass2_initial_batchable(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer) +{ + local struct BFS_Pass2_SLM slm; + struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer ); + + uint thread_primref_id = args.wg_primref_begin + get_local_id(0); + bool thread_primref_valid = thread_primref_id < args.wg_primref_end; + + DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args); +} + + + + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// BVH2 construction -- DFS Phase +/// +///////////////////////////////////////////////////////////////////////////////////////////////// + +struct DFSArgs +{ + uint primref_base; + uint global_bvh2_base; + bool do_mask_processing; + ushort num_primrefs; + global uint* primref_indices_in; + global uint* primref_indices_out; + global PrimRef* primref_buffer; + global struct BVH2* global_bvh2; +}; + + +struct DFSPrimRefAABB +{ + half lower[3]; + half upper[3]; +}; + +void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb ) +{ + bb->lower[0] = 1; + bb->lower[1] = 1; + bb->lower[2] = 1; + bb->upper[0] = 0; + bb->upper[1] = 0; + bb->upper[2] = 0; +} + +void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v ) +{ + aabb->lower[0] = min( aabb->lower[0], v->lower[0] ); + aabb->lower[1] = min( aabb->lower[1], v->lower[1] ); + aabb->lower[2] = min( aabb->lower[2], v->lower[2] ); + aabb->upper[0] = max( aabb->upper[0], v->upper[0] ); + aabb->upper[1] = max( aabb->upper[1], v->upper[1] ); + aabb->upper[2] = max( aabb->upper[2], v->upper[2] ); +} + +half DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb ) +{ + const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]); + return fma( d.x, (d.y + d.z), d.y * d.z ); +} + +struct DFSPrimRef +{ + struct DFSPrimRefAABB aabb; + ushort2 meta; +}; + +void DFSPrimRef_SetBVH2Root( struct DFSPrimRef* ref, ushort root ) +{ + ref->meta.y = root; +} + +uint DFSPrimRef_GetInputIndex( struct DFSPrimRef* ref ) +{ + return ref->meta.x; +} + +uint DFSPrimRef_GetBVH2Parent( struct DFSPrimRef* ref ) +{ + return ref->meta.y; +} + + +struct PrimRefSet +{ + struct DFSPrimRefAABB AABB[DFS_WG_SIZE]; + ushort2 meta[DFS_WG_SIZE]; + uint input_indices[DFS_WG_SIZE]; +}; + + + + +local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id ) +{ + return &refs->AABB[id]; +} +struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id ) +{ + struct DFSPrimRef r; + r.aabb = refs->AABB[id]; + r.meta = refs->meta[id]; + return r; +} +void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id ) +{ + refs->AABB[id] = ref.aabb; + refs->meta[id] = ref.meta; +} + +void PrimRefSet_SetPrimRef_FullPrecision( struct AABB3f* root_aabb, local struct PrimRefSet* refs, PrimRef ref, ushort id ) +{ + float3 root_l = AABB3f_load_lower( root_aabb ); + float3 root_u = AABB3f_load_upper( root_aabb ); + float3 d = root_u - root_l; + float scale = 1.0f / max( d.x, max( d.y, d.z ) ); + + float3 l = ref.lower.xyz; + float3 u = ref.upper.xyz; + half3 lh = convert_half3_rtz( (l - root_l) * scale ); + half3 uh = convert_half3_rtp( (u - root_l) * scale ); + + refs->AABB[id].lower[0] = lh.x; + refs->AABB[id].lower[1] = lh.y; + refs->AABB[id].lower[2] = lh.z; + refs->AABB[id].upper[0] = uh.x; + refs->AABB[id].upper[1] = uh.y; + refs->AABB[id].upper[2] = uh.z; + refs->meta[id].x = id; + refs->meta[id].y = 0; +} + + + +void DFS_CreatePrimRefSet( struct DFSArgs args, + local struct PrimRefSet* prim_refs ) +{ + ushort id = get_local_id( 0 ); + ushort num_primrefs = args.num_primrefs; + + struct AABB3f box = BVH2_GetNodeBox( args.global_bvh2, args.global_bvh2_base ); + if ( id < num_primrefs ) + { + PrimRef ref = args.primref_buffer[args.primref_indices_in[id]]; + prim_refs->input_indices[id] = args.primref_indices_in[id]; + PrimRefSet_SetPrimRef_FullPrecision( &box, prim_refs, ref, id ); + } +} + +struct ThreadRangeInfo +{ + uchar start; + uchar local_num_prims; + uchar bvh2_root; + bool active; +}; + +struct BVHBuildLocals // size: ~3.8K +{ + uchar2 axis_and_left_count[ DFS_WG_SIZE ]; + struct ThreadRangeInfo range[ DFS_WG_SIZE ]; + uint sah[ DFS_WG_SIZE ]; +}; + +#define LOCAL_BVH2_NODE_COUNT (2*(DFS_WG_SIZE) -1) + +struct LocalBVH2 +{ + uint nodes[LOCAL_BVH2_NODE_COUNT]; + uint num_nodes; + + // bit layout is for a node is + // uchar child_ptr; // this is right_child_index >> 1. right child's msb is always 0 + // uchar primref_base; // index of the node's first primref. will be 0 at the root + // uchar parent_dist; // distance in nodes from this node to its parent + // uchar prim_counter; // number of prims in this subtree. For a complete tree (256 prims), the root may be off by 1 + + // for a WG size of 256, 8b is enough for parent distance, because the tree is built in level order + // the maximum distance between parent and child occurs for a complete tree. + // in this scenario the left-most leaf has index 255, its parent has index 127, the deltas to the children are 128 and 129 +}; + + +void LocalBVH2_Initialize( struct LocalBVH2* bvh2, ushort num_prims ) +{ + bvh2->num_nodes = 1; + bvh2->nodes[0] = min(num_prims,(ushort)255); +} + + + +void LocalBVH2_Initialize_Presplit(struct LocalBVH2* bvh2, ushort num_prims, ushort left_count, ushort right_count ) +{ + bvh2->num_nodes = 3; + bvh2->nodes[0] = min(num_prims, (ushort)255); + + ushort bvh2_root = 0; + ushort child_place = 1; + + uint child_ptr = (child_place + 1) >> 1; + bvh2->nodes[bvh2_root] |= (child_ptr) << 24; + + uint parent_dist = child_place - bvh2_root; + + // initialize child nodes + ushort primref_base_left = 0; + ushort primref_base_right = left_count; + uint left = (primref_base_left << 16) + ((parent_dist << 8)) + left_count; + uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8) + right_count; + bvh2->nodes[child_place] = left; + bvh2->nodes[child_place + 1] = right; +} + + +void LocalBVH2_CreateInnerNode( local struct LocalBVH2* bvh2, ushort bvh2_root, uint primref_base_left, uint primref_base_right ) +{ + ushort child_place = atomic_add_local( &(bvh2-> num_nodes), 2 ); + + uint child_ptr = (child_place + 1) >> 1; + bvh2->nodes[bvh2_root] |= (child_ptr) << 24; + + uint parent_dist = child_place - bvh2_root; + + // initialize child nodes + uint left = (primref_base_left << 16) + ((parent_dist << 8)); + uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8); + bvh2->nodes[child_place] = left; + bvh2->nodes[child_place + 1] = right; +} + +ushort2 LocalBVH2_GetChildIndices( struct LocalBVH2* bvh2, ushort bvh2_root ) +{ + ushort right_idx = (bvh2->nodes[bvh2_root] & 0xff000000) >> 23; + return (ushort2)(right_idx - 1, right_idx); +} + + +ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* bvh2, ushort bvh2_root ) +{ + // increment only the lower 8 bits. Algorithm will not overflow by design + return atomic_inc_local( &bvh2->nodes[bvh2_root] ) & 0xff; +} + +ushort LocalBVH2_SetLeafPrimCount(local struct LocalBVH2* bvh2, ushort bvh2_root, ushort count) +{ + return bvh2->nodes[bvh2_root] |= (count& 0xff); +} + +bool LocalBVH2_IsRoot( struct LocalBVH2* bvh2, ushort node_id ) +{ + return node_id == 0; +} + +ushort LocalBVH2_GetLeafPrimrefStart( struct LocalBVH2* bvh2, ushort bvh2_node_id ) +{ + return (bvh2->nodes[bvh2_node_id] >> 16) & 255; +} + +bool LocalBVH2_IsLeftChild( struct LocalBVH2* bvh2, ushort parent_node, ushort current_node ) +{ + return (current_node & 1); // nodes are allocated in pairs. first node is root, left child is an odd index +} + +ushort LocalBVH2_GetParent( struct LocalBVH2* bvh2, ushort node ) +{ + return node - ((bvh2->nodes[node] >> 8) & 255); +} + +uint LocalBVH2_GetNodeCount( struct LocalBVH2* bvh2 ) +{ + return bvh2->num_nodes; +} + +bool LocalBVH2_IsLeaf( struct LocalBVH2* bvh2, ushort node_index ) +{ + return (bvh2->nodes[node_index] & 255) <= TREE_ARITY; +} + +ushort LocalBVH2_GetLeafPrimCount( struct LocalBVH2* bvh2, ushort node_index ) +{ + return (bvh2->nodes[node_index] & 255); +} + +void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, + local struct PrimRefSet* prim_refs, + ushort bvh2_root, + ushort prim_range_start, + ushort local_num_prims, + ushort global_num_prims, + local struct BVHBuildLocals* locals, + local uint* num_active_threads ) +{ + ushort tid = get_local_id( 0 ); + ushort primref_position = tid; + + bool active_thread = tid < global_num_prims; + + // Handle cases where initial binner creates leaves + if ( active_thread && local_num_prims <= TREE_ARITY ) + { + struct DFSPrimRef ref = PrimRefSet_GetPrimRef(prim_refs, primref_position); + DFSPrimRef_SetBVH2Root(&ref, bvh2_root); + PrimRefSet_SetPrimRef(prim_refs, ref, primref_position); + active_thread = false; + if (primref_position == prim_range_start) + atomic_sub_local(num_active_threads, local_num_prims); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + locals->range[ tid ].start = prim_range_start; + locals->range[ tid ].local_num_prims = local_num_prims; + locals->range[ tid ].bvh2_root = bvh2_root; + locals->range[ tid ].active = active_thread; + + do + { + if(active_thread && prim_range_start == primref_position) + locals->sah[primref_position] = UINT_MAX; + + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( active_thread ) + { + local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); + + // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost + // do this axis-by-axis to keep register pressure low + float best_sah = INFINITY; + ushort best_axis = 3; + ushort best_count = 0; + + struct DFSPrimRefAABB box_left[3]; + struct DFSPrimRefAABB box_right[3]; + float CSplit[3]; + ushort count_left[3]; + + for ( ushort axis = 0; axis < 3; axis++ ) + { + DFSPrimRefAABB_init( &box_left[axis] ); + DFSPrimRefAABB_init( &box_right[axis] ); + + CSplit[axis] = my_box->lower[axis] + my_box->upper[axis]; + count_left[axis] = 0; + } + + // scan primrefs in our subtree and partition using this thread's prim as a split plane + { + struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start ); + + for ( ushort p = 1; p < local_num_prims; p++ ) + { + struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration + + for( ushort axis = 0; axis < 3; axis++ ) + { + float c = box.lower[axis] + box.upper[axis]; + + if ( c < CSplit[axis] ) + { + // this primitive is to our left. + DFSPrimRefAABB_extend( &box_left[axis], &box ); + count_left[axis]++; + } + else + { + // this primitive is to our right + DFSPrimRefAABB_extend( &box_right[axis], &box ); + } + } + + box = next_box; + } + + // last iteration without preloading box + for( ushort axis = 0; axis < 3; axis++ ) + { + float c = box.lower[axis] + box.upper[axis]; + + if ( c < CSplit[axis] ) + { + // this primitive is to our left. + DFSPrimRefAABB_extend( &box_left[axis], &box ); + count_left[axis]++; + } + else + { + // this primitive is to our right + DFSPrimRefAABB_extend( &box_right[axis], &box ); + } + } + + } + + for ( ushort axis = 0; axis < 3; axis++ ) + { + float Al = DFSPrimRefAABB_halfArea( &box_left[axis] ); + float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] ); + + // Avoid NANs in SAH calculation in the corner case where all prims go right + // In this case we set Al=Ar, because such a split will only be selected if all primrefs + // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees + // should store the same quantized area value + if ( count_left[axis] == 0 ) + Al = Ar; + + // compute sah cost + ushort count_right = local_num_prims - count_left[axis]; + float sah = Ar * count_right + Al * count_left[axis]; + + // keep this split if it is better than the previous one, or if the previous one was a corner-case + if ( sah < best_sah || best_count == 0 ) + { + // yes, keep it + best_axis = axis; + best_sah = sah; + best_count = count_left[axis]; + } + } + + // write split information to SLM + locals->axis_and_left_count[primref_position].x = best_axis; + locals->axis_and_left_count[primref_position].y = best_count; + uint sah = as_uint(best_sah); + // break ties by axis to ensure deterministic split selection + // otherwise builder can produce non-deterministic tree structure run to run + // based on the ordering of primitives (which can vary due to non-determinism in atomic counters) + // Embed split axis and index into sah value; compute min over sah and max over axis + sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | tid ); + + // reduce on split candidates in our local subtree and decide the best one + atomic_min_local( &locals->sah[ prim_range_start ], sah); + } + + + barrier( CLK_LOCAL_MEM_FENCE ); + + ushort split_index = locals->sah[ prim_range_start ] & 255; + ushort split_axis = locals->axis_and_left_count[split_index].x; + ushort split_left_count = locals->axis_and_left_count[split_index].y; + + if ( (primref_position == split_index) && active_thread ) + { + // first thread in a given subtree creates the inner node + ushort start_left = prim_range_start; + ushort start_right = prim_range_start + split_left_count; + if ( split_left_count == 0 ) + start_right = start_left + (local_num_prims / 2); // handle split-in-the-middle case + + LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right ); + } + + + barrier( CLK_LOCAL_MEM_FENCE ); + + struct DFSPrimRef ref; + ushort new_primref_position; + + if ( active_thread ) + { + ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); + bool go_left; + + if ( split_left_count == 0 ) + { + // We chose a split with no left-side prims + // This will only happen if all primrefs are located in the exact same position + // In that case, fall back to split-in-the-middle + split_left_count = (local_num_prims / 2); + go_left = (primref_position - prim_range_start < split_left_count); + } + else + { + // determine what side of the split this thread's primref belongs on + local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); + local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index ); + float c = my_box->lower[split_axis] + my_box->upper[split_axis]; + float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis]; + go_left = c < Csplit; + } + + // adjust state variables for next loop iteration + bvh2_root = (go_left) ? kids.x : kids.y; + local_num_prims = (go_left) ? split_left_count : (local_num_prims - split_left_count); + prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count; + + // determine the new primref position by incrementing a counter in the destination subtree + new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root ); + + // load our primref from its previous position + ref = PrimRefSet_GetPrimRef( prim_refs, primref_position ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( active_thread ) + { + // write our primref into its sorted position and note which node it went in + DFSPrimRef_SetBVH2Root( &ref, bvh2_root ); + PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position ); + primref_position = new_primref_position; + + + // deactivate all threads whose subtrees are small enough to form a leaf + if ( local_num_prims <= TREE_ARITY ) + { + active_thread = false; + if( primref_position == prim_range_start ) + atomic_sub_local( num_active_threads, local_num_prims ); + } + + locals->range[ primref_position ].start = prim_range_start; + locals->range[ primref_position ].local_num_prims = local_num_prims; + locals->range[ primref_position ].bvh2_root = bvh2_root; + locals->range[ primref_position ].active = active_thread; + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + // if we'll have next iteration then load from SLM + if(*num_active_threads) + { + prim_range_start = locals->range[ tid ].start; + local_num_prims = locals->range[ tid ].local_num_prims; + bvh2_root = locals->range[ tid ].bvh2_root; + active_thread = locals->range[ tid ].active; + primref_position = tid; + } + else + { + break; + } + + } while ( true ); + +} + + +#define REFIT_BIT_DWORDS (LOCAL_BVH2_NODE_COUNT - DFS_WG_SIZE)/32 + +struct RefitBits +{ + uint bits[REFIT_BIT_DWORDS]; +}; + +struct DFS_SLM +{ + union + { + struct LocalBVH2 bvh2; + struct { + struct AABB3f centroid_bounds; + uint left_count; + uint right_count; + struct BFS_BinInfo bins; + struct BFS_BinInfoReduce3_SLM reduce3; + } binning; + + } u1; + + union + { + struct { + struct PrimRefSet prim_refs; + struct BVHBuildLocals locals; + } pass0; + + struct AABB3f node_boxes[LOCAL_BVH2_NODE_COUNT]; + + } u2; + + union + { + uchar bytes[DFS_WG_SIZE]; + uint dwords[DFS_WG_SIZE/4]; + } mask_info; + + struct RefitBits refit_bits; + +}; + + +void DFS_InitialBinningPass( + local struct BFS_BinInfo* bins, + local struct BFS_BinInfoReduce3_SLM* reduce3, + uniform local struct AABB3f* centroid_bounds, + local struct PrimRefSet* refs, + local uint* left_counter, + local uint* right_counter, + ushort num_refs ) +{ + uint tid = get_local_id(0); + + // initialize SLM structures + if (tid == 0) + { + AABB3f_init(centroid_bounds); + *left_counter = 0; + *right_counter = 0; + } + + BinInfo_init(bins); + + PrimRef ref; + struct DFSPrimRef dfs_ref; + + if (tid < num_refs) + { + dfs_ref = PrimRefSet_GetPrimRef(refs, tid); + struct DFSPrimRefAABB box = dfs_ref.aabb; + ref.lower.xyz = (float3)(box.lower[0], box.lower[1], box.lower[2]); + ref.upper.xyz = (float3)(box.upper[0], box.upper[1], box.upper[2]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // compute centroid bounds so that we can bin + if (tid < num_refs) + { + float3 centroid = ref.lower.xyz + ref.upper.xyz; + Uniform_AABB3f_atomic_merge_local_sub_group_lu(centroid_bounds, centroid, centroid); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // add primrefs to bins + struct BinMapping mapping; + BinMapping_init(&mapping, centroid_bounds, BFS_NUM_BINS); + + BinInfo_add_primref( &mapping, bins, &ref, tidu1.bvh2; + + global struct BVH2* global_bvh2 = args.global_bvh2; + + PrimRef ref; + uint parent_node; + + { + local struct BVHBuildLocals* locals = &slm->u2.pass0.locals; + local struct PrimRefSet* prim_refs = &slm->u2.pass0.prim_refs; + + DFS_CreatePrimRefSet(args, prim_refs); + + uint local_id = get_local_id(0); + + ushort bvh2_root = 0; + ushort prim_range_start = 0; + ushort local_num_prims = args.num_primrefs; + + if(local_id == 0) + *num_active_threads = local_num_prims; + + // barrier for DFS_CreatePrimRefSet and num_active_threads + barrier(CLK_LOCAL_MEM_FENCE); + + // initial binning pass if number of primrefs is large + if( args.num_primrefs > 32 ) + { + DFS_InitialBinningPass(&slm->u1.binning.bins, &slm->u1.binning.reduce3, &slm->u1.binning.centroid_bounds, prim_refs, + &slm->u1.binning.left_count, &slm->u1.binning.right_count, args.num_primrefs); + + barrier(CLK_LOCAL_MEM_FENCE); + + ushort left_count = slm->u1.binning.left_count; + ushort right_count = args.num_primrefs - left_count; + if (get_local_id(0) == 0) + LocalBVH2_Initialize_Presplit(bvh2, args.num_primrefs, left_count, right_count); + + bvh2_root = (local_id < left_count) ? 1 : 2; + local_num_prims = (local_id < left_count) ? left_count : right_count; + prim_range_start = (local_id < left_count) ? 0 : left_count; + } + else + { + if (get_local_id(0) == 0) + LocalBVH2_Initialize(bvh2, args.num_primrefs); + } + + DFS_ConstructBVH2( bvh2, prim_refs, bvh2_root, prim_range_start, local_num_prims, args.num_primrefs, locals, num_active_threads); + + // move the prim refs into their sorted position + // keep this thread's primref around for later use + if ( local_id < args.num_primrefs ) + { + struct DFSPrimRef dfs_ref = PrimRefSet_GetPrimRef( prim_refs, local_id ); + + uint input_id = DFSPrimRef_GetInputIndex( &dfs_ref ); + + parent_node = DFSPrimRef_GetBVH2Parent( &dfs_ref ); + + uint primref_index = prim_refs->input_indices[input_id]; + ref = args.primref_buffer[primref_index]; + args.primref_indices_out[local_id] = primref_index; + args.primref_indices_in[local_id] = primref_index; + // these buffers are not read again until the end of kernel + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + } + + + // initialize flags for determining when subtrees are done refit + if ( get_local_id( 0 ) < REFIT_BIT_DWORDS ) + slm->refit_bits.bits[get_local_id( 0 )] = 0; + + + // stash full-precision primref AABBs in slm storage + local struct AABB3f* slm_boxes = &slm->u2.node_boxes[0]; + bool active_thread = get_local_id( 0 ) < args.num_primrefs; + if( active_thread ) + { + AABB3f_set( &slm_boxes[get_local_id( 0 )], ref.lower.xyz, ref.upper.xyz ); + + // stash instance masks in SLM storage + if( args.do_mask_processing ) + slm->mask_info.bytes[get_local_id(0)] = PRIMREF_instanceMask( &ref ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + // Refit leaf nodes + uint box_index; + if ( active_thread ) + { + // the thread for the first primref in every leaf is the one that will ascend + // remaining threads merge their AABB/mask into the first one and terminate + uint first_ref = LocalBVH2_GetLeafPrimrefStart( bvh2, parent_node ); + if ( first_ref != get_local_id( 0 ) ) + { + AABB3f_atomic_merge_local_lu( &slm_boxes[first_ref], ref.lower.xyz, ref.upper.xyz ); + + if( args.do_mask_processing ) + { + uint dword_index = first_ref/4; + uint shift = (first_ref%4)*8; + uint mask = PRIMREF_instanceMask(&ref) << shift; + atomic_or_local( &slm->mask_info.dwords[dword_index], mask ); + } + active_thread = false; // switch off all primref threads except the first one + } + + box_index = first_ref; + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( active_thread ) + { + uint current_node = parent_node; + parent_node = LocalBVH2_GetParent( bvh2, current_node ); + + // write out the leaf node's AABB + uint num_prims = LocalBVH2_GetLeafPrimCount( bvh2, current_node ); + uint prim_offs = args.primref_base + LocalBVH2_GetLeafPrimrefStart( bvh2, current_node ); + + uint mask = 0xff; + if( args.do_mask_processing ) + mask = slm->mask_info.bytes[box_index]; + + BVH2_WriteLeafNode( global_bvh2, args.global_bvh2_base + current_node, &slm_boxes[box_index], prim_offs, num_prims, mask ); + + // we no longer need the BVH2 bits for this node, so re-purpose the memory to store the AABB index + bvh2->nodes[current_node] = box_index; + + // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed + uint thread_mask = (1 << (parent_node % 32)); + if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], thread_mask ) & thread_mask) == 0 ) + active_thread = false; + } + + // count how many active threads in sub_group we have and increment wg's number of active threads + uint sg_active = sub_group_reduce_add(active_thread ? 1 : 0); + if(get_sub_group_local_id() == 0) + { + atomic_add_local(num_active_threads, sg_active); + } + + // refit internal nodes: + // walk up the tree and refit AABBs + + do + { + barrier( CLK_LOCAL_MEM_FENCE ); // we need this barrier because we need to make sure all threads read num_active_threads before modifying it + if ( active_thread ) + { + uint current_node = parent_node; + parent_node = LocalBVH2_GetParent( bvh2, current_node ); + + // pull left/right box indices from current node + ushort2 kids = LocalBVH2_GetChildIndices( bvh2, current_node ); + + uint left_box = bvh2->nodes[kids.x]; + uint right_box = bvh2->nodes[kids.y]; + + struct AABB3f left = slm_boxes[left_box]; + struct AABB3f right = slm_boxes[right_box]; + AABB3f_extend( &left, &right ); + + uint2 child_offsets = (uint2)( + args.global_bvh2_base + kids.x, + args.global_bvh2_base + kids.y); + + uint mask = 0xff; + if( args.do_mask_processing ) + { + mask = slm->mask_info.bytes[left_box] + | slm->mask_info.bytes[right_box]; + slm->mask_info.bytes[left_box] = mask; + } + + BVH2_WriteInnerNode( args.global_bvh2, args.global_bvh2_base+current_node, &left, child_offsets, mask ); + + slm_boxes[left_box] = left; + bvh2->nodes[current_node] = left_box; + + // stop at the root + if ( LocalBVH2_IsRoot( bvh2, current_node ) ) + { + active_thread = false; + atomic_dec_local(num_active_threads); + } + else + { + // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed + uint mask = (1 << (parent_node % 32)); + if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], mask ) & mask) == 0 ) + { + active_thread = false; + atomic_dec_local(num_active_threads); + } + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); + } while ( *num_active_threads > 0 ); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size(DFS_WG_SIZE,1,1) )) +__attribute__( (intel_reqd_sub_group_size(16)) ) +kernel void +DFS( global struct VContextScheduler* scheduler, + global struct SAHBuildGlobals* globals_buffer ) +{ + local struct DFS_SLM slm; + local struct DFSDispatchRecord record; + local uint num_active_threads; + + if ( get_local_id( 0 ) == 0 ) + { + // pop an entry off the DFS dispatch queue + //uint wg_index = atomic_dec_global( &scheduler->num_dfs_wgs ) - 1; + //record = scheduler->dfs_queue.records[wg_index]; + + // TODO: The version above races, but is considerably faster... investigate + uint wg_index = get_group_id(0); + record = scheduler->dfs_queue.records[wg_index]; + write_mem_fence( CLK_LOCAL_MEM_FENCE ); + atomic_dec_global( &scheduler->num_dfs_wgs ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + + bool odd_pass = record.tree_depth & 1; + + global struct SAHBuildGlobals* sah_globals = globals_buffer + record.batch_index; + + struct DFSArgs args; + args.num_primrefs = record.num_primrefs; + args.primref_indices_in = SAHBuildGlobals_GetPrimrefIndices_In( sah_globals, odd_pass ); + args.primref_indices_out = SAHBuildGlobals_GetPrimrefIndices_Out( sah_globals, odd_pass ); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs( sah_globals ); + args.global_bvh2 = SAHBuildGlobals_GetBVH2( sah_globals ); + args.primref_indices_in += record.primref_base; + args.primref_indices_out += record.primref_base; + args.primref_base = record.primref_base; + args.global_bvh2_base = record.bvh2_base; + args.do_mask_processing = SAHBuildGlobals_NeedMasks( sah_globals ); + + Do_DFS( args, &slm, &num_active_threads ); + +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// BVH2 to BVH6 +/// +///////////////////////////////////////////////////////////////////////////////////////////////// + + + +struct BuildFlatTreeArgs +{ + ushort leaf_size_in_bytes; + ushort leaf_type; + ushort inner_node_type; + bool do_mask_processing; + + global uint* primref_indices; + global PrimRef* primref_buffer; + global struct Globals* globals; + global struct BVHBase* bvh_base; + global struct BVH2* bvh2; +}; + + +// lane i in the return value is the index of the ith largest primref in the input +// the return value can be used with shuffle() to move data into its sorted position +// the elements of 'key' must be unique.. only the first 6 elements are sorted +varying ushort SUBGROUP_get_sort_indices_N6( varying uint key ) +{ + // each lane computes the number of items larger than it + // this is its position in the descending order + // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it + // if compiler is not generating optimal code, consider moving to Cm + + varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0; + varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0; + varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0; + varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0; + varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0; + varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0; + varying ushort a = cmp0 + cmp2 + cmp4; + varying ushort b = cmp1 + cmp3 + cmp5; + varying ushort num_larger = a + b; + + // each lane determines which of the input elements it should pull + varying ushort lane = get_sub_group_local_id(); + a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0; + b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0; + a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0; + b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0; + a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0; + b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0; + return a + b; +} + +uint SUBGROUP_area_to_sort_key( varying float area, uniform ushort num_children ) +{ + varying ushort lane = get_sub_group_local_id(); + area = (lane < num_children) ? area : 0; // put inactive nodes last + + // drop LSBs and break ties by lane number to ensure unique keys + // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal. + // If we do not do this it can lead to non-deterministic tree structure + return (as_uint(area) & 0xffffff80) + (lane^(get_sub_group_size()-1)); +} + +// lane i in the return value is the index of the ith largest primref in the input +// the return value can be used with shuffle() to move data into its sorted position +// the elements of 'key' must be unique.. only the first 6 elements are sorted +varying ushort SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16( varying uint key ) +{ + // each lane computes the number of items larger than it + // this is its position in the descending order + // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it + // if compiler is not generating optimal code, consider moving to Cm + + varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0; + varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0; + varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0; + varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0; + varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0; + varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0; + varying ushort a = cmp0 + cmp2 + cmp4; + varying ushort b = cmp1 + cmp3 + cmp5; + varying ushort num_larger = a + b; + + varying ushort cmp0_1 = (sub_group_broadcast(key, 8) > key) ? 1 : 0; + varying ushort cmp1_1 = (sub_group_broadcast(key, 9) > key) ? 1 : 0; + varying ushort cmp2_1 = (sub_group_broadcast(key, 10) > key) ? 1 : 0; + varying ushort cmp3_1 = (sub_group_broadcast(key, 11) > key) ? 1 : 0; + varying ushort cmp4_1 = (sub_group_broadcast(key, 12) > key) ? 1 : 0; + varying ushort cmp5_1 = (sub_group_broadcast(key, 13) > key) ? 1 : 0; + varying ushort a_1 = cmp0_1 + cmp2_1 + cmp4_1; + varying ushort b_1 = cmp1_1 + cmp3_1 + cmp5_1; + varying ushort num_larger_1 = a_1 + b_1; + + // each lane determines which of the input elements it should pull + varying ushort lane = get_sub_group_local_id(); + if(lane < 8) + { + a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0; + b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0; + a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0; + b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0; + a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0; + b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0; + } + else + { + a = (sub_group_broadcast(num_larger_1, 8) == lane-8) ? 8 : 8; + b = (sub_group_broadcast(num_larger_1, 9) == lane-8) ? 1 : 0; + a += (sub_group_broadcast(num_larger_1, 10) == lane-8) ? 2 : 0; + b += (sub_group_broadcast(num_larger_1, 11) == lane-8) ? 3 : 0; + a += (sub_group_broadcast(num_larger_1, 12) == lane-8) ? 4 : 0; + b += (sub_group_broadcast(num_larger_1, 13) == lane-8) ? 5 : 0; + } + + return a + b; +} + +uint SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16( varying float area, uniform ushort num_children ) +{ + varying ushort lane = get_sub_group_local_id() % 8; + area = (lane < num_children) ? area : 0; // put inactive nodes last + + // drop LSBs and break ties by lane number to ensure unique keys + // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal. + // If we do not do this it can lead to non-deterministic tree structure + return (as_uint(area) & 0xffffff80) + (lane^7); +} + +ushort SUBGROUP_BuildFlatTreeNode( + uniform struct BuildFlatTreeArgs args, + uniform uint bvh2_root, + uniform struct InternalNode* qnode, + uniform uint qnode_index, + varying uint3* sg_children_out // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z) + // if a leaf is created, receives number of primrefs (z) +) // return value is the number of child nodes or 0 for a leaf +{ + global struct BVH2* bvh2 = args.bvh2; + varying ushort lane = get_sub_group_local_id(); + + global struct BVHBase* base = args.bvh_base; + + + if ( !BVH2_IsInnerNode( bvh2, bvh2_root ) ) + { + uniform ushort num_prims = BVH2_GetLeafPrimCount( bvh2, bvh2_root ); + uniform uint primref_start = BVH2_GetLeafPrimStart( bvh2, bvh2_root ); + varying uint primref_index = primref_start + ((lane < num_prims) ? lane : 0); + + varying uint ref_id = args.primref_indices[primref_index]; + varying PrimRef ref = args.primref_buffer[ref_id]; + uniform char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); + uniform char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes; + + uniform int offset = (int)(leaf_mem - (char*)qnode); + offset = offset >> 6; + + varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&ref), num_prims ); + varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key); + ref = PrimRef_sub_group_shuffle(&ref, sort_index); + ref_id = intel_sub_group_shuffle(ref_id, sort_index); + + if (lane < num_prims) + args.primref_indices[primref_index] = ref_id; + + uint global_num_prims = args.globals->numPrimitives; + char* bvh_mem = (char*) args.bvh_base; + + if(lane < num_prims) + args.primref_indices[primref_index + global_num_prims] = qnode - (struct InternalNode*)bvh_mem; + + if (args.leaf_type == NODE_TYPE_INSTANCE) + subgroup_setInstanceQBVHNodeN( offset, &ref, num_prims, (struct QBVHNodeN*)qnode, lane < num_prims ? PRIMREF_instanceMask(&ref) : 0 ); + else + subgroup_setQBVHNodeN( offset, args.leaf_type, &ref, num_prims, (struct QBVHNodeN*)qnode, BVH_NODE_DEFAULT_MASK ); + + sg_children_out->z = num_prims; + return 0; + } + else + { + // collapse BVH2 into BVH6. + // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough + uniform ushort num_children = 2; + + uniform uint2 kids = BVH2_GetChildIndices( bvh2, bvh2_root ); + varying uint sg_bvh2_node = kids.x; + if ( lane == 1 ) + sg_bvh2_node = kids.y; + + do + { + // choose the inner node with maximum area to replace. + // Its left child goes in its old location. Its right child goes in a new lane + + // TODO_OPT: We re-read the AABBs again and again to compute area + // ... store per-lane boxes instead and pre-compute areas + + varying float sg_area = BVH2_GetNodeArea( bvh2, sg_bvh2_node ); + varying bool sg_is_inner = BVH2_IsInnerNode( bvh2, sg_bvh2_node ); + sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf + + uniform float max_area = sub_group_reduce_max_N6( sg_area ); + varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner; + uniform uint mask = intel_sub_group_ballot( sg_reducable ); + + // TODO_OPT: Some of these ops seem redundant.. look at trimming further + + if ( mask == 0 ) + break; + + // choose the inner node with maximum area to replace + uniform ushort victim_child = ctz( mask ); + uniform uint victim_node = sub_group_broadcast( sg_bvh2_node, victim_child ); + kids = BVH2_GetChildIndices( bvh2, victim_node ); + + if ( lane == victim_child ) + sg_bvh2_node = kids.x; + else if ( lane == num_children ) + sg_bvh2_node = kids.y; + + num_children++; + + } while ( num_children < TREE_ARITY ); + + // allocate inner node space + uniform uint kids_offset; + if (get_sub_group_local_id() == 0) + kids_offset = allocate_inner_nodes( args.bvh_base, num_children ); + kids_offset = sub_group_broadcast(kids_offset, 0); + + uniform struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset); + uniform int offset = (int)((char*)kid - (char*)qnode) >> 6; + +#if 0 + uniform uint kids_offset; + if ( get_sub_group_local_id() == 0 ) + kids_offset = alloc_node_mem( args.globals, sizeof( struct QBVHNodeN ) * num_children ); + kids_offset = sub_group_broadcast( kids_offset, 0 ); + + + // create inner node + uniform struct QBVHNodeN* kid = (struct QBVHNodeN*) ((char*)(args.bvh_base) + kids_offset); + uniform int offset = (int)((char*)kid - (char*)qnode) >> 6; +#endif + uniform uint child_type = args.inner_node_type; + + // sort child nodes in descending order by AABB area + varying struct AABB box = BVH2_GetAABB( bvh2, sg_bvh2_node ); + varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&box), num_children ); + varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key); + box = AABB_sub_group_shuffle(&box, sort_index); + sg_bvh2_node = intel_sub_group_shuffle(sg_bvh2_node, sort_index); + + uniform uint node_mask = (args.do_mask_processing) ? BVH2_GetMask( bvh2, bvh2_root ) : 0xff; + + subgroup_setQBVHNodeN( offset, child_type, &box, num_children, (struct QBVHNodeN*)qnode, node_mask ); + + // return child information + *sg_children_out = (uint3)(sg_bvh2_node, qnode_index + offset + get_sub_group_local_id(), num_children ); + return num_children; + } +} + +ushort SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16( + uniform struct BuildFlatTreeArgs args, + varying uint bvh2_root, + varying struct InternalNode* qnode_base, + varying uint qnode_index, + varying uint3* sg_children_out, // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z) + // if a leaf is created, receives number of primrefs (z) + bool active_lane +) // return value is the number of child nodes or 0 for a leaf +{ + global struct BVH2* bvh2 = args.bvh2; + varying ushort SIMD16_lane = get_sub_group_local_id(); + varying ushort SIMD8_lane = get_sub_group_local_id() % 8; + varying ushort SIMD8_id = get_sub_group_local_id() / 8; + varying ushort lane = get_sub_group_local_id(); + global struct BVHBase* base = args.bvh_base; + + struct BVH2NodeMetaData nodeMetaData = BVH2_GetNodeMetaData( bvh2, bvh2_root ); + + bool is_leaf = active_lane && !BVH2NodeMetaData_IsInnerNode( &nodeMetaData ); + bool is_inner = active_lane && BVH2NodeMetaData_IsInnerNode( &nodeMetaData ); + + uchar mask = BVH_NODE_DEFAULT_MASK; + if(is_inner) + mask = (args.do_mask_processing) ? BVH2NodeMetaData_GetMask( &nodeMetaData ) : 0xff; + + int offset; + + varying struct InternalNode* qnode = qnode_base + qnode_index; + // TOOD: we don't need unions, I left them only for readability + union { + uint num_prims; + uint num_children; + } lane_num_data; + + union { + PrimRef ref; // this is in fact AABB + struct AABB box; + } lane_box_data; + + union { + uint ref_id; + uint sg_bvh2_node; + } lane_id_data; + + // for leafs + varying uint primref_index; + + if(is_leaf) + { + lane_num_data.num_prims = BVH2NodeMetaData_GetLeafPrimCount( &nodeMetaData ); + uint primref_start = BVH2NodeMetaData_GetLeafPrimStart( &nodeMetaData ); + primref_index = primref_start + ((SIMD8_lane < lane_num_data.num_prims) ? SIMD8_lane : 0); + + lane_id_data.ref_id = args.primref_indices[primref_index]; + lane_box_data.ref = args.primref_buffer[lane_id_data.ref_id]; + char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); + char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes; + + offset = (int)(leaf_mem - (char*)qnode); + offset = offset >> 6; + } + + + if(intel_sub_group_ballot(is_inner)) + { + // collapse BVH2 into BVH6. + // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough + + uint2 kids; + if(is_inner) + { + lane_num_data.num_children = 2; + kids = BVH2_GetChildIndices( bvh2, bvh2_root ); + + lane_id_data.sg_bvh2_node = kids.x; + if ( SIMD8_lane == 1 ) + lane_id_data.sg_bvh2_node = kids.y; + } + + bool active = is_inner; + do + { + // choose the inner node with maximum area to replace. + // Its left child goes in its old location. Its right child goes in a new lane + + // TODO_OPT: We re-read the AABBs again and again to compute area + // ... store per-lane boxes instead and pre-compute areas + + varying float sg_area = 0; + varying bool sg_is_inner = false; + if(active) + { + sg_area = BVH2_GetNodeArea( bvh2, lane_id_data.sg_bvh2_node ); + sg_is_inner = BVH2_IsInnerNode( bvh2, lane_id_data.sg_bvh2_node ); + sg_area = (sg_is_inner && SIMD8_lane < lane_num_data.num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf + } + + float max_area = sub_group_reduce_max_N6_2xSIMD8_in_SIMD16( sg_area ); + varying bool sg_reducable = max_area == sg_area && sg_is_inner && (SIMD8_lane < lane_num_data.num_children); + uint mask = intel_sub_group_ballot( sg_reducable ) & (0xFF << SIMD8_id * 8); // we'll end up with two different masks for two SIMD8 in SIMD16 due to bits masking + + // TODO_OPT: Some of these ops seem redundant.. look at trimming further + + if ( mask == 0 ) + active = false; + + // choose the inner node with maximum area to replace + ushort victim_child = ctz( mask ); + uint victim_node = intel_sub_group_shuffle( lane_id_data.sg_bvh2_node, victim_child ); + if(active) + { + kids = BVH2_GetChildIndices( bvh2, victim_node ); + + if ( SIMD16_lane == victim_child ) // we use SIMD16_lane, cause victim_child was calculated based on SIMD16 i.e. second node will have victim from 8..13 + lane_id_data.sg_bvh2_node = kids.x; + else if ( SIMD8_lane == lane_num_data.num_children ) + lane_id_data.sg_bvh2_node = kids.y; + + lane_num_data.num_children++; + + if(lane_num_data.num_children >= TREE_ARITY) + active = false; + } + + } while ( intel_sub_group_ballot(active) ); // if any active, then continue + + // sum children from both halfs of SIMD16 to allocate nodes only once per sub_group + uniform ushort num_children = is_inner ? lane_num_data.num_children : 0; + uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); + uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); + + num_children = first_SIMD8_num_children + second_SIMD8_num_children; + uint kids_offset; + + // allocate inner node space + if(num_children && SIMD16_lane == 0) + kids_offset = allocate_inner_nodes( args.bvh_base, num_children ); + kids_offset = sub_group_broadcast(kids_offset, 0); + if((is_inner)) + { + kids_offset += SIMD8_id * first_SIMD8_num_children; + + struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset); + + offset = (int)((char*)kid - (char*)qnode) >> 6; + lane_box_data.box = BVH2_GetAABB( bvh2, lane_id_data.sg_bvh2_node ); + } + } + + // sort child nodes in descending order by AABB area + varying uint key = SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16(AABB_halfArea(&lane_box_data.box), lane_num_data.num_children ); + varying ushort sort_index = SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16(key); + lane_box_data.box = PrimRef_sub_group_shuffle(&lane_box_data.box, sort_index); + lane_id_data.sg_bvh2_node = intel_sub_group_shuffle(lane_id_data.sg_bvh2_node, sort_index); + + char* bvh_mem = (char*) args.bvh_base; + if (is_leaf && SIMD8_lane < lane_num_data.num_prims) + { + args.primref_indices[primref_index] = lane_id_data.ref_id; + args.primref_indices[primref_index + args.globals->numPrimitives] = qnode - (struct InternalNode*)bvh_mem; + } + + bool degenerated = false; + uint node_type = is_leaf ? args.leaf_type : args.inner_node_type; + + if(args.leaf_type == NODE_TYPE_INSTANCE) + degenerated = subgroup_setInstanceBox_2xSIMD8_in_SIMD16(&lane_box_data.box, lane_num_data.num_children, &mask, SIMD8_lane < lane_num_data.num_prims ? PRIMREF_instanceMask(&lane_box_data.ref) : 0, is_leaf); + + subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, node_type, &lane_box_data.box, lane_num_data.num_children, mask, (struct QBVHNodeN*)(qnode), degenerated, active_lane); + + // return child information + if(is_inner) + { + sg_children_out->x = lane_id_data.sg_bvh2_node; + sg_children_out->y = qnode_index + offset + SIMD8_lane; + } + + sg_children_out->z = lane_num_data.num_children; + + return is_inner ? lane_num_data.num_children : 0; +} + +void check_primref_integrity( global struct SAHBuildGlobals* globals ) +{ + global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); + global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, 0 ); + dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); + if ( get_local_id( 0 ) == 0 ) + { + for ( uint i = 0; i < num_primrefs; i++ ) + { + primref_out[i] = 0; + } + + for ( uint i = 0; i < num_primrefs; i++ ) + primref_out[primref_in[i]]++; + + for ( uint i = 0; i < num_primrefs; i++ ) + if ( primref_out[i] != 1 ) + printf( "Foo: %u %u\n", i, primref_out[i] ); + } +} + + + + +void check_bvh2(global struct SAHBuildGlobals* globals ) +{ + global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(globals); + global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); + global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out(globals, 0); + dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs(globals); + + if (get_local_id(0) == 0) + { + for (uint i = 0; i < num_primrefs; i++) + primref_out[i] = 0; + + uint stack[256]; + uint sp=0; + uint r = BVH2_GetRoot(bvh2); + stack[sp++] = r; + while (sp) + { + r = stack[--sp]; + if (BVH2_IsInnerNode(bvh2,r)) + { + uint2 kids = BVH2_GetChildIndices( bvh2, r); + if (kids.x >= bvh2->num_nodes || kids.y >= bvh2->num_nodes) + { + printf("BVH2!! Bad node index found!\n"); + return; + } + + stack[sp++] = kids.x; + stack[sp++] = kids.y; + } + else + { + uint ref = BVH2_GetLeafPrimStart(bvh2,r); + uint count = BVH2_GetLeafPrimCount(bvh2,r); + if( count == 0 ) + { + printf("BVH2!! Empty leaf found!\n"); + return; + } + for (uint i = 0; i < count; i++) + { + if (ref + i > num_primrefs) + { + printf("BVH2!! Bad leaf range!\n"); + return; + } + uint c = primref_out[ref+i]; + if (c != 0) + { + printf("BVH2!! overlapped prim ranges\n"); + return; + } + primref_out[ref+i] = 1; + if (primref_in[ref + i] >= num_primrefs) + { + printf("BAD PRIMREF ID FOUND!\n"); + return; + } + } + } + } + } + + printf("bvh2 is ok!\n"); +} + + +#if 0 +// TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size(256,1,1)) ) +__attribute__( (intel_reqd_sub_group_size(8) ) ) +kernel void +build_qnodes( global struct SAHBuildGlobals* globals, global struct VContextScheduler* scheduler ) +{ + globals = globals + (scheduler->num_trivial_builds + scheduler->num_single_builds); + globals = globals + get_group_id(0); + + + struct BuildFlatTreeArgs args; + args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals ); + args.leaf_type = SAHBuildGlobals_GetLeafType( globals ); + args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals ); + args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); + args.bvh_base = SAHBuildGlobals_GetBVHBase( globals ); + args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); + args.globals = (global struct Globals*) globals->p_globals; + args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals ); + + dword alloc_backpointers = SAHBuildGlobals_NeedBackPointers( globals ); + global uint2* root_buffer = (global uint2*) globals->p_qnode_root_buffer; + global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base ); + global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); + + local uint nodes_produced; + if ( get_sub_group_id() == 0 ) + { + // allocate first node + if (get_sub_group_local_id() == 0) + allocate_inner_nodes( args.bvh_base, 1 ); + + // first subgroup does first node + varying uint3 children_info; + uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, BVH2_GetRoot(args.bvh2), qnodes, 0, &children_info ); + + if ( get_sub_group_local_id() < num_children ) + root_buffer[get_sub_group_local_id()] = children_info.xy; + + if ( alloc_backpointers ) + { + // set root's backpointer + if( get_sub_group_local_id() == 0 ) + back_pointers[0] = (0xffffffc0) | (children_info.z << 3); + + // point child backpointers at the parent + if( get_sub_group_local_id() < num_children ) + back_pointers[children_info.y] = 0; + } + + if ( get_sub_group_local_id() == 0 ) + nodes_produced = num_children; + } + + barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE ); + + + uniform uint buffer_index = get_sub_group_id(); + uniform bool sg_active = buffer_index < nodes_produced; + + while ( work_group_any( sg_active ) ) + { + if( sg_active ) + { + uniform uint bvh2_node = root_buffer[buffer_index].x; + uniform uint qnode_index = root_buffer[buffer_index].y; + + // build a node + varying uint3 children_info; + uniform ushort num_children = SUBGROUP_BuildFlatTreeNode( args, bvh2_node, qnodes + qnode_index, qnode_index, &children_info ); + + // handle backpointers + if ( alloc_backpointers ) + { + // update this node's backpointer with child count + if ( get_sub_group_local_id() == 0 ) + back_pointers[qnode_index] |= (children_info.z << 3); + + // point child backpointers at parent + if ( get_sub_group_local_id() < num_children ) + back_pointers[children_info.y] = (qnode_index << 6); + } + + if ( num_children ) + { + // allocate space in the child buffer + uint root_buffer_position = 0; + if ( get_sub_group_local_id() == 0 ) + root_buffer_position = atomic_add_local( &nodes_produced, num_children ); + root_buffer_position = sub_group_broadcast( root_buffer_position, 0 ); + + // store child indices in root buffer + if ( get_sub_group_local_id() < num_children ) + root_buffer[root_buffer_position + get_sub_group_local_id()] = children_info.xy; + } + } + + // sync everyone + work_group_barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, + memory_scope_work_group ); + + + if( sg_active ) + buffer_index += get_num_sub_groups(); + + sg_active = (buffer_index < nodes_produced); + } +} +#endif + + + + + + + +inline bool buffer_may_overflow( uint capacity, uint current_size, uint elements_processed_per_sub_group ) +{ + uint num_consumed = min( get_num_sub_groups() * elements_processed_per_sub_group, current_size ); + uint space_available = (capacity - current_size) + num_consumed; + uint space_needed = TREE_ARITY * num_consumed; + return space_available < space_needed; +} + +inline uint build_qnodes_pc( + global struct SAHBuildGlobals* globals, + bool alloc_backpointers, + bool process_masks, + uint first_qnode, + uint first_bvh2_node, + + local uint2* SLM_local_root_buffer, + local uint* SLM_ring_tail, + const uint RING_SIZE +) + +{ + struct BuildFlatTreeArgs args; + args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals ); + args.leaf_type = SAHBuildGlobals_GetLeafType( globals ); + args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals ); + args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); + args.bvh_base = SAHBuildGlobals_GetBVHBase( globals ); + args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); + args.globals = (global struct Globals*) globals->p_globals; + args.do_mask_processing = process_masks; + + global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base ); + global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); + + // first subgroup adds first node + if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0) + { + SLM_local_root_buffer[0].x = first_bvh2_node; + SLM_local_root_buffer[0].y = first_qnode; + *SLM_ring_tail = 1; + + } + + uint ring_head = 0; + uint ring_tail = 1; + uint ring_size = 1; + + barrier( CLK_LOCAL_MEM_FENCE ); + + const uniform uint elements_processed_in_sg = 2; + + while ( ring_size > 0 && !buffer_may_overflow( RING_SIZE, ring_size, elements_processed_in_sg ) ) + { + ushort SIMD16_lane = get_sub_group_local_id(); + + // SIMD16 as 2xSIMD8 + ushort SIMD8_lane = get_sub_group_local_id() % 8; + ushort SIMD8_id = get_sub_group_local_id() / 8; + bool active_lane; + + uniform uint nodes_consumed = min( get_num_sub_groups() * elements_processed_in_sg, ring_size ); // times two because we process two nodes in subgroup + uniform bool sg_active = get_sub_group_id() * elements_processed_in_sg < nodes_consumed; + ushort num_children = 0; + varying uint3 children_info = 0; + + uint bvh2_node = 0; + uint qnode_index = 0; + + if (sg_active) + { + ushort consumed_pos = get_sub_group_id() * elements_processed_in_sg + SIMD8_id; + active_lane = consumed_pos < nodes_consumed ? true : false; + consumed_pos = consumed_pos < nodes_consumed ? consumed_pos : consumed_pos-1; + + uint buffer_index = (ring_head + consumed_pos) % RING_SIZE; + + bvh2_node = SLM_local_root_buffer[buffer_index].x; + qnode_index = SLM_local_root_buffer[buffer_index].y; + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + if (sg_active) + { + // build a node + num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, bvh2_node, qnodes, qnode_index, &children_info, active_lane); + + // handle backpointers + // TODO_OPT: This should be separate shaders not a runtime branch + // doing it this way for now because GRLTLK does not make dynamic shader selection on host very easy. + // this needs to change... GRLTLK should + + if (alloc_backpointers && active_lane) + { + // update this node's backpointer with child count + if (SIMD8_lane == 0) + back_pointers[qnode_index] |= (children_info.z << 3); + + // point child backpointers at parent + if (SIMD8_lane < num_children) + back_pointers[children_info.y] = (qnode_index << 6); + } + + // save data + + uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); + uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); + uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children; + + uint root_buffer_position = 0; + + // allocate space in the child buffer + if (SIMD16_lane == 0 && SIMD16_num_children) + root_buffer_position = atomic_add_local(SLM_ring_tail, SIMD16_num_children); + + root_buffer_position = sub_group_broadcast( root_buffer_position, 0 ); + root_buffer_position += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16 + + // store child indices in root buffer + if (SIMD8_lane < num_children) + { + uint store_pos = (root_buffer_position + SIMD8_lane) % RING_SIZE; + SLM_local_root_buffer[store_pos] = children_info.xy; + } + } + + // sync everyone + barrier( CLK_LOCAL_MEM_FENCE ); + + ring_head += nodes_consumed; + ring_tail = *SLM_ring_tail; + ring_size = ring_tail - ring_head; + } + + return ring_head; +} + + + + +inline void amplify_and_spill( + global struct SAHBuildGlobals* globals, + dword alloc_backpointers, + uint first_qnode, + uint first_bvh2_node, + global uint2* global_root_buffer, + local uint* root_buffer_counter, + const uint RING_SIZE +) + +{ + struct BuildFlatTreeArgs args; + args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals); + args.leaf_type = SAHBuildGlobals_GetLeafType(globals); + args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals); + args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals); + args.bvh_base = SAHBuildGlobals_GetBVHBase(globals); + args.bvh2 = SAHBuildGlobals_GetBVH2(globals); + args.globals = (global struct Globals*) globals->p_globals; + + global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base); + global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base); + + + varying uint3 children_info; + uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, first_bvh2_node, qnodes + first_qnode, first_qnode, &children_info); + + if (alloc_backpointers) + { + // set first node's backpointer + if (get_sub_group_local_id() == 0) + { + // if first node is root, use root sentinel in backpointer + // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread) + uint bp = 0xffffffc0; + if (first_qnode != 0) + bp = back_pointers[first_qnode]; + bp |= (children_info.z << 3); + + back_pointers[first_qnode] = bp; + } + + // point child backpointers at the parent + if (get_sub_group_local_id() < num_children) + back_pointers[children_info.y] = (first_qnode << 6); + } + + if (num_children) + { + uint spill_pos = 0; + if (get_sub_group_local_id() == 0) + spill_pos = atomic_add_local(root_buffer_counter,num_children); + + spill_pos = sub_group_broadcast(spill_pos, 0); + + if (get_sub_group_local_id() < num_children) + global_root_buffer[spill_pos+get_sub_group_local_id()] = children_info.xy; + } + +} + + + + +inline void build_qnodes_pc_kickoff_func( + global struct SAHBuildGlobals* globals, + global uint2* root_buffer, + bool alloc_backpointers, + bool process_masks, + + local uint2* SLM_local_root_buffer, + local uint* SLM_spill_pos, + local uint* SLM_ring_tail, + int RING_SIZE +) +{ + // allocate first node + if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0 ) + allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(globals), 1 ); + + *SLM_spill_pos=0; + + uint ring_head = build_qnodes_pc( globals, alloc_backpointers, process_masks, + 0, BVH2_GetRoot(SAHBuildGlobals_GetBVH2(globals)), SLM_local_root_buffer, SLM_ring_tail, RING_SIZE ); + + + uint n = *SLM_ring_tail - ring_head; + if (n > 0) + { +#if 0 + // do an additional round of amplification so we can get more nodes into the root buffer and go wider in the next phase + /// JDB TODO: this is causing hangs on DG2 for metro, so disabling for now... + for (uint i = get_sub_group_id(); i < n; i+= get_num_sub_groups() ) + { + uint consume_pos = (ring_head + i) % RING_SIZE; + uniform uint bvh2_root = SLM_local_root_buffer[consume_pos].x; + uniform uint qnode_root = SLM_local_root_buffer[consume_pos].y; + + amplify_and_spill( globals, alloc_backpointers, qnode_root, bvh2_root, root_buffer, SLM_spill_pos, RING_SIZE ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); +#else + for (uint i = get_local_id(0); i < n; i += get_local_size(0)) + root_buffer[i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; +#endif + + if (get_local_id(0) == 0) + { + globals->root_buffer_num_produced = n; + globals->root_buffer_num_produced_hi = 0; + globals->root_buffer_num_consumed = 0; + globals->root_buffer_num_consumed_hi = 0; + } + } +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 256, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void +build_qnodes_pc_kickoff( + global struct SAHBuildGlobals* globals, + global uint2* root_buffer, + dword sah_flags +) +{ + bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; + bool process_masks = sah_flags & SAH_FLAG_NEED_MASKS; + + + const int RING_SIZE = 64; + + local uint2 SLM_local_root_buffer[RING_SIZE]; + local uint SLM_spill_pos; + local uint SLM_ring_tail; + + build_qnodes_pc_kickoff_func(globals, + root_buffer, + alloc_backpointers, + process_masks, + SLM_local_root_buffer, + &SLM_spill_pos, + &SLM_ring_tail, + RING_SIZE + ); +} + + + + +inline void build_qnodes_pc_amplify_func( + global struct SAHBuildGlobals* globals, + global uint2* root_buffer, + bool alloc_backpointers, + bool process_masks, + + local uint2* SLM_local_root_buffer, + local uint* SLM_broadcast, + local uint* SLM_ring_tail, + int RING_SIZE + ) +{ + // TODO_OPT: Probably don't need this atomic.. could clear 'num_consumed' every time + // and just use get_group_id() + // + + if (get_local_id(0) == 0) + *SLM_broadcast = atomic_inc_global(&globals->root_buffer_num_consumed); + + barrier( CLK_LOCAL_MEM_FENCE ); + + uniform uint consume_pos = *SLM_broadcast; + uniform uint bvh2_root = root_buffer[consume_pos].x; + uniform uint qnode_root = root_buffer[consume_pos].y; + + uint ring_head = build_qnodes_pc(globals, alloc_backpointers,process_masks, + qnode_root, bvh2_root, SLM_local_root_buffer, SLM_ring_tail, RING_SIZE); + + // TODO_OPT: Instead of spilling the nodes, do one more round of amplification and write + // generated children directly into the root buffer. This should allow faster amplification + + // spill root buffer contents + uint n = *SLM_ring_tail - ring_head; + if (n > 0) + { + + if (get_local_id(0) == 0) + *SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n); + + barrier( CLK_LOCAL_MEM_FENCE ); + uint produce_pos = *SLM_broadcast; + + for (uint i = get_local_id(0); i < n; i += get_local_size(0)) + root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; + } +} + + + + + +// Process two nodes per wg during amplification phase. +// DOing it this way ensures maximum parallelism +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void +build_qnodes_pc_amplify( + global struct SAHBuildGlobals* globals, + global uint2* root_buffer, + dword sah_flags ) +{ + bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; + + struct BuildFlatTreeArgs args; + args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals); + args.leaf_type = SAHBuildGlobals_GetLeafType(globals); + args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals); + args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); + args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals); + args.bvh_base = SAHBuildGlobals_GetBVHBase(globals); + args.bvh2 = SAHBuildGlobals_GetBVH2(globals); + args.globals = (global struct Globals*) globals->p_globals; + args.do_mask_processing = sah_flags & SAH_FLAG_NEED_MASKS; + + global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base); + global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base); + + ushort SIMD16_lane = get_sub_group_local_id(); + + // SIMD16 as 2xSIMD8 + ushort SIMD8_lane = get_sub_group_local_id() % 8; + ushort SIMD8_id = get_sub_group_local_id() / 8; + bool active_lane = false; + + uint consume_pos; + consume_pos = globals->root_buffer_num_consumed + get_group_id(0) * 2; // times 2 because we process two nodes in workgroup + consume_pos += SIMD8_id; + + active_lane = consume_pos < globals->root_buffer_num_to_consume ? true : false; + consume_pos = consume_pos < globals->root_buffer_num_to_consume ? consume_pos : consume_pos-1; + + uint first_bvh2_node = root_buffer[consume_pos].x; + uint first_qnode = root_buffer[consume_pos].y; + + varying uint3 children_info; + ushort num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, first_bvh2_node, qnodes, first_qnode, &children_info, active_lane); + + if (alloc_backpointers && active_lane) + { + // set first node's backpointer + if (SIMD8_lane == 0) + { + // if first node is root, use root sentinel in backpointer + // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread) + uint bp = 0xffffffc0; + if (first_qnode != 0) + bp = back_pointers[first_qnode]; + bp |= (children_info.z << 3); + + back_pointers[first_qnode] = bp; + } + + // point child backpointers at the parent + if (SIMD8_lane < num_children) + back_pointers[children_info.y] = (first_qnode << 6); + } + + // save data + { + // sum children from both halfs of SIMD16 to do only one atomic per sub_group + uint produce_pos; + uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); + uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); + uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children; + + if (SIMD16_lane == 0 && SIMD16_num_children) + produce_pos = atomic_add_global(&globals->root_buffer_num_produced, SIMD16_num_children); + + produce_pos = sub_group_broadcast(produce_pos, 0); + produce_pos += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16 + + if (SIMD8_lane < num_children) + { + root_buffer[produce_pos + SIMD8_lane] = children_info.xy; + } + } +} + + +////////// +// +// Batched version of qnode creation +// +////////// + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +kernel void +build_qnodes_init_scheduler_batched(global struct QnodeScheduler* scheduler, dword num_builds, dword num_max_qnode_global_root_buffer_entries) +{ + + scheduler->batched_build_offset = scheduler->num_trivial_builds + scheduler->num_single_builds; + scheduler->batched_build_count = num_builds - scheduler->batched_build_offset; + scheduler->num_max_qnode_global_root_buffer_entries = num_max_qnode_global_root_buffer_entries; + + const uint num_builds_to_process = scheduler->batched_build_count; + const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; + + scheduler->batched_builds_to_process = num_builds_to_process; + scheduler->num_qnode_grb_curr_entries = (num_builds_to_process + 15) / 16; // here we store number of workgroups for "build_qnodes_begin_batchable" kernel + scheduler->num_qnode_grb_new_entries = num_builds_to_process; + scheduler->qnode_global_root_buffer.curr_entries_offset = max_qnode_grb_entries; +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void +build_qnodes_begin_batchable(global struct QnodeScheduler* scheduler, + global struct SAHBuildGlobals* builds_globals) +{ + const uint tid = get_group_id(0) * get_local_size(0) + get_local_id(0); + + const uint num_builds_to_process = scheduler->batched_builds_to_process; + + if(tid < num_builds_to_process) + { + const uint build_idx = scheduler->batched_build_offset + tid; + + uint bvh2_node = BVH2_GetRoot(SAHBuildGlobals_GetBVH2(&builds_globals[build_idx])); + uint qnode = 0; + struct QNodeGlobalRootBufferEntry entry = { bvh2_node, qnode, build_idx, 1}; + scheduler->qnode_global_root_buffer.entries[tid] = entry; + + builds_globals[build_idx].root_buffer_num_produced = 0; + builds_globals[build_idx].root_buffer_num_produced_hi = 0; + builds_globals[build_idx].root_buffer_num_consumed = 0; + builds_globals[build_idx].root_buffer_num_consumed_hi = 0; + + // allocate first node for this build + //allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx]), 1 ); + SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx])->nodeDataCur++; + } +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) +kernel void +build_qnodes_scheduler(global struct QnodeScheduler* scheduler) +{ + const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; + + uint new_entries = min(scheduler->num_qnode_grb_new_entries, max_qnode_grb_entries); + + scheduler->num_qnode_grb_curr_entries = new_entries; + scheduler->num_qnode_grb_new_entries = 0; + scheduler->qnode_global_root_buffer.curr_entries_offset = scheduler->qnode_global_root_buffer.curr_entries_offset ? 0 : max_qnode_grb_entries; +} + + + + +// TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 32, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void +build_qnodes_pc_amplify_batched( + global struct SAHBuildGlobals* builds_globals, + global struct QnodeScheduler* scheduler + ) +{ + const uint group_id = get_group_id(0); + + global struct QNodeGlobalRootBuffer* global_root_buffer = &scheduler->qnode_global_root_buffer; + const uint curr_entries_offset = global_root_buffer->curr_entries_offset; + struct QNodeGlobalRootBufferEntry entry = global_root_buffer->entries[curr_entries_offset + group_id]; + + const uint build_id = entry.build_idx; + + global struct SAHBuildGlobals* globals = &builds_globals[build_id]; + global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer; + bool alloc_backpointers = SAHBuildGlobals_NeedBackPointers(globals); + bool process_masks = SAHBuildGlobals_NeedMasks(globals); + + const int RING_SIZE = 32; // for 2 SGs, 16 should result in 2 rounds: one SG produces 6, then 2 SGs consume 2 and produce 12 + // for 4 SGs, 32 results in 2 rounds: one SG produces 6, 4 SGs consume 4 and produce 24, resulting in 26 + + local uint2 SLM_local_root_buffer[RING_SIZE]; + local uint SLM_broadcast; + local uint SLM_ring_tail; + local uint SLM_grb_broadcast; + + + //// This below can be moved to separate function if needed for TLAS //// + + uniform uint bvh2_root = entry.bvh2_node; + uniform uint qnode_root = entry.qnode; + + uint ring_head = build_qnodes_pc(globals, alloc_backpointers, process_masks, + qnode_root, bvh2_root, SLM_local_root_buffer, &SLM_ring_tail, RING_SIZE); + + // spill root buffer contents + uint n = SLM_ring_tail - ring_head; + if (n > 0) + { + const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; + + if (get_local_id(0) == 0) + { + SLM_grb_broadcast = atomic_add_global(&scheduler->num_qnode_grb_new_entries, n); + + if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, then make space in build's root_buffer + SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n); + else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then make space in build's root_buffer + SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n - (max_qnode_grb_entries - SLM_grb_broadcast)); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + uint produce_pos = SLM_broadcast; + + uint grb_produce_num = n; // grb stands for global_root_buffer + uint lrb_produce_num = 0; // lrb stands for local root buffer, meaning this build's root_buffer + + if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, don't write to it + { + grb_produce_num = 0; + lrb_produce_num = n; + } + else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then decrease amount of entries and store rest in build's root buffer + { + grb_produce_num = max_qnode_grb_entries - SLM_grb_broadcast; + lrb_produce_num = n - grb_produce_num; + } + + // save data to global_root_buffer + for(uint i = get_local_id(0); i < grb_produce_num; i += get_local_size(0)) + { + const uint2 slm_record = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; + + struct QNodeGlobalRootBufferEntry new_entry; + new_entry.bvh2_node = slm_record.x; + new_entry.qnode = slm_record.y; + new_entry.build_idx = entry.build_idx; + + const uint new_entries_offset = curr_entries_offset ? 0 : max_qnode_grb_entries; + global_root_buffer->entries[new_entries_offset + SLM_grb_broadcast + i] = new_entry; + } + + // if anything left, write to build's root buffer + for (uint i = get_local_id(0); i < lrb_produce_num; i += get_local_size(0)) + root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i + grb_produce_num) % RING_SIZE]; + } +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void +build_qnodes_try_to_fill_grb_batched( + global struct SAHBuildGlobals* builds_globals, + global struct QnodeScheduler* scheduler + ) +{ + const uint build_id = scheduler->batched_build_offset + get_group_id(0); + global struct SAHBuildGlobals* globals = &builds_globals[build_id]; + global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer; + + global struct QNodeGlobalRootBuffer* qnode_root_buffer = (global struct QNodeGlobalRootBuffer*)&scheduler->qnode_global_root_buffer; + + const uint num_produced = globals->root_buffer_num_produced; + const uint num_consumed = globals->root_buffer_num_consumed; + const uint entries = num_produced - num_consumed; // entries to build's root buffer + + if(!entries) + return; + + uint global_root_buffer_offset; + if(get_local_id(0) == 0) + global_root_buffer_offset = atomic_add_global(&scheduler->num_qnode_grb_new_entries, entries); + + global_root_buffer_offset = sub_group_broadcast(global_root_buffer_offset, 0); + + const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; + + if(global_root_buffer_offset >= max_qnode_grb_entries) // if global_root_buffer is full, then return + return; + + uint global_root_buffer_produce_num = entries; + if(global_root_buffer_offset + entries >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then reduce number of entries to push + global_root_buffer_produce_num = max_qnode_grb_entries - global_root_buffer_offset; + + for(uint i = get_local_id(0); i < global_root_buffer_produce_num; i += get_local_size(0)) + { + const uint2 entry = root_buffer[num_consumed + i]; + + struct QNodeGlobalRootBufferEntry new_entry; + new_entry.bvh2_node = entry.x; + new_entry.qnode = entry.y; + new_entry.build_idx = build_id; + + const uint new_entries_offset = qnode_root_buffer->curr_entries_offset ? 0 : max_qnode_grb_entries; + qnode_root_buffer->entries[new_entries_offset + global_root_buffer_offset + i] = new_entry; + } + + if(get_local_id(0) == 0) + globals->root_buffer_num_consumed += global_root_buffer_produce_num; +} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl new file mode 100644 index 00000000000..1f64ef3fbe2 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl @@ -0,0 +1,2025 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "intrinsics.h" +#include "AABB3f.h" +#include "AABB.h" +#include "GRLGen12.h" +#include "quad.h" +#include "common.h" +#include "instance.h" + +#include "api_interface.h" + +#include "binned_sah_shared.h" + + +#if 0 +#define LOOP_TRIPWIRE_INIT uint _loop_trip=0; + +#define LOOP_TRIPWIRE_INCREMENT(max_iterations) \ + _loop_trip++;\ + if ( _loop_trip > max_iterations )\ + {\ + if( get_local_id(0) == 0 )\ + printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!! group=%u\n", get_group_id(0) );\ + break;\ + } +#else + +#define LOOP_TRIPWIRE_INIT +#define LOOP_TRIPWIRE_INCREMENT(max_iterations) + +#endif + + +// ========================================================= +// DFS +// ========================================================= + +// there are 128 threads x SIMD16 == 2048 lanes in a DSS +// There is 128KB of SLM. Upper limit of 64KB per WG, so target is 2 groups of 1024 lanes @ 64K each +// --> Full occupancy requires using less than 64B per lane +// +// Groups of 256 lanes gives us 16KB per group +// + +// We use subgroups very heavily here in order to avoid +// use of per-thread scratch space for intermediate values + +#define DFS_WG_SIZE 256 +#define DFS_NUM_SUBGROUPS 16 +#define DFS_BVH2_NODE_COUNT (2*(DFS_WG_SIZE)-1) +#define TREE_ARITY 6 + +// FlatTree node limits: +// these are the derivations if we always collapse to one primitive and pack nodes as tightly as possible +// If BVH2 construction is allowed to terminate early and place multiple prims in a leaf, these numbers will be too low +#if 0 + +// maximum flattree size is the number of inner nodes in a full M-ary tree with one leaf per primitive +// This is given by I = (L-1)/(M-1) +// For a 256 thread workgroup, L=256, M=6, this gives: 51 +#define DFS_MAX_FLATTREE_NODES 51 + + +// A flattree leaf is a node which contains only primitives. +// +// The maximum number of leaves is related to the number of nodes as: +// L(N) = ((M-1)*N + 1) / M +// +#define DFS_MAX_FLATTREE_LEAFS 43 // = 43 for 256 thread WG (L=256, M=6) + +#else + +// This is the result of estimate_qbvh6_nodes(256) + +#define DFS_MAX_FLATTREE_LEAFS 256 +#define DFS_MAX_FLATTREE_NODES 307 // 256 fat-leaves + 51 inner nodes. 51 = ceil(256/5) +#define DFS_MAX_FLATTREE_DEPTH 52 // number of inner nodes in the worst-case tree + +#endif + +#define uniform +#define varying + + +struct DFSArgs +{ + global struct BVHBase* bvh_base; + global PrimRef* primref_buffer; + ushort leaf_node_type; + ushort inner_node_type; + ushort leaf_size_in_bytes; + bool need_backpointers; + bool need_masks; + ushort num_primrefs; + global uint* primref_index_buffer; +}; + + +struct DFSPrimRefAABB +{ + half lower[3]; + half upper[3]; +}; + +GRL_INLINE void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb ) +{ + bb->lower[0] = 1; + bb->lower[1] = 1; + bb->lower[2] = 1; + bb->upper[0] = 0; + bb->upper[1] = 0; + bb->upper[2] = 0; +} + +GRL_INLINE void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v ) +{ + aabb->lower[0] = min( aabb->lower[0], v->lower[0] ); + aabb->lower[1] = min( aabb->lower[1], v->lower[1] ); + aabb->lower[2] = min( aabb->lower[2], v->lower[2] ); + aabb->upper[0] = max( aabb->upper[0], v->upper[0] ); + aabb->upper[1] = max( aabb->upper[1], v->upper[1] ); + aabb->upper[2] = max( aabb->upper[2], v->upper[2] ); +} + +GRL_INLINE float DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb ) +{ + const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]); + return fma( d.x, (d.y + d.z), d.y * d.z ); +} + +GRL_INLINE struct DFSPrimRefAABB DFSPrimRefAABB_sub_group_reduce( struct DFSPrimRefAABB* aabb ) +{ + struct DFSPrimRefAABB bounds; + bounds.lower[0] = sub_group_reduce_min( aabb->lower[0] ); + bounds.lower[1] = sub_group_reduce_min( aabb->lower[1] ); + bounds.lower[2] = sub_group_reduce_min( aabb->lower[2] ); + bounds.upper[0] = sub_group_reduce_max( aabb->upper[0] ); + bounds.upper[1] = sub_group_reduce_max( aabb->upper[1] ); + bounds.upper[2] = sub_group_reduce_max( aabb->upper[2] ); + return bounds; +} + +struct DFSPrimRef +{ + struct DFSPrimRefAABB aabb; + uint2 meta; +}; + +struct PrimRefMeta +{ + uchar2 meta; +}; + +GRL_INLINE uint PrimRefMeta_GetInputIndex( struct PrimRefMeta* it ) +{ + return it->meta.x; +} +GRL_INLINE uint PrimRefMeta_GetInstanceMask( struct PrimRefMeta* it ) +{ + return it->meta.y; +} + + +struct PrimRefSet +{ + struct AABB3f root_aabb; + struct DFSPrimRefAABB AABB[DFS_WG_SIZE]; + uint2 meta[DFS_WG_SIZE]; + +}; + +GRL_INLINE local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id ) +{ + return &refs->AABB[id]; +} + +GRL_INLINE float PrimRefSet_GetMaxAABBArea( local struct PrimRefSet* refs ) +{ + float3 root_l = AABB3f_load_lower( &refs->root_aabb ); + float3 root_u = AABB3f_load_upper( &refs->root_aabb ); + float3 d = root_u - root_l; + float scale = 1.0f / max( d.x, max( d.y, d.z ) ); + + half3 dh = convert_half3_rtp( d * scale ); + return fma( dh.x, (dh.y + dh.z), dh.y * dh.z ); +} + +GRL_INLINE float3 ulp3( float3 v ) { + + return fabs(v) * FLT_EPSILON; +} + +GRL_INLINE struct AABB PrimRefSet_ConvertAABB( local struct PrimRefSet* refs, struct DFSPrimRefAABB* box ) +{ + float3 root_l = AABB3f_load_lower( &refs->root_aabb ); + float3 root_u = AABB3f_load_upper( &refs->root_aabb ); + float3 d = root_u - root_l; + float scale = max( d.x, max( d.y, d.z ) ); + + float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) ); + float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) ); + l = l * scale + root_l ; + u = u * scale + root_l ; + + // clamping is necessary in case that a vertex lies exactly in the upper AABB plane. + // If we use unclamped values, roundoff error in the scale factor calculation can cause us + // to snap to a flattened AABB that lies outside of the original one, resulting in missed geometry. + u = min( u, root_u ); + l = min( l, root_u ); + + struct AABB r; + r.lower.xyz = l.xyz; + r.upper.xyz = u.xyz; + return r; +} + +GRL_INLINE PrimRef PrimRefSet_GetFullPrecisionAABB( local struct PrimRefSet* refs, ushort id ) +{ + struct AABB r; + r = PrimRefSet_ConvertAABB( refs, &refs->AABB[id] ); + r.lower.w = 0; + r.upper.w = 0; + return r; +} + +GRL_INLINE uint PrimRefSet_GetInputIndex( local struct PrimRefSet* refs, ushort id ) +{ + return refs->meta[id].x; +} + +GRL_INLINE uint PrimRefSet_GetInstanceMask( local struct PrimRefSet* refs, ushort id ) +{ + return refs->meta[id].y; +} +GRL_INLINE struct PrimRefMeta PrimRefSet_GetMeta( local struct PrimRefSet* refs, ushort id ) +{ + struct PrimRefMeta meta; + meta.meta.x = refs->meta[id].x; + meta.meta.y = refs->meta[id].y; + return meta; +} + + +GRL_INLINE struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id ) +{ + struct DFSPrimRef r; + r.aabb = refs->AABB[id]; + r.meta = refs->meta[id]; + return r; +} + + +GRL_INLINE void PrimRefSet_SetPrimRef_FullPrecision( local struct PrimRefSet* refs, PrimRef ref, ushort id ) +{ + + float3 root_l = AABB3f_load_lower( &refs->root_aabb ); + float3 root_u = AABB3f_load_upper( &refs->root_aabb ); + float3 d = root_u - root_l; + float scale = 1.0f / max(d.x, max(d.y,d.z)); + + float3 l = ref.lower.xyz; + float3 u = ref.upper.xyz; + half3 lh = convert_half3_rtz( (l - root_l) * scale ); + half3 uh = convert_half3_rtp( (u - root_l) * scale ); + + refs->AABB[id].lower[0] = lh.x; + refs->AABB[id].lower[1] = lh.y; + refs->AABB[id].lower[2] = lh.z; + refs->AABB[id].upper[0] = uh.x; + refs->AABB[id].upper[1] = uh.y; + refs->AABB[id].upper[2] = uh.z; + refs->meta[id].x = id; + refs->meta[id].y = PRIMREF_instanceMask(&ref); + + +} + +GRL_INLINE void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id ) +{ + refs->AABB[id] = ref.aabb; + refs->meta[id] = ref.meta; +} + +GRL_INLINE struct AABB3f PrimRefSet_GetRootAABB( local struct PrimRefSet* refs ) +{ + return refs->root_aabb; +} + +GRL_INLINE void SUBGROUP_PrimRefSet_Initialize( local struct PrimRefSet* refs ) +{ + if ( get_sub_group_local_id() == 0 ) + AABB3f_init( &refs->root_aabb ); // TODO_OPT: subgroup-vectorized version of AABB3f_init +} + + +GRL_INLINE void PrimRefSet_Printf( local struct PrimRefSet* refs, ushort num_prims ) +{ + + barrier( CLK_LOCAL_MEM_FENCE ); + if ( get_local_id( 0 ) == 0 ) + { + printf( "Scene AABB:\n" ); + struct AABB3f rootBox = PrimRefSet_GetRootAABB( refs ); + AABB3f_print( &rootBox ); + + float ma = PrimRefSet_GetMaxAABBArea( refs ); + + for ( uint i = 0; i < num_prims; i++ ) + { + printf( "Ref: %u\n", i ); + struct AABB r = PrimRefSet_GetFullPrecisionAABB( refs, i ); + AABB_print( &r ); + + float a = DFSPrimRefAABB_halfArea( PrimRefSet_GetAABBPointer( refs, i ) ); + printf( "Scaled Area: %f / %f = %f \n", a, ma, a / ma ); + + } + } + barrier( CLK_LOCAL_MEM_FENCE ); +} + + + +GRL_INLINE void PrimRefSet_CheckBounds( local struct PrimRefSet* refs, ushort num_prims, PrimRef* primref_buffer ) +{ + + barrier( CLK_LOCAL_MEM_FENCE ); + if ( get_local_id( 0 ) == 0 ) + { + + for ( uint i = 0; i < num_prims; i++ ) + { + PrimRef ref = primref_buffer[i]; + struct AABB r2 = PrimRefSet_GetFullPrecisionAABB( refs, i ); + + struct DFSPrimRefAABB* box = &refs->AABB[i]; + float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) ); + float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) ); + + printf( " halfs:{%x,%x,%x}{%x,%x,%x}\n", as_uint(l.x), as_uint(l.y), as_uint(l.z), as_uint(u.x), as_uint(u.y), as_uint(u.z) ); + + printf( " {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%u,%u,%u,%u,%u,%u}\n", + ref.lower.x, ref.lower.y, ref.lower.z, r2.lower.x, r2.lower.y, r2.lower.z, + ref.upper.x, ref.upper.y, ref.upper.z, r2.upper.x, r2.upper.y, r2.upper.z, + r2.lower.x <= ref.lower.x, + r2.lower.y <= ref.lower.y, + r2.lower.z <= ref.lower.z, + + r2.upper.x >= ref.upper.x, + r2.upper.y >= ref.upper.y, + r2.upper.z >= ref.upper.z ); + + } + + } + barrier( CLK_LOCAL_MEM_FENCE ); +} + + + +struct LocalBVH2 +{ + uint num_nodes; + uint nodes[DFS_BVH2_NODE_COUNT]; + + // nodes are a bitfield: + // bits 8:0 (9b) ==> number of primrefs in this subtree + // + // bits 17:9 (9b) ==> for an inner node: contains offset to a pair of children + // ==> for a leaf node: contains index of the first primref in this leaf + // + // bits 30:18 (13b) ==> quantized AABB area (relative to root box) + // bit 31 (1b) ==> is_inner flag + // + // NOTE: The left child offset of any node is always odd.. therefore, it is possible to recover a bit if we need it + // by storing only the 8 MSBs +}; + +#define DFS_BVH2_AREA_QUANT 8191.0f + + + +GRL_INLINE void SUBGROUP_LocalBVH2_Initialize( local struct LocalBVH2* tree, ushort num_prims ) +{ + tree->num_nodes = 1; // include the root node + tree->nodes[0] = num_prims; // initialize root node as a leaf containing the full subtree + +} + +GRL_INLINE void LocalBVH2_CreateInnerNode( local struct LocalBVH2* tree, ushort node_index, + ushort start_left, ushort start_right, + ushort quantized_left_area, ushort quantized_right_area ) +{ + uint child_pos = atomic_add_local( &tree->num_nodes, 2 ); + + // set the inner node flag and child position in the parent + // leave the other bits intact + uint parent_node = tree->nodes[node_index]; + parent_node |= 0x80000000; + parent_node = (parent_node & ~(0x1ff<<9)) | (child_pos << 9); + tree->nodes[node_index] = parent_node; + + // setup children as leaf nodes with prim-count zero + uint left_child = (convert_uint(start_left) << 9) | (convert_uint( quantized_left_area ) << 18); + uint right_child = (convert_uint(start_right) << 9) | (convert_uint( quantized_right_area ) << 18); + tree->nodes[child_pos] = left_child; + tree->nodes[child_pos + 1] = right_child; + +} + +GRL_INLINE ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* tree, ushort node_index ) +{ + // increment only the lower bits. Given correct tree construction algorithm this will not overflow into MSBs + return (atomic_inc_local( &tree->nodes[node_index] )) & 0x1ff; +} + +GRL_INLINE ushort LocalBVH2_GetNodeArea( local struct LocalBVH2* tree, ushort nodeID ) +{ + return (tree->nodes[nodeID] >> 18) & 0x1FFF; +} + +GRL_INLINE bool LocalBVH2_IsInnerNode( local struct LocalBVH2* tree, ushort nodeID ) +{ + return (tree->nodes[nodeID] & 0x80000000) != 0; +} + + +GRL_INLINE ushort2 LocalBVH2_GetChildIndices( local struct LocalBVH2* tree, ushort nodeID ) +{ + ushort idx = ((tree->nodes[nodeID] >> 9) & 0x1FF); + return (ushort2)(idx, idx + 1); +} + +GRL_INLINE ushort LocalBVH2_GetSubtreePrimCount( local struct LocalBVH2* tree, ushort node ) +{ + return tree->nodes[node] & 0x1FF; +} + +GRL_INLINE ushort LocalBVH2_GetLeafPrimStart( local struct LocalBVH2* tree, ushort node ) +{ + return ((tree->nodes[node] >> 9) & 0x1FF); +} + + +GRL_INLINE void LocalBVH2_Printf( local struct LocalBVH2* tree ) +{ + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( get_local_id( 0 ) == 0 ) + { + printf( "Nodes: %u\n", tree->num_nodes ); + + for ( uint i = 0; i < tree->num_nodes; i++ ) + { + uint num_prims = LocalBVH2_GetSubtreePrimCount( tree, i ); + printf( "%3u : 0x%08x %3u 0x%04x ", i, tree->nodes[i], num_prims, LocalBVH2_GetNodeArea(tree,i) ); + if ( LocalBVH2_IsInnerNode( tree, i ) ) + { + ushort2 kids = LocalBVH2_GetChildIndices( tree, i ); + printf( " INNER ( %3u %3u )\n", kids.x, kids.y ); + } + else + { + printf( " LEAF {" ); + for ( uint j = 0; j < num_prims; j++ ) + printf( " %3u ", LocalBVH2_GetLeafPrimStart( tree, i ) + j ); + printf( "}\n" ); + } + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); +} + +struct FlatTreeInnerNode +{ + uint DW0; // lower 16b are index of corresponding LocalBVH2 node.. Bits 30:16 are an atomic flag used during refit. Bit 31 is a leaf marker + ushort parent_index; + ushort first_child; + uchar index_in_parent; + uchar num_children; + + //struct DFSPrimRefAABB AABB; +}; + +struct FlatTree +{ + uint num_nodes; + uint qnode_byte_offset; // byte offset from the BVHBase to the flat-tree's first QNode + uint qnode_base_index; + + struct FlatTreeInnerNode nodes[DFS_MAX_FLATTREE_NODES]; + uchar primref_back_pointers[DFS_WG_SIZE]; +}; + +GRL_INLINE void FlatTree_Printf( local struct FlatTree* flat_tree ) +{ + barrier( CLK_LOCAL_MEM_FENCE ); + if ( get_local_id( 0 ) == 0 ) + { + printf( "NumNodes: %u\n", flat_tree->num_nodes ); + for ( uint i = 0; i < flat_tree->num_nodes; i++ ) + { + ushort bvh2_node = flat_tree->nodes[i].DW0 & 0xffff; + printf( "%2u Parent: %2u Index_in_parent: %u, NumKids: %u FirstKid: %3u bvh2: %3u DW0: 0x%x\n", + i, + flat_tree->nodes[i].parent_index, + flat_tree->nodes[i].index_in_parent, + flat_tree->nodes[i].num_children, + flat_tree->nodes[i].first_child, + bvh2_node, + flat_tree->nodes[i].DW0 ); + } + } + barrier( CLK_LOCAL_MEM_FENCE ); +} + + + + +GRL_INLINE ushort FlatTree_GetNodeCount( local struct FlatTree* flat_tree ) +{ + return flat_tree->num_nodes; +} + +GRL_INLINE uint FlatTree_GetParentIndex( local struct FlatTree* flat_tree, ushort id ) +{ + return flat_tree->nodes[id].parent_index; +} + +GRL_INLINE ushort FlatTree_GetBVH2Root( local struct FlatTree* flat_tree, ushort node_index ) +{ + return (flat_tree->nodes[node_index].DW0) & 0xffff; +} + +GRL_INLINE ushort FlatTree_GetNumChildren( local struct FlatTree* flat_tree, ushort node_index ) +{ + return flat_tree->nodes[node_index].num_children; +} + +GRL_INLINE bool FlatTree_IsLeafNode( local struct FlatTree* flat_tree, ushort node_index ) +{ + return (flat_tree->nodes[node_index].DW0 & 0x80000000) != 0; +} + + +GRL_INLINE uint FlatTree_GetQNodeByteOffset( struct FlatTree* flat_tree, ushort node_index ) +{ + return flat_tree->qnode_byte_offset + node_index * sizeof(struct QBVHNodeN); +} + +GRL_INLINE uint FlatTree_GetQNodeIndex( struct FlatTree* flat_tree, ushort node_index ) +{ + return flat_tree->qnode_base_index + node_index; +} + +GRL_INLINE void FlatTree_AllocateQNodes( struct FlatTree* flat_tree, struct DFSArgs args ) +{ + uint node_base = 64*allocate_inner_nodes( args.bvh_base, flat_tree->num_nodes ); + flat_tree->qnode_base_index = (node_base - BVH_ROOT_NODE_OFFSET) / sizeof( struct QBVHNodeN ); + flat_tree->qnode_byte_offset = node_base; +} + +GRL_INLINE ushort FlatTree_GetFirstChild( struct FlatTree* flat_tree, ushort node_index ) +{ + return flat_tree->nodes[node_index].first_child; +} + +GRL_INLINE ushort FlatTree_GetPrimRefStart( struct FlatTree* flat_tree, ushort node_index ) +{ + return flat_tree->nodes[node_index].first_child; +} +GRL_INLINE ushort FlatTree_GetPrimRefCount( struct FlatTree* flat_tree, ushort node_index ) +{ + return flat_tree->nodes[node_index].num_children; +} + +GRL_INLINE uint FlatTree_BuildBackPointer( local struct FlatTree* flat_tree, ushort node_index ) +{ + uint parent_index = flat_tree->nodes[node_index].parent_index + flat_tree->qnode_base_index; + parent_index = (parent_index << 6) | (FlatTree_GetNumChildren( flat_tree, node_index ) << 3); + return parent_index; +} + + +GRL_INLINE void SUBGROUP_FlatTree_Initialize( uniform local struct FlatTree* flat_tree, struct DFSArgs args ) +{ + if ( get_sub_group_local_id() == 0 ) + { + flat_tree->num_nodes = 1; + flat_tree->nodes[0].DW0 = 0; // point first node at BVH2 root node, which is assumed to be at index zero + } + +} +/* +GRL_INLINE void SUBGROUP_FlatTree_ReduceAndSetAABB( uniform local struct FlatTree* flat_tree, + uniform ushort node_index, + varying local struct DFSPrimRefAABB* box ) +{ + // TODO_OPT: Replace this with an optimized reduction which exploits the fact that we only ever have 6 active lanes + // Try using the "negated max" trick here to compute min/max simultaneously, with max in top 6 lanes + // This will replace 6 reductions with 3 + + // TODO_OPT: This only utilizes up to 6 SIMD lanes. We can use up to 12 of them by putting + // min into even lanes, and -max into odd lanes, and using a manual min-reduction on pairs of lanes + + struct DFSPrimRefAABB bb = DFSPrimRefAABB_sub_group_reduce( box ); + if( get_sub_group_local_id() ) + flat_tree->nodes[node_index].AABB = bb; +} +*/ + +GRL_INLINE void SUBGROUP_FlatTree_CreateInnerNode( uniform local struct FlatTree* flat_tree, + uniform ushort flat_tree_root, + varying ushort sg_child_bvh2_root, + uniform ushort num_children ) +{ + uniform uint lane = get_sub_group_local_id(); + + // increment counter to allocate new nodes.. set required root node fields + uniform uint child_base; + if ( lane == 0 ) + { + child_base = atomic_add_local( &flat_tree->num_nodes, num_children ); + flat_tree->nodes[flat_tree_root].first_child = (uchar) child_base; + flat_tree->nodes[flat_tree_root].num_children = num_children; + + // initialize mask bits for this node's live children + uint child_mask = ((1 << num_children) - 1) << 16; + flat_tree->nodes[flat_tree_root].DW0 |= child_mask; + } + + child_base = sub_group_broadcast( child_base, 0 ); + + // initialize child nodes + if ( lane < num_children ) + { + varying uint child = child_base + lane; + flat_tree->nodes[child].DW0 = sg_child_bvh2_root; + flat_tree->nodes[child].index_in_parent = lane; + flat_tree->nodes[child].parent_index = flat_tree_root; + } + +} + + + +GRL_INLINE void SUBGROUP_FlatTree_CreateLeafNode( uniform local struct FlatTree* flat_tree, + uniform ushort flat_tree_root, + uniform ushort primref_start, + uniform ushort num_prims ) +{ + ushort lane = get_sub_group_local_id(); + if ( lane < num_prims ) + { + flat_tree->primref_back_pointers[primref_start + lane] = (uchar) flat_tree_root; + if ( lane == 0 ) + { + flat_tree->nodes[flat_tree_root].first_child = (uchar) primref_start; + flat_tree->nodes[flat_tree_root].num_children = (uchar) num_prims; + flat_tree->nodes[flat_tree_root].DW0 |= 0x80000000; + } + } +} + + +GRL_INLINE uniform bool SUBGROUP_FlatTree_SignalRefitComplete( uniform local struct FlatTree* flat_tree, uniform ushort* p_node_index ) +{ + uniform ushort node_index = *p_node_index; + uniform ushort parent = flat_tree->nodes[node_index].parent_index; + uniform ushort index_in_parent = flat_tree->nodes[node_index].index_in_parent; + + // clear the corresponding mask bit in the parent node + uniform uint child_mask = (0x10000 << index_in_parent); + uniform uint old_mask_bits = 0; + if( get_sub_group_local_id() == 0 ) + old_mask_bits = atomic_xor( &flat_tree->nodes[parent].DW0, child_mask ); + + old_mask_bits = sub_group_broadcast( old_mask_bits, 0 ); + + // if we cleared the last mask bit, this subgroup proceeds up the tree and refits the next node + // otherwise, it looks for something else to do + if ( ((old_mask_bits^child_mask) & 0xffff0000) == 0 ) + { + *p_node_index = parent; + return true; + } + + return false; +} + +/* +GRL_INLINE local struct DFSPrimRefAABB* FlatTree_GetChildAABB( local struct FlatTree* flat_tree, + local struct PrimRefSet* prim_refs, + ushort node_index, ushort child_index ) +{ + ushort child_id = FlatTree_GetFirstChild( flat_tree, node_index ) + child_index; + + if( !FlatTree_IsLeafNode( flat_tree, node_index ) ) + return &flat_tree->nodes[child_id].AABB; + else + return PrimRefSet_GetAABBPointer( prim_refs, child_id ); +} +*/ +GRL_INLINE uint FlatTree_GetPrimRefBackPointer( local struct FlatTree* flat_tree, ushort primref_index ) +{ + return flat_tree->primref_back_pointers[primref_index] * sizeof(struct QBVHNodeN) + flat_tree->qnode_byte_offset; +} + + +GRL_INLINE void FlatTree_check_boxes(local struct FlatTree* flat_tree, + global struct AABB* primref_buffer, + local struct AABB3f* boxes, + local struct PrimRefMeta* meta ) + +{ + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) == 0) + { + printf("checking flattree bounds...\n"); + + for (uint i = 0; i < flat_tree->num_nodes; i++) + { + struct AABB rb; + rb.lower.xyz = AABB3f_load_lower(&boxes[i]); + rb.upper.xyz = AABB3f_load_upper(&boxes[i]); + + uint offs = FlatTree_GetFirstChild( flat_tree, i ); + uint count = FlatTree_GetNumChildren( flat_tree, i ); + + for (uint c = 0; c < count; c++) + { + struct AABB lb; + if (FlatTree_IsLeafNode( flat_tree, i )) + { + lb = primref_buffer[ PrimRefMeta_GetInputIndex( &meta[offs+c] ) ]; + } + else + { + lb.lower.xyz = AABB3f_load_lower(&boxes[ offs+c ]); + lb.upper.xyz = AABB3f_load_upper(&boxes[ offs+c ]); + } + + if( !AABB_subset( &lb, &rb ) ) + printf("Bad bounds!! child %u of %u %f : %f %f : %f %f : %f %f : %f %f : %f %f : %f \n", + c, i , + rb.lower.x, rb.upper.x, rb.lower.y, rb.upper.y, rb.lower.z, rb.upper.z, + lb.lower.x, lb.upper.x, lb.lower.y, lb.upper.y, lb.lower.z, lb.upper.z + ); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); +} + + +struct FlatTreeScheduler +{ + int num_leafs; + uint writeout_produce_count; + uint writeout_consume_count; + uint active_subgroups; + uint num_built_nodes; + uint num_levels; // number of depth levels in the tree + + //uchar leaf_indices[DFS_MAX_FLATTREE_LEAFS]; // indices of leaf FlatTree nodes to be refitted + //uchar writeout_indices[DFS_MAX_FLATTREE_NODES]; // indices of flattree nodes to be written out or collapsed + + ushort level_ordered_nodes[DFS_MAX_FLATTREE_NODES]; // node indices sorted by depth (pre-order, high depth before low depth) + ushort level_start[DFS_MAX_FLATTREE_DEPTH]; // first node at given level in the level-ordered node array + uint level_count[DFS_MAX_FLATTREE_DEPTH]; // number of nodes at given level +}; + +GRL_INLINE void SUBGROUP_FlatTreeScheduler_Initialize( uniform local struct FlatTreeScheduler* scheduler ) +{ + scheduler->num_built_nodes = 0; + scheduler->num_leafs = 0; + scheduler->writeout_produce_count = 0; + scheduler->writeout_consume_count = 0; + scheduler->active_subgroups = DFS_NUM_SUBGROUPS; +} +/* +GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueLeafForRefit( uniform local struct FlatTreeScheduler* scheduler, + uniform ushort leaf ) +{ + if ( get_sub_group_local_id() == 0 ) + scheduler->leaf_indices[atomic_inc( &scheduler->num_leafs )] = leaf; +}*/ + +GRL_INLINE void SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node ) +{ + if ( get_sub_group_local_id() == 0 ) + atomic_inc_local( &scheduler->num_built_nodes ); +} + +GRL_INLINE uint FlatTreeScheduler_GetNumBuiltNodes( uniform local struct FlatTreeScheduler* scheduler ) +{ + return scheduler->num_built_nodes; +} + +/* +GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node ) +{ + if ( get_sub_group_local_id() == 0 ) + scheduler->writeout_indices[atomic_inc( &scheduler->writeout_produce_count )] = node; +}*/ + +/* +GRL_INLINE bool SUBGROUP_FlatTreeScheduler_GetRefitTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort* leaf_idx ) +{ + // schedule the leaves in reverse order to ensure that later leaves + // complete before earlier ones.. This prevents contention during the WriteOut stage + // + // There is a barrier between this function and 'QueueLeafForRefit' so we can safely decrement the same counter + // that we incremented earlier + varying int idx = 0; + if( get_sub_group_local_id() == 0 ) + idx = atomic_dec( &scheduler->num_leafs ); + + sub_group_barrier( CLK_LOCAL_MEM_FENCE ); + idx = sub_group_broadcast( idx, 0 ); + + if ( idx <= 0 ) + return false; + + *leaf_idx = scheduler->leaf_indices[idx-1]; + return true; +}*/ + +/* +// Signal the scheduler that a subgroup has reached the DONE state. +// Return true if this is the last subgroup to be done +void SUBGROUP_FlatTreeScheduler_SubGroupDone( local struct FlatTreeScheduler* scheduler ) +{ + if ( get_sub_group_local_id() == 0 ) + atomic_dec( &scheduler->active_subgroups ); +} +*/ + +/* + +#define STATE_SCHEDULE_REFIT 0x1234 +#define STATE_SCHEDULE_WRITEOUT 0x5679 +#define STATE_REFIT 0xabcd +#define STATE_WRITEOUT 0xefef +#define STATE_DONE 0xaabb + +// Get a flattree node to write out. Returns the new scheduler state +GRL_INLINE ushort SUBGROUP_FlatTreeScheduler_GetWriteOutTask( uniform local struct FlatTreeScheduler* scheduler, + uniform ushort num_nodes, + uniform ushort* node_idx ) +{ + uniform ushort return_state = STATE_WRITEOUT; + uniform ushort idx = 0; + if ( get_sub_group_local_id() == 0 ) + { + idx = atomic_inc( &scheduler->writeout_consume_count ); + + if ( idx >= scheduler->writeout_produce_count ) + { + // more consumers than there are produced tasks.... + + if ( scheduler->writeout_produce_count == num_nodes ) + { + // if all nodes have been written out, flattening is done + return_state = STATE_DONE; + } + else + { + // some writeout tasks remain, and have not been produced by refit threads yet + // we need to put this one back + atomic_dec( &scheduler->writeout_consume_count ); + return_state = STATE_SCHEDULE_WRITEOUT; + } + } + else + { + // scheduled successfully + idx = scheduler->writeout_indices[idx]; + } + } + + *node_idx = sub_group_broadcast( idx, 0 ); + return sub_group_broadcast( return_state, 0 ); + +} +*/ + + +/* +GRL_INLINE void FlatTreeScheduler_Printf( local struct FlatTreeScheduler* scheduler ) +{ + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( get_local_id( 0 ) == 0 ) + { + printf( "***SCHEDULER***\n" ); + printf( "built_nodes=%u active_sgs=%u leafs=%u wo_p=%u wo_c=%u\n", scheduler->num_built_nodes, scheduler->active_subgroups, scheduler->num_leafs, + scheduler->writeout_produce_count, scheduler->writeout_consume_count ); + printf( "leafs for refit: {" ); + + int nleaf = max( scheduler->num_leafs, 0 ); + + for ( uint i = 0; i < nleaf; i++ ) + printf( "%u ", scheduler->leaf_indices[i] ); + printf( "}\n" ); + + printf( "writeout queue: %u:%u {", scheduler->writeout_produce_count, scheduler->writeout_consume_count ); + for ( uint i = 0; i < scheduler->writeout_produce_count; i++ ) + printf( "%u ", scheduler->writeout_indices[i] ); + printf( "}\n" ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + +} +*/ + + +GRL_INLINE void SUBGROUP_BuildFlatTreeNode( local struct LocalBVH2* bvh2, + local struct FlatTree* flat_tree, + local struct FlatTreeScheduler* scheduler, + uniform ushort flat_tree_root ) +{ + varying ushort lane = get_sub_group_local_id(); + varying ushort bvh2_root = FlatTree_GetBVH2Root( flat_tree, flat_tree_root ); + + if ( !LocalBVH2_IsInnerNode( bvh2, bvh2_root ) ) + { + uniform ushort num_prims = LocalBVH2_GetSubtreePrimCount( bvh2, bvh2_root ); + uniform ushort primref_start = LocalBVH2_GetLeafPrimStart( bvh2, bvh2_root ); + + SUBGROUP_FlatTree_CreateLeafNode( flat_tree, flat_tree_root, primref_start, num_prims ); + } + else + { + // collapse BVH2 into BVH6. + // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough + uniform ushort num_children = 2; + + uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); + varying ushort sg_bvh2_node = kids.x; + if ( lane == 1 ) + sg_bvh2_node = kids.y; + + do + { + // choose the inner node with maximum area to replace. + // Its left child goes in its old location. Its right child goes in a new lane + + varying ushort sg_area = LocalBVH2_GetNodeArea( bvh2, sg_bvh2_node ); + varying bool sg_is_inner = LocalBVH2_IsInnerNode( bvh2, sg_bvh2_node ); + sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf + + uniform ushort max_area = sub_group_reduce_max( sg_area ); + varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner; + uniform uint mask = intel_sub_group_ballot( sg_reducable ); + + // TODO_OPT: Some of these ops seem redundant.. look at trimming further + // TODO_OPT: sub_group_reduce_max results in too many instructions...... unroll the loop and specialize it.. + // or ask IGC to give us a version that declares a static maximum number of subgroups to use + + if ( mask == 0 ) + break; + + // choose the inner node with maximum area to replace + uniform ushort victim_child = ctz( mask ); + uniform ushort victim_node = sub_group_broadcast( sg_bvh2_node, victim_child ); + uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, victim_node ); + + if ( lane == victim_child ) + sg_bvh2_node = kids.x; + else if ( lane == num_children ) + sg_bvh2_node = kids.y; + + + num_children++; + + + }while ( num_children < TREE_ARITY ); + + SUBGROUP_FlatTree_CreateInnerNode( flat_tree, flat_tree_root, sg_bvh2_node, num_children ); + } + +} + + +GRL_INLINE void SUBGROUP_DFS_BuildFlatTree( uniform local struct LocalBVH2* bvh2, + uniform local struct FlatTree* flat_tree, + uniform local struct FlatTreeScheduler* scheduler + ) +{ + + uniform ushort flat_tree_node_index = get_sub_group_id(); + uniform ushort num_nodes = 1; + uniform ushort num_built = 0; + + uint tid = get_local_id(0); + if (tid < DFS_MAX_FLATTREE_DEPTH) + { + scheduler->level_start[tid] = DFS_MAX_FLATTREE_NODES; + scheduler->level_count[tid] = 0; + scheduler->num_levels = 0; + } + + LOOP_TRIPWIRE_INIT; + + do + { + // process one flat tree node per sub group, as many as are available + // + // The first pass will only run one sub-group, the second up to 6, the third up to 36, and so on + // nodes will be processed in breadth-first order, but they are not guaranteed to be stored in this order + // due to use of atomic counters for node allocation + // + if ( flat_tree_node_index < num_nodes ) + { + SUBGROUP_BuildFlatTreeNode( bvh2, flat_tree, scheduler, flat_tree_node_index ); + SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( scheduler, flat_tree_node_index ); + flat_tree_node_index += get_num_sub_groups(); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + // bump up the node count if new nodes were created + // stop as soon as all flattree nodes have been processed + num_nodes = FlatTree_GetNodeCount( flat_tree ); + num_built = FlatTreeScheduler_GetNumBuiltNodes( scheduler ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + LOOP_TRIPWIRE_INCREMENT( 300 ); + + } while ( num_built < num_nodes ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + + // determine depth of each node, compute node ranges and counts for each depth level, + // and prepare a depth-ordered node index array + uint depth = 0; + uint level_pos = 0; + for( uint i=tid; ilevel_count[depth] ); + + // compute total number of levels + atomic_max_local( &scheduler->num_levels, depth+1 ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + for( uint i=tid; ilevel_count[d]; + + scheduler->level_start[depth] = level_start; + + // scatter node indices into level-ordered node array + scheduler->level_ordered_nodes[level_start + level_pos] = tid; + } + + barrier( CLK_LOCAL_MEM_FENCE ); + +} + +/* +GRL_INLINE bool SUBGROUP_RefitNode( uniform local struct FlatTree* flat_tree, + uniform local struct PrimRefSet* prim_refs, + uniform ushort* p_node_index ) +{ + + // fetch and reduce child AABBs across the subgroup + uniform ushort node_index = *p_node_index; + uniform ushort num_kids = FlatTree_GetNumChildren( flat_tree, node_index ); + varying ushort sg_child_index = (get_sub_group_local_id() < num_kids) ? get_sub_group_local_id() : 0; + + varying local struct DFSPrimRefAABB* box = FlatTree_GetChildAABB( flat_tree, prim_refs, node_index, sg_child_index ); + + SUBGROUP_FlatTree_ReduceAndSetAABB( flat_tree, node_index, box ); + + if ( node_index == 0 ) + return false; // if we just refitted the root, we can stop now + + // signal the parent node that this node was refitted. If this was the last child to be refitted + // returns true and sets 'node_index' to the parent node, so that this thread can continue refitting + return SUBGROUP_FlatTree_SignalRefitComplete( flat_tree, p_node_index ); +}*/ + +GRL_INLINE struct QBVHNodeN* qnode_ptr( BVHBase* bvh_mem, uint byte_offset ) +{ + return (struct QBVHNodeN*)(((char*)bvh_mem) + byte_offset); +} + +GRL_INLINE void SUBGROUP_WriteQBVHNode( + uniform local struct FlatTree* flat_tree, + uniform local struct PrimRefMeta* primref_meta, + uniform local struct AABB3f* boxes, + uniform ushort flat_tree_root, + uniform struct DFSArgs args, + uniform local uchar* masks + ) +{ + + + uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root ); + uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root ); + + varying ushort lane = get_sub_group_local_id(); + varying ushort sg_child_index = (lane < num_children) ? lane : 0; + + uniform ushort child_base = FlatTree_GetFirstChild( flat_tree, flat_tree_root ); + + varying struct AABB sg_box4; + if (FlatTree_IsLeafNode( flat_tree, flat_tree_root )) + { + // fetch AABBs for primrefs + sg_box4 = args.primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_base + sg_child_index] ) ]; + + } + else + { + // fetch AABBs for child nodes + sg_box4.lower.xyz = AABB3f_load_lower( &boxes[child_base+sg_child_index] ); + sg_box4.upper.xyz = AABB3f_load_upper( &boxes[child_base+sg_child_index] ); + } + + + struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) ); + + uniform int offset; + uniform uint child_type; + if ( is_leaf ) + { + char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); + + leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes; + + offset = (int)(leaf_mem - (char*)qnode); + child_type = args.leaf_node_type; + } + else + { + struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) ); + offset = (int) ((char*)kid - (char*)qnode); + child_type = args.inner_node_type; + } + offset = offset >> 6; + + if (child_type == NODE_TYPE_INSTANCE) + { + uint instanceMask = PrimRefMeta_GetInstanceMask( &primref_meta[child_base + sg_child_index] ); + subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 ); + } + else + { + uint mask = BVH_NODE_DEFAULT_MASK; + if( args.need_masks ) + mask = masks[flat_tree_root]; + + subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode, mask ); + } + + if ( args.need_backpointers ) + { + global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); + uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root ); + uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root ); + back_pointers[idx] = bp; + } + + /* + // TODO_OPT: Eventually this section should also handle leaf splitting due to mixed primref types + // For now this is done by the leaf creation pipeline, but that path should probably be refactored + // such that all inner node creation is done in one place + + uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root ); + uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root ); + + varying ushort lane = get_sub_group_local_id(); + varying ushort sg_child_index = (lane < num_children) ? lane : 0; + + varying local struct DFSPrimRefAABB* sg_box = FlatTree_GetChildAABB( flat_tree, prim_refs, flat_tree_root, sg_child_index ); + + varying struct AABB sg_box4 = PrimRefSet_ConvertAABB( prim_refs, sg_box ); + + struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) ); + + uniform int offset; + uniform uint child_type; + if ( is_leaf ) + { + char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); + + leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes; + + offset = (int)(leaf_mem - (char*)qnode); + child_type = args.leaf_node_type; + } + else + { + struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) ); + offset = (int) ((char*)kid - (char*)qnode); + child_type = args.inner_node_type; + } + offset = offset >> 6; + + if (child_type == NODE_TYPE_INSTANCE) + { + uint instanceMask = PrimRefSet_GetInstanceMask( prim_refs, FlatTree_GetPrimRefStart(flat_tree, flat_tree_root) + lane ); + subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 ); + } + else + subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode ); + + if ( args.need_backpointers ) + { + global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); + uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root ); + uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root ); + back_pointers[idx] = bp; + } + */ +} + +/* +GRL_INLINE void SUBGROUP_DFS_RefitAndWriteOutFlatTree( + uniform local struct FlatTree* flat_tree, + uniform local struct PrimRefSet* prim_refs, + uniform local struct FlatTreeScheduler* scheduler, + uniform struct DFSArgs args) +{ + + uniform ushort state = STATE_SCHEDULE_REFIT; + uniform ushort node_index = 0; + uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree); + + { + LOOP_TRIPWIRE_INIT; + + bool active = true; + bool continue_refit = false; + while (1) + { + if (active) + { + if (continue_refit || SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index)) + { + continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index); + } + else + { + active = false; + if (get_sub_group_local_id() == 0) + atomic_dec(&scheduler->active_subgroups); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); // finish all atomics + if (scheduler->active_subgroups == 0) + break; + barrier(CLK_LOCAL_MEM_FENCE); // finish all checks.. prevent race between thread which loops around and thread which doesn't + + LOOP_TRIPWIRE_INCREMENT(200); + } + } + + for (uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups()) + SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, i, args); + + barrier(CLK_LOCAL_MEM_FENCE); + + + // JDB: Version below attempts to interleave refit and qnode write-out + // This could theoretically reduce thread idle time, but it is more complex and does more atomics for scheduling + +#if 0 + // after we've constructed the flat tree (phase 1), there are two things that need to happen: + // PHASE 2: Refit the flat tree, computing all of the node ABBs + // PHASE 3: Write the nodes out to memory + // + // all of this is sub-group centric. Different subgroups can execute phases 2 and 3 concurrently + // + + // TODO_OPT: The scheduling algorithm might need to be re-thought. + // Fused EUs are very hard to reason about. It's possible that by scheduling independent + // SGs in this way we would lose a lot of performance due to fused EU serialization. + // Needs to be tested experimentally if such a thing is possible + + uniform ushort state = STATE_SCHEDULE_REFIT; + uniform ushort node_index = 0; + uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree); + + LOOP_TRIPWIRE_INIT; + + do + { + // barrier necessary to protect access to scheduler->active_subgroups + barrier(CLK_LOCAL_MEM_FENCE); + + if (state == STATE_SCHEDULE_REFIT) + { + if (SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index)) + state = STATE_REFIT; + else + state = STATE_SCHEDULE_WRITEOUT; // fallthrough + } + if (state == STATE_SCHEDULE_WRITEOUT) + { + state = SUBGROUP_FlatTreeScheduler_GetWriteOutTask(scheduler, num_nodes, &node_index); + if (state == STATE_DONE) + SUBGROUP_FlatTreeScheduler_SubGroupDone(scheduler); + } + + + // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask' + // Note that in theory we could have the write-out tasks spin until the refit tasks clear, which would make this barrier unnecessary + // However, we cannot do this safely on SKUs which do not support independent subgroup forward progress. + barrier(CLK_LOCAL_MEM_FENCE); + + if (state == STATE_REFIT) + { + uniform ushort prev_node = node_index; + uniform bool continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index); + + SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut(scheduler, prev_node); + + if (!continue_refit) + state = STATE_SCHEDULE_REFIT; + } + else if (state == STATE_WRITEOUT) + { + SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, node_index, args); + state = STATE_SCHEDULE_WRITEOUT; + } + // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask' + barrier(CLK_LOCAL_MEM_FENCE); + + LOOP_TRIPWIRE_INCREMENT(200); + + } while (scheduler->active_subgroups > 0); + +#endif +} +*/ + +GRL_INLINE void DFS_CreatePrimRefSet( struct DFSArgs args, + local struct PrimRefSet* prim_refs ) +{ + ushort id = get_local_id( 0 ); + ushort num_primrefs = args.num_primrefs; + + + PrimRef ref; + struct AABB3f local_aabb; + if ( id < num_primrefs ) + { + ref = args.primref_buffer[id]; + AABB3f_set_lower( &local_aabb, ref.lower.xyz ); + AABB3f_set_upper( &local_aabb, ref.upper.xyz ); + } + else + { + AABB3f_init( &local_aabb ); + } + + AABB3f_atomic_merge_localBB_nocheck( &prim_refs->root_aabb, &local_aabb ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( id < num_primrefs ) + PrimRefSet_SetPrimRef_FullPrecision( prim_refs, ref, id ); +} + + + +struct BVHBuildLocals +{ + float Al[DFS_WG_SIZE]; + float Ar[DFS_WG_SIZE]; + uchar2 axis_and_left_count[ DFS_WG_SIZE ]; + uint sah[DFS_WG_SIZE]; + uint num_active_threads; +}; + + +GRL_INLINE void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, + local struct PrimRefSet* prim_refs, + ushort num_prims, + local struct BVHBuildLocals* locals ) +{ + ushort tid = get_local_id( 0 ); + + ushort bvh2_root = 0; + ushort prim_range_start = 0; + ushort primref_position = tid; + + bool active_thread = tid < num_prims; + float root_area = PrimRefSet_GetMaxAABBArea( prim_refs ); + float area_scale = DFS_BVH2_AREA_QUANT / root_area; + + locals->num_active_threads = num_prims; + barrier( CLK_LOCAL_MEM_FENCE ); + + LOOP_TRIPWIRE_INIT; + + do + { + if(active_thread && prim_range_start == primref_position) + locals->sah[primref_position] = UINT_MAX; + + if ( active_thread ) + { + local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); + + // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost + // do this axis-by-axis to keep register pressure low + float best_sah = INFINITY; + ushort best_axis = 3; + ushort best_count = 0; + float best_al = INFINITY; + float best_ar = INFINITY; + + struct DFSPrimRefAABB box_left[3]; + struct DFSPrimRefAABB box_right[3]; + float CSplit[3]; + ushort count_left[3]; + + for ( ushort axis = 0; axis < 3; axis++ ) + { + DFSPrimRefAABB_init( &box_left[axis] ); + DFSPrimRefAABB_init( &box_right[axis] ); + + CSplit[axis] = my_box->lower[axis] + my_box->upper[axis]; + count_left[axis] = 0; + } + + // scan primrefs in our subtree and partition using this thread's prim as a split plane + { + struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start ); + + for ( ushort p = 1; p < num_prims; p++ ) + { + struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration + + for( ushort axis = 0; axis < 3; axis++ ) + { + float c = box.lower[axis] + box.upper[axis]; + + if ( c < CSplit[axis] ) + { + // this primitive is to our left. + DFSPrimRefAABB_extend( &box_left[axis], &box ); + count_left[axis]++; + } + else + { + // this primitive is to our right + DFSPrimRefAABB_extend( &box_right[axis], &box ); + } + } + + box = next_box; + } + + // last iteration without preloading box + for( ushort axis = 0; axis < 3; axis++ ) + { + float c = box.lower[axis] + box.upper[axis]; + + if ( c < CSplit[axis] ) + { + // this primitive is to our left. + DFSPrimRefAABB_extend( &box_left[axis], &box ); + count_left[axis]++; + } + else + { + // this primitive is to our right + DFSPrimRefAABB_extend( &box_right[axis], &box ); + } + } + } + + for ( ushort axis = 0; axis < 3; axis++ ) + { + float Al = DFSPrimRefAABB_halfArea( &box_left[axis] ); + float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] ); + + // Avoid NANs in SAH calculation in the corner case where all prims go right + // In this case we set Al=Ar, because such a split will only be selected if all primrefs + // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees + // should store the same quantized area value + if ( count_left[axis] == 0 ) + Al = Ar; + + // compute sah cost + ushort count_right = num_prims - count_left[axis]; + float sah = Ar * count_right + Al * count_left[axis]; + + // keep this split if it is better than the previous one, or if the previous one was a corner-case + if ( sah < best_sah || best_count == 0 ) + { + // yes, keep it + best_axis = axis; + best_sah = sah; + best_count = count_left[axis]; + best_al = Al; + best_ar = Ar; + } + } + + + // write split information to SLM + locals->Al[primref_position] = best_al; + locals->Ar[primref_position] = best_ar; + locals->axis_and_left_count[primref_position].x = best_axis; + locals->axis_and_left_count[primref_position].y = best_count; + + uint sah = as_uint(best_sah); + // break ties by axis to ensure deterministic split selection + // otherwise builder can produce non-deterministic tree structure run to run + // based on the ordering of primitives (which can vary due to non-determinism in atomic counters) + // Embed split axis and index into sah value; compute min over sah and max over axis + sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | primref_position ); + + // reduce on split candidates in our local subtree and decide the best one + atomic_min_local( &locals->sah[ prim_range_start ], sah); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + ushort split_index = locals->sah[ prim_range_start ] & 255; + ushort split_axis = locals->axis_and_left_count[split_index].x; + ushort split_left_count = locals->axis_and_left_count[split_index].y; + float split_al = locals->Al[split_index]; + float split_ar = locals->Ar[split_index]; + + if ( (primref_position == prim_range_start) && active_thread ) + { + // first thread in a given subtree creates the inner node + ushort quantized_left_area = convert_ushort_rtn( split_al * area_scale ); + ushort quantized_right_area = convert_ushort_rtn( split_ar * area_scale ); + ushort start_left = prim_range_start; + ushort start_right = prim_range_start + split_left_count; + if ( split_left_count == 0 ) + start_right = start_left + (num_prims / 2); // handle split-in-the-middle case + + LocalBVH2_CreateInnerNode( bvh2, bvh2_root, + start_left, start_right, + quantized_left_area, quantized_right_area ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + struct DFSPrimRef ref; + ushort new_primref_position; + + if ( active_thread ) + { + ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); + bool go_left; + + if ( split_left_count == 0 ) + { + // We chose a split with no left-side prims + // This will only happen if all primrefs are located in the exact same position + // In that case, fall back to split-in-the-middle + split_left_count = (num_prims / 2); + go_left = (primref_position - prim_range_start < split_left_count); + } + else + { + // determine what side of the split this thread's primref belongs on + local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); + local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index ); + float c = my_box->lower[split_axis] + my_box->upper[split_axis]; + float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis]; + go_left = c < Csplit; + } + + // adjust state variables for next loop iteration + bvh2_root = (go_left) ? kids.x : kids.y; + num_prims = (go_left) ? split_left_count : (num_prims - split_left_count); + prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count; + + // determine the new primref position by incrementing a counter in the destination subtree + new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root ); + + // load our primref from its previous position + ref = PrimRefSet_GetPrimRef( prim_refs, primref_position ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + if ( active_thread ) + { + // write our primref into its sorted position + PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position ); + primref_position = new_primref_position; + + // deactivate all threads whose subtrees are small enough to form a leaf + if ( num_prims <= TREE_ARITY ) + { + active_thread = false; + atomic_dec_local( &locals->num_active_threads ); + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + LOOP_TRIPWIRE_INCREMENT( 50 ); + + + } while ( locals->num_active_threads > 0 ); + + +} + + + +// fast path for #prims <= TREE_ARITY +GRL_INLINE void Trivial_DFS( struct DFSArgs args ) +{ + + ushort tid = get_local_id( 0 ); + + PrimRef myRef; + AABB_init( &myRef ); + if( tid < args.num_primrefs ) + myRef = args.primref_buffer[tid]; + + uint node_offset; + if ( tid == 0 ) + node_offset = 64*allocate_inner_nodes( args.bvh_base, 1 ); + node_offset = sub_group_broadcast(node_offset,0); + + char* bvh_mem = (char*) args.bvh_base; + struct QBVHNodeN* qnode = (struct QBVHNodeN*) (bvh_mem + node_offset); + + uint child_type = args.leaf_node_type; + uint prim_base = args.bvh_base->quadLeafStart*64 ; + + char* leaf_mem = bvh_mem + prim_base; + int offset = (int)( leaf_mem - (char*)qnode ); + + if (child_type == NODE_TYPE_INSTANCE) + { + subgroup_setInstanceQBVHNodeN( offset >> 6, &myRef, args.num_primrefs, qnode, tid < args.num_primrefs ? PRIMREF_instanceMask(&myRef) : 0 ); + } + else + subgroup_setQBVHNodeN( offset >> 6, child_type, &myRef, args.num_primrefs, qnode, BVH_NODE_DEFAULT_MASK ); + + if ( tid < args.num_primrefs ) + { + global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs; + uint bp = node_offset; + + // TODO_OPT: Leaf creation pipeline can be made simpler by having a sideband buffer containing + // fatleaf index + position in fatleaf for each primref, instead of forcing leaf creation shader to reconstruct it + // should also probably do the fat-leaf splitting here + args.primref_buffer[tid] = myRef; + args.primref_index_buffer[tid] = tid; + + primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN); + + if ( tid == 0 && args.need_backpointers ) + { + uint bp = ((uint)-1) << 6; + bp |= (args.num_primrefs) << 3; + *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) = bp; + } + } +} + + + + + +void SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( uniform local struct FlatTree* flat_tree, + uniform local struct FlatTreeScheduler* flat_scheduler, + uniform local struct AABB3f* boxes, + uniform local struct PrimRefMeta* primref_meta, + uniform global struct AABB* primref_buffer, + uniform local uchar* masks, + bool need_masks ) + +{ + uniform int num_levels = (int) flat_scheduler->num_levels; + varying ushort lane = get_sub_group_local_id(); + + // iterate over depth levels in the tree... deepest to shallowest + for (uniform int level = num_levels - 1; level >= 0; level--) + { + // loop over a range of flattree nodes at this level, one node per sub-group + // TODO_OPT: Try and enable this code to process two nodes in a SIMD16 subgroup + uniform ushort level_start = flat_scheduler->level_start[level]; + uniform ushort level_node_count = flat_scheduler->level_count[level]; + + for (uniform ushort i = get_sub_group_id(); i < level_node_count; i += get_num_sub_groups()) + { + uniform ushort node_index = flat_scheduler->level_ordered_nodes[ level_start + i ]; + + varying struct AABB box; + AABB_init(&box); + + uniform uint child_base = FlatTree_GetFirstChild( flat_tree, node_index ); + uniform uint num_children = FlatTree_GetNumChildren( flat_tree, node_index ); + varying uint child_index = child_base + ((laneflat_tree, args ); + else if ( get_sub_group_id() == 2 ) + SUBGROUP_LocalBVH2_Initialize( &slm->u.s1.bvh2, args.num_primrefs ); + else if ( get_sub_group_id() == 4 ) + SUBGROUP_FlatTreeScheduler_Initialize( &slm->flat_scheduler ); + else if ( get_sub_group_id() == 6 ) + SUBGROUP_PrimRefSet_Initialize( &slm->u.s1.prim_refs ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + // load the PrimRefs + DFS_CreatePrimRefSet( args, &slm->u.s1.prim_refs ); + + // build the BVH2 + DFS_ConstructBVH2( &slm->u.s1.bvh2, &slm->u.s1.prim_refs, args.num_primrefs, &slm->u.s1.bvh2_locals ); + + // copy out metadata for primrefs now that they have been sorted + if( tid < args.num_primrefs ) + { + slm->primitive_meta[tid] = PrimRefSet_GetMeta( &slm->u.s1.prim_refs, tid ); + } + barrier( CLK_LOCAL_MEM_FENCE ); + + // collapse into a FlatTree + SUBGROUP_DFS_BuildFlatTree( &slm->u.s1.bvh2, &slm->flat_tree, &slm->flat_scheduler ); + + // allocate output QBVH6 nodes + if ( get_local_id( 0 ) == 0 ) + FlatTree_AllocateQNodes( &slm->flat_tree, args ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( &slm->flat_tree, &slm->flat_scheduler, &slm->u.s2.boxes[0], slm->primitive_meta, args.primref_buffer, slm->u.s2.masks, args.need_masks ); + + //FlatTree_Printf( &slm->flat_tree ); + //FlatTree_check_boxes ( &slm->flat_tree, args.primref_buffer, &slm->u.s2.boxes[0], slm->primitive_meta ); + + SUBGROUP_DFS_WriteNodes( &slm->flat_tree, &slm->u.s2.boxes[0], slm->primitive_meta, args, slm->u.s2.masks ); + + + // generate sorted primref index buffer and backpointers to feed the leaf creation pipeilne + if ( tid < args.num_primrefs ) + { + uint input_index = PrimRefMeta_GetInputIndex(&slm->primitive_meta[tid]); + + uint bp = FlatTree_GetPrimRefBackPointer( &slm->flat_tree, tid ); + global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs; + + args.primref_index_buffer[tid] = input_index; + + primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN); + + if ( tid == 0 && args.need_backpointers ) + { + *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) |= ((uint)-1) << 6; + } + } +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void DFS( global struct Globals* globals, + global char* bvh_mem, + global PrimRef* primref_buffer, + global uint* primref_index_buffer, + uint alloc_backpointers + ) +{ + struct DFSArgs args; + args.bvh_base = (global struct BVHBase*) bvh_mem; + args.leaf_node_type = globals->leafPrimType; + args.inner_node_type = NODE_TYPE_INTERNAL; + args.leaf_size_in_bytes = globals->leafSize; + args.primref_buffer = primref_buffer; + args.need_backpointers = alloc_backpointers != 0; + args.num_primrefs = globals->numPrimitives; + args.primref_index_buffer = primref_index_buffer; + args.need_masks = args.leaf_node_type == NODE_TYPE_INSTANCE; + + if ( args.num_primrefs <= TREE_ARITY ) + { + // TODO_OPT: This decision should be made using indirect dispatch + if( get_sub_group_id() == 0 ) + Trivial_DFS( args ); + return; + } + + local struct Single_WG_build_SLM slm; + + execute_single_WG_build( args, &slm ); +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void DFS_single_wg( + global struct Globals* globals, + global char* bvh_mem, + global PrimRef* primref_buffer, + global uint* primref_index_buffer, + uint sah_flags +) +{ + struct DFSArgs args; + args.bvh_base = (global struct BVHBase*) bvh_mem; + args.leaf_node_type = globals->leafPrimType; + args.inner_node_type = NODE_TYPE_INTERNAL; + args.leaf_size_in_bytes = globals->leafSize; + args.primref_buffer = primref_buffer; + args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; + args.num_primrefs = globals->numPrimitives; + args.primref_index_buffer = primref_index_buffer; + args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS; + + local struct Single_WG_build_SLM slm; + + execute_single_WG_build( args, &slm ); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +kernel void DFS_trivial( + global struct Globals* globals, + global char* bvh_mem, + global PrimRef* primref_buffer, + global uint* primref_index_buffer, + uint sah_flags +) +{ + struct DFSArgs args; + args.bvh_base = (global struct BVHBase*) bvh_mem; + args.leaf_node_type = globals->leafPrimType; + args.inner_node_type = NODE_TYPE_INTERNAL; + args.leaf_size_in_bytes = globals->leafSize; + args.primref_buffer = primref_buffer; + args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; + args.num_primrefs = globals->numPrimitives; + args.primref_index_buffer = primref_index_buffer; + args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS; + + Trivial_DFS( args ); +} + + +struct DFSArgs dfs_args_from_sah_globals( global struct SAHBuildGlobals* sah_globals ) +{ + struct DFSArgs args; + args.bvh_base = (global struct BVHBase*) sah_globals->p_bvh_base; + args.leaf_node_type = sah_globals->leaf_type; + args.inner_node_type = NODE_TYPE_INTERNAL; + args.leaf_size_in_bytes = sah_globals->leaf_size; + args.primref_buffer = (global PrimRef*) sah_globals->p_primrefs_buffer; + args.need_backpointers = sah_globals->flags & SAH_FLAG_NEED_BACKPOINTERS; + args.num_primrefs = sah_globals->num_primrefs; + args.primref_index_buffer = (global uint*) sah_globals->p_primref_index_buffers; + args.need_masks = sah_globals->flags & SAH_FLAG_NEED_MASKS; + + return args; +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(DFS_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void DFS_single_wg_batchable( + global struct SAHBuildGlobals* globals_buffer, + global struct VContextScheduler* scheduler +) +{ + global struct SAHBuildGlobals* sah_globals = globals_buffer + scheduler->num_trivial_builds + get_group_id(0); + + struct DFSArgs args = dfs_args_from_sah_globals( sah_globals ); + + local struct Single_WG_build_SLM slm; + + execute_single_WG_build(args, &slm); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +kernel void DFS_trivial_batchable( + global struct SAHBuildGlobals* globals_buffer +) +{ + global struct SAHBuildGlobals* sah_globals = globals_buffer + get_group_id(0); + + struct DFSArgs args = dfs_args_from_sah_globals(sah_globals); + + Trivial_DFS(args); +} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl new file mode 100644 index 00000000000..bb220b30612 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl @@ -0,0 +1,357 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "common.h" +#include "instance.h" + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(32, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel +primref_to_quads(global struct Globals *globals, + global struct AABB *primref, + global char *primref_index, + global char *bvh_mem, + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, + const uint stride, + const uint offset, + const uint allow_update) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart ); + uint quadIndicesStart = bvh->quadIndicesDataStart; + + const uint numPrimitives = globals->numPrimitives; + uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0); + if (i < numPrimitives) + { + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + + const uint primrefID = *(uint *)(primref_index + i * stride + offset); + + const uint geomID = PRIMREF_geomID(&primref[primrefID]); + const uint primID0 = PRIMREF_primID0(&primref[primrefID]); + const uint primID1 = PRIMREF_primID1(&primref[primrefID]); + const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]); + + const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0); + const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1); + + const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1); + + uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride; + + const uint4 indices = q.a; + + const uint mask = 0xff; // FIXME: hardcoded mask + float3 vtx0, vtx1, vtx2, vtx3; + GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices); + + uint j0 = q.lb.x; + uint j1 = q.lb.y; + uint j2 = q.lb.z; + uint shaderIndex = (mask << 24) | geomID; + uint geomIndex = geomID | (geomFlags << 30); + uint primIndex0 = primID0; + const uint delta = primID1 - primID0; + const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4)); + uint primIndex1Delta = delta | (j << 16) | (1 << 22); + + uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta); + float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x); + float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y); + float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z); + + global uint4* dst = (global uint4*)&quads[i]; + store_uint4_L1WB_L3WB(dst, 0, pack0); + store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1)); + store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2)); + store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3)); + + if(allow_update) + { + global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i)); + + uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w ); + + store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 ); + store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride); + } + + if (i == 0) + bvh->quadLeafCur += numPrimitives ; + } + + + +#if 0 + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart ); + + const uint numPrimitives = globals->numPrimitives; + const uint startID = get_group_id( 0 ) * get_local_size( 0 ); + const uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives); + + for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) + { + const uint primrefID = *(uint *)(primref_index + i * stride + offset); + + const uint geomID = PRIMREF_geomID(&primref[primrefID]); + const uint primID0 = PRIMREF_primID0(&primref[primrefID]); + const uint primID1 = PRIMREF_primID1(&primref[primrefID]); + const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]); + + const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0); + const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1); + + const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1); + + const uint4 indices = q.a; + const uint mask = 0xff; // FIXME: hardcoded mask + float3 vtx0, vtx1, vtx2, vtx3; + GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices); + + setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags ); + } + + if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0) + bvh->quadLeafCur += numPrimitives ; +#endif +} + +GRL_INLINE void create_procedural_leaf(global struct Globals *globals, + global struct AABB *primref, + local uint *primrefids, + uint numProcedurals, + struct QBVHNodeN *qnode, + global char *bvh_mem, + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + if (get_local_id(0) >= 8) + return; + + global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem; + + /* first read geomID of all primitives */ + uint primrefID = -1; + uint geomID = -1; + uint geomFlags = 0; + if (get_local_id(0) < numProcedurals) + { + primrefID = primrefids[get_local_id(0)]; + geomID = PRIMREF_geomID(&primref[primrefID]); + geomFlags = PRIMREF_geomFlags( &primref[primrefID] ); + } + + // cannot sort by geomID as bounds in parent node are then wrong + //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID); + //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID); + //geomID = geomID_primrefID >> 32; + //primrefID = geomID_primrefID; + + /* We have to split at geomID boundaries into multiple leaves. This + * block calculates the lane where a leaf starts and ends. */ + const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u); + const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u); + const uint leaf_start = geomIDprev != geomID; + const uint leaf_end = geomIDnext != geomID; + const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u); + + /* This computes which leaf a lane processes. E.g. form geomID = + * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */ + //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive? + + /* This computes the n'th primitive a lane processes inside its + * leaf. For the example above we compute leaf_prim = + * [0,1,0,1,2,0]. */ + const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0); + + /* from here on we allocate data and write to memory, thus only + * lanes that process a primitive should continue. */ + if (get_local_id(0) >= numProcedurals) + return; + + /* Here we allocate a single memory block for each required + * ProceduralLeaf node. We do this from a single lane to ensure + * the allocation is contiguous. */ + uint leaf_base_offset = 0; + uint n_leafs = sub_group_reduce_add(leaf_start); + if (get_local_id(0) == 0) + leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs ); + leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0); + + /* Compute the leaf offset for each lane. */ + uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1; + + struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset; + + /* write the procedural leaf headers */ + if (leaf_end) + { + pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function. Future extensions may have shaderIndex != geomID + pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME: Use setter function + pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!! + } + /* write the procedural leaf primIDs */ + pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]); + + /* update leaf node offset inside parent node */ + if (get_local_id(0) == 0) + { + QBVH6Node_set_offset(qnode, pleaf); + QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL); + } + + /* Let parent node children point to proper procedural leaf block + * and primitive. */ + qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +GRL_ANNOTATE_BIG_REG_REQ +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +primref_to_procedurals(global struct Globals *globals, + global struct AABB *primref, + global char *primref_index, + global char *bvh_mem, + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, + const uint stride, + const uint offset) +{ + global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; + + const uint numPrimitives = globals->numPrimitives; + uint startID = get_group_id( 0 ) * get_local_size( 0 ); + uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives); + + uint offset1 = stride * globals->numPrimitives; + if (stride == 8) + offset1 = 4; + + uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1); + /* start at leaf start */ + while (startID < numPrimitives) + { + const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1); + if (back_pointer != prev_start_back_pointer) + break; + startID++; + } + + uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1); + /* end at next leaf start */ + while (endID < numPrimitives) + { + const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1); + if (back_pointer != prev_end_back_pointer) + break; + endID++; + } + + local uint procedurals[16]; + + for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);) + { + /* load leaf start points and back_pointer */ + const uint primrefID = *(uint *)(primref_index + lid * stride + offset); + uint back_pointer = *(uint *)(primref_index + lid * stride + offset1); + uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1); + + const uint leaf_start = back_pointer != prev_back_pointer; + uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0); + + /* compute number of primitives inside the leaf starting at lid */ + const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); + uint numPrimitives = 0; + if (back_pointer == leaf_start_back_pointer && lid < endID) + numPrimitives = sub_group_reduce_add(1); + numPrimitives = sub_group_broadcast(numPrimitives, 0); + + procedurals[get_local_id(0)] = primrefID; + + struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer; + + create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc); + + lid += numPrimitives; + } +} + +GRL_INLINE void create_HW_instance_leaf( + global struct BVHBase* bvh, + global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc, + uint dstLeafId, + uint instanceIndex, + uint rootNodeByteOffset, + uint instanceMask) +{ + /* convert DXR instance to instance leaf node */ + global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh); + HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask); +} + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel create_HW_instance_nodes( + global const struct Globals *globals, + global char *primref_index, + global struct AABB *primref, + global struct BVHBase *bvh, + global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances, + uint32_t stride, + uint32_t offset) +{ + uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id(); + uint num_prims = globals->numPrimitives; + if (dstLeafId >= num_prims) + return; + if( dstLeafId == 0 ) + bvh->instanceLeafEnd += 2*num_prims; + + /* get instance ID */ + const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset); + const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]); + const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]); + const uint instMask = PRIMREF_instanceMask(&primref[primrefID]); + create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask ); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel create_HW_instance_nodes_pointers( + global const struct Globals *globals, + global char *primref_index, + global struct AABB *primref, + global struct BVHBase *bvh, + global void *instances_in, + uint32_t stride, + uint32_t offset) +{ + uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id(); + uint num_prims = globals->numPrimitives; + if (dstLeafId >= num_prims) + return; + if (dstLeafId == 0) + bvh->instanceLeafEnd += 2 * num_prims; + + global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; + + /* get instance ID */ + const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset); + const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]); + const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]); + const uint instMask = PRIMREF_instanceMask(&primref[primrefID]); + create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask ); +} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl new file mode 100644 index 00000000000..bc9cf590f51 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl @@ -0,0 +1,556 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "common.h" + +#define GRID_SIZE 1024 + +/* + This presplit item contains for each primitive a number of splits to + perform (priority) and the primref index. + */ + +struct PresplitItem +{ + unsigned int index; + float priority; +}; + +/* + + This function splits a line v0->v1 at position pos in dimension dim + and merges the bounds for the left and right line segments into + lbounds and rbounds. + + */ + +GRL_INLINE void splitLine(const uint dim, + const float pos, + const float4 v0, + const float4 v1, + struct AABB *lbounds, + struct AABB *rbounds) +{ + const float v0d = v0[dim]; + const float v1d = v1[dim]; + + /* this point is on left side */ + if (v0d <= pos) + AABB_extend_point(lbounds, v0); + + /* this point is on right side */ + if (v0d >= pos) + AABB_extend_point(rbounds, v0); + + /* the edge crosses the splitting location */ + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) + { + const float f = (pos - v0d) / (v1d - v0d); + const float4 c = f * (v1 - v0) + v0; + AABB_extend_point(lbounds, c); + AABB_extend_point(rbounds, c); + } +} + +/* + + This function splits a clipped triangle v0,v1,v2 with bounds prim at + position pos in dimension dim and merges the bounds for the left and + right clipped triangle fragments into lbounds and rbounds. + + */ + +GRL_INLINE void splitTriangle(struct AABB *prim, + const uint dim, + const float pos, + const float4 v0, + const float4 v1, + const float4 v2, + struct AABB *lbounds, + struct AABB *rbounds) +{ + /* clip each triangle edge */ + splitLine(dim, pos, v0, v1, lbounds, rbounds); + splitLine(dim, pos, v1, v2, lbounds, rbounds); + splitLine(dim, pos, v2, v0, lbounds, rbounds); + + /* the triangle itself was clipped already, thus clip against triangle bounds */ + AABB_intersect(lbounds, prim); + AABB_intersect(rbounds, prim); +} + +float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom) +{ + /* calculate projected area of first triangles */ + const uint primID0 = PRIMREF_primID0(prim); + const uint3 tri0 = GRL_load_triangle(geom, primID0); + const float4 av0 = GRL_load_vertex(geom, tri0.x); + const float4 av1 = GRL_load_vertex(geom, tri0.y); + const float4 av2 = GRL_load_vertex(geom, tri0.z); + const float area_tri0 = areaProjectedTriangle(av0, av1, av2); + + /* calculate projected area of second triangle */ + const uint primID1 = PRIMREF_primID1(prim); + const uint3 tri1 = GRL_load_triangle(geom, primID1); + const float4 bv0 = GRL_load_vertex(geom, tri1.x); + const float4 bv1 = GRL_load_vertex(geom, tri1.y); + const float4 bv2 = GRL_load_vertex(geom, tri1.z); + const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2); + + /* as priority we use the AABB area */ + const float area_aabb = AABB_halfArea(prim); + float priority = area_aabb; + + /* prefer triangles with a large potential SAH gain. */ + const float area_tris = area_tri0 + area_tri1; + const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris)); + priority *= area_ratio; + + /* ignore too small primitives */ + //const float4 size = AABB_size(prim); + //const float max_size = max(size.x,max(size.y,size.z)); + //if (max_size < 0.5f*max_scene_size/GRID_SIZE) + // priority = 0.0f; + + return priority; +} + +/* + + This kernel calculates for each primitive an estimated splitting priority. + + */ + + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals, + global struct BVHBase* bvh_base, + global struct AABB *primref, + global struct PresplitItem *presplit, + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + //assert(sizeof(PresplitItem) == sizeof_PresplitItem); + + /* calculate the range of primitives each work group should process */ + const uint numPrimitives = globals->numPrimitives; + const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0); + const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0); + + /* get scene bounding box size */ + const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds); + const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z)); + + /* each work group iterates over its range of primitives */ + for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) + { + const uint geomID = PRIMREF_geomID(&primref[i]); + + /* splitting heuristic for triangles */ + if (GRL_is_triangle(&geomDesc[geomID])) + { + presplit[i].index = i; + presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]); + } + + /* splitting of procedurals is not supported */ + else if (GRL_is_procedural(&geomDesc[geomID])) + { + presplit[i].index = i; + presplit[i].priority = 0.0f; + } + + else + { + //assert(false); + } + } + + if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0) + globals->numOriginalPrimitives = globals->numPrimitives; +} + +/* + + This kernel computes the sum of all priorities. + + */ + + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +priority_sum(global struct Globals *globals, + global struct PresplitItem *presplit, + uint numPrimitivesToSplit) +{ + const uint N = globals->numPrimitives; + const uint j = get_local_id(0); + const uint J = get_local_size(0); + const uint BLOCKSIZE = (N + J - 1) / J; + const uint start = min((j + 0) * BLOCKSIZE, N); + const uint end = min((j + 1) * BLOCKSIZE, N); + + float prioritySum = 0; + for (uint i = start; i < end; i++) + prioritySum += presplit[i].priority; + + prioritySum = work_group_reduce_add(prioritySum); + globals->presplitPrioritySum = prioritySum; + +#if 0 + work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + float scale = 1.0f; + for (uint i = 0; i < 10; i++) + { + //if (j == 0) + //printf("prioritySum = %f\n",scale*prioritySum); + + uint numSplits = 0; + for (uint i = start; i < end; i++) + numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit; + + numSplits = work_group_reduce_add(numSplits); + + if (numSplits > numPrimitivesToSplit) + break; + + //if (j == 0) + // printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit); + + globals->presplitPrioritySum = scale * prioritySum; + scale -= 0.05f; + } +#endif +} + +GRL_INLINE void heapify_down(struct AABB *array, uint size) +{ + /* we start at the root */ + uint cur_node_id = 0; + struct AABB *cur_node = array; + + while (true) + { + int larger_node_id = cur_node_id; + struct AABB *larger_node = cur_node; + + /* check if left child is largest */ + const int left_node_id = 2 * cur_node_id + 1; + struct AABB *left_node = &array[left_node_id]; + if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node)) + { + larger_node_id = left_node_id; + larger_node = left_node; + } + + /* check if right child is largest */ + const int right_node_id = 2 * cur_node_id + 2; + struct AABB *right_node = &array[right_node_id]; + if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node)) + { + larger_node_id = right_node_id; + larger_node = right_node; + } + + /* if current node is largest heap property is fulfilled and we are done */ + if (larger_node_id == cur_node_id) + break; + + /* otherwise we swap cur and largest */ + struct AABB tmp = *cur_node; + *cur_node = *larger_node; + *larger_node = tmp; + + /* we continue downwards with the largest node */ + cur_node_id = larger_node_id; + cur_node = larger_node; + } +} + +GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id) +{ + /* stop if we start at the root */ + if (cur_node_id == 0) + return; + + struct AABB *cur_node = &array[cur_node_id]; + + /* we loop until we reach the root node */ + while (cur_node_id) + { + /* get parent node */ + uint parent_node_id = (cur_node_id - 1) / 2; + struct AABB *parent_node = &array[parent_node_id]; + + /* if parent is larger then current we fulfill the heap property and can terminate */ + if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node)) + break; + + /* otherwise we swap cur and parent */ + struct AABB tmp = *cur_node; + *cur_node = *parent_node; + *parent_node = tmp; + + /* and continue upwards */ + cur_node_id = parent_node_id; + cur_node = parent_node; + } +} + +/* splits a quad primref */ +GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom, + struct AABB *cur, uint dim, float fsplit, + struct AABB *left, struct AABB *right) +{ + /* left and right bounds to compute */ + AABB_init(left); + AABB_init(right); + + /* load first triangle and split it */ + const uint primID0 = PRIMREF_primID0(cur); + const uint3 tri0 = GRL_load_triangle(geom, primID0); + const float4 av0 = GRL_load_vertex(geom, tri0.x); + const float4 av1 = GRL_load_vertex(geom, tri0.y); + const float4 av2 = GRL_load_vertex(geom, tri0.z); + splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right); + + /* load second triangle and split it */ + const uint primID1 = PRIMREF_primID1(cur); + const uint3 tri1 = GRL_load_triangle(geom, primID1); + const float4 bv0 = GRL_load_vertex(geom, tri1.x); + const float4 bv1 = GRL_load_vertex(geom, tri1.y); + const float4 bv2 = GRL_load_vertex(geom, tri1.z); + splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right); + + /* copy the PrimRef payload into left and right */ + left->lower.w = cur->lower.w; + left->upper.w = cur->upper.w; + right->lower.w = cur->lower.w; + right->upper.w = cur->upper.w; +} + +/* + + This kernel performs the actual pre-splitting. It selects split + locations based on an implicit octree over the scene. + + */ + +#define USE_HEAP 0 +#define HEAP_SIZE 32u + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +//__attribute__((intel_reqd_sub_group_size(16))) +void kernel +perform_presplits(global struct Globals *globals, + global struct BVHBase* bvh_base, + global struct AABB *primref, + global struct PresplitItem *presplit, + global char *bvh_mem, + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, + uint numPrimitivesToSplit) +{ + /* calculate the range of primitives each work group should process */ + const uint numPrimitives = globals->numPrimitives; + int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit; + pstart = max(0, pstart); + const uint numPrimitivesToProcess = globals->numPrimitives - pstart; + const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0); + const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0); + + /* calculates the 3D grid */ + float4 grid_base; + grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds ); + grid_base.w = 0; + + float4 grid_extend; + grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds); + grid_extend.w=0; + + grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z)); + const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f); + const float inv_grid_size = 1.0f / GRID_SIZE; + + /* we have to update centroid bounds */ + struct AABB centroidBounds; + AABB_init(¢roidBounds); + + /* initialize heap */ + struct AABB heap[HEAP_SIZE]; + uint heap_size = 0; + + /* each work group iterates over its range of primitives */ + for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0)) + { + /* array is in ascending order */ + //const uint ID = numPrimitives-1-j; + const uint ID = pstart + j; + const float prob = presplit[ID].priority; + const uint i = presplit[ID].index; + const uint geomID = PRIMREF_geomID(&primref[i]); + + /* do not split primitives with low splitting priority */ + if (prob <= 0.0f) + continue; + + /* we support splitting only for triangles */ + if (!GRL_is_triangle(&geomDesc[geomID])) + continue; + + /* compute number of split primitives to produce */ + uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit; + numSplitPrims = min(HEAP_SIZE, numSplitPrims); + + /* stop if not splits have to get performed */ + if (numSplitPrims <= 1) + continue; + + /* add primref to heap */ + heap[0] = primref[i]; + heap_size = 1; + uint heap_pos = 0; + + /* iterate until all splits are done */ + uint prims = 1; + uint last_heap_size = heap_size; + while (prims < numSplitPrims) + { + /* map the primitive bounds to the grid */ + const float4 lower = heap[heap_pos].lower; + const float4 upper = heap[heap_pos].upper; + const float4 glower = (lower - grid_base) * grid_scale + 0.2f; + const float4 gupper = (upper - grid_base) * grid_scale - 0.2f; + uint4 ilower = convert_uint4_rtz(glower); + uint4 iupper = convert_uint4_rtz(gupper); + + /* this ignores dimensions that are empty */ + if (glower.x >= gupper.x) + iupper.x = ilower.x; + if (glower.y >= gupper.y) + iupper.y = ilower.y; + if (glower.z >= gupper.z) + iupper.z = ilower.z; + + /* Now we compute a morton code for the lower and upper grid + * coordinates. */ + const uint lower_code = bitInterleave3D(ilower); + const uint upper_code = bitInterleave3D(iupper); + + /* if all bits are equal then we cannot split */ + if (lower_code == upper_code) + { +#if !USE_HEAP + prims++; // !!!!!!! + + heap_pos++; + if (heap_pos == last_heap_size) + { + heap_pos = 0; + last_heap_size = heap_size; + } + continue; +#else + if (heap_size == 1) + break; + + const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1); + primref[offset] = heap[heap_pos]; + + presplit[offset].index = offset; + presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]); + + heap[0] = heap[--heap_size]; + heapify_down(heap, heap_size); + continue; +#endif + } + + /* We find the bit position of the first differing bit from the + * top down. This bit indicates a split position inside an + * implicit octree. */ + const uint diff = 31 - clz(lower_code ^ upper_code); + + /* compute octree level and dimension to perform the split in */ + const uint level = diff / 3; + const uint dim = diff % 3; + + /* now we compute the grid position of the split */ + const uint isplit = iupper[dim] & ~((1 << level) - 1); + + /* compute world space position of split */ + const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim]; + + /* split primref into left and right part */ + struct AABB left, right; + splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right); + prims++; + + /* update centroid bounds */ + AABB_extend_point(¢roidBounds, AABB_centroid2(&left)); + AABB_extend_point(¢roidBounds, AABB_centroid2(&right)); + +#if !USE_HEAP + + heap[heap_pos] = left; + heap[heap_size] = right; + heap_size++; + + heap_pos++; + if (heap_pos == last_heap_size) + { + heap_pos = 0; + last_heap_size = heap_size; + } +#else + + /* insert left element into heap */ + heap[0] = left; + heapify_down(heap, heap_size); + + /* insert right element into heap */ + heap[heap_size] = right; + heapify_up(heap, heap_size); + + heap_size++; +#endif + } + + /* copy primities to primref array */ + primref[i] = heap[0]; + + presplit[ID].index = i; + presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]); + + for (uint k = 1; k < heap_size; k++) + { + const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1); + primref[offset] = heap[k]; + + presplit[offset].index = offset; + presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]); + } + } + + /* merge centroid bounds into global bounds */ + centroidBounds = AABB_sub_group_reduce(¢roidBounds); + if (get_sub_group_local_id() == 0) + AABB_global_atomic_merge(&globals->centroidBounds, ¢roidBounds); + + work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + /* update number of primitives on finish */ + if (Globals_OnFinish(globals)) + { + globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives; + globals->numSplittedPrimitives = 0; + + /* update first build record */ // FIXME: should be done in builder itself + global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64); + record->end = globals->numPrimitives; + } +} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl new file mode 100644 index 00000000000..1dd9a3cdd92 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl @@ -0,0 +1,674 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "common.h" +#include "instance.h" + +#include "bvh_build_primref.h" + +//#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable +//int sub_group_non_uniform_any(int predicate); + +#define WINDOW_SIZE 16 + +/* Representation of two merged triangles. */ +struct QuadIndices +{ + uint primID0, primID1; + uint v0, v1, v2, v3; +}; + +/* + + This function calculates a PrimRef from a merged quad and writes + this PrimRef to memory. + + */ +GRL_INLINE void create_prim_ref(const uint geomID, + const struct QuadIndices quad, + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, + struct AABB *geometryBounds, + struct AABB *centroidBounds, + global uint *numPrimitives, + global struct AABB *primref) +{ + + /* load quad vertices */ + const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged + const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1); + const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2); + const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3); + + /* calculate bounds for quad */ + float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3)); + float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3)); + + /* extend geometry and centroid bounds */ + const float4 centroid2 = lower + upper; + AABB_extendlu(geometryBounds, lower, upper); + AABB_extendlu(centroidBounds, centroid2, centroid2); + + PrimRef ref; + PRIMREF_setAABB( &ref, lower.xyz, upper.xyz ); + PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) ); + + /* store primref to memory */ + const uint offset = atomic_add_global(numPrimitives, 1); + primref[offset] = ref; +} + +/* + + This function calculates a PrimRef from a procedural and writes + this PrimRef to memory. + + */ +GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, + const uint geomID, + const uint primID, + struct AABB *geometryBounds, + struct AABB *centroidBounds, + global uint *numPrimitives, + global struct AABB *primref) +{ + /* load aabb from memory */ + struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); + + /* extend geometry and centroid bounds */ + float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f); + float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f); + const float4 centroid2 = lower + upper; + AABB_extendlu(geometryBounds, lower, upper); + AABB_extendlu(centroidBounds, centroid2, centroid2); + + /* encode geomID, primID */ + uint geomFlags = GRL_get_Flags(&geomDesc[geomID]); + + PrimRef ref; + PRIMREF_setAABB( &ref, lower.xyz, upper.xyz ); + PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags ); + + /* store primref to memory */ + const uint offset = atomic_add_global(numPrimitives, 1); + primref[offset] = ref; +} + +/* + + This function performs a binary search to calculate the geomID and + primID of the i'th primitive of the scene. For the search a + prefix_sum array is used that stores for each location j the sum of + the number of primitives of all meshes k with k 1) + { + const uint m = (l + r) / 2; + k = prefix_sum[m]; + if (k <= i) + { + l = m; + } + else if (i < k) + { + r = m; + } + } + + struct GeomPrimID id; + id.geomID = l; + id.primID = i - prefix_sum[l]; + return id; +} + +/* + + Checks if a vertex contains only finite floating point numbers. + + */ + +GRL_INLINE bool isfinite_vertex(float4 vtx) +{ + return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z); +} + + +/* + Create primrefs from array of instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +primrefs_from_DXR_instances(global struct Globals *globals, + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, + uint numInstances, + global struct AABB *primrefs, + uint allowUpdate) +{ + const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (instanceIndex < numInstances) + { + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + 0, + allowUpdate); + } +} + +/* + Create primrefs from array of instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel +primrefs_from_DXR_instances_indirect(global struct Globals *globals, + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, + global struct IndirectBuildRangeInfo* indirect_data, + global struct AABB *primrefs, + uint allowUpdate) +{ + // TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed + // directly to the kernel. THe rest of the kernel args are pulled using + // loads from memory. It may be more efficient to put 'numInstances' and + // 'allowUpdate' into 'globals' + + const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; + + if (instanceIndex < indirect_data->primitiveCount) + { + instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) + (((global char*)instances) + indirect_data->primitiveOffset); + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + 0, + allowUpdate); + } +} + +/* + Create primrefs from array of pointers to instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +primrefs_from_DXR_instances_pointers(global struct Globals *globals, + global struct BVHBase* bvh, + global void *instances_in, + uint numInstances, + global struct AABB *primrefs, + uint allowUpdate) +{ + global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; + + const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (instanceIndex < numInstances) + { + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + 0, + allowUpdate); + } +} + +/* + Create primrefs from array of pointers to instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel +primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals, + global struct BVHBase* bvh, + global void *instances_in, + global struct AABB *primrefs, + global struct IndirectBuildRangeInfo* indirect_data, + uint allowUpdate) +{ + global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; + + const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; + + if (instanceIndex < indirect_data->primitiveCount) + { + instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**) + (((global char*)instances) + indirect_data->primitiveOffset); + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + 0, + allowUpdate); + } +} + + +/////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// + +bool can_pair( uint3 a, uint3 b ) +{ + bool match0 = any( a.xxx == b.xyz ) ? 1 : 0; + bool match1 = any( a.yyy == b.xyz ) ? 1 : 0; + bool match2 = any( a.zzz == b.xyz ) ? 1 : 0; + return (match0 + match1 + match2) >= 2; +} + +void reduce_bounds( + float3 lower, + float3 upper, + global struct Globals* globals, + global struct BVHBase* bvh ) +{ + + // reduce centroid bounds... make sure to exclude lanes with invalid AABBs + float3 cent = lower + upper; + float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper); + float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper); + + // reduce geo bounds + AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper ); + AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper ); +} + + +struct TriState +{ + bool valid; + uint prim_index; + uint pairing; + uint3 indices; + float3 lower; + float3 upper; +}; + +#define NOT_PAIRED 0xffffffff + +void load_triangle_data(uniform global char* index_buffer, + uniform const uint index_format, + uniform global char* vertex_buffer, + uniform const uint vertex_format, + uniform const uint vertex_stride, + uniform global float* transform_buffer, + uniform uint total_vert_count, + struct TriState* state, + float4* v) +{ + state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index ); + + const uint last_vertex = total_vert_count - 1; + const uint x = min(state->indices.x, last_vertex); + const uint y = min(state->indices.y, last_vertex); + const uint z = min(state->indices.z, last_vertex); + + GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v); +} + +struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + uniform uint base, + uniform uint num_prims, + uniform uint total_vert_count ) +{ + + struct TriState state; + state.pairing = NOT_PAIRED; + state.valid = false; + state.prim_index = base + get_sub_group_local_id(); + state.lower = (float3)(INFINITY, INFINITY, INFINITY); + state.upper = -(float3)(INFINITY, INFINITY, INFINITY); + + if (state.prim_index < num_prims) + { + state.valid = true; + float4 v[3]; + load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer, + geomDesc->Desc.Triangles.IndexFormat, + (global char*)geomDesc->Desc.Triangles.pVertexBuffer, + geomDesc->Desc.Triangles.VertexFormat, + geomDesc->Desc.Triangles.VertexBufferByteStride, + (global float*)geomDesc->Desc.Triangles.pTransformBuffer, + total_vert_count, + &state, + v); + + if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count || + !isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) || + state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z) + { + state.valid = false; + } + else + { + state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz)); + state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz)); + } + } + return state; +} + +void broadcast_triangles_local( struct TriState* state ) +{ + varying uint my_prim = state->prim_index; + varying uint my_pairing = state->pairing; + varying float3 my_lower = state->lower; + varying float3 my_upper = state->upper; + varying bool valid = state->valid; + varying uint3 indices = state->indices; + + for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++) + { + // don't broadcast invalid prims + if ( !sub_group_broadcast( valid, broadcast_lane ) ) + continue; + + uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane); + uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane); + + if (broadcast_pairing == NOT_PAIRED) + { + // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it + bool pairable = false; + uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane ); + if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid ) + { + pairable = can_pair( indices, other_indices ); + } + + + uint pairable_lane = ctz(intel_sub_group_ballot(pairable)); + if (valid && pairable_lane < get_sub_group_size()) + { + // pair the broadcast primitive with the first lane that can accept it + float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane); + float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane); + if (get_sub_group_local_id() == pairable_lane) + { + my_pairing = broadcast_prim; + my_lower.xyz = min(my_lower.xyz, broadcast_lower); + my_upper.xyz = max(my_upper.xyz, broadcast_upper); + } + + // pair the broadcast primitive with the same that was paired to it + uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane); + if (get_sub_group_local_id() == broadcast_lane) + { + my_pairing = pairable_prim; + } + } + } + else + { + // + // if this lane was already paired with the broadcasting tri + // in an earlier loop iteration, then record the pairing in this lane's registers + float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane); + float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane); + if (broadcast_pairing == my_prim) + { + my_pairing = broadcast_prim; + my_lower.xyz = min(my_lower.xyz, broadcast_lower); + my_upper.xyz = max(my_upper.xyz, broadcast_upper); + } + } + } + + state->pairing = my_pairing; + state->lower = my_lower; + state->upper = my_upper; +} + + +void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other ) +{ + varying uint my_prim = state->prim_index; + varying uint my_pairing = state->pairing; + varying float3 my_lower = state->lower; + varying float3 my_upper = state->upper; + varying bool valid = state->valid; + varying uint3 indices = state->indices; + + for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++) + { + // don't broadcast invalid prims + if (!sub_group_broadcast(other->valid, broadcast_lane)) + continue; + + uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane); + uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane); + + if (broadcast_pairing == NOT_PAIRED) + { + // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it + bool pairable = false; + if ( my_pairing == NOT_PAIRED && valid ) + { + uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane); + pairable = can_pair(indices, other_indices); + } + + // pair the broadcast primitive with the first lane that can accept it + uint pairable_mask = intel_sub_group_ballot(pairable); + if (valid && (ctz(pairable_mask) == get_sub_group_local_id())) + { + my_pairing = broadcast_prim; + my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane)); + my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane)); + } + } + + } + + state->pairing = my_pairing; + state->lower = my_lower; + state->upper = my_upper; +} + +GRL_INLINE void do_triangles_to_primrefs( + global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + uint geomID_and_flags, + const uint num_prims) +{ + uint geomID = geomID_and_flags & 0x00ffffff; + uint geom_flags = geomID_and_flags >> 24; + uint prim_base = get_group_id(0) * get_local_size(0); + uint total_vert_count = GRL_get_triangles_VertexCount(geomDesc); + + struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count ); + broadcast_triangles_local( &tri ); + + + // we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED) + // or for the lane corresponding to the larger of two triangles + bool will_write = (tri.pairing > tri.prim_index) && tri.valid; + uint write_mask = intel_sub_group_ballot(will_write); + uint write_offs = subgroup_bit_prefix_exclusive( write_mask ); + uint write_count = popcount(write_mask); + + // allocate space in primref buffer + uint write_base; + if( get_sub_group_local_id() == 0 ) + write_base = atomic_add_global( &globals->numPrimitives, write_count ); + write_offs += sub_group_broadcast( write_base, 0 ); + + uint primID0 = tri.prim_index; + uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index; + + if (will_write) + { + PrimRef ref; + PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz); + PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags); + uint8 val = (uint8)( + as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w), + as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w)); + store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val); + } + + reduce_bounds( tri.lower, tri.upper, globals, bvh ); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +triangles_to_primrefs( + global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + uint geomID_and_flags, + uint num_prims + ) +{ + do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +triangles_to_primrefs_indirect( + global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global struct IndirectBuildRangeInfo* indirect_data, + uint geomID_and_flags) +{ + const uint num_prims = indirect_data->primitiveCount; + do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); +} + +GRL_INLINE void do_procedurals_to_primrefs( + global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + uint geomID_and_flags, + const uint num_prims) +{ + uint geomID = geomID_and_flags & 0x00ffffff; + uint geomFlags = geomID_and_flags >> 24; + + uint primID = get_group_id(0) * get_local_size(0) + get_sub_group_local_id(); + + bool create_primref = false; + float3 lower = (float3)(INFINITY, INFINITY, INFINITY); + float3 upper = -(float3)(INFINITY, INFINITY, INFINITY); + if (primID < num_prims) + { + /* check if procedural is valid */ + struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID); + const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ); + const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ); + if (valid_min & valid_max) + { + /* load aabb from memory */ + float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ); + float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ); + + // convert degenerate boxes to points at the box centroid + lower = min( l, u ); + upper = max( l, u ); + + create_primref = true; + } + } + + uint write_mask = intel_sub_group_ballot(create_primref); + uint write_offs = subgroup_bit_prefix_exclusive(write_mask); + uint write_count = popcount(write_mask); + + // allocate space in primref buffer + uint write_base; + if (get_sub_group_local_id() == 0) + write_base = atomic_add_global(&globals->numPrimitives, write_count); + write_offs += sub_group_broadcast(write_base, 0); + + // write the primref + if (create_primref) + { + PrimRef ref; + PRIMREF_setAABB(&ref, lower.xyz, upper.xyz); + PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags); + primref[write_offs] = ref; + } + + reduce_bounds(lower, upper, globals, bvh); + +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +procedurals_to_primrefs( + global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + uint geomID_and_flags, + uint num_prims + ) +{ + do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +procedurals_to_primrefs_indirect( + global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global const struct IndirectBuildRangeInfo* indirect_data, + uint geomID_and_flags + ) +{ + const uint num_prims = indirect_data->primitiveCount; + do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); +} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.h b/src/intel/vulkan/grl/gpu/bvh_build_primref.h new file mode 100644 index 00000000000..25e2d3df194 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.h @@ -0,0 +1,246 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#if 0 +/* + +Create primrefs from array of instance descriptors. + +*/ + +void store_instance_primref( + global struct BVHBase* top_bvh, + global struct Globals* globals, + global PrimRef* primrefs, + bool alloc_primref, + PrimRef new_primref ) +{ + uint allocatePrimref = alloc_primref ? 1 : 0; + uint index = 0; + uint numAllocations = sub_group_reduce_add(allocatePrimref); + + if (get_sub_group_local_id() == 0) + { + index = atomic_add_global(&globals->numPrimitives, numAllocations); + } + + index = sub_group_broadcast(index, 0); + index = index + sub_group_scan_exclusive_add(allocatePrimref); + + if (allocatePrimref) + { + primrefs[index] = new_primref; + } + + struct AABB centroidBounds; + centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref); + struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref); + struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(¢roidBounds); + + if (get_sub_group_local_id() == 0) + { + AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz); + AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds); + } +} + + + +// Compute transformed blas AABB. Returns false if instance is degenerate +bool create_instance_primref( + PrimRef* ref_out, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, + global struct BVHBase* bvh, + uint instanceMask, + uint instanceIndex + ) +{ + struct AABB3f bbox; + bool alloc_primref = false; + uint rootNodeOffset = NO_NODE_OFFSET; + if (bvh != 0) + { + alloc_primref = true; + AABB3f AS_bounds = BVHBase_GetRootAABB(bvh); + + const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]); + const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]); + + if (!valid_min || !valid_max || instanceMask == 0) + { + // degenerated instance case + + // TODO this should be under if ( allocate backpointers ) + { + // we have to allocate the primref because this instance can be updated to non-degenerated + // take the origin of the instance as a bounding box. + + bbox.lower[0] = instance->Transform[3]; + bbox.lower[1] = instance->Transform[7]; + bbox.lower[2] = instance->Transform[11]; + bbox.upper[0] = instance->Transform[3]; + bbox.upper[1] = instance->Transform[7]; + bbox.upper[2] = instance->Transform[11]; + instanceMask = 0; + } + } + else + { + rootNodeOffset = BVH_ROOT_NODE_OFFSET; + float transformOverhead = 0.0f; + bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead); + } + } + + *ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0); + return alloc_primref; +} + +GRL_INLINE void primrefs_from_instances( + global struct Globals* globals, + global struct BVHBase* top_bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, + uint instanceIndex, + global struct AABB* primrefs) +{ + bool alloc_primref = false; + PrimRef new_primref; + AABB_init(&new_primref); + + if (instance) + { + uint mask = GRL_get_InstanceMask(instance); + global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure; + alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex); + } + + store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref); +} +#endif + +#if 1 +GRL_INLINE void primrefs_from_instances( + global struct Globals* globals, + global struct BVHBase* top_bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, + uint instanceIndex, + global struct AABB* primrefs, + global GRL_RAYTRACING_AABB* procedural_aabb, + uint allowUpdate + ) +{ + struct AABB3f bbox; + uint allocatePrimref = 0; + + uint rootNodeOffset = NO_NODE_OFFSET; + uint instanceMask = 0; + + bool is_procedural = (procedural_aabb != 0); + + if( instance ) + { + instanceMask = GRL_get_InstanceMask(instance) ; + if ( is_procedural ) + { + // procedural instance primref + allocatePrimref = 1; + + float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ); + float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ); + + if (instanceMask == 0 || any(lower > upper)) + { + bbox.lower[0] = instance->Transform[3]; + bbox.lower[1] = instance->Transform[7]; + bbox.lower[2] = instance->Transform[11]; + bbox.upper[0] = instance->Transform[3]; + bbox.upper[1] = instance->Transform[7]; + bbox.upper[2] = instance->Transform[11]; + instanceMask = 0; + } + else + { + bbox = transform_aabb(lower, upper, instance->Transform); + } + } + else + { + // HW-instance primref + + global struct BVHBase* bvh = instance ? + (global struct BVHBase*)instance->AccelerationStructure : + 0; + + if (bvh != 0) + { + AABB3f AS_bounds = BVHBase_GetRootAABB(bvh); + + const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]); + const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]); + + + if (valid_min && valid_max && instanceMask != 0) + { + allocatePrimref = 1; + rootNodeOffset = BVH_ROOT_NODE_OFFSET; + float transformOverhead = 0.0f; + bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead); + } + else if (allowUpdate) + { + // degenerated instance case + // we have to allocate the primref because this instance can be updated to non-degenerated + // take the origin of the instance as a bounding box. + allocatePrimref = 1; + bbox.lower[0] = instance->Transform[3]; + bbox.lower[1] = instance->Transform[7]; + bbox.lower[2] = instance->Transform[11]; + bbox.upper[0] = instance->Transform[3]; + bbox.upper[1] = instance->Transform[7]; + bbox.upper[2] = instance->Transform[11]; + instanceMask = 0; + } + } + } + } + + uint index = 0; + uint numAllocations = sub_group_reduce_add(allocatePrimref); + + if (get_sub_group_local_id() == 0) + { + index = atomic_add_global(&globals->numPrimitives, numAllocations); + } + + index = sub_group_broadcast(index, 0); + index = index + sub_group_scan_exclusive_add(allocatePrimref); + + struct AABB new_primref; + struct AABB centroidBounds; + if (allocatePrimref) + { + new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural); + primrefs[index] = new_primref; + centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref); + } + else + { + AABB_init(&new_primref); + AABB_init(¢roidBounds); + } + + + struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref); + struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(¢roidBounds); + + if (get_sub_group_local_id() == 0) + { + AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz); + AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds); + } +} +#endif diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl new file mode 100644 index 00000000000..bcda2fa54ec --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl @@ -0,0 +1,491 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "bvh_build_refit.h" +#include "api_interface.h" +#include "common.h" + + + + + +#if 0 +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) +void kernel +update_instance_leaves( global struct BVHBase* bvh, + uint64_t dxrInstancesArray, + uint64_t dxrInstancesPtr, + global struct AABB3f* instance_aabb_scratch +) +{ + uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh ); + uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 ); + if ( id >= num_leaves ) + return; + + global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray = + (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray; + global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray = + (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr; + + global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh ); + + /* iterate over all children of the instance node and get their bounds */ + + uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] ); + global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL; + if ( dxrInstancesArray != NULL ) + instance = &instancesArray[instanceIdx]; + else + instance = instancesPtrArray[instanceIdx]; + + struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform ); + global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure; + struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds; + struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO: Use faster abs-matrix method + + const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] ); + const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] ); + + uint mask = GRL_get_InstanceMask(instance); + + uint offset = instanceBvh->rootNodeOffset; + if ( !valid_min || !valid_max ) + { + bbox.lower[0] = xfm.p.x; + bbox.lower[1] = xfm.p.y; + bbox.lower[2] = xfm.p.z; + bbox.upper[0] = xfm.p.x; + bbox.upper[1] = xfm.p.y; + bbox.upper[2] = xfm.p.z; + offset = NO_NODE_OFFSET; + mask = 0; + } + + instance_aabb_scratch[id] = bbox; + + HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH +} +#endif + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +update_instance_leaves(global struct BVHBase* bvh, + uint64_t dxrInstancesArray, + uint64_t dxrInstancesPtr, + global struct AABB3f* instance_aabb_scratch +) +{ + uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh); + uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); + if (id >= num_leaves) + return; + + DO_update_instance_leaves( + bvh, + dxrInstancesArray, + dxrInstancesPtr, + instance_aabb_scratch, + id, + 0 ); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +update_instance_leaves_indirect(global struct BVHBase* bvh, + uint64_t dxrInstancesArray, + uint64_t dxrInstancesPtr, + global struct AABB3f* instance_aabb_scratch, + global struct IndirectBuildRangeInfo* indirect_data) +{ + uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh); + uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); + if (id >= num_leaves) + return; + + DO_update_instance_leaves( + bvh, + dxrInstancesArray + indirect_data->primitiveOffset, + dxrInstancesPtr, + instance_aabb_scratch, + id, + 0 ); +} + +#if 0 +/* + + This kernel refit a BVH. The algorithm iterates over all BVH nodes + to find all leaf nodes, which is where refitting starts. For these + leaf nodes bounds get recalculated and then propagates up the tree. + + One kernel instance considers a range of inner nodes as startpoints. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit( + global struct BVHBase *bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, + global struct AABB3f* instance_leaf_aabbs ) +{ + /* here we temporarily store the bounds for the children of a node */ + struct AABB childrenAABB[BVH_NODE_N6]; + + /* get pointer to inner nodes and back pointers */ + global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh); + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + + /* construct range of nodes that each work group will process */ + const uint numInnerNodes = BVHBase_numNodes(bvh); + const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0); + const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0); + + /* each workgroup iterates over its range of nodes */ + for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) + { + global struct QBVHNodeN* curNode = &inner_nodes[i]; + uint numChildren = refit_bottom(bvh, geosArray, + instance_leaf_aabbs, + curNode, + childrenAABB, + *InnerNode_GetBackPointer(backPointers, i)); + if (numChildren != 0) + { + /* update bounds of node */ + QBVHNodeN_setBounds(curNode, childrenAABB, numChildren); + + /* refit upper parts of the BVH */ + // TODO: this will not gonna work for mixed nodes + refit_bottom_up(curNode, bvh, childrenAABB, numChildren); + } + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(8, 1, 1))) +void kernel Find_refit_treelets( + global struct BVHBase* bvh, + global TreeletNodeData* treelets, + global uint* scratchStartpoints, + global uint* startpointAlloc) +{ + find_refit_treelets(bvh, + treelets, + scratchStartpoints, + startpointAlloc); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel Assign_refit_startpoints_to_treelets( + global struct BVHBase* bvh, + global TreeletNodeData* treelets, + global uint* scratchStartpoints) +{ + assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(128, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel Finalize_treelets_in_groups( + global struct BVHBase* bvh, + global uint* scratchStartpoints ) +{ + local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE]; + + finalize_treelets_in_groups(bvh, scratchStartpoints, depths); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs) +{ + uint group_id = get_group_id(0); + SquashedInput sqinput = psqinputs[group_id]; + global struct BVHBase* bvh = sqinput.pBvh; + uint numLeaves = BVHBase_GetNumQuads(bvh); + global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh); + + global void* input = sqinput.pInput; + global struct AABB* bbox_scratch = sqinput.bbox_scratch; + + uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64; + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; + uint id = get_local_id(0); + + for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0)) + { + struct AABB theAABB; + refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB); + theAABB.lower.w = as_float(0xABBADEFFu); + bbox_scratch[leafsIndexOffset + leaf_id] = theAABB; + } +} + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(32, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel Refit_quads( + global struct BVHBase* bvh, + global void* input, + global struct AABB* bbox_scratch, + uint numGroupsExecuted, + global SquashedInputGroupDesc* sqinput) +{ + uint numLeafs = BVHBase_GetNumQuads(bvh); + if (numLeafs == 0) return; + global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh); + + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; + uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64; + + uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted; + + uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0); + uint id_end = min(id_start + numLeafsPerGr, numLeafs); + for (uint id = id_start; id < id_end; id+= get_local_size(0)) + { + struct AABB theAABB; + refit_bottom_child_quad(leafs + id, geosArray, &theAABB); + theAABB.lower.w = as_float(0xABBADEFFu); + bbox_scratch[leafsIndexOffset + id] = theAABB; + } + + if (get_group_id(0) == 0 && get_local_id(0) < 16) + { + + uint groupnr; + uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh); + if (get_sub_group_local_id() == 0) { + groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt); + } + groupnr = sub_group_broadcast(groupnr, 0); + for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size()) + { + uint gr = groupnr + subtree; + //printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n", bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints); + sqinput[gr].bvh = (qword)bvh; + sqinput[gr].scratch = (qword)bbox_scratch; + sqinput[gr].groupInTree = subtree; + } + //if (get_local_id(0)==0 && treeletCnt > 1) + //{ + // printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth); + //} + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel +Refit_tree_per_group_quad( + global SquashedInput* psqinputs) +{ + uint group_id = get_group_id(0); + SquashedInput sqinput = psqinputs[group_id]; + global struct BVHBase* bvh = sqinput.pBvh; + global struct AABB* bbox_scratch = sqinput.bbox_scratch; + global void* pInput = sqinput.pInput; + local Treelet_by_single_group_locals loc; + + if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0) + return; + +#if REFIT_DEBUG_CHECKS + uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh); + if (bottoms_cnt != 1) { + if (get_local_id(0) == 0) + { + printf("Error: this tree has more than 1 treelets!\n"); + } + return; + } +#endif + + /* get pointer to inner nodes and back pointers */ + uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); + + // uniform per group + uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh); + + uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart; + + if (numLeafs == 0) { return; } + + uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0); + + update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread); + + mem_fence_workgroup_default(); work_group_barrier(0); + + RefitTreelet trltDsc = *pTrltDsc; + + refit_treelet_by_single_group( + bbox_scratch, + &loc, + bvh, + trltDsc, + false, + true); + + if (trltDsc.maxDepth > 0) + { + mem_fence_workgroup_default(); work_group_barrier(0); + post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh); + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel +Refit_treelet_per_group( + global SquashedInputGroupDesc* sqinput) +{ + uint group_id = get_group_id(0); + global struct AABB* bbox_scratch = (global struct AABB* )sqinput[group_id].scratch; + global struct BVHBase* bvh = (global struct BVHBase* )sqinput[group_id].bvh; + group_id = sqinput[group_id].groupInTree; + + /* get pointer to inner nodes and back pointers */ + uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); + + uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh); + + // uniform per group + uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh); + + bool should_we_process_treetip = true; + local Treelet_by_single_group_locals loc; + local bool* l_should_we_process_treetip = (local bool*)&loc; +#if REFIT_VERBOSE_LOG + if (group_id != 0) return; +#endif + + if (bottoms_cnt > 1) + { +#if REFIT_VERBOSE_LOG + for (; group_id < bottoms_cnt; group_id++) + { + if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); } + work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device); +#endif + bool rootProcThread = refit_treelet_by_single_group( + bbox_scratch, + &loc, + bvh, + pTrltDsc[group_id], + true, + false); + + // we have to make last group that finishes go up and process the treetip + if (rootProcThread) + { + + mem_fence_gpu_invalidate(); + uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2); + should_we_process_treetip = finished_cnt + 1 == bottoms_cnt; + + * l_should_we_process_treetip = should_we_process_treetip; + + if (should_we_process_treetip) mem_fence_gpu_invalidate(); + } +#if REFIT_VERBOSE_LOG + } +#endif + work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group); + + should_we_process_treetip = *l_should_we_process_treetip; + } + + if (should_we_process_treetip) + { + //this group will process treetip + if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; } + if (bottoms_cnt == 1) { bottoms_cnt = 0; } + refit_treelet_by_single_group( + bbox_scratch, + &loc, + bvh, + pTrltDsc[bottoms_cnt], + true, + true); + } +} + +/* + This kernel refit a BVH. The algorithm iterates over all BVH nodes + to find all leaf nodes, which is where refitting starts. For these + leaf nodes bounds get recalculated and then propagates up the tree. + + One kernel instance considers exactly one inner_node startpoint. + not range of inner nodes. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(8, 1, 1))) void kernel +Refit_per_one_startpoint( + global struct BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, + global struct AABB3f* instance_leaf_aabbs ) +{ + /* here we temporarily store the bounds for the children of a node */ + struct AABB childrenAABB[BVH_NODE_N6]; + + /* get pointer to inner nodes and back pointers */ + global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + + /* get the inner node that we will consider as a bottom startpoint */ + const uint numInnerNodes = BVHBase_numNodes(bvh); + const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0); + + if (innerNodeIdx >= numInnerNodes) return; + + global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx]; + uint numChildren = refit_bottom( + bvh, + geosArray, + instance_leaf_aabbs, + curNode, + childrenAABB, + *InnerNode_GetBackPointer(backPointers, innerNodeIdx)); + + if (numChildren != 0) + { + /* update bounds of node */ + QBVHNodeN_setBounds(curNode, childrenAABB, numChildren); + + /* refit upper parts of the BVH */ + /* TODO: this will not gonna work for mixed nodes */ + refit_bottom_up(curNode, bvh, childrenAABB, numChildren); + } +} + +#endif + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel +Refit_indirect_sg( + global struct BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, + global struct AABB3f* instance_leaf_aabbs) +{ + DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0); + +} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_refit.h new file mode 100644 index 00000000000..522a44b23a7 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.h @@ -0,0 +1,546 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "common.h" +#include "api_interface.h" +#include "instance.h" +#include "GRLGen12.h" +#include "libs/lsc_intrinsics.h" + + +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +DO_update_instance_leaves(global struct BVHBase* bvh, + uint64_t dxrInstancesArray, + uint64_t dxrInstancesPtr, + global struct AABB3f* instance_aabb_scratch, + uint id , + global struct GRL_RAYTRACING_AABB* procedural_box +) +{ + + global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray = + (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray; + global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray = + (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr; + + global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh); + + + /* iterate over all children of the instance node and get their bounds */ + + uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]); + global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL; + if (dxrInstancesArray != NULL) + instance = &instancesArray[instanceIdx]; + else + instance = instancesPtrArray[instanceIdx]; + + uint mask = GRL_get_InstanceMask(instance); + uint offset = NO_NODE_OFFSET; + + struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform); + struct AABB3f bbox; + + if (procedural_box != 0) + { + bbox.lower[0] = procedural_box->MinX; + bbox.lower[1] = procedural_box->MinY; + bbox.lower[2] = procedural_box->MinZ; + bbox.upper[0] = procedural_box->MaxX; + bbox.upper[1] = procedural_box->MaxY; + bbox.upper[2] = procedural_box->MaxZ; + } + else + { + global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure; + bbox = instanceBvh->Meta.bounds; + offset = BVH_ROOT_NODE_OFFSET; + } + + + const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]); + const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]); + + if (!valid_min || !valid_max ) + { + bbox.lower[0] = xfm.p.x; + bbox.lower[1] = xfm.p.y; + bbox.lower[2] = xfm.p.z; + bbox.upper[0] = xfm.p.x; + bbox.upper[1] = xfm.p.y; + bbox.upper[2] = xfm.p.z; + offset = NO_NODE_OFFSET; + mask = 0; + } + else + { + bbox = AABB3f_transform(xfm, bbox); // JDB TODO: Use faster abs-matrix method + } + + instance_aabb_scratch[id] = bbox; + + HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH +} + +/* + This function starts at some BVH node and refits all nodes upwards + to the root. At some node the algorithm only proceeds upwards if + all children of the current node have already been processed. This + is checked as each time a node is reached an atomic counter is + incremented, which will reach the number of children of the node at + some time. + */ + +GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed) + global struct BVHBase *bvh, // pointer to BVH + struct AABB *childrenAABB, // temporary data to use + uint numChildrenTotal) +{ + global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + + /* compute the index of the start node */ + uint curNodeIndex = qnode_start - nodeData; + + /* the start node got already processed, thus go to its parent node */ + curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6; + + /* end at root node */ + while (curNodeIndex != 0x03FFFFFF) + { + /* increment refit counter that counts refitted children of current node */ + const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex)); + + /* if all children got refitted, then continue */ + const uint numChildrenRefitted = (parentPointer >> 0) & 0x7; + numChildrenTotal = (parentPointer >> 3) & 0x7; + if (numChildrenRefitted != numChildrenTotal) + return; + + /* reset refit counter for next refit */ + *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8; + + /* get bounds of all children from child nodes directly */ + global struct QBVHNodeN *qnode = nodeData + curNodeIndex; + global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode); + for (uint k = 0; k < numChildrenTotal; k++) + childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k); + + /* update node bounds of all children */ + QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal); + + write_mem_fence(CLK_GLOBAL_MEM_FENCE); + + /* make parent node the current node */ + curNodeIndex = parentPointer >> 6; + } + + /* update QBVH6 bounds */ + struct AABB bounds; + AABB_init(&bounds); + + for (uint i = 0; i < numChildrenTotal; i++) + AABB_extend(&bounds, &childrenAABB[i]); + + setBVHBaseBounds(bvh, &bounds); +} + + +GRL_INLINE void SUBGROUP_refit_bottom_up( + uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed) + uniform global struct BVHBase* bvh, // pointer to BVH + varying struct AABB reduce_bounds, + uniform uint numChildrenTotal, + varying ushort lane, + varying ushort head_lane) +{ + uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + + /* compute the index of the start node */ + uniform uint curNodeIndex = qnode_start - nodeData; + + /* the start node got already processed, thus go to its parent node */ + uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6; + + varying struct AABB childrenAABB; + + /* end at root node */ + while ( curNodeIndex != 0x03FFFFFF ) + { + mem_fence_gpu_invalidate(); + + /* increment refit counter that counts refitted children of current node */ + uniform uint parentPointer = 1; + if (lane == 0) + { + // acquire fence ensures that all previous writes complete before the atomic starts + parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex)); + } + + parentPointer = intel_sub_group_shuffle( parentPointer, head_lane ); + + /* if all children got refitted, then continue */ + uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7; + numChildrenTotal = (parentPointer >> 3) & 0x7; + if ( numChildrenRefitted != numChildrenTotal ) + return; + + /* reset refit counter for next refit */ + if (lane == 0) + { + *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8); + } + + /* get bounds of all children from child nodes directly */ + global struct QBVHNodeN* qnode = nodeData + curNodeIndex; + global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); + + varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0; + childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); + + /* update node bounds of all children */ + reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB ); + reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane ); + + subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane); + + /* update node mask */ + uchar childrenMask = qnode_child[child_idx].instMask; + + qnode->instMask = sub_group_reduce_or_N6(childrenMask); + + /* make parent node the current node */ + curNodeIndex = parentPointer >> 6; + } + + /* update QBVH6 bounds */ + + if( lane == 0 ) + setBVHBaseBounds( bvh, &reduce_bounds ); +} + + +GRL_INLINE void quadCopyVertices( + const struct QuadLeaf* pQuad, + struct QuadLeaf* newQuad) +{ + const uint4* s = (const uint4*) & (pQuad->v[0][0]); + uint4* d = (uint4*) & (newQuad->v[0][0]); + const uint8* s2 = (const uint8*)(s+1); + uint8* d2 = (uint8*)(d+1); + *d = *s; + *d2 = *s2; +} + + +GRL_INLINE void get_updated_quad( + global const struct QuadLeaf* pQuad, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs, + struct QuadLeaf* newQuad) +{ + struct QuadLeaf tempQuad; + + // fetch non vtx data; + { + uint4* tempQuad4U = (uint4*)&tempQuad; + global const uint4* pQuad4U = (global const uint4*)pQuad; + *tempQuad4U = *pQuad4U; + } + + /* get the geomID and primID0/1 for both quad triangles */ + const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc); + const uint primID0 = tempQuad.primIndex0; + const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad); + ushort fourth_vert = 0; + + if (primID1 != primID0) + { + ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad); + fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert; + fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert; + } + + global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID; + + uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert); + + // read the indices of the 4 verts we want + float3 vtx0, vtx1, vtx2, vtx3; + GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices); + + QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3); + + *newQuad = tempQuad; +} + +// This calculates children BBs for innerNode having *all* children leafs. +// mixed nodes will be updated by passing through bottom-up thread. +GRL_INLINE uint refit_bottom( global struct BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global struct AABB3f* instance_leaf_aabbs, + global struct QBVHNodeN* curNode, + struct AABB *childrenAABB, + uint backPointer) +{ + uint numChildren = 0; + + /* we start refit at leaf nodes, this case is for quad nodes */ + if (curNode->type == BVH_QUAD_NODE) + { + global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode); + + /* iterate over all quads of the quad node and get their bounds */ + numChildren = (backPointer >> 3) & 0x7; + for (uint k = 0; k < numChildren; k++) + { + struct QuadLeaf Q; + get_updated_quad(&quads[k], geomDesc, &Q); + quadCopyVertices(&Q, &quads[k]); + childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad + } + } + + /* we start refit at leaf nodes, this case is for procedural nodes */ + else if (curNode->type == BVH_PROCEDURAL_NODE) + { + global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode); + + /* iterate over all children of the procedural node and get their bounds */ + numChildren = (backPointer >> 3) & 0x7; + for (uint k = 0; k < numChildren; k++) + { + /* extract geomID and primID from leaf */ + const uint startPrim = QBVHNodeN_startPrim(curNode, k); + const uint geomID = ProceduralLeaf_geomIndex(leaf); + const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! + + /* read bounds from geometry descriptor */ + struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); + childrenAABB[k].lower.x = aabb.MinX; + childrenAABB[k].lower.y = aabb.MinY; + childrenAABB[k].lower.z = aabb.MinZ; + childrenAABB[k].upper.x = aabb.MaxX; + childrenAABB[k].upper.y = aabb.MaxY; + childrenAABB[k].upper.z = aabb.MaxZ; + + /* advance leaf pointer to next child */ + leaf += QBVHNodeN_blockIncr(curNode, k); + } + } + + /* we start refit at leaf nodes, this case is for instance nodes */ + else if (curNode->type == BVH_INSTANCE_NODE) + { + global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode); + global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh ); + + /* iterate over all children of the instance node and get their bounds */ + numChildren = (backPointer >> 3) & 0x7; + for (uint k = 0; k < numChildren; k++) + { + uint leafindex = (instancesLeaves + k) - leafBase; + childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] ); + childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] ); + } + } + + return numChildren; +} + + + + + +// This calculates children BBs for innerNode having *all* children leafs. +// mixed nodes will be updated by passing through bottom-up thread. +GRL_INLINE uint SUBGROUP_refit_bottom( + uniform global struct BVHBase* bvh, + uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + uniform global struct AABB3f* instance_leaf_aabbs, + uniform global struct QBVHNodeN* curNode, + uniform uint backPointer, + varying struct AABB* childrenAABB, + varying uchar* childrenMask, + varying ushort lane, + global uchar* is_procedural_instance + ) +{ + uniform uint numChildren = 0; + bool enable_procedural_instance = (is_procedural_instance != 0); + + /* we start refit at leaf nodes, this case is for quad nodes */ + if (curNode->type == BVH_QUAD_NODE) + { + /* iterate over all quads of the quad node and get their bounds */ + numChildren = (backPointer >> 3) & 0x7; + + uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode); + + struct QuadLeaf Q; + if (lane < numChildren) + { + get_updated_quad(&quads[lane], geomDesc, &Q); + + *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad + + quadCopyVertices(&Q, &quads[lane]); + *childrenMask = 0xff; + } + // FIXME: support leaves with more than one quad + } + + /* we start refit at leaf nodes, this case is for procedural nodes */ + else if (curNode->type == BVH_PROCEDURAL_NODE) + { + uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode); + + + + /* iterate over all children of the procedural node and get their bounds */ + numChildren = (backPointer >> 3) & 0x7; + + varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0; + incr = sub_group_scan_exclusive_add(incr); + + if( lane < numChildren ) + { + /* extract geomID and primID from leaf */ + varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane ); + varying global struct ProceduralLeaf* my_leaf = leaf + incr; + const uint geomID = ProceduralLeaf_geomIndex(my_leaf); + const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim); + + /* read bounds from geometry descriptor */ + struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); + childrenAABB->lower.x = aabb.MinX; + childrenAABB->lower.y = aabb.MinY; + childrenAABB->lower.z = aabb.MinZ; + childrenAABB->upper.x = aabb.MaxX; + childrenAABB->upper.y = aabb.MaxY; + childrenAABB->upper.z = aabb.MaxZ; + *childrenMask = 0xff; + } + } + + /* we start refit at leaf nodes, this case is for instance nodes */ + else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE) + { + uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode); + uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh); + + /* iterate over all children of the instance node and get their bounds and masks */ + numChildren = (backPointer >> 3) & 0x7; + if( lane < numChildren ) + { + uint leafindex = (instancesLeaves + lane) - leafBase; + childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]); + childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]); + *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]); + } + } + else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE) + { + // Handle procedural-instance leaves + // TODO: Generalize this! Should re-write the kernel to work with arbitrary mixed-mode leaves + + numChildren = (backPointer >> 3) & 0x7; + uint childType = BVH_INTERNAL_NODE; + if ( lane < numChildren ) + { + childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane ); + if (childType != BVH_INTERNAL_NODE) + { + uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode ); + uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh ); + uint leafindex = (instancesLeaves + lane) - leafBase; + childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] ); + childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] ); + *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] ); + + // see if the child has flipped from procedural to non-procedural and update the child type field as needed + uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] ); + uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE; + if (newChildType != childType) + { + InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType ); + } + } + } + + + // don't ascend the tree for a true internal node + if (sub_group_all(childType == BVH_INTERNAL_NODE)) + numChildren = 0; + } + + return numChildren; +} + +#define SG_REFIT_WG_SIZE 8 + +void DO_Refit_per_one_startpoint_sg( + global struct BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, + global struct AABB3f* instance_leaf_aabbs, + global uchar* is_procedural_instance ) +{ + /* get pointer to inner nodes and back pointers */ + global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + + /* get the inner node that we will consider as a bottom startpoint */ + const uint numInnerNodes = BVHBase_numNodes(bvh); + const uint innerNodeIdx = get_sub_group_global_id(); + + varying ushort lane = get_sub_group_local_id(); + + if (innerNodeIdx >= numInnerNodes) return; + + varying struct AABB childrenAABB; // one child AABB per lane + AABB_init(&childrenAABB); + + varying uchar childrenMask = 0; // one child mask per lane + + global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx]; + uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + uint numChildren = SUBGROUP_refit_bottom( + bvh, + geosArray, + instance_leaf_aabbs, + curNode, + backPointer, + &childrenAABB, + &childrenMask, + lane, + is_procedural_instance + ); + + + if (numChildren != 0) + { + /* update bounds of node */ + struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB); + reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0); + subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane); + + /* update mask of node */ + uchar mask = sub_group_reduce_or_N6(childrenMask); + curNode->instMask = mask; + + /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done + only by the first thread (similar to morton phase1) the machine hangs. */ + mem_fence_gpu_invalidate(); + + /* refit upper parts of the BVH */ + /* TODO: this will not gonna work for mixed nodes */ + SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0); + } +} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl new file mode 100644 index 00000000000..0a4bd3466af --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl @@ -0,0 +1,1917 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "common.h" +#include "instance.h" + +#define DBG(x) + +#define ENABLE_CHECKS 0 + +#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1 + +/* todo: */ +/* - new cross WG code path for first splits */ +/* - optimize find best child loop sequence */ +/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */ + +#define DIVIDE_BY_6 1 + +inline uint getNumPrims(struct BuildRecord *buildRecord) +{ + return buildRecord->end - buildRecord->start; +} + +inline void printBuildRecord(struct BuildRecord *record) +{ + printf("centroidBounds\n"); + AABB_print(&record->centroidBounds); + printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record)); +} + +inline void printBinInfo2(struct BinInfo2 *record) +{ + printf("boundsX[%d]\n", BINS * 2); + for (uint b = 0; b < BINS * 2; b++) + { + AABB3f_print(&record->boundsX[b]); + printf("counts.x = %d\n", record->counts[b].x); + } + printf("boundsY[%d]\n", BINS * 2); + for (uint b = 0; b < BINS * 2; b++) + { + AABB3f_print(&record->boundsY[b]); + printf("counts.y = %d\n", record->counts[b].y); + } + printf("boundsZ[%d]\n", BINS * 2); + for (uint b = 0; b < BINS * 2; b++) + { + AABB3f_print(&record->boundsZ[b]); + printf("counts.z = %d\n", record->counts[b].z); + } +} + +inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins) +{ + const float4 eps = 1E-34f; + const float4 diag = max(eps, centBounds->upper - centBounds->lower); + const float4 scale = (float4)(0.99f * (float)bins) / diag; + binMapping->scale = select((float4)(0.0f), scale, diag > eps); + binMapping->ofs = centBounds->lower; +} + +inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref) +{ + const float4 centroid2 = primref->lower + primref->upper; + AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2); +} + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + +inline void initBinInfo(struct BinInfo *binInfo) +{ + for (uint i = 0; i < BINS; i++) + { + AABB3f_init(&binInfo->boundsX[i]); + AABB3f_init(&binInfo->boundsY[i]); + AABB3f_init(&binInfo->boundsZ[i]); + binInfo->counts[i] = (uint3)(0); + } +} + +inline void subgroup_initBinInfo(struct BinInfo *binInfo) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + for (uint i = subgroupLocalID; i < BINS; i += subgroup_size) + { + AABB3f_init(&binInfo->boundsX[i]); + AABB3f_init(&binInfo->boundsY[i]); + AABB3f_init(&binInfo->boundsZ[i]); + binInfo->counts[i] = (uint3)(0); + } +} + +inline void parallel_initBinInfo(struct BinInfo *binInfo) +{ + const uint localID = get_local_id(0); + if (localID < BINS) + { + AABB3f_init(&binInfo->boundsX[localID]); + AABB3f_init(&binInfo->boundsY[localID]); + AABB3f_init(&binInfo->boundsZ[localID]); + binInfo->counts[localID] = (uint3)(0); + } +} + +inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref) +{ + const float4 lower = primref->lower; + const float4 upper = primref->upper; + const float4 p = lower + upper; + const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); + AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper); + AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper); + AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper); + atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); + atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); + atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); +} + +inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref) +{ + const float4 lower = primref->lower; + const float4 upper = primref->upper; + const float4 p = lower + upper; + const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); + AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper); + AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper); + AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper); + atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); + atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); + atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); +} + +inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref) +{ + const float4 lower = primref->lower; + const float4 upper = primref->upper; + const float4 p = lower + upper; + const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); + AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz); + AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz); + AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz); + binInfo->counts[i.x].x++; + binInfo->counts[i.y].y++; + binInfo->counts[i.z].z++; +} + +// ===================================================================================================================== +// ===================================================================================================================== +// ===================================================================================================================== + +inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins) +{ + const uint localID = get_local_id(0); + if (localID < bins) + { + AABB3f_init(&binInfo->boundsX[localID]); + AABB3f_init(&binInfo->boundsY[localID]); + AABB3f_init(&binInfo->boundsZ[localID]); + binInfo->counts[localID] = (uint3)(0); + } +} + +inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref) +{ + const float4 lower = primref->lower; + const float4 upper = primref->upper; + const float4 p = lower + upper; + const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); + AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper); + AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper); + AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper); + atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); + atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); + atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); +} + +inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins) +{ + const uint localID = get_local_id(0); + if (localID < bins) + { + AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]); + AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]); + AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]); + atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x); + atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y); + atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z); + } +} + +inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren) +{ + const uint subgroupLocalID = get_sub_group_local_id(); +#if 0 + /*! find best child to split */ + const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY; + const float maxArea = sub_group_reduce_max(area); + const uint mask = intel_sub_group_ballot(area == maxArea); + const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1; +#else + float bestArea = -(float)INFINITY; + int bestChild = -1; + for (int i = 0; i < numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize) + continue; + + /* find child with largest surface area */ + if (childrenAABB[i].lower.w > bestArea) + { + bestChild = i; + bestArea = childrenAABB[i].lower.w; + } + } +#endif + return bestChild; +} + +inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref) +{ + const float4 centroid2 = primref->lower + primref->upper; + + if (centroid2.x < buildRecord->centroidBounds.lower.x) + return false; + if (centroid2.y < buildRecord->centroidBounds.lower.y) + return false; + if (centroid2.z < buildRecord->centroidBounds.lower.z) + return false; + + if (centroid2.x > buildRecord->centroidBounds.upper.x) + return false; + if (centroid2.y > buildRecord->centroidBounds.upper.y) + return false; + if (centroid2.z > buildRecord->centroidBounds.upper.z) + return false; + + if (primref->lower.x < geometryBounds->lower.x) + return false; + if (primref->lower.y < geometryBounds->lower.y) + return false; + if (primref->lower.z < geometryBounds->lower.z) + return false; + + if (primref->upper.x > geometryBounds->upper.x) + return false; + if (primref->upper.y > geometryBounds->upper.y) + return false; + if (primref->upper.z > geometryBounds->upper.z) + return false; + + return true; +} + +/* initialize primref index array */ +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +create_primref_index(global struct Globals *globals, + global struct AABB *primref, + global unsigned int *primref_index) +{ + const uint local_size = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + const uint localID = get_local_id(0); + + const uint startID = (taskID + 0) * globals->numPrimitives / numTasks; + const uint endID = (taskID + 1) * globals->numPrimitives / numTasks; + for (uint primID = startID + localID; primID < endID; primID += local_size) + primref_index[primID] = primID; +} + +// ========================================================================================================== +// ========================================================================================================== +// ========================================================================================================== + +inline float left_to_right_area16(struct AABB3f *low) +{ + struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low); + return halfArea_AABB3f(&low_prefix); +} + +inline uint left_to_right_counts16(uint low) +{ + return sub_group_scan_exclusive_add(low); +} + +inline float right_to_left_area16(struct AABB3f *low) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID); + struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse); + const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID); + return low_area; +} + +inline uint right_to_left_counts16(uint low) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + const uint low_reverse = sub_group_broadcast(low, ID); + const uint low_prefix = sub_group_scan_inclusive_add(low_reverse); + return sub_group_broadcast(low_prefix, ID); +} + +inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high) +{ + struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low); + struct AABB3f low_reduce = AABB3f_sub_group_reduce(low); + struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high); + AABB3f_extend(&high_prefix, &low_reduce); + const float low_area = halfArea_AABB3f(&low_prefix); + const float high_area = halfArea_AABB3f(&high_prefix); + return (float2)(low_area, high_area); +} + +inline uint2 left_to_right_counts32(uint low, uint high) +{ + const uint low_prefix = sub_group_scan_exclusive_add(low); + const uint low_reduce = sub_group_reduce_add(low); + const uint high_prefix = sub_group_scan_exclusive_add(high); + return (uint2)(low_prefix, low_reduce + high_prefix); +} + +inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID); + struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID); + struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse); + struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse); + struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse); + AABB3f_extend(&high_prefix, &low_reduce); + const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID); + const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID); + return (float2)(low_area, high_area); +} + +inline uint2 right_to_left_counts32(uint low, uint high) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint ID = subgroup_size - 1 - subgroupLocalID; + const uint low_reverse = sub_group_broadcast(high, ID); + const uint high_reverse = sub_group_broadcast(low, ID); + const uint low_prefix = sub_group_scan_inclusive_add(low_reverse); + const uint low_reduce = sub_group_reduce_add(low_reverse); + const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce; + return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID)); +} + +inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit) +{ + ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0; + ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1; + ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2; + /* ignore zero sized dimensions */ + splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0)); + splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0)); + splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0)); + ulong bestSplit = min(min(splitX, splitY), splitZ); + bestSplit = sub_group_reduce_min(bestSplit); + return bestSplit; +} + +inline uint fastDivideBy6_uint(uint v) +{ +#if 1 + const ulong u = (ulong)v >> 1; + return (uint)((u * 0x55555556ul) >> 32); +#else + return v / 6; +#endif +} + +inline uint3 fastDivideBy6_uint3(uint3 v) +{ + return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z)); +} + +inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID]; + + const float lr_areaX = left_to_right_area16(&boundsX); + const float rl_areaX = right_to_left_area16(&boundsX); + + struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID]; + + const float lr_areaY = left_to_right_area16(&boundsY); + const float rl_areaY = right_to_left_area16(&boundsY); + + struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID]; + + const float lr_areaZ = left_to_right_area16(&boundsZ); + const float rl_areaZ = right_to_left_area16(&boundsZ); + + const uint3 counts = binInfo->counts[subgroupLocalID]; + + const uint lr_countsX = left_to_right_counts16(counts.x); + const uint rl_countsX = right_to_left_counts16(counts.x); + const uint lr_countsY = left_to_right_counts16(counts.y); + const uint rl_countsY = right_to_left_counts16(counts.y); + const uint lr_countsZ = left_to_right_counts16(counts.z); + const uint rl_countsZ = right_to_left_counts16(counts.z); + + const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ); + const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ); + +#if DIVIDE_BY_6 == 0 + const uint blocks_shift = SAH_LOG_BLOCK_SHIFT; + uint3 blocks_add = (uint3)((1 << blocks_shift) - 1); + const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift; + const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift; +#else + const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1); + const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1); +#endif + float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count)); + + /* first bin is invalid */ + + sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0); + sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0); + sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0); + + const uint mid = (startID + endID) / 2; + const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0; + + const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit); + + struct Split split; + split.sah = as_float((uint)(bestSplit >> 32)); + split.dim = (uint)bestSplit & 3; + split.pos = (uint)bestSplit >> 2; + + return split; +} + +inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID]; + struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size]; + + const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high); + const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high); + + struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID]; + struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size]; + + const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high); + const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high); + + struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID]; + struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size]; + + const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high); + const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high); + + const uint3 counts_low = binInfo->counts[subgroupLocalID]; + const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size]; + + const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x); + const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x); + const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y); + const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y); + const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z); + const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z); + + const uint blocks_shift = SAH_LOG_BLOCK_SHIFT; + uint3 blocks_add = (uint3)((1 << blocks_shift) - 1); + + /* low part: bins 0..15 */ + const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x); + const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x); + +#if DIVIDE_BY_6 == 0 + const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift; + const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift; + +#else + //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6; + //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6; + + /* skip blocks for breadth-first phase */ + const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x)); + const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x)); + +#endif + + float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low)); + + /* first bin is invalid */ + // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x; + // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y; + // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z; + + sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0); + sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0); + sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0); + + /* high part: bins 16..31 */ + + const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y); + const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y); +#if DIVIDE_BY_6 == 0 + const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift; + const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift; +#else + //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6; + //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6; + + /* skip blocks for breadth-first phase */ + const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y)); + const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y)); + +#endif + const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high)); + + const uint mid = (startID + endID) / 2; + const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0; + + const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit); + const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit); + const ulong bestSplit = min(bestSplit_low, bestSplit_high); + + struct Split split; + split.sah = as_float((uint)(bestSplit >> 32)); + split.dim = (uint)bestSplit & 3; + split.pos = (uint)bestSplit >> 2; + + return split; +} + +// ===================================================================== + +inline float leafSAH(float geometryArea, uint prims, uint block_shift) +{ + return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift); +} + +inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref) +{ + const uint dim = split->dim; + const float lower = primref->lower[dim]; + const float upper = primref->upper[dim]; + const float c = lower + upper; + const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]); + return pos < split->pos; +} + +inline void serial_find_split(global struct AABB *primref, + struct BinMapping *binMapping, + struct BuildRecord *buildRecord, + local struct Split *split, + local struct BinInfo *binInfo, + global uint *primref_index0, + global uint *primref_index1) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + const uint startID = buildRecord->start; + const uint endID = buildRecord->end; + + subgroup_initBinInfo(binInfo); + + for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size) + { + const uint index = primref_index0[t]; + primref_index1[t] = index; + atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]); + } +} + +inline void serial_partition_index(global struct AABB *primref, + struct BinMapping *binMapping, + struct BuildRecord *buildRecord, + struct Split *inSplit, + struct BuildRecord *outLeft, + struct BuildRecord *outRight, + struct AABB *outGeometryBoundsLeft, + struct AABB *outGeometryBoundsRight, + global uint *primref_index0, + global uint *primref_index1) +{ + const uint localID = get_local_id(0); + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroupID = get_sub_group_id(); + const uint subgroup_size = get_sub_group_size(); + + const uint begin = buildRecord->start; + const uint end = buildRecord->end; + struct Split split = *inSplit; + + struct BuildRecord left; + struct BuildRecord right; + initBuildRecord(&left, begin, end); + initBuildRecord(&right, begin, end); + + struct AABB leftAABB; + struct AABB rightAABB; + AABB_init(&leftAABB); + AABB_init(&rightAABB); + + global uint *l = primref_index0 + begin; + global uint *r = primref_index0 + end; + + /* no valid split, just split in the middle */ + if (split.sah == (float)(INFINITY)) + { + for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size) + { + const uint index = primref_index1[i]; + const uint count = sub_group_reduce_add(1); + extendBuildRecord(&left, &primref[index]); + AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); + l[subgroupLocalID] = index; + l += count; + } + + for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size) + { + const uint index = primref_index1[i]; + const uint count = sub_group_reduce_add(1); + extendBuildRecord(&right, &primref[index]); + AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); + r -= count; + r[subgroupLocalID] = index; + } + } + else + { + for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size) + { + const uint index = primref_index1[i]; + const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; + const uint isRight = 1 - isLeft; + const uint countLeft = sub_group_reduce_add(isLeft); + const uint countRight = sub_group_reduce_add(isRight); + const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); + const uint prefixRight = sub_group_scan_exclusive_add(isRight); + + r -= countRight; + + if (isLeft) + { + extendBuildRecord(&left, &primref[index]); + AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); + l[prefixLeft] = index; + } + else + { + extendBuildRecord(&right, &primref[index]); + AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); + r[prefixRight] = index; + } + l += countLeft; + } + } + + left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); + right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); + leftAABB = AABB_sub_group_reduce(&leftAABB); + rightAABB = AABB_sub_group_reduce(&rightAABB); + + if (subgroupLocalID == 0) + { + uint pos = l - primref_index0; // single first thread needs to compute "pos" + left.end = pos; + right.start = pos; + + leftAABB.lower.w = AABB_halfArea(&leftAABB); + rightAABB.lower.w = AABB_halfArea(&rightAABB); + + leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left)); + rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right)); + + *outLeft = left; + *outRight = right; + *outGeometryBoundsLeft = leftAABB; + *outGeometryBoundsRight = rightAABB; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + +#if ENABLE_CHECKS == 1 + if (subgroupLocalID == 0) + { + if (AABB_verify(outLeft)) + { + printf("outLeft:\n"); + printBuildRecord(outLeft); + } + if (AABB_verify(outRight)) + { + printf("outRight:\n"); + printBuildRecord(outRight); + } + if (AABB_verify(outGeometryBoundsLeft)) + { + printf("outGeometryBoundsLeft:\n"); + AABB_print(outGeometryBoundsLeft); + } + if (AABB_verify(outGeometryBoundsRight)) + { + printf("outGeometryBoundsRight:\n"); + AABB_print(outGeometryBoundsRight); + } + + for (uint i = outLeft->start; i < outLeft->end; i++) + { + const uint index = primref_index0[i]; + if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index])) + printf("check left %d \n", i); + if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index])) + printf("check prim ref bounds left %d \n", i); + } + for (uint i = outRight->start; i < outRight->end; i++) + { + const uint index = primref_index0[i]; + if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index])) + printf("check right %d \n", i); + if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index])) + printf("check prim ref bounds right %d \n", i); + } + } +#endif +} + +inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator, + const uint start, + const uint end, + global struct AABB *primref, + uint primID, + global char *bvh_mem, + unsigned leafSize) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + const uint items = end - start; + +#if ENABLE_CHECKS == 1 + if (items > BVH_LEAF_N_MAX) + printf("error items %d \n", items); +#endif + + // JDB TODO: Why was this code commented out?? + //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0; + //offset = sub_group_broadcast(offset,0); + + //uint offset = globals->leaf_mem_allocator_start + start * leafSize; + uint offset = allocator->start + start * leafSize; + return offset; +} + +inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode) +{ + size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN); + uint offset_u = (uint)offset; +#if ENABLE_CHECKS + if ((size_t)((offset_u << 6) >> 6) != offset) + { + printf("get_qnode_index_for_backptr - index out of reach"); + } +#endif + return offset_u; +} + +struct SerialBuildRecurseTemplateConst +{ + unsigned leafSize; + unsigned leafType; + bool allocateBackpointers; +}; + +// ==================================================================================== +// ==================================================================================== +// ==================================================================================== +// ==================================================================================== +// ==================================================================================== + +inline void parallel_find_split(global struct AABB *primref, + local struct BuildRecord *buildRecord, + local struct Split *bestSplit, + local struct BinInfo *binInfo, + global uint *primref_index0, + global uint *primref_index1) +{ + const uint localID = get_local_id(0); + const uint local_size = get_local_size(0); + const uint subgroupID = get_sub_group_id(); + + const uint startID = buildRecord->start; + const uint endID = buildRecord->end; + + struct BinMapping binMapping; + initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS); + + /* init bininfo */ + parallel_initBinInfo(binInfo); + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + for (uint t = startID + localID; t < endID; t += local_size) + { + const uint index = primref_index0[t]; + primref_index1[t] = index; + atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + /* find best dimension */ + + if (subgroupID == 0) + { + *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); +} + +inline void parallel_find_split32(local uint *local_sync, + global struct AABB *primref, + local struct BuildRecord *buildRecord, + local struct Split *bestSplit, + local struct BinInfo2 *binInfo2, + global uint *primref_index0, + global uint *primref_index1) +{ + + const uint localID = get_local_id(0); + const uint local_size = get_local_size(0); + const uint subgroupID = get_sub_group_id(); + const uint numSubGroups = get_num_sub_groups(); + const uint subgroupLocalID = get_sub_group_local_id(); + + const uint startID = buildRecord->start; + const uint endID = buildRecord->end; + + struct BinMapping binMapping; + initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS); + + /* init bininfo */ + parallel_initBinInfo2(binInfo2, 2 * BINS); + + if (localID == 0) + *local_sync = 0; + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + for (uint t = startID + localID; t < endID; t += local_size) + { + const uint index = primref_index0[t]; + primref_index1[t] = index; + atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]); + } + + /* find best split position using the last subgroup */ + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0; + syncID = sub_group_broadcast(syncID, 0); + + if (syncID + 1 == numSubGroups) + { + *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID); + DBG(if (localID == 0) printSplit(bestSplit)); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); +} + +inline void parallel_partition_index(local uint *local_sync, + global struct AABB *primref, + struct BinMapping *binMapping, + const uint begin, + const uint end, + struct Split *inSplit, + local struct BuildRecord *outLeft, + local struct BuildRecord *outRight, + local struct AABB *outGeometryBoundsLeft, + local struct AABB *outGeometryBoundsRight, + global uint *primref_index0, + global uint *primref_index1, + uint *atomicCountLeft, + uint *atomicCountRight) +{ + const uint localID = get_local_id(0); + const uint local_size = get_local_size(0); + const uint subgroupID = get_sub_group_id(); + const uint numSubGroups = get_num_sub_groups(); + const uint subgroup_size = get_sub_group_size(); + const uint subgroupLocalID = get_sub_group_local_id(); + + const uint size = end - begin; + struct Split split = *inSplit; + + /* init bin bounds */ + if (localID == 0) + { + initBuildRecord(outLeft, begin, end); + initBuildRecord(outRight, begin, end); + AABB_init(outGeometryBoundsLeft); + AABB_init(outGeometryBoundsRight); + *atomicCountLeft = 0; + *atomicCountRight = 0; + *local_sync = 0; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ? + + struct BuildRecord left; + struct BuildRecord right; + initBuildRecord(&left, begin, end); + initBuildRecord(&right, begin, end); + + struct AABB leftAABB; + struct AABB rightAABB; + AABB_init(&leftAABB); + AABB_init(&rightAABB); + + if (split.sah == (float)(INFINITY)) + { + if (subgroupID == 0) + { + for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size) + { + const uint index = primref_index1[i]; + extendBuildRecord(&left, &primref[index]); + AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); + primref_index0[i] = index; + } + + for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size) + { + const uint index = primref_index1[i]; + extendBuildRecord(&right, &primref[index]); + AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); + primref_index0[i] = index; + } + + left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); + right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); + leftAABB = AABB_sub_group_reduce(&leftAABB); + rightAABB = AABB_sub_group_reduce(&rightAABB); + + if (localID == 0) + { + outLeft->centroidBounds = left.centroidBounds; + outRight->centroidBounds = right.centroidBounds; + + *outGeometryBoundsLeft = leftAABB; + *outGeometryBoundsRight = rightAABB; + + outLeft->end = split.pos; + outRight->start = split.pos; + + outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft); + outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight); + outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft)); + outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight)); + } + } + } + else + { + + const int startID = begin + ((subgroupID + 0) * size / numSubGroups); + const int endID = begin + ((subgroupID + 1) * size / numSubGroups); + + for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size) + { + const uint index = primref_index1[i]; + const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; + const uint isRight = 1 - isLeft; + const uint countLeft = sub_group_reduce_add(isLeft); + const uint countRight = sub_group_reduce_add(isRight); + const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); + const uint prefixRight = sub_group_scan_exclusive_add(isRight); + + uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0; + offsetLeft = sub_group_broadcast(offsetLeft, 0); + uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0; + offsetRight = sub_group_broadcast(offsetRight, 0); + + if (isLeft) + { + extendBuildRecord(&left, &primref[index]); + AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); + primref_index0[begin + offsetLeft + prefixLeft] = index; + } + else + { + extendBuildRecord(&right, &primref[index]); + AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); + primref_index0[end - (offsetRight + countRight) + prefixRight] = index; + } + } + left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); + right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); + leftAABB = AABB_sub_group_reduce(&leftAABB); + rightAABB = AABB_sub_group_reduce(&rightAABB); + + AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper); + AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper); + + AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper); + AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + if (subgroupLocalID == 0) + { + const uint sync = atomic_add(local_sync, 1); + if (sync + 1 == numSubGroups) + { + uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos" + outLeft->end = pos; + outRight->start = pos; + + outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft); + outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight); + outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft)); + outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight)); + } + } + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + +#if ENABLE_CHECKS == 1 + if (localID == 0) + { + if (outLeft->end <= begin) + printf("pos begin error\n"); + if (outLeft->end > end) + printf("pos end error\n"); + + for (uint i = outLeft->start; i < outLeft->end; i++) + { + const uint index = primref_index0[i]; + //printf("left %d -> %d \n",i,index); + if (!is_left(binMapping, inSplit, &primref[index])) + printf("check left %d \n", i); + if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index])) + printf("check prim ref bounds left %d \n", i); + } + for (uint i = outRight->start; i < outRight->end; i++) + { + const uint index = primref_index0[i]; + //printf("right %d -> %d \n",i,index); + if (is_left(binMapping, inSplit, &primref[index])) + printf("check right %d \n", i); + if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index])) + printf("check prim ref bounds right %d \n", i); + } + } +#endif +} + + +#define ENABLE_LOOP_BREADTH_FIRST 0 +#if ENABLE_LOOP_BREADTH_FIRST +// TBD It might be that layout of this impact perf. +struct BreadthFirstLoopLocals +{ + struct BuildRecord local_current; +#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 + struct BinInfo binInfo; +#else + struct BinInfo2 binInfo; +#endif + struct Split split; + struct BuildRecord children[BVH_NODE_N + 1]; + struct AABB childrenAABB[BVH_NODE_N + 1]; + uint atomicCountLeft; + uint atomicCountRight; + uint local_sync; + uint recordID; + uint buildRecordIDs[BUILDRECORD_STACK_SIZE]; + uint numBuildRecordIDs; + bool exit; +}; + + +inline void parallel_build_breadth_first_loopT(global struct Globals *globals, + global struct AABB *primref, + global uint *primref_index, + global char *bvh_mem, + uint subtreeThreshold, + local struct BreadthFirstLoopLocals *L, + struct BreadthFirstTemplateConst T) +{ + const uint global_size = get_global_size(0); + const uint local_size = get_local_size(0); + const uint localID = get_local_id(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + + const uint subgroupID = get_sub_group_id(); + const uint subgroupLocalID = get_sub_group_local_id(); + + /* double buffered primref index array */ + global uint *primref_index0 = primref_index; + global uint *primref_index1 = primref_index + globals->numPrimitives; + + global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); + +#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 + const uint bins = BINS; +#else + const uint bins = 2 * BINS; +#endif + + if (localID == 0) + { + L->numBuildRecordIDs = 0; + L->exit = false; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + while (1) + { + if (localID == 0) + { + if (L->numBuildRecordIDs == 0) + { + L->recordID = generic_atomic_add(&globals->counter, 1); + if (L->recordID >= globals->numBuildRecords) + L->exit = true; + } + else + { + L->numBuildRecordIDs--; + L->recordID = L->buildRecordIDs[L->numBuildRecordIDs]; + } + L->local_current = records[L->recordID]; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + /* no more buildrecords available ? */ + + if (L->exit) + break; + + local struct BuildRecord *current = &L->local_current; + const uint items = getNumPrims(current); + const uint depth = getBuildRecursionDepth(current); + + global unsigned int *num_records_output = &globals->numBuildRecords_extended; + + struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current; + + /* ignore small buildrecords */ + if (items < max(subtreeThreshold, cfg_minLeafSize)) + { + // do nothing + } + else + { + /*! find best split */ +#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 + parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1); +#else + parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1); +#endif + uint numChildren = 2; + + /*! find best split */ + struct BinMapping binMapping; + initBinMapping(&binMapping, ¤t->centroidBounds, bins); + + parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight); + + while (numChildren < BVH_NODE_N6) + { + /*! find best child to split */ + const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren); + if (bestChild == -1) + break; + + /* perform best found split */ + local struct BuildRecord *brecord = &L->children[bestChild]; + local struct BuildRecord *lrecord = &L->children[numChildren + 0]; + local struct BuildRecord *rrecord = &L->children[numChildren + 1]; + +#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 + parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1); +#else + parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1); +#endif + + initBinMapping(&binMapping, &brecord->centroidBounds, bins); + + parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight); + + *brecord = *rrecord; + L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1]; + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + numChildren++; + } + + //sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + if (localID <= 16 && subgroupID == 0) + { + global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem; + global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base); + global uint *back_pointers = BVHBase_backPointers(bvh_base); + uint qnode_index = 0; + if (T.allocateBackpointers) + { + /* index of internal node, the domain of backpointers map*/ + qnode_index = get_qnode_index_for_backptr(nodes_start, qnode); + // the backpointer is already set, but we need to add/encode the num of children + // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic... + back_pointers[qnode_index] += (numChildren << 3); + } + + /* sort children based on rnage size */ + const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren); + //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1); + const uint IDs = numPrimsIDs & 7; + const uint pushIDs = convertToPushIndices8(IDs); + + /* alloc #numChildren nodes at once */ + const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren); + + /* update single relative node pointer and type */ + const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6; + const uint type = BVH_INTERNAL_NODE; + + /* set parent pointer in child build records */ + if (subgroupLocalID < numChildren) + { + setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1); + global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN); + L->children[subgroupLocalID].current = child_data_ptr; + if (T.allocateBackpointers) + { + uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr); + back_pointers[child_index] = qnode_index << 6; + } + } + + /* write out qbvh node */ + subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode); + + /* write out child buildrecords to memory */ + + uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0; + global_records_offset = sub_group_broadcast(global_records_offset, 0); + + if (localID == 0) + { + records[L->recordID] = L->children[0]; + L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID; + for (uint i = 1; i < numChildren; i++) + { + const uint ID = globals->numBuildRecords + global_records_offset + i - 1; + records[ID] = L->children[i]; + L->buildRecordIDs[L->numBuildRecordIDs++] = ID; + } + } + } + } + work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + } + + /* last active HW thread ? */ + if (localID == 0) + { + const uint sync = atomic_add(&globals->sync, 1); + if (sync + 1 == numTasks) + { + globals->sync = 0; + /* set final number of buildrecords */ + globals->numBuildRecords += globals->numBuildRecords_extended; + globals->numBuildRecords_extended = 0; + globals->counter = 0; + } + } +} + +__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_build_breadth_first_loop(global struct Globals *globals, + global struct AABB *primref, + global uint *primref_index, + global char *bvh_mem, + uint subtreeThreshold) +{ + local struct BreadthFirstLoopLocals L; + static const struct BreadthFirstTemplateConst T = { + false // bool allocateBackpointers; + }; + + parallel_build_breadth_first_loopT(globals, + primref, + primref_index, + bvh_mem, + subtreeThreshold, + &L, + T); +} + +__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_build_breadth_first_loop_backpointers(global struct Globals *globals, + global struct AABB *primref, + global uint *primref_index, + global char *bvh_mem, + uint subtreeThreshold) +{ + local struct BreadthFirstLoopLocals L; + static const struct BreadthFirstTemplateConst T = { + true // bool allocateBackpointers; + }; + + parallel_build_breadth_first_loopT(globals, + primref, + primref_index, + bvh_mem, + subtreeThreshold, + &L, + T); +} +// =================================================== +// =============== experimental code ================= +// =================================================== +#endif + +#define ENABLE_GLOBAL_SPLIT 0 +#if ENABLE_GLOBAL_SPLIT +inline void parallel_partition_segment_index(local uint *local_sync, + global struct AABB *primref, + struct BinMapping *binMapping, + const uint begin, + const uint end, + const uint global_begin, + const uint global_end, + struct Split *inSplit, + local struct AABB *outLeft, + local struct AABB *outRight, + local struct AABB *outGeometryBoundsLeft, + local struct AABB *outGeometryBoundsRight, + global uint *primref_index0, + global uint *primref_index1, + uint *atomicCountLeft, + uint *atomicCountRight) +{ + const uint localID = get_local_id(0); + const uint local_size = get_local_size(0); + const uint subgroupID = get_sub_group_id(); + const uint numSubGroups = get_num_sub_groups(); + const uint subgroup_size = get_sub_group_size(); + const uint subgroupLocalID = get_sub_group_local_id(); + + const uint size = end - begin; + struct Split split = *inSplit; + + /* init bin bounds */ + if (localID == 0) + { + AABB_init(outLeft); + AABB_init(outRight); + AABB_init(outGeometryBoundsLeft); + AABB_init(outGeometryBoundsRight); + *local_sync = 0; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + struct AABB left; + struct AABB right; + AABB_init(&left); + AABB_init(&right); + + struct AABB leftAABB; + struct AABB rightAABB; + AABB_init(&leftAABB); + AABB_init(&rightAABB); + + const int startID = begin + ((subgroupID + 0) * size / numSubGroups); + const int endID = begin + ((subgroupID + 1) * size / numSubGroups); + + for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size) + { + const uint index = primref_index1[i]; + const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; + const uint isRight = 1 - isLeft; + const uint countLeft = sub_group_reduce_add(isLeft); + const uint countRight = sub_group_reduce_add(isRight); + const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); + const uint prefixRight = sub_group_scan_exclusive_add(isRight); + + uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0; + offsetLeft = sub_group_broadcast(offsetLeft, 0); + uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0; + offsetRight = sub_group_broadcast(offsetRight, 0); + + if (isLeft) + { + AABB_extend_point(&left, AABB_centroid2(&primref[index])); + AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); + primref_index0[global_begin + offsetLeft + prefixLeft] = index; + } + else + { + AABB_extend_point(&right, AABB_centroid2(&primref[index])); + AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); + primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index; + } + } + left = AABB_sub_group_reduce(&left); + right = AABB_sub_group_reduce(&right); + leftAABB = AABB_sub_group_reduce(&leftAABB); + rightAABB = AABB_sub_group_reduce(&rightAABB); + + AABB_local_atomic_merge(outLeft, left.lower, left.upper); + AABB_local_atomic_merge(outRight, right.lower, right.upper); + + AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper); + AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper); + + work_group_barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(BINS * 2, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel global_init_split_iteration(global struct Globals *globals, + global struct GlobalBuildRecord *global_record, + global char *bvh_mem, + const uint subTreeThreshold) +{ + const uint localID = get_local_id(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + + global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); + + /* for each build record with size > subTreeThreshold initialize a global build record */ + + const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks; + const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks; + + for (uint i = startID; i < endID; i++) + { + global struct BuildRecord *buildRecord = &records[i]; + DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start)); + + if ((buildRecord->end - buildRecord->start) > subTreeThreshold) + { + uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0; + + ID = work_group_broadcast(ID, 0); + global struct BinInfo2 *binInfo = &global_record[ID].binInfo; + global struct BinMapping *binMapping = &global_record[ID].binMapping; + initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS); + parallel_initBinInfo2(binInfo, 2 * BINS); + if (localID == 0) + { + global_record[ID].range.start = buildRecord->start; + global_record[ID].range.end = buildRecord->end; + global_record[ID].atomicCountLeft = 0; + global_record[ID].atomicCountRight = 0; + global_record[ID].buildRecordID = i; + AABB_init(&global_record[ID].leftCentroid); + AABB_init(&global_record[ID].rightCentroid); + AABB_init(&global_record[ID].leftGeometry); + AABB_init(&global_record[ID].rightGeometry); + } + } + } + DBG( + work_group_barrier(CLK_LOCAL_MEM_FENCE); + if (localID == 0) + printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords);); +} + +__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel global_bin_iteration(global struct Globals *globals, + global struct AABB *primref, + global uint *primref_index, + global char *bvh_mem, + global struct GlobalBuildRecord *global_record) +{ + const uint localID = get_local_id(0); + const uint blockSize = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + + const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; + + /* early out */ + if (numGlobalBuildRecords == 0) + return; + + /* double buffered primref index array */ + global uint *primref_index0 = primref_index; + global uint *primref_index1 = primref_index + globals->numPrimitives; + + uint numBlocks = 0; + + /* get total number of blocks, size of block == WG size */ + for (uint i = 0; i < numGlobalBuildRecords; i++) + numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize; + + const uint startBlockID = (taskID + 0) * numBlocks / numTasks; + const uint endBlockID = (taskID + 1) * numBlocks / numTasks; + uint numBlockIDs = endBlockID - startBlockID; + + uint splitRecordID = 0; + uint offset_start = 0; + uint offset_end = 0; + uint cur_blocks = 0; + + for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++) + { + const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; + const uint blocks = (sizeRecord + blockSize - 1) / blockSize; + if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks) + { + const uint preBlocks = startBlockID - blockCounter; + cur_blocks = min(numBlockIDs, blocks - preBlocks); + offset_start = preBlocks * blockSize; + offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord); + break; + } + blockCounter += blocks; + } + + if (localID == 0) + DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); + + local struct BinInfo2 local_binInfo; + parallel_initBinInfo2(&local_binInfo, 2 * BINS); + struct BinMapping binMapping = global_record[splitRecordID].binMapping; + + while (1) + { + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + const uint startID = global_record[splitRecordID].range.start + offset_start; + const uint endID = global_record[splitRecordID].range.start + offset_end; + + if (localID == 0) + DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID)); + + for (uint i = startID + localID; i < endID; i += blockSize) + { + const uint index = primref_index0[i]; + primref_index1[i] = index; + atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync + atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS); + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + numBlockIDs -= cur_blocks; + if (numBlockIDs == 0) + break; + + splitRecordID++; + parallel_initBinInfo2(&local_binInfo, 2 * BINS); + binMapping = global_record[splitRecordID].binMapping; + + const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; + const uint blocks = (sizeRecord + blockSize - 1) / blockSize; + cur_blocks = min(numBlockIDs, blocks); + offset_start = 0; + offset_end = min(cur_blocks * blockSize, sizeRecord); + + if (localID == 0) + DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); + } +} + +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +global_compute_best_split_iteration(global struct Globals *globals, + global char *bvh_mem, + global struct GlobalBuildRecord *global_record) +{ + const uint localID = get_local_id(0); + const uint blockSize = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + + const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; + + /* early out */ + if (numGlobalBuildRecords == 0) + return; + + const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks; + const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks; + for (uint i = startRecordID; i < endRecordID; i++) + { + struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo, + global_record[i].binMapping.scale, + global_record[i].range.start, + global_record[i].range.end); + if (localID == 0) + { + global_record[i].split = split; + global_record[i].atomicCountLeft = 0; + global_record[i].atomicCountRight = 0; + DBG(printSplit(&global_record[i].split)); + } + } +} + +__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +global_partition_iteration(global struct Globals *globals, + global struct AABB *primref, + global uint *primref_index, + global char *bvh_mem, + global struct GlobalBuildRecord *global_record) +{ + + const uint localID = get_local_id(0); + const uint blockSize = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + + const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; + + /* early out */ + if (numGlobalBuildRecords == 0) + return; + + /* double buffered primref index array */ + global uint *primref_index0 = primref_index; + global uint *primref_index1 = primref_index + globals->numPrimitives; + + uint numBlocks = 0; + + /* get total number of blocks, size of block == WG size */ + for (uint i = 0; i < numGlobalBuildRecords; i++) + numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize; + + const uint startBlockID = (taskID + 0) * numBlocks / numTasks; + const uint endBlockID = (taskID + 1) * numBlocks / numTasks; + uint numBlockIDs = endBlockID - startBlockID; + + uint splitRecordID = 0; + uint offset_start = 0; + uint offset_end = 0; + uint cur_blocks = 0; + + for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++) + { + const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; + const uint blocks = (sizeRecord + blockSize - 1) / blockSize; + if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks) + { + const uint preBlocks = startBlockID - blockCounter; + cur_blocks = min(numBlockIDs, blocks - preBlocks); + offset_start = preBlocks * blockSize; + offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord); + break; + } + blockCounter += blocks; + } + + if (localID == 0) + DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); + + local struct AABB centroidAABB[2]; + local struct AABB geometryAABB[2]; + local uint local_sync; + + while (1) + { + + const uint startID = global_record[splitRecordID].range.start + offset_start; + const uint endID = global_record[splitRecordID].range.start + offset_end; + + struct BinMapping binMapping = global_record[splitRecordID].binMapping; + struct Split split = global_record[splitRecordID].split; + + const uint global_start = global_record[splitRecordID].range.start; + const uint global_end = global_record[splitRecordID].range.end; + + if (localID == 0) + DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); + + parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, ¢roidAABB[0], ¢roidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight); + + /* update global structures */ + if (localID == 0) + { + AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, ¢roidAABB[0]); + AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, ¢roidAABB[1]); + AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]); + AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]); + } + + numBlockIDs -= cur_blocks; + if (numBlockIDs == 0) + break; + + splitRecordID++; + + const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; + const uint blocks = (sizeRecord + blockSize - 1) / blockSize; + cur_blocks = min(numBlockIDs, blocks); + offset_start = 0; + offset_end = min(cur_blocks * blockSize, sizeRecord); + } +} + +inline void printBinaryNode(struct AABB *aabb) +{ + printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w)); +} + +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel global_finalize_iteration(global struct Globals *globals, + global struct GlobalBuildRecord *global_record, + global char *bvh_mem, + global struct AABB *binary_nodes) +{ + const uint localID = get_local_id(0); + const uint localSize = get_local_size(0); + const uint groupID = get_group_id(0); + const uint numGroups = get_num_groups(0); + + global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); + + for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize) + { + const uint buildRecordID = global_record[i].buildRecordID; + const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w); + /* left child buildrecord */ + const uint leftID = buildRecordID; + records[leftID].start = global_record[i].range.start; + records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft; + records[leftID].centroidBounds = global_record[i].leftCentroid; + /* right child buildrecord */ + const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1); + records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft; + records[rightID].end = global_record[i].range.end; + records[rightID].centroidBounds = global_record[i].rightCentroid; + /* two binary nodes */ + const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2); + binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0); + binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1); + binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry; + binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry; + binary_nodes[binaryChildID + 0].lower.w = as_float(leftID); + binary_nodes[binaryChildID + 0].upper.w = as_float(-1); + binary_nodes[binaryChildID + 1].lower.w = as_float(rightID); + binary_nodes[binaryChildID + 1].upper.w = as_float(-1); + records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0); + records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1); + } + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + if (localID == 0) + { + const uint sync = atomic_add(&globals->sync, 1); + if (sync + 1 == numGroups) + { + globals->sync = 0; + DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords)); + DBG( + for (uint i = 0; i < globals->numBuildRecords; i++) { + printf("i %d \n", i); + printBuildRecord(&records[i]); + } printf("Binary Tree \n"); + for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) { + printf("i %d \n", i); + printBinaryNode(&binary_nodes[i]); + } + + ); + globals->numGlobalBuildRecords = 0; + } + } +} + +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals, + global struct GlobalBuildRecord *global_record, + global char *bvh_mem, + global struct AABB *binary_nodes) +{ +#define MAX_TOP_LEVEL_STACK_DEPTH 32 + struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH]; + global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH]; + struct AABB childrenAABB[BVH_NODE_N6]; + float childrenHalfArea[BVH_NODE_N6]; + + /* build records */ + global struct BuildRecord *record = getBuildRecords(bvh_mem, globals); + + struct BVHBase *base = (struct BVHBase *)bvh_mem; + struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset); + + uint stack_index = 1; + stack[0] = binary_nodes[0]; + stackParentPtrs[0] = (global uchar *)qnode_root; + + while (stack_index != 0) + { + stack_index--; + + childrenAABB[0] = stack[stack_index]; + struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index]; + childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]); + + /* buildrecord leaf => set parent pointer and continue*/ + DBG( + printf("stack_index %d \n", stack_index); + printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w));); + + if (as_uint(childrenAABB[0].upper.w) == -1) + { + const uint buildRecordID = as_uint(childrenAABB[0].lower.w); + DBG( + printf("leaf buildRecordID %d \n", buildRecordID); + printBuildRecord(&record[buildRecordID]);) + + record[buildRecordID].current = (global uchar *)qnode; + continue; + } + + childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]); + + uint numChildren = 1; + while (numChildren < BVH_NODE_N6) + { + // FIXME + + /*! find best child to split */ + float bestArea = -(float)INFINITY; + int bestChild = -1; + for (int i = 0; i < numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (as_uint(childrenAABB[i].upper.w) == -1) + continue; + + /* find child with largest surface area */ + if (childrenHalfArea[i] > bestArea) + { + bestChild = i; + bestArea = childrenAABB[i].lower.w; + } + } + if (bestChild == -1) + break; + const uint leftID = as_uint(childrenAABB[bestChild].lower.w); + const uint rightID = as_uint(childrenAABB[bestChild].upper.w); + childrenAABB[bestChild] = binary_nodes[leftID]; + childrenAABB[numChildren] = binary_nodes[rightID]; + childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]); + childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]); + numChildren++; + } + + const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren); + + /* update single relative node pointer */ + const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6; + const uint type = BVH_INTERNAL_NODE; + + setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode); + + DBG( + printQBVHNodeN(qnode); + printf("numChildren %d \n", numChildren); + for (uint i = 0; i < numChildren; i++) + AABB_print(&childrenAABB[i]);); + + /* update parent pointer of build records of all children */ + for (uint ID = 0; ID < numChildren; ID++) + { + stack[stack_index] = childrenAABB[ID]; + stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN); + stack_index++; + } + } +} + +#endif diff --git a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h new file mode 100644 index 00000000000..b8cf7288f6a --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h @@ -0,0 +1,1507 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "bvh_build_refit.h" +#include "libs/lsc_intrinsics.h" + + +#define REFIT_DEBUG_CHECKS 0 +#define REFIT_VERBOSE_LOG 0 + +#define NUM_STARTPOINTS_IN_SLM (1024) + +GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr) +{ + uint8 val = (uint8)( + as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w), + as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w)); + + store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val); +} + +GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr) +{ + uint8 val = (uint8)( + as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w), + as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w)); + + store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val); +} + +typedef struct Treelet_by_single_group_locals +{ + uint startpoints[NUM_STARTPOINTS_IN_SLM]; +} Treelet_by_single_group_locals; + +typedef struct SquashedInputGroupDesc { + qword bvh; + qword scratch; + uint groupInTree; + uint totalNumGroups; //valid only for 0th element in array, otherwise its trash padding +} SquashedInputGroupDesc; + +// +// +// update primitives +// +// + +typedef struct SquashedInput { + global struct BVHBase* pBvh; + global void* pInput; + global struct AABB* bbox_scratch; +} SquashedInput; + + + +// updates one quad leaf and gets BBOX contatining it +GRL_INLINE void refit_bottom_child_quad( + global struct QuadLeaf* quad, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + struct AABB* childAABB) +{ + struct QuadLeaf Q; + get_updated_quad(quad, geomDesc, &Q); + quadCopyVertices(&Q, quad); + *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad +} + +// procedurals will have to go old path at first +#if 0 +// updates one procedural leaf and gets BBOX contatining it +GRL_INLINE void refit_bottom_child_procedural( + global struct ProceduralLeaf** pleaf, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + struct AABB* childAABB) +{ + global struct ProceduralLeaf* leaf = *pleaf; + /* extract geomID and primID from leaf */ + const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx); + const uint geomID = ProceduralLeaf_geomIndex(leaf); + const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! + + /* read bounds from geometry descriptor */ + struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); + childAABB->lower.x = aabb.MinX; + childAABB->lower.y = aabb.MinY; + childAABB->lower.z = aabb.MinZ; + childAABB->upper.x = aabb.MaxX; + childAABB->upper.y = aabb.MaxY; + childAABB->upper.z = aabb.MaxZ; + + /* advance leaf pointer to next child */ + *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx); +} + + +GRL_INLINE void update_procedural_leafs( + global struct BVHBase* bvh, + global void* input, + global struct AABB* bbox_scratch, + uint id, + uint num_done_by_one_thread) +{ + uint numLeaves = BVHBase_GetNumQuads(bvh); + uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64; + global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh); + uint start_leaf = id * num_done_by_one_thread; + uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves); + + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; + + for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++) + { + struct AABB theAABB; + refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB); + theAABB.lower.w = as_float(0xABBADEFF); + theAABB.upper.w = 0x00; + storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]); + } +} +#endif + +GRL_INLINE void update_quads( + global struct BVHBase* bvh, + global void* input, + global struct AABB* bbox_scratch, + uint id, + uint num_done_by_one_thread) +{ + uint numLeaves = BVHBase_GetNumQuads(bvh); + uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64; + global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh); + uint start_leaf = id * num_done_by_one_thread; + uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves); + + global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; + + for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++) + { + struct AABB theAABB; + refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB); + theAABB.lower.w = as_float(0xABBADEFF); + theAABB.upper.w = 0x00; + storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]); + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////// +// +// core bottom-up update functions +// +// + +GRL_INLINE void quantise_bounds( + struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp, + uchar3* lower_uchar, + uchar3* upper_uchar) +{ + const float up = 1.0f + ulp; + const float down = 1.0f - ulp; + + struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ??? + + float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); + lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); + float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); + upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); + + *lower_uchar = convert_uchar3_rtn(lower); + *upper_uchar = convert_uchar3_rtp(upper); +} + +typedef struct Qbounds_as_DW { + uint32_t xLL; uint32_t xLU; uint32_t xUU; + uint32_t yLL; uint32_t yLU; uint32_t yUU; + uint32_t zLL; uint32_t zLU; uint32_t zUU; +} Qbounds_as_DW; + +GRL_INLINE void encodeQuantisedDataAsDW( + uchar3 lower_uchar, + uchar3 upper_uchar, + uint idx, + Qbounds_as_DW* qbounds) +{ + uint shift_init = idx * 8; + if (idx >= 4) { + uint shift = (shift_init - 32); + qbounds->xLU |= ((uint)lower_uchar.x) << shift; + qbounds->yLU |= ((uint)lower_uchar.y) << shift; + qbounds->zLU |= ((uint)lower_uchar.z) << shift; + } + else { + qbounds->xLL |= ((uint)lower_uchar.x) << shift_init; + qbounds->yLL |= ((uint)lower_uchar.y) << shift_init; + qbounds->zLL |= ((uint)lower_uchar.z) << shift_init; + } + + if (idx < 2) { + uint shift = (shift_init + 16); + qbounds->xLU |= ((uint)upper_uchar.x) << shift; + qbounds->yLU |= ((uint)upper_uchar.y) << shift; + qbounds->zLU |= ((uint)upper_uchar.z) << shift; + } + else { + uint shift = (shift_init - 16); + + qbounds->xUU |= ((uint)upper_uchar.x) << shift; + qbounds->yUU |= ((uint)upper_uchar.y) << shift; + qbounds->zUU |= ((uint)upper_uchar.z) << shift; + } +} + +GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode) +{ + qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x; + qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y; + qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z; +} + + +GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced) +{ + + int3 exp; + const float up = 1.0f + ulp; + struct AABB conservative_aabb = conservativeAABB(aabb_reduced); + const float3 len = AABB_size(&conservative_aabb).xyz * up; + const float3 mant = frexp_vec3(len, &exp); + const float3 org = conservative_aabb.lower.xyz; + + exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); + + qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z; + + qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y; qbvh_node->exp_z = exp.z; + + Qbounds_as_DW qbounds = { 0x0 }; + + + { + uchar3 lower_uchar, upper_uchar; + quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar); + + //encode invalid children. its enough to set 0x80 as lower_x bytes + uint shift = numChildren * 8; + uint shift2 = min(shift, 31u); + qbounds.xLL = (0x80808080u << shift2); + uint shift3 = max(shift, 32u) - 32; + qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3); + + encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds); + //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node); + } + + uint ch = prevChildIdx == 0; + while (ch < numChildren) { + uchar3 lower_uchar, upper_uchar; + quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar); + encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds); + //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node); + ch += 1 + (prevChildIdx == (ch + 1)); + } + Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]); + *qbounds_dst = qbounds; + return; +} + +GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B) +{ + AABB_extend(&A, &B); + // to make it work for TLAS node masks change to this: + // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w)); + A.lower.w = as_float(0xABBADE00u); + return A; +} + +GRL_INLINE void refitReduceNodePrev( + uint prevIdx, + uint leadChildIdx, + uint numChildren, + struct AABB* globalBox, + struct AABB* reduceBox, + uint depth, + uint NodeIndex) +{ + uint8_t childIgnored = (prevIdx - leadChildIdx); + +# if REFIT_DEBUG_CHECKS + bool err = false; + if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u) + { + printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n", + get_local_id(0), + childIgnored, + NodeIndex, + prevIdx, + depth); + err = true; + } + + if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u) + { + printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n", + get_local_id(0), + NodeIndex, + depth); + } + + bool fail = false; + for (uint k = 0; (k < numChildren) && !err; ++k) { + if (k != childIgnored) { + if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) { + printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n", + get_local_id(0), + k, + prevIdx - leadChildIdx, + NodeIndex, + leadChildIdx + k, + depth); + fail = true; + } + } + } + err |= fail; +# endif + + // for each child 3 bits contains load index + const uint32_t indicesEncoded = + (1 << 0) + + (2 << 3) + + (3 << 6) + + (4 << 9) + + (5 << 12) + + (0 << 15) + + (1 << 18) + + (2 << 21) + + (3 << 24) + + (4 << 27); + // 1,2,3,4,5 + + + uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3); + + struct AABB* childAABB = globalBox + leadChildIdx; + struct AABB temp = childAABB[indicesEncodedShifted & 7]; + indicesEncodedShifted >>= 3; + struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7); + struct AABB backlog = temp; + + for (uint child = 2; child < numChildren; child++) + { + temp = *nextChild; + *reduceBox = refitReduce2Boxes(*reduceBox, backlog); + indicesEncodedShifted >>= 3; + nextChild = childAABB + (indicesEncodedShifted & 7); + backlog = temp; + } + + *reduceBox = refitReduce2Boxes(*reduceBox, backlog); + +#if REFIT_DEBUG_CHECKS + for (uint k = 0; (k < numChildren) && !err; ++k) { + if (k != childIgnored) { + if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) { + printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n", + get_local_id(0), + k, numChildren, + prevIdx - leadChildIdx, + NodeIndex, + leadChildIdx + k, + depth); + + err = true; + } + } + } + if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) { + printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n", + NodeIndex, + depth); + } +#endif +} + + +GRL_INLINE uint hash_local_id() +{ + return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id(); +} + +//=============================================================== +// +// Core update function +// +//=============================================================== +GRL_INLINE bool refit_treelet_by_single_group( + global struct AABB* bbox, + local Treelet_by_single_group_locals* loc, + uniform global BVHBase* pBvh, + uniform RefitTreelet trltDsc, + bool encodeQnodes, + bool isTipTreelet) +{ + BackPointers* backpointers = BVHBase_GetBackPointers(pBvh); + InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh); + uint local_id = get_local_id(0); + StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset; + + // special case for single path treelets, TODO rewrite it as subgroups based + if (trltDsc.numStartpoints == 1) { + if (local_id == 0) { + RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc); + uint innerNodeIdx = desc.theOnlyNodeIndex; + uint numChildren = desc.numChildrenOfTheNode; + uint childIndex = desc.childrenOffsetOfTheNode; + uint maxDepth = desc.maxDepth; + + uint prevIdx = childIndex; + struct AABB myBox = bbox[childIndex]; + struct AABB prevAABB; + uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0; + InternalNode* curNode = internalNodes + innerNodeIdx; + uint currDepth = 0; + + while (1) + { + prevAABB = myBox; + if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); } + + if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); } + + if (++currDepth > maxDepth) { break; } + + if (encodeQnodes) { + InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } +#if !REFIT_DEBUG_CHECKS + else +#endif + { storeAABBToL1(myBox, &bbox[innerNodeIdx]); } + + prevIdx = innerNodeIdx; + innerNodeIdx = BackPointer_GetParentIndex(backpointer); + backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); + numChildren = BackPointer_GetNumChildren(backpointer); + curNode = internalNodes + innerNodeIdx; + childIndex = innerNodeIdx + curNode->childOffset; + } + + if (isTipTreelet) { + AABB3f reduced3f = AABB3fFromAABB(myBox); + pBvh->Meta.bounds = reduced3f; + } + else { + storeAABBToL3(myBox, &bbox[innerNodeIdx]); + } + + if (encodeQnodes || isTipTreelet) { + InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } + +#if REFIT_VERBOSE_LOG + printf("single node treelet: storing node idx %d \n", innerNodeIdx); +#endif + } + + return local_id == 0; + } + + local uint* loc_startpoints = loc->startpoints; + + +#if REFIT_DEBUG_CHECKS + if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) { + if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n"); + return local_id == 0; + } +#endif + + uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints; + + /*===================================================================== + first phase where we update startpoints nodes only + ----------------------------------------------------------------------*/ + for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) { + uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i)); + uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint); + uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); + if (startpoint_i >= SLMedStartpointsOffset) { + uint idx = startpoint_i - SLMedStartpointsOffset; + loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint); + } + + uint numChildren = BackPointer_GetNumChildren(backpointer); + InternalNode* curNode = internalNodes + innerNodeIdx; + uint childIndex = innerNodeIdx + curNode->childOffset; + + uint prevIdx = childIndex; + struct AABB myBox = bbox[childIndex]; + struct AABB prevAABB = myBox; + +# if REFIT_DEBUG_CHECKS + if (numChildren == 0) { + printf("this node has no chidren!\n", 0); + AABB_init(&myBox); + } +# endif + + if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); } + myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); + +#if REFIT_VERBOSE_LOG + printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx); +#endif + storeAABBToL1(myBox, &bbox[innerNodeIdx]); + + if (encodeQnodes) { + InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } + } + + uniform uint CurrPeeledDepth = 1; + uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints; + uint nextFloorStartpoint = hash_local_id(); + + uint depthOnionEnd = trltDsc.depthLess64; + if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; } + if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; } + + /*===================================================================== + second phase, we update horizontally untill + we reach number of active path below grou size + ----------------------------------------------------------------------*/ + while (CurrPeeledDepth < depthOnionEnd) { + mem_fence_workgroup_default(); + + work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group); + uint start = nextFloorStartpoint; + nextFloorStartpoint = numStartpoints; + + for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) { + uint startpoint = loc_startpoints[startpoint_i]; + uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint); + uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); + + if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) { + StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint); + loc_startpoints[startpoint_i] = newSP; + nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i); + } + + InternalNode* curNode = internalNodes + innerNodeIdx; + uint childIndex = innerNodeIdx + curNode->childOffset; + uint numChildren = BackPointer_GetNumChildren(backpointer); + + uint prevIdx = childIndex; + struct AABB myBox = bbox[childIndex]; + struct AABB prevAABB = myBox; + refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx); + + myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); + +#if REFIT_VERBOSE_LOG + printf("onion: startpoint %d at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx); +#endif + storeAABBToL1(myBox, &bbox[innerNodeIdx]); + if (encodeQnodes) { + InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } + } + CurrPeeledDepth++; + } + + uint startpoint_idx = nextFloorStartpoint; + bool active = startpoint_idx < numStartpoints; + + work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group); + StartPoint startpoint = loc_startpoints[startpoint_idx]; + + struct AABB myBox; + uint prevIdx = 0; + uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint); + + /*===================================================================== + last phase, each thread just continues path to its end + + only thread that computes the longest path leaves prematurely + (thats why while condition isn't <=) the code for finalizing root of treelet + is special and hendled afterwards + + TODO: with proper assigning of paths to lanes we should reach only three + active lanes per physical thread quite soon for this subgroups could be used + ----------------------------------------------------------------------*/ + bool prevActive = active; + while (CurrPeeledDepth < trltDsc.maxDepth) { + uint backpointer; + uint childIndex; + InternalNode* curNode = internalNodes + innerNodeIdx; + if (active) { + childIndex = innerNodeIdx + curNode->childOffset; + backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); + } else if(prevActive){ + mem_fence_workgroup_default(); + } + + prevActive = active; + + work_group_barrier(0, memory_scope_work_group); + //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx); + if (active) { + +#if REFIT_DEBUG_CHECKS + if (CurrPeeledDepth > StartPoint_GetDepth(startpoint)) + { + printf("uppath: startpoint %d at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth); + } +#endif + if (prevIdx == 0) { + myBox = bbox[childIndex]; + prevIdx = childIndex; + } + uint numChildren = BackPointer_GetNumChildren(backpointer); + + struct AABB prevAABB = myBox; + refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx); + myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); +#if REFIT_VERBOSE_LOG + printf("uppath: startpoint %d at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx); +#endif + active = CurrPeeledDepth < StartPoint_GetDepth(startpoint); + + if (encodeQnodes) { +#if !REFIT_DEBUG_CHECKS + if (!active) +#endif + { storeAABBToL1(myBox, &bbox[innerNodeIdx]); } + InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } else { + storeAABBToL1(myBox, &bbox[innerNodeIdx]); + } + + prevIdx = innerNodeIdx; + innerNodeIdx = BackPointer_GetParentIndex(backpointer); + } + + CurrPeeledDepth++; + } + + { + uint backpointer; + uint childIndex; + InternalNode* curNode = internalNodes + innerNodeIdx; + if (active) { + childIndex = innerNodeIdx + curNode->childOffset; + backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); + } else if(prevActive) { + mem_fence_workgroup_default(); + } + + work_group_barrier(0, memory_scope_work_group); + + /*===================================================================== + final step, is special processing of root, + its different, since its box is transfered cross group (written to L3) + or is root of whole tree and hence fill global box in bvh MD + TODO: this should be done in SG as only one thread is active + ----------------------------------------------------------------------*/ + if (active) { + if (prevIdx == 0) { + myBox = bbox[childIndex]; + prevIdx = childIndex; + } + uint numChildren = BackPointer_GetNumChildren(backpointer); + struct AABB prevAABB = myBox; + refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx); + myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); + +#if REFIT_VERBOSE_LOG + printf("root: startpoint %d at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/); +#endif + if (isTipTreelet) { + AABB3f reduced3f = AABB3fFromAABB(myBox); + pBvh->Meta.bounds = reduced3f; + InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } else { + storeAABBToL3(myBox, &bbox[innerNodeIdx]); + if (encodeQnodes) { + InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); + } + } + } + } + + return active; +} + + +////////////////////////////////////////////////////////////////////////////////////// +// +// Internal nodes enocding as a separate dispatch +// +// + +// encode qnodes as a separate pass +GRL_INLINE void post_refit_encode_qnode_tree_per_group( + global struct AABB* bbox_scratch, + global struct BVHBase* bvh) +{ + uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh); + InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh); + + for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0)) + { + struct AABB reduced = bbox_scratch[nodeIdx]; +# if REFIT_DEBUG_CHECKS + if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) { + printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx); + return; + } + for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) { + uint childIdx = (as_uint(reduced.upper.w) >> 4) + k; + if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) { + printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx); + return; + } + } +# endif + struct InternalNode* qbvh_node = internalNodes + nodeIdx; + uint childIndex = as_uint(reduced.upper.w) >> 4; + uint numChildren = as_uint(reduced.upper.w) & 7; + struct AABB* children = bbox_scratch + childIndex; + //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced); + InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced); + } +} + +////////////////////////////////////////////////////////////////////////////////////// +// +// Construction of treelets and paths +// +// + +// this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node +// when thread closed treelets it the data is starts to be treelet ID +typedef uint TreeletNodeData; + +typedef struct TreeletsOpenNodeInfo { + // bool isTreeletRoot; // : 1 + short maxDepth; // : 14 + uint numStartpoints;// : 16 +} TreeletsOpenNodeInfo; + +typedef struct TreeletsClosedNodeInfo { + // bool isTreeletRoot; // : 1 + uint treeletId; // : 31 (when treelet is closed) +} TreeletsClosedNodeInfo; + +GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D) +{ + return D & ((1u << 31u) - 1u); +} + +GRL_INLINE uint isTreeletRoot(TreeletNodeData E) +{ + return E >> 31; +} + +GRL_INLINE uint getNumStartpoints(TreeletNodeData E) +{ + return E & ((1 << 16) - 1); +} + +GRL_INLINE uint getMaxDepth(TreeletNodeData E) +{ + return (E >> 16) & ((1 << 14) - 1); +} + +// single startpoint treelet +GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E) +{ + return (E >> 31) && (getMaxDepth(E) == 0); +} + +GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D) +{ + return ClearTreeletRoot(D) | (1 << 30); +} + +GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D) +{ + return D | (1 << 31); +} + +GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E) +{ + TreeletsOpenNodeInfo I; + I.maxDepth = getMaxDepth(E); + I.numStartpoints = getNumStartpoints(E); + return I; +} + +GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot) +{ + TreeletNodeData D = isRoot ? (1 << 31) : 0; + D |= (I.maxDepth & ((1 << 14) - 1)) << 16; + D |= I.numStartpoints & ((1 << 16) - 1); + return D; +} + +GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E) +{ + TreeletsClosedNodeInfo I; + I.treeletId = E & ((1u << 31u) - 1u); + return I; +} + +GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I) +{ + TreeletNodeData D = (1u << 31u); // closed is always a root! + D |= I.treeletId & ((1u << 31u) - 1u); + return D; +} + +GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId) +{ + TreeletNodeData D = (1 << 31); // closed is always a root! + D |= treeletId & ((1u << 31u) - 1u); + return D; +} + +GRL_INLINE void chk_close_Treelet( + RefitTreelet* TreeletDescsArr, + TreeletNodeData* nodeTreeletDataArr, + uint* StartPointBuffer, + uint* currStartpoint, + TreeletNodeData nodeData, + TreeletsOpenNodeInfo* nodeOpenInfo, + uint nodeIdx, + uint* treeletDescIdx) +{ + if (isTreeletRoot(nodeData)) + { + TreeletNodeData encoded = 0; + if (nodeOpenInfo->numStartpoints == 1) + { + encoded = ClearTreeletRoot(SetTipStartpoint(nodeData)); + } + else + { + RefitTreelet RTdesc; + RTdesc.startpoint_offset = *currStartpoint; + *currStartpoint += nodeOpenInfo->numStartpoints; + RTdesc.numStartpoints = nodeOpenInfo->numStartpoints; + RTdesc.maxDepth = nodeOpenInfo->maxDepth; + TreeletDescsArr[*treeletDescIdx] = RTdesc; + encoded = EncodeClosedInfo(*treeletDescIdx); + *treeletDescIdx = *treeletDescIdx + 1; + TreeletsOpenNodeInfo infoDefault = { 0, 0 }; + *nodeOpenInfo = infoDefault; + } + + nodeTreeletDataArr[nodeIdx] = encoded; + } + // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset); +} + + +// TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root +GRL_INLINE void treelet_bottom_up_mark_treelets( + global struct BVHBase* bvh, + global InternalNode* internalNodes, + global StartPoint* scratch_startpoints, + uint curNodeIndex, + BackPointers* backPointers, + global TreeletNodeData* treelets, + uint refitTreeletsDataStart, + uint* startpointAlloc) +{ + TreeletsOpenNodeInfo currInfo; + currInfo.maxDepth = 0; + currInfo.numStartpoints = 1; + + global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64)); + + treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true); + + /* the start node got already processed, thus go to its parent node */ + uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); + curNodeIndex = parentPointer >> 6; + + bool isInTip = false; + while (curNodeIndex != 0x03FFFFFF) + { + uint numChildrenTotal = 0; + // numChildrenTotal and parentPointer gets updated... + // atomic trickery, on backpointers, only the last one thread enters up + { + /* increment refit counter that counts refitted children of current node */ + global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex); + mem_fence_gpu_invalidate(); + parentPointer = 1 + atomic_inc_global(pCurrentBackpointer); + + /* if all children got refitted, then continue */ + const uint numChildrenRefitted = (parentPointer >> 0) & 0x7; + numChildrenTotal = (parentPointer >> 3) & 0x7; + + if (numChildrenRefitted != numChildrenTotal) + return; + + /* reset refit counter for next refit */ + *pCurrentBackpointer = (parentPointer & 0xfffffff8); + } + + /* get children treelets */ + global struct InternalNode* node = internalNodes + curNodeIndex; + uint childrenIndices = curNodeIndex + node->childOffset; + global TreeletNodeData* childrenTreelets = treelets + childrenIndices; + + // yeah, it is possible we are pulling trash here, but we wont use it. + // this is for the sake of one non control flow spoiled data pull + TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1]; + TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3]; + TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5]; + + // zero out the potential trash + if (numChildrenTotal < 3) dataCh2 = 0; + if (numChildrenTotal < 4) dataCh3 = 0; + if (numChildrenTotal < 5) dataCh4 = 0; + if (numChildrenTotal < 6) dataCh5 = 0; + + TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0); + TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1); + TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2); + TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3); + TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4); + TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5); + + uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5); + // see if we should merge the trees, if not then we should move to tip. + currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints; + + bool isTipStartpoint = false; + if (!isInTip) + { + // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes + bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS)); + bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots; + if (mergeTreelets && allChildrenRootsCurrently) + { + childrenTreelets[0] = ClearTreeletRoot(dataCh0); + childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root. + if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2); + if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3); + if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4); + if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5); + } + else + { + isInTip = true; + isTipStartpoint = allChildrenRootsCurrently; + } + } + + // close any roots underneath + if (isInTip && numChildrenBeingRoots) + { + uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) + + isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5); + + uint treeletId = 0; + uint bottomStartpointSpace = 0; + + uint startpointsFromTiptree = trivialRoots; + + if (trivialRoots) isTipStartpoint = false; + + if (numChildrenBeingRoots > trivialRoots) + { + startpointsFromTiptree += // startpoint ONLY from tiptree + (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints + + (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints + + (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints + + (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints + + (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints + + (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints; + + treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots); + bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree); + } + + currInfo.numStartpoints = startpointsFromTiptree; + + chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId); + chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId); + chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId); + chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId); + chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId); + chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId); + } + + if (isTipStartpoint) + { + currInfo.maxDepth = 0; + currInfo.numStartpoints = 1; + } + else + { + // reduce max depth and number of startpoint underneath + currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth), + max(infoCh2.maxDepth, infoCh3.maxDepth)), + max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1; + } + + treelets[curNodeIndex] = EncodeOpenInfo( + currInfo, + !isInTip /*mark marged treelet as an new root iff we are in bottom we */); + + /* make parent node the current node */ + curNodeIndex = parentPointer >> 6; + } + + uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh); + + uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints); + + treelets[0] = EncodeClosedInfo(treeletId); + RefitTreelet tipTreeletDesc; + tipTreeletDesc.startpoint_offset = bottomStartpointSpace; + tipTreeletDesc.numStartpoints = currInfo.numStartpoints; + tipTreeletDesc.maxDepth = currInfo.maxDepth; + + treeletDescs[treeletId] = tipTreeletDesc; + + uint realNumberOfTreelets = treeletId + 1; + // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets + // so substract 1. Except single treelet tree which is should stay 1. + uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId; + + *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets; + + uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6; + uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6; + bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B; + bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B; + *startpointAlloc = 0; +} + + +GRL_INLINE void find_refit_treelets( + global struct BVHBase* bvh, + global TreeletNodeData* treelets, + global uint* scratchStartpoints, + global uint* startpointAlloc) +{ + /* get pointer to inner nodes and back pointers */ + uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh); + + /* construct range of nodes that each work group will process */ + uniform const uint numInnerNodes = BVHBase_numNodes(bvh); + + varying ushort lane = get_sub_group_local_id(); + varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0); + + uint numBackpointers = BVHBase_GetNumInternalNodes(bvh); + + // align to 64B and divide + uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6; + + uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B; + if (global_id == 0) + { + bvh->refitTreeletsDataStart = refitTreeletsDataStart; + } + + global struct InternalNode* curNode = &inner_nodes[global_id]; + + varying ushort has_startpoint = 0; + if (global_id < numInnerNodes) { + if ((curNode->nodeType != BVH_INTERNAL_NODE)) + { + has_startpoint = 1; + } + } + + if (has_startpoint == 0) + return; + + treelet_bottom_up_mark_treelets( + bvh, + inner_nodes, + scratchStartpoints, + global_id, + BVHBase_GetBackPointers(bvh), + treelets, + refitTreeletsDataStart, + startpointAlloc); +} + +GRL_INLINE void assign_refit_startpoints_to_treelets( + global struct BVHBase* bvh, + global TreeletNodeData* treelets, + global uint* scratchStartpoints) +{ + /* get pointer to inner nodes and back pointers */ + uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh); + + /* construct range of nodes that each work group will process */ + uniform const uint numInnerNodes = BVHBase_numNodes(bvh); + + varying ushort lane = get_sub_group_local_id(); + varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0); + varying uint curNodeIndex = starPointNode; + global struct InternalNode* curNode = &inner_nodes[curNodeIndex]; + + varying ushort is_startpoint = 0; + + if (curNodeIndex < numInnerNodes) + { + if ((curNode->nodeType != BVH_INTERNAL_NODE)) + { + is_startpoint = 1; + } + } + + if (is_startpoint == 0) + { + return; + } + + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + + RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh); + uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh); + if (numTreelets > 1) numTreelets++; + + uint myDepthWhenDead = 0; + uint startpointsBeforeMe = 0; + bool dead = false; + + uint prevNodeIndex = 0x03FFFFFF; + + while (curNodeIndex != 0x03FFFFFF) + { + TreeletNodeData nodeData = treelets[curNodeIndex]; + + uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); + uint numChildren = BackPointer_GetNumChildren(parentPointer); + + // this is counterpart of atomic based entrance decision. + // the alive path is the longest, if two are equal take the one that came through child with smaller index. + if (prevNodeIndex != 0x03FFFFFF) + { + uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset; + uint childEnd = numChildren + leadChildOfCur; + + uint longestPath = 0; + uint longestPathChildIdx = leadChildOfCur; + + for (uint child = leadChildOfCur; child < childEnd; child++) + { + TreeletNodeData childData = treelets[child]; + if (!isTreeletRoot(childData)) + { + TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData); + if (longestPath <= childinfo.maxDepth) { + longestPathChildIdx = child; + longestPath = childinfo.maxDepth + 1; + } + + if (child < prevNodeIndex) + { + // also count how many startpoints are there before me (used to place startpoint in proper slot) + startpointsBeforeMe += childinfo.numStartpoints; + } + } + } + + if (!dead && prevNodeIndex != longestPathChildIdx) + { + dead = true; + //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead); + } + + if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index + { + myDepthWhenDead = longestPath; + // it is a startpoint + //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead); + } + + if (starPointNode == (uint)-1) { + // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet + if (dead) + { + //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead); + // and we are dead, so we are not a startpoint of tip, + // so we must disappear to not be added as a startpoint. + return; + } + else + { + // it is a startpoint + //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex); + starPointNode = curNodeIndex; + } + } + } + + if (isTreeletRoot(nodeData)) + { + TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData); + RefitTreelet treeletDesc = treeletDescs[info.treeletId]; + uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe; + scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1)); + + //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead); + + if (dead) return; + myDepthWhenDead = 0; + startpointsBeforeMe = 0; + starPointNode = (uint)-1; + } + + /* make parent node the current node */ + prevNodeIndex = curNodeIndex; + curNodeIndex = BackPointer_GetParentIndex(parentPointer); + //if(!dead) + //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex); + } +} + +const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32; + +GRL_INLINE void finalize_treelets_in_groups( + global struct BVHBase* bvh, + global uint* scratchStartpoints, + local uint* depths) +{ + uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh); + + uint local_id = get_local_id(0); + + uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted; + + RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh); + + for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted) + { + if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); } + + RefitTreelet treeletDesc = treeletDescs[treeletId]; + StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset; + if (treeletDesc.numStartpoints <= 1) + { + // for smaller latency we store 1 element treelets as RefitTreeletTrivial, + // this happens most of the time for tip treelet + if (local_id == 0) + { + RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 }; + if (treeletDesc.numStartpoints == 1) + { + StartPoint sp = srcStartpoints[0]; + + tr.theOnlyNodeIndex = StartPoint_GetNodeIdx(sp); + uint backpointer = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex); + tr.numChildrenOfTheNode = BackPointer_GetNumChildren(backpointer); + tr.childrenOffsetOfTheNode = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex; + } + RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId); + *trivial = tr; +#if REFIT_VERBOSE_LOG + printf("treelet trivial %d {\n theOnlyNodeIndex = %d;\n numStartpoints = %d;\n childrenOffsetOfTheNode = %d;\n maxDepth =%d;\n numChildrenOfTheNode = %d;\n}\n", + treeletId, + tr.theOnlyNodeIndex, + tr.numStartpoints, + tr.childrenOffsetOfTheNode, + tr.maxDepth, + tr.numChildrenOfTheNode); +#endif + } + } + else + { +#define SKIP_PATHS_SORTING 0 +#if SKIP_PATHS_SORTING + StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset; + for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0)) + { + dstStartpoints[startpointID] = srcStartpoints[startpointID]; + } +#else + //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); } + + if (local_id <= treeletDesc.maxDepth) { + depths[local_id] = 0; + // printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id); + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size(); + + // collect histogram of how many paths of given length we have + + // keep count of depth 0 + uint val = 0; + + // optimize: we will load Startpoint only once to + uint S_c[8]; + // optimize: keep accumulated numbers in registers to limit number of atomic ops + uint D_c[8] = { 0 }; + + uint cached_threshold = 8 * get_local_size(0); + cached_threshold = min(cached_threshold, treeletDesc.numStartpoints); + + uint loop_turn = 0; + uint sgid = get_sub_group_local_id(); + + for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0)) + { + uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]); + atomic_inc((volatile local uint*) (depths + dstSlot)); + } + + uint HistogramSG = 0; + if (treeletDesc.maxDepth < 8) + { + for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) + { + StartPoint S = srcStartpoints[startpointID]; + S_c[loop_turn++] = S; + uint dstSlot = StartPoint_GetDepth(S); + D_c[dstSlot]++; + } + + for (uint d = 0; d <= treeletDesc.maxDepth; d++) + { + val = sub_group_reduce_add(D_c[d]); + if (sgid == d) + { + HistogramSG = val; + } + } + if (sgid <= treeletDesc.maxDepth && HistogramSG != 0) + { + atomic_add((volatile local uint*) (depths + sgid), HistogramSG); + } + } + else + { + for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) + { + StartPoint S = srcStartpoints[startpointID]; + S_c[loop_turn++] = S; + uint dstSlot = StartPoint_GetDepth(S); + atomic_inc((volatile local uint*) (depths + dstSlot)); + } + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + +#if REFIT_VERBOSE_LOG + if (local_id == 0) + { + for (uint d = 0; d <= treeletDesc.maxDepth; d++) + { + printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]); + } + } +#endif + + if (treeletDesc.maxDepth < get_sub_group_size()) + { + if (get_sub_group_id() == 0) + { + + uint cntOfDepth = 0; + if (sgid <= treeletDesc.maxDepth) { + cntOfDepth = depths[sgid]; + } + uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth); + depths[sgid] = pref_sum; + + uint numLeft = treeletDesc.numStartpoints - (pref_sum); + uint depthLess64 = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth; + uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth; + uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth; + + // filling data for thread 0 who will save this to mem + treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64); + treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128); + treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256); + treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth; + + if (sgid == 0) { + treeletDescs[treeletId] = treeletDesc; +#if REFIT_VERBOSE_LOG + printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; \n maxDepth = %d;\n depthLess64 = %d;\n depthLess128 = %d;\n depthLess256 = %d;\n}\n", + treeletId, + treeletDesc.startpoint_offset, + treeletDesc.numStartpoints, + treeletDesc.numNonTrivialStartpoints, + treeletDesc.maxDepth, + treeletDesc.depthLess64, + treeletDesc.depthLess128, + treeletDesc.depthLess256); +#endif + } + } + } + else if (local_id <= treeletDesc.maxDepth) { + uint thisdepthcount = depths[local_id]; + treeletDesc.depthLess64 = 0; + treeletDesc.depthLess128 = 0; + treeletDesc.depthLess256 = 0; + uint numLeft = treeletDesc.numStartpoints; + uint pref_sum = 0; + + for (uint d = 0; d < local_id; d++) + { + uint depthCnt = depths[d]; + if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; } + if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; } + if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; } + pref_sum += depthCnt; + numLeft -= depthCnt; + if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; } + } + + if (local_id == treeletDesc.maxDepth) + { + treeletDescs[treeletId] = treeletDesc; +#if REFIT_VERBOSE_LOG + printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; maxDepth = %d;\n depthLess64 = %d; depthLess128 = %d; depthLess256 = %d;\n}\n", + treeletId, + treeletDesc.startpoint_offset, + treeletDesc.numStartpoints, + treeletDesc.numNonTrivialStartpoints, + treeletDesc.maxDepth, + treeletDesc.depthLess64, + treeletDesc.depthLess128, + treeletDesc.depthLess256); +#endif + } + } + + StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset; + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + loop_turn = 0; + if (treeletDesc.maxDepth < 8) + { + uint prefixSG = 0; + + // make prefixSG keep interval for paths with sglid depth that is separated out for sg. + if (sgid <= treeletDesc.maxDepth && HistogramSG != 0) + { + prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG); + } + + // from now on all sgs run independently + + // make D_c keep offset interval that is separated out for given lane + for (uint d = 0; d <= treeletDesc.maxDepth; d++) + { + uint thisDPrefixSg = sub_group_broadcast(prefixSG, d); + uint thisLaneCount = D_c[d]; + uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount); + D_c[d] = laneOffset + thisDPrefixSg; + } + + for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) + { + StartPoint S = S_c[loop_turn++]; + uint d = StartPoint_GetDepth(S); + uint dstSlot = D_c[d]++; + dstStartpoints[dstSlot] = S; + } + } + else + { + for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) + { + StartPoint S = S_c[loop_turn++]; + uint d = StartPoint_GetDepth(S); + uint dstSlot = atomic_inc((volatile local uint*) (depths + d)); + dstStartpoints[dstSlot] = S; + } + } + + for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0)) + { + StartPoint S = srcStartpoints[srcStartpointID]; + uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]); + uint dstSlot = atomic_inc((volatile local uint*) (depths+ d)); + dstStartpoints[dstSlot] = S; + } +#endif //skip sorting + } + } +} diff --git a/src/intel/vulkan/grl/gpu/bvh_copy.cl b/src/intel/vulkan/grl/gpu/bvh_copy.cl new file mode 100644 index 00000000000..6e76f195095 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_copy.cl @@ -0,0 +1,763 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "d3d12.h" +#include "common.h" +#include "mem_utils.h" +#include "misc_shared.h" + +#define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT)) + +GRL_INLINE +uint GroupCountForCopySize(uint size) +{ + return (size >> 8) + 4; +} + +GRL_INLINE +uint GroupCountForCopy(BVHBase* base) +{ + return GroupCountForCopySize(base->Meta.allocationSize); +} + +GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances) +{ + for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0)) + { + for (uint row = 0; row < 3; row++) + { + for (uint column = 0; column < 4; column++) + { + D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column)); + } + } + D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex])); + D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex])); + D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex])); + D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex])); + D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex])); + } +} + +GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart) +{ + if (get_local_id(0) == 0) + { + uint64_t previousGeoDataBufferEnd = dataBufferStart; + for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1) + { + D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type)); + D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags)); + if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES) + { + // Every triangle is stored separately + uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount; + D3D12_set_triangles_Transform(&descs[geoIndex], 0); + D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE); + D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT); + D3D12_set_triangles_IndexCount(&descs[geoIndex], 0); + D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3); + D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); + D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); + D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float)); + previousGeoDataBufferEnd += vertexBufferSize; + } + else + { + D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount); + D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); + D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB)); + previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount; + } + } + } +} + +GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad) +{ + float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc); + uint64_t firstTriangleIndex = quad->primIndex0; + uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2; + + vertices[firstTriangleIndex * 9] = quad->v[0][0]; + vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1]; + vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2]; + + vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0]; + vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1]; + vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2]; + + vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0]; + vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1]; + vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2]; + + if (numTriangles == 2) + { + uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad); + uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad); + for( size_t i=0; i<3; i++ ) + { + uint32_t idx = packed_indices & 3 ; packed_indices >>= 2; + for( size_t j=0; j<3; j++ ) + vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j]; + } + } +} + +GRL_INLINE +void storeProceduralDesc( + struct AABB procAABB, + uint32_t primId, + D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc) +{ + D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc); + D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB); +} + +GRL_INLINE +void copyDataFromLProcedurals( + BVHBase* base, + D3D12_RAYTRACING_GEOMETRY_DESC* descs) +{ + unsigned numProcedurals = BVHBase_GetNumProcedurals(base); + InternalNode* innerNodes = BVHBase_GetInternalNodes(base); + unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base); + + if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals + { + + // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them + for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0)) + { + InternalNode* innerNode = innerNodes + nodeI; + + if (innerNode->nodeType == NODE_TYPE_PROCEDURAL) + { + float* origin = innerNode->lower; + + global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode); + + for (uint k = 0; k < 6; k++) + { + if (InternalNode_IsChildValid(innerNode, k)) + { + struct AABB3f qbounds = { + (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]), + (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) }; + + struct AABB dequantizedAABB; + + dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8); + dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8); + dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8); + dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8); + dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8); + dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8); + + dequantizedAABB = conservativeAABB(&dequantizedAABB); + /* extract geomID and primID from leaf */ + const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k); + const uint geomID = ProceduralLeaf_geomIndex(leaf); + const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! + + storeProceduralDesc(dequantizedAABB, primID, descs + geomID); + } + /* advance leaf pointer to next child */ + leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k); + } + + } + else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); } + else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; } + } + } +} + +GRL_INLINE +void copyDataFromQuadLeaves(BVHBase* base, + D3D12_RAYTRACING_GEOMETRY_DESC* descs) +{ + QuadLeaf* quads = BVHBase_GetQuadLeaves(base); + uint64_t numQuads = BVHBase_GetNumQuads(base); + for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0)) + { + uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc); + copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]); + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel clone_indirect(global char* dest, + global char* src) +{ + BVHBase* base = (BVHBase*)src; + uint64_t bvhSize = base->Meta.allocationSize; + + uint numGroups = GroupCountForCopy(base); + CopyMemory(dest, src, bvhSize, numGroups); +} + +GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt) +{ + global BVHBase* baseSrc = (global BVHBase*)src; + global BVHBase* baseDest = (global BVHBase*)dest; + + uint32_t offset = sizeof(BVHBase); + uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc); + uint32_t nodeSize = numNodes * sizeof(InternalNode); + offset += nodeSize; + + int quadChildFix = baseSrc->quadLeafStart; + int procChildFix = baseSrc->proceduralDataStart; + int instChildFix = baseSrc->instanceLeafStart; + + // serialization already copies part of bvh base so skip this part + CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt); + baseDest->Meta.allocationSize = compactedSize; + + if (baseSrc->Meta.instanceCount) + { + const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf); + CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt); + const uint instanceLeafStart = (uint)(offset / 64); + baseDest->instanceLeafStart = instanceLeafStart; + instChildFix -= instanceLeafStart; + offset += instLeafsSize; + baseDest->instanceLeafEnd = (uint)(offset / 64); + } + if (baseSrc->Meta.geoCount) + { + const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf); + if (quadLeafsSize) + { + CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt); + const uint quadLeafStart = (uint)(offset / 64); + baseDest->quadLeafStart = quadLeafStart; + quadChildFix -= quadLeafStart; + offset += quadLeafsSize; + baseDest->quadLeafCur = (uint)(offset / 64); + } + + const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf); + if (procLeafsSize) + { + CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt); + const uint proceduralDataStart = (uint)(offset / 64); + baseDest->proceduralDataStart = proceduralDataStart; + procChildFix -= proceduralDataStart; + offset += procLeafsSize; + baseDest->proceduralDataCur = (uint)(offset / 64); + } + } + // copy nodes with fixed child offsets + global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase)); + global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc); + // used in mixed case + char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc); + char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc); + uint localId = get_sub_group_local_id(); + for (uint i = get_group_id(0); i < numNodes; i += groupCnt) + { + uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]); + char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0]; + if (localId * 4 == offsetof(InternalNode, childOffset)) + { + int childOffset = as_int(nodePart); + if (nodeType == NODE_TYPE_MIXED) + { + char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset; + if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd) + nodePart = as_int(childOffset - instChildFix); + } + else if (nodeType == NODE_TYPE_INSTANCE) + nodePart = as_int(childOffset - instChildFix); + else if (nodeType == NODE_TYPE_QUAD) + nodePart = as_int(childOffset - quadChildFix); + else if (nodeType == NODE_TYPE_PROCEDURAL) + nodePart = as_int(childOffset - procChildFix); + } + nodeDest[i * 16 + localId] = nodePart; + } + + if (baseSrc->Meta.instanceCount) + { + const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc); + CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt); + baseDest->Meta.instanceDescsStart = offset; + offset += instanceDescSize; + } + if (baseSrc->Meta.geoCount) + { + const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData); + CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt); + baseDest->Meta.geoDescsStart = offset; + offset += (geoMetaSize + 63) & ~63; // align to 64 + } + + uint backPointerDataStart = offset / 64; + uint refitTreeletsDataStart = backPointerDataStart; + uint refitStartPointDataStart = backPointerDataStart; + uint dataEnd = backPointerDataStart; + uint fatLeafTableStart = dataEnd; + uint fatLeafCount = baseSrc->fatLeafCount; + uint innerTableStart = dataEnd; + uint innerCount = baseSrc->innerCount; + + uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate; + uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate; + uint quadIndicesDataStart = dataEnd; + + if (BVHBase_HasBackPointers(baseSrc)) + { +#if 0 // + const uint oldbackpontersDataStart = baseSrc->backPointerDataStart; + const uint shift = oldbackpontersDataStart - backPointerDataStart; + const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63; + + CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt); + + refitTreeletsDataStart = baseSrc->refitTreeletsDataStart - shift; + refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift; + dataEnd = baseSrc->BVHDataEnd - shift; +#else // compacting version + const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63; + CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt); + offset += backpointersSize; + + refitTreeletsDataStart = offset / 64; + refitStartPointDataStart = offset / 64; + + // TODO: remove treelets from .... everywhere + const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc); + + if (treeletExecutedCnt) + { + const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1; + + refitTreeletsDataStart = offset / 64; + const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63; + RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset); + RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc); + + uint numThreads = groupCnt * get_local_size(0); + uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0); + + for (uint i = globalID; i < treeletCnt; i += numThreads) + { + RefitTreelet dsc = srcTreelets[i]; + RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc; + if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) { + trivial_dsc->childrenOffsetOfTheNode -= quadChildFix; + } + destTreelets[i] = dsc; + } + + offset += treeletsSize; + + refitStartPointDataStart = offset / 64; + const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63; + CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt); + offset += startPointsSize; + dataEnd = offset / 64; + } + + uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63); + fatLeafTableStart = offset / 64; + if (fatleafEntriesSize) { + CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt); + } + offset += fatleafEntriesSize; + + // New atomic update + if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart) + { + uint numQuads = BVHBase_GetNumQuads(baseSrc); + uint quadTableMainBufferSize = (numQuads + 255) & ~255; + uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255; + uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63); + if (quadTableEntriesSize) { + CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt); + } + offset += quadTableEntriesSize; + + uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63); + quadIndicesDataStart = offset / 64; + if (quadIndicesDataSize) { + CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt); + } + offset += quadIndicesDataSize; + } + + uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63); + innerTableStart = offset / 64; + if (innerEntriesSize) { + CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt); + } + offset += innerEntriesSize; + + dataEnd = offset / 64; +#endif + } + + baseDest->backPointerDataStart = backPointerDataStart; + baseDest->refitTreeletsDataStart = refitTreeletsDataStart; + baseDest->refitStartPointDataStart = refitStartPointDataStart; + baseDest->fatLeafTableStart = fatLeafTableStart ; + baseDest->fatLeafCount = fatLeafCount; + baseDest->innerTableStart = innerTableStart; + baseDest->innerCount = innerCount; + + baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate; + baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate; + baseDest->quadIndicesDataStart = quadIndicesDataStart; + baseDest->BVHDataEnd = dataEnd; +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel compact(global char* dest, + global char* src, + uint groupCnt) +{ + uint64_t compactedSize = compute_compacted_size((BVHBase*)src); + compactT(dest, src, compactedSize, 0, groupCnt); +} + +// set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data +GRL_INLINE +unsigned prepare_header( + uint64_t headerSize, + uint64_t instancePtrSize, + uint64_t numInstances, + uint64_t bvhSize, + uint8_t* driverID, + uint64_t reminder) +{ + + unsigned loc_id = get_sub_group_local_id(); + + uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize; + uint64_t DeserializedSizeInBytes = bvhSize; + uint64_t InstanceHandleCount = numInstances; + + char bvh_magic_str[] = BVH_MAGIC_MACRO; + uint* bvh_magic_uint = (uint*)bvh_magic_str; + + unsigned headerTempLanePiece; + if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); } + else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; } + else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; } + else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; } + else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; } + else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; } + else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); } + else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; } + else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); } + else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; } + else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); } + else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; } + else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); } + + return headerTempLanePiece; +} + + + + +GRL_INLINE +void serializeT( + global byte_align64B* dest, + global byte_align64B* src, + global uint8_t* driverID, + uint groups_count) +{ + SerializationHeader* header = (SerializationHeader*)dest; + BVHBase* base = (BVHBase*)src; + + const uint headerSize = sizeof(SerializationHeader); + const uint numInstances = base->Meta.instanceCount; + const uint instancePtrSize = sizeof(gpuva_t); + const uint compactedSize = compute_compacted_size(base); + uint local_id = get_sub_group_local_id(); + + // this is not 64byte aligned :( + const uint offsetToBvh = headerSize + instancePtrSize * numInstances; + + global InstanceDesc* src_instances = 0; + + if (numInstances) { + src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart); + } + + // effectively this part should end up as one 64B aligned 64B write + if (get_group_id(0) == groups_count - 1) + { + Block64B headerPlus; + + // we patch the missing piece with instance or bhv beginning (TRICK A and B) + // we assume header is 56B. + global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src; + + unsigned headerTemp; + + headerTemp = prepare_header( + headerSize, + instancePtrSize, + numInstances, + compactedSize, + driverID, + *srcPiece); + + CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp); + } + + if (numInstances > 0) + { + uint instancesOffset = headerSize; + uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6; + uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3; + unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances); + + global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset); + + // we've copied first instance onto a header, (see TRICK A) + // now we have only instances start at aligned memory + uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt; + dst_instances += unaligned_prefixing_instance_cnt; + src_instances += unaligned_prefixing_instance_cnt; + + if (numAlignedInstances) + { + // each 8 instances form a cacheline + uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs + // qwords besides multiple of 8; + uint startReminder = numAlignedInstances & ~((1 << 3) - 1); + uint numreminder = numAlignedInstances & ((1 << 3) - 1); + + uint task_id = get_group_id(0); + + while (task_id < numCachelines) + { + uint src_id = task_id * 8 + (local_id >> 1); + uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA; + uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected; + uint data = *src; + + global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id)); + CacheLineSubgroupWrite(dst, data); + task_id += groups_count; + } + + if (task_id == numCachelines && local_id < 8 && numreminder > 0) + { + // this should write full cacheline + + uint index = startReminder + local_id; + // data will be taken from instances for lanes (local_id < numreminder) + // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B) + global uint64_t* srcData = (local_id < numreminder) ? + &src_instances[index].AccelerationStructureGPUVA : + ((global uint64_t*)src) + (local_id - numreminder); + dst_instances[index] = *srcData; + } + } + } + + // the parts above copied unaligned dst beginning of bvh (see TRICK B) + uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u); + + compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel serialize_indirect( + global char* dest, + global char* src, + global uint8_t* driverID) +{ + BVHBase* base = (BVHBase*)src; + uint groups_count = GroupCountForCopy(base); + serializeT(dest, src, driverID, groups_count); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel serialize_for_input_dump_indirect( + global struct OutputBatchPtrs* batchPtrs, + global dword* dstOffset, + global char* src, + global uint8_t* driverID) +{ + BVHBase* base = (BVHBase*)src; + uint groups_count = GroupCountForCopy(base); + global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset); + dest += (sizeof(OutputData) + 127) & ~127; + serializeT(dest, src, driverID, groups_count); +} + +GRL_INLINE +void deserializeT( + global char* dest, + global char* src, + unsigned groupCnt) +{ + SerializationHeader* header = (SerializationHeader*)src; + + const uint64_t headerSize = sizeof(struct SerializationHeader); + const uint64_t instancePtrSize = sizeof(gpuva_t); + const uint64_t numInstances = header->InstanceHandleCount; + const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances; + const uint64_t bvhSize = header->DeserializedSizeInBytes; + + if (numInstances) + { + const bool instances_mixed_with_inner_nodes = false; + if (instances_mixed_with_inner_nodes) + { + // not implemented ! + // copy each node with 64byte granularity if node is instance, patch it mid-copy + } + else + { + BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh); + + // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than + // numInstances (count of pointers and descriptors). + uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6; + uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1; + + // + // instances are in separate memory intervals + // copy all the other data simple way + // + uint nodesEnd = srcBvhBase->Meta.instanceDescsStart; + // copy before instance leafs + CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt); + + uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6; + uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart; + uint sizePostInstances = instanceDescStart - offsetPostInstances; + // copy after instance leafs before instance desc + CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt); + + uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc); + uint sizePostInstanceDescs = bvhSize - instanceDescEnd; + // copy after instance desc + CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt); + + global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize); + global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart); + global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart); + + // copy and patch instance descriptors + for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt) + { + InstanceDesc desc = srcDesc[instanceIndex]; + uint64_t newInstancePtr = newInstancePtrs[instanceIndex]; + desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr; + + dstDesc[instanceIndex] = desc; + } + + // copy and patch hw instance leafs + global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances); + global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances); + + for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt) + { + // pull the instance from srcBVH + HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex]; + + uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf); + uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex]; + uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf); + + HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr); + uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf); + + if (startNode != 0) { + uint64_t rootNodeOffset = startNode - originalBvhPtr; + HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset); + } + + dstInstleafs[hwLeafIndex] = tmpInstleaf; + } + } + } + else + { + CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt); + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel deserialize_indirect( + global char* dest, + global char* src) +{ + SerializationHeader* header = (SerializationHeader*)src; + const uint64_t bvhSize = header->DeserializedSizeInBytes; + unsigned groupCnt = GroupCountForCopySize(bvhSize); + deserializeT(dest, src, groupCnt); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest, + global char* src) +{ + + DecodeHeader* header = (DecodeHeader*)dest; + BVHBase* base = (BVHBase*)src; + + uint32_t numGeos = base->Meta.geoCount; + uint32_t numInstances = base->Meta.instanceCount; + + if (numInstances > 0) + { + header->Type = TOP_LEVEL; + header->NumDesc = numInstances; + + D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader)); + copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart), + instanceDesc, + numInstances); + } + else if (numGeos > 0) + { + header->Type = BOTTOM_LEVEL; + header->NumDesc = numGeos; + + D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader)); + uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos; + createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart), + geomDescs, + numGeos, + data); + + work_group_barrier(CLK_GLOBAL_MEM_FENCE); + + copyDataFromQuadLeaves(base, + geomDescs); + + copyDataFromLProcedurals(base, + geomDescs); + } + else + { + header->Type = BOTTOM_LEVEL; + header->NumDesc = 0; + } +} diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.cl b/src/intel/vulkan/grl/gpu/bvh_debug.cl new file mode 100644 index 00000000000..bce75fec3ff --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_debug.cl @@ -0,0 +1,208 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// @file bvh_debug.cl +// +// @brief routines to do basic integrity checks +// +// Notes: +// + +#include "GRLGen12.h" +#include "intrinsics.h" +#include "libs/lsc_intrinsics.h" +#include "GRLGen12IntegrityChecks.h" +#include "api_interface.h" + +#define ERROR_PRINTF 0 +GRL_INLINE bool commit_err( + global uint* some_null, + global BVHBase* bvh, + global ERROR_INFO* err_info_slot, + ERROR_INFO err) +{ + if (err.type != error_t_no_error) { + uint expected = error_t_no_error; + atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type); + if (expected == error_t_no_error) + { + err_info_slot->offset_in_BVH = err.offset_in_BVH; + err_info_slot->when = err.when; + err_info_slot->reserved = 0xAAACCAAA; + mem_fence_evict_to_memory(); +#if ERROR_PRINTF + printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH); +#else + // This is to trigger PF. Note we have to write directly to memory. + // If write would stay in L3 it won't give a PF untill this will get evicted to mem. + store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type); +#endif + return true; + } + } + return false; +} + +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel check_tree_topology( + global uint* some_null, + global BVHBase* bvh, + global ERROR_INFO* err, + uint phase) +{ + uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); + + if (err->type != error_t_no_error) return; + + uint dummy1, dummy2, dummy3; + ERROR_INFO reterr = check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false); + if (reterr.type == error_t_no_error) + { + reterr = check_backpointers(bvh, globalID); + } + if (reterr.type == error_t_no_error) + { + reterr = validate_atomic_update_structs(bvh, globalID); + } + reterr.when = phase; + commit_err(some_null, bvh, err, reterr); +} + +GRL_INLINE bool IsValid48bPtr(qword ptr) +{ + qword CANONIZED_BITS = 0xFFFFul << 48ul; + qword canonized_part = ptr & CANONIZED_BITS; + bool isIt = ptr != 0 && ( + canonized_part == 0 || canonized_part == CANONIZED_BITS); + return isIt; +} + +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel check_geos_before_quad_update( + global BVHBase* bvh, //dest bvh + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global uint* some_null, + global ERROR_INFO* err, + uint phase, + uint numGeos, + uint numThreads) +{ + uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); + + if (err->type != error_t_no_error) return; + + // first check sanity of geos + ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 }; + + for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size()) + { + bool IsSane = IsValid48bPtr((qword)(qword)geomDesc); + + if (IsSane) { + GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID]; + IsSane = geo.Type < NUM_GEOMETRY_TYPES; + if (IsSane) { + if (geo.Type == GEOMETRY_TYPE_TRIANGLES) { + if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) { + IsSane = false; + } + else + { + if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2) + { + IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) && + IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) && + IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer); + } + else if (geo.Desc.Triangles.VertexCount > 2) + { + IsSane = + geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&& + IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0; + } + } + } + } + } + + geo_insanity_error.offset_in_BVH = ID; + geo_insanity_error.when = phase; + if (!IsSane) { + commit_err(some_null, bvh, err, geo_insanity_error); + } + return; + } +} + +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel check_geos_vs_quads( + global BVHBase* bvh, + global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, + global uint* some_null, + global ERROR_INFO* err, + uint phase, + uint numGeos, + uint numThreads) +{ + uint numQuads = BVHBase_GetNumQuads(bvh); + + QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh); + + uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); + uint qoffset = bvh->quadLeafStart; + + if (err->type != error_t_no_error) return; + + ERROR_INFO theErr = { error_t_no_error, 0 }; + + for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size()) + { + ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase }; + + QuadLeaf quad = quads[ID]; + + uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc); + + if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; } + + uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ? + geomDesc[geoIdx].Desc.Triangles.IndexCount / 3 : + geomDesc[geoIdx].Desc.Triangles.VertexCount / 3; + + if(quad.primIndex0 >= numPrimsInGeo) { + commit_err(some_null, bvh, err, quadErr); + return; + } + + if(!QuadLeaf_IsSingleTriangle(&quad) && + (quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo)) + { + commit_err(some_null, bvh, err, quadErr); + return; + } + } +} + +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel check_instances_linked_bvhs( + global uint* some_null, + global BVHBase* bvh, + global ERROR_INFO* err, + uint phase) +{ + if (err->type != error_t_no_error) return; + + uint instanceLeafStart = bvh->instanceLeafStart; + uint instanceLeafEnd = bvh->instanceLeafEnd; + uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2; + + uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); + + ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true); + reterr.when = phase; + commit_err(some_null, bvh, err, reterr); +} diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.grl b/src/intel/vulkan/grl/gpu/bvh_debug.grl new file mode 100644 index 00000000000..28008ab09ce --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_debug.grl @@ -0,0 +1,107 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module bvh_on_gpu_checks; + +kernel_module debug_kernels ("bvh_debug.cl") +{ + links lsc_intrinsics; + kernel opencl_check_tree_topology < kernelFunction="check_tree_topology">; + kernel opencl_check_instances_linked_bvhs < kernelFunction="check_instances_linked_bvhs">; + kernel opencl_check_geos_before_quad_update < kernelFunction="check_geos_before_quad_update">; + kernel opencl_check_geos_vs_quads < kernelFunction="check_geos_vs_quads">; +} + + +metakernel debug_checks_prepare_const_regs() +{ + define cRoundingSIMD REG4; + define cInit0 REG5; + define cShiftForSIMD REG3; + cRoundingSIMD = (16-1); + cShiftForSIMD = 4; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; +} + +metakernel debug_checks_bvh_topology( + qword some_null_ptr, + qword bvh, + qword bvh_inner_nodes_end, + qword error_struct, + dword when, + dword bvh_inner_nodes_start_value ) +{ + define cRoundingSIMD REG4; + define cShiftForSIMD REG3; + REG1.lo = load_dword(bvh_inner_nodes_end); + REG0 = bvh_inner_nodes_start_value; + REG1.hi = 0; + REG2 = REG1 - REG0; + REG2 = REG2 + cRoundingSIMD; + REG2 = REG2 >> cShiftForSIMD; + + DISPATCHDIM_X = REG2.lo; + + dispatch_indirect opencl_check_tree_topology args( + some_null_ptr, + bvh, + error_struct, + when); +} + +metakernel debug_check_instances_linked_bvhs( + qword some_null_ptr, + qword bvh, + qword error_struct, + dword numHWThreads, + dword when) +{ + dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args( + some_null_ptr, + bvh, + error_struct, + when); +} + +metakernel debug_check_geos_before_quad_update( + qword bvh, + qword geos, + qword some_null_ptr, + qword error_struct, + dword when, + dword numGeos, + dword numHWThreads ) +{ + dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args( + bvh, + geos, + some_null_ptr, + error_struct, + when, + numGeos, + numHWThreads ); +} + +metakernel debug_check_geos_vs_quads( + qword bvh, + qword geos, + qword some_null_ptr, + qword error_struct, + dword when, + dword numGeos, + dword numHWThreads ) +{ + dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args( + bvh, + geos, + some_null_ptr, + error_struct, + when, + numGeos, + numHWThreads ); +} diff --git a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl new file mode 100644 index 00000000000..4fa222b53eb --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl @@ -0,0 +1,97 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "d3d12.h" +#include "common.h" + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem, + global char *postbuild_info) +{ + BVHBase *base = (BVHBase *)bvh_mem; + PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info; + + postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem, + global char *postbuild_info) +{ + + BVHBase *base = (BVHBase *)bvh_mem; + PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info; + + postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize; +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem, + global char *postbuild_info) +{ + + BVHBase *base = (BVHBase *)bvh_mem; + PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info; + + uint64_t headerSize = sizeof(SerializationHeader); + uint64_t numInstances = base->Meta.instanceCount; + + postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) + + numInstances * sizeof(gpuva_t) + + compute_compacted_size(base); + //base->Meta.allocationSize; + postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances; +} + +void countTrianglesAndProcedurals(GeoMetaData *geoMetaData, + uint64_t numGeos, + uint64_t *numTriangles, + uint64_t *numProcedurals) +{ + uint64_t numTrianglesLoc = 0; + uint64_t numProceduralsLoc = 0; + + for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0)) + { + if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES) + { + *numTriangles += geoMetaData[geoIndex].PrimitiveCount; + } + else + { + *numProcedurals += geoMetaData[geoIndex].PrimitiveCount; + } + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem, + global char *postbuild_info) +{ + BVHBase *base = (BVHBase *)bvh_mem; + PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info; + + uint64_t numTriangles = 0; + uint64_t numProcedurals = 0; + countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart), + base->Meta.geoCount, + &numTriangles, + &numProcedurals); + uint64_t numInstances = base->Meta.instanceCount; + uint64_t numDescs = base->Meta.geoCount; + uint64_t headerSize = sizeof(DecodeHeader); + uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) + + numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC); + + // Each triangle is stored separately - 3 vertices (9 floats) per triangle + uint64_t triangleDataSize = 9 * sizeof(float); + uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB); + uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize; + + postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize; +} diff --git a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl new file mode 100644 index 00000000000..ab0f891acee --- /dev/null +++ b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl @@ -0,0 +1,1683 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "AABB.h" +#include "GRLGen12.h" +#include "api_interface.h" +#include "common.h" +#include "qbvh6.h" + +#define MAX_SPLITS_PER_INSTANCE 64 +#define NUM_REBRAID_BINS 32 + +#define NUM_CHILDREN 6 +#define MAX_NODE_OFFSET 65535 // can't open nodes whose offsets exceed this + +// OCL/DPC++ *SHOULD* have a uniform keyword... but they dont... so I'm making my own +#define uniform +#define varying + +#define SGPRINT_UNIFORM(fmt,val) {sub_group_barrier(CLK_LOCAL_MEM_FENCE); if( get_sub_group_local_id() == 0 ) { printf(fmt,val); }} + +#define SGPRINT_6x(prefix,fmt,type,val) {\ + type v0 = sub_group_broadcast( val, 0 );\ + type v1 = sub_group_broadcast( val, 1 );\ + type v2 = sub_group_broadcast( val, 2 );\ + type v3 = sub_group_broadcast( val, 3 );\ + type v4 = sub_group_broadcast( val, 4 );\ + type v5 = sub_group_broadcast( val, 5 );\ + sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if( get_sub_group_local_id() == 0 ) { \ + printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \ + v0,v1,v2,v3,v4,v5);}} + + +#define SGPRINT_16x(prefix,fmt,type,val) {\ + type v0 = sub_group_broadcast( val, 0 );\ + type v1 = sub_group_broadcast( val, 1 );\ + type v2 = sub_group_broadcast( val, 2 );\ + type v3 = sub_group_broadcast( val, 3 );\ + type v4 = sub_group_broadcast( val, 4 );\ + type v5 = sub_group_broadcast( val, 5 );\ + type v6 = sub_group_broadcast( val, 6 );\ + type v7 = sub_group_broadcast( val, 7 );\ + type v8 = sub_group_broadcast( val, 8 );\ + type v9 = sub_group_broadcast( val, 9 );\ + type v10 = sub_group_broadcast( val, 10 );\ + type v11 = sub_group_broadcast( val, 11 );\ + type v12 = sub_group_broadcast( val, 12 );\ + type v13 = sub_group_broadcast( val, 13 );\ + type v14 = sub_group_broadcast( val, 14 );\ + type v15 = sub_group_broadcast( val, 15 );\ + sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if( get_sub_group_local_id() == 0 ) { \ + printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \ + fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \ + v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}} + +#if 1 +#define GRL_ATOMIC_INC(addr) atomic_add(addr, 1); +#else +#define GRL_ATOMIC_INC(addr) atomic_inc(addr); +#endif + +#if 0 +#define LOOP_TRIPWIRE_INIT uint _loop_trip=0; + +#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) \ + _loop_trip++;\ + if ( _loop_trip > max_iterations )\ + {\ + printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!!\n" );\ + printf( name"\n");\ + break;\ + } +#else + +#define LOOP_TRIPWIRE_INIT +#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) + +#endif + + + +typedef struct SGHeap +{ + uint32_t key_value; + bool lane_mask; +} SGHeap; + +GRL_INLINE void SGHeap_init(uniform SGHeap *h) +{ + h->lane_mask = false; + h->key_value = 0xbaadf00d; +} + +GRL_INLINE bool SGHeap_full(uniform SGHeap *h) +{ + return sub_group_all(h->lane_mask); +} +GRL_INLINE bool SGHeap_empty(uniform SGHeap *h) +{ + return sub_group_all(!h->lane_mask); +} + +GRL_INLINE bool SGHeap_get_lane_mask(uniform SGHeap *h) +{ + return h->lane_mask; +} +GRL_INLINE uint16_t SGHeap_get_lane_values(uniform SGHeap *h) +{ + return (h->key_value & 0xffff); +} + +GRL_INLINE ushort isolate_lowest_bit( ushort m ) +{ + return m & ~(m - 1); +} + + +// lane i receives the index of the ith set bit in mask. +GRL_INLINE ushort subgroup_bit_rank( uniform ushort mask ) +{ + varying ushort lane = get_sub_group_local_id(); + ushort idx = 16; + for ( uint i = 0; i < NUM_CHILDREN; i++ ) + { + ushort lo = isolate_lowest_bit( mask ); + mask = mask ^ lo; + idx = (lane == i) ? lo : idx; + } + + return ctz( idx ); +} + +// push a set of elements spread across a subgroup. Return mask of elements that were not pushed +GRL_INLINE uint16_t SGHeap_vectorized_push(uniform SGHeap *h, varying uint16_t key, varying uint16_t value, uniform ushort push_mask) +{ + +#if 0 // an attempt to make this algorithm branchless + varying uint key_value = (((uint)key) << 16) | ((uint)value); + uniform ushort free_mask = intel_sub_group_ballot( !h->lane_mask ); + + varying ushort free_slot_idx = subgroup_bit_prefix_exclusive( free_mask ); // for each heap slot, what is its position in a compacted list of free slots (prefix sum) + varying ushort push_idx = subgroup_bit_prefix_exclusive( push_mask ); // for each lane, what is its position in a compacted list of pushing lanes (prefix sum) + + uniform ushort num_pushes = min( popcount( free_mask ), popcount( push_mask ) ); + + varying ushort push_index = subgroup_bit_rank( push_mask ); // lane i gets the index of the i'th set bit in push_mask + + varying uint shuffled = intel_sub_group_shuffle( key_value, intel_sub_group_shuffle( push_index, free_slot_idx ) ); + varying bool pushed = false; + if ( !h->lane_mask && free_slot_idx < num_pushes ) + { + h->lane_mask = true; + h->key_value = shuffled; + pushed = true; + } + + return push_mask & intel_sub_group_ballot( push_idx >= num_pushes ); +#else + + varying uint lane = get_sub_group_local_id(); + + varying uint key_value = (((uint)key) << 16) | ((uint)value); + uniform ushort free_mask = intel_sub_group_ballot(!h->lane_mask); + + // TODO_OPT: Look for some clever way to remove this loop + while (free_mask && push_mask) + { + // insert first active child into first available lane + uniform uint child_id = ctz(push_mask); + uniform uint victim_lane = ctz(free_mask); + uniform uint kv = sub_group_broadcast( key_value, child_id ); + if (victim_lane == lane) + { + h->lane_mask = true; + h->key_value = kv; + } + push_mask ^= (1 << child_id); + free_mask ^= (1 << victim_lane); + } + + return push_mask; + +#endif +} + +// push an item onto a heap that is full except for one slot +GRL_INLINE void SGHeap_push_and_fill(uniform SGHeap *h, uniform uint16_t key, uniform uint16_t value) +{ + uniform uint32_t key_value = (((uint)key) << 16) | value; + if (!h->lane_mask) + { + h->lane_mask = true; + h->key_value = key_value; // only one lane will be active at this point + } +} + +// pop the min item from a full heap +GRL_INLINE void SGHeap_full_pop_min(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out) +{ + varying uint lane = get_sub_group_local_id(); + uniform uint kv = sub_group_reduce_min(h->key_value); + if (h->key_value == kv) + h->lane_mask = false; + + *key_out = (kv >> 16); + *value_out = (kv & 0xffff); +} + +// pop the max item from a heap +GRL_INLINE void SGHeap_pop_max(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out) +{ + uniform uint lane = get_sub_group_local_id(); + uniform uint kv = sub_group_reduce_max(h->lane_mask ? h->key_value : 0); + if (h->key_value == kv) + h->lane_mask = false; + + *key_out = (kv >> 16); + *value_out = (kv & 0xffff); +} + +GRL_INLINE void SGHeap_printf( SGHeap* heap ) +{ + uint key = heap->key_value >> 16; + uint value = heap->key_value & 0xffff; + + if ( get_sub_group_local_id() == 0) + printf( "HEAP: \n" ); + SGPRINT_16x( " mask: ", "%6u ", bool, heap->lane_mask ); + SGPRINT_16x( " key : ", "0x%04x ", uint, key ); + SGPRINT_16x( " val : ", "0x%04x ", uint, value ); + +} + +GRL_INLINE float transformed_aabb_halfArea(float3 lower, float3 upper, const float *Transform) +{ + // Compute transformed extent per 'transform_aabb'. Various terms cancel + float3 Extent = upper - lower; + float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]); + float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]); + float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]); + + return (ex * ey) + (ey * ez) + (ex * ez); +} + +GRL_INLINE uint16_t quantize_area(float relative_area) +{ + // clamp relative area at 0.25 (1/4 of root area) + // and apply a non-linear distribution because most things in real scenes are small + relative_area = pow(min(1.0f, relative_area * 4.0f), 0.125f); + return convert_ushort_rtn( relative_area * 65535.0f ); +} + +GRL_INLINE varying uint16_t SUBGROUP_get_child_areas(uniform InternalNode *n, + uniform const float *Transform, + uniform float relative_area_scale) +{ + varying uint16_t area; + varying uint16_t lane = get_sub_group_local_id(); + varying int exp_x = n->exp_x; + varying int exp_y = n->exp_y; + varying int exp_z = n->exp_z; + + { + // decode the AABB positions. Lower in the bottom 6 lanes, upper in the top + uniform uint8_t *px = &n->lower_x[0]; + uniform uint8_t *py = &n->lower_y[0]; + uniform uint8_t *pz = &n->lower_z[0]; + + varying float fx = convert_float(px[lane]); + varying float fy = convert_float(py[lane]); + varying float fz = convert_float(pz[lane]); + fx = n->lower[0] + bitShiftLdexp(fx, exp_x - 8); + fy = n->lower[1] + bitShiftLdexp(fy, exp_y - 8); + fz = n->lower[2] + bitShiftLdexp(fz, exp_z - 8); + + // transform the AABBs to world space + varying float3 lower = (float3)(fx, fy, fz); + varying float3 upper = intel_sub_group_shuffle(lower, lane + 6); + + { + + // TODO_OPT: This is only utilizing 6 lanes. + // We might be able to do better by vectorizing the calculation differently + float a1 = transformed_aabb_halfArea( lower, upper, Transform ); + float a2 = a1 * relative_area_scale; + area = quantize_area( a2 ); + } + } + + return area; +} + + + +GRL_INLINE ushort get_child_area( + InternalNode* n, + ushort child, + const float* Transform, + float relative_area_scale ) +{ + uint16_t area; + uint16_t lane = get_sub_group_local_id(); + int exp_x = n->exp_x; + int exp_y = n->exp_y; + int exp_z = n->exp_z; + + // decode the AABB positions. Lower in the bottom 6 lanes, upper in the top + uint8_t* px = &n->lower_x[0]; + uint8_t* py = &n->lower_y[0]; + uint8_t* pz = &n->lower_z[0]; + + float3 lower, upper; + lower.x = convert_float( n->lower_x[child] ); + lower.y = convert_float( n->lower_y[child] ); + lower.z = convert_float( n->lower_z[child] ); + upper.x = convert_float( n->upper_x[child] ); + upper.y = convert_float( n->upper_y[child] ); + upper.z = convert_float( n->upper_z[child] ); + + lower.x = bitShiftLdexp( lower.x, exp_x - 8 ); // NOTE: the node's 'lower' field cancels out, so don't add it + lower.y = bitShiftLdexp( lower.y, exp_y - 8 ); // see transform_aabb_halfArea + lower.z = bitShiftLdexp( lower.z, exp_z - 8 ); + upper.x = bitShiftLdexp( upper.x, exp_x - 8 ); + upper.y = bitShiftLdexp( upper.y, exp_y - 8 ); + upper.z = bitShiftLdexp( upper.z, exp_z - 8 ); + + float a1 = transformed_aabb_halfArea( lower, upper, Transform ); + float a2 = a1 * relative_area_scale; + area = quantize_area( a2 ); + + return area; +} + + +GRL_INLINE varying int SUBGROUP_get_child_offsets(uniform InternalNode *n) +{ + varying uint lane = get_sub_group_local_id(); + varying uint child = (lane < NUM_CHILDREN) ? lane : 0; + + varying uint block_incr = InternalNode_GetChildBlockIncr( n, child ); + + //varying uint prefix = sub_group_scan_exclusive_add( block_incr ); + varying uint prefix; + if ( NUM_CHILDREN == 6 ) + { + prefix = block_incr + intel_sub_group_shuffle_up( 0u, block_incr, 1u ); + prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 2 ); + prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 4 ); + prefix = prefix - block_incr; + } + + return n->childOffset + prefix; +} + + +// compute the maximum number of leaf nodes that will be produced given 'num_splits' node openings +GRL_INLINE uint get_num_nodes(uint num_splits, uint max_children) +{ + // each split consumes one node and replaces it with N nodes + // there is initially one node + // number of nodes is thus: N*s + 1 - s ==> (N-1)*s + 1 + return (max_children - 1) * num_splits + 1; +} + +// compute the number of node openings that can be performed given a fixed extra node budget +GRL_INLINE uint get_num_splits(uint num_nodes, uint max_children) +{ + // inverse of get_num_nodes: x = (n-1)s + 1 + // s = (x-1)/(n-1) + if (num_nodes == 0) + return 0; + + return (num_nodes - 1) / (max_children - 1); +} + +GRL_INLINE uint get_rebraid_bin_index(uint16_t quantized_area, uint NUM_BINS) +{ + // arrange bins in descending order by size + float relative_area = quantized_area * (1.0f/65535.0f); + relative_area = 1.0f - relative_area; // arrange bins largest to smallest + size_t bin = round(relative_area * (NUM_BINS - 1)); + return bin; +} + +GRL_INLINE global InternalNode *get_node(global BVHBase *base, int incr) +{ + global char *ptr = (((global char *)base) + BVH_ROOT_NODE_OFFSET); // NOTE: Assuming this will be hoisted out of inner loops + + return (global InternalNode *)(ptr + incr * 64); +} + +GRL_INLINE bool is_aabb_valid(float3 lower, float3 upper) +{ + return all(isfinite(lower)) && + all(isfinite(upper)) && + all(lower <= upper); +} + +GRL_INLINE bool is_node_openable(InternalNode *n) +{ + // TODO_OPT: Optimize me by fetching dwords instead of looping over bytes + // TODO: OPT: Pre-compute openability and pack into the pad byte next to the nodeType field?? + bool openable = n->nodeType == NODE_TYPE_INTERNAL; + if ( openable ) + { + for ( uint i = 0; i < NUM_CHILDREN; i++ ) + { + bool valid = InternalNode_IsChildValid( n, i ); + uint childType = InternalNode_GetChildType( n, i ); + openable = openable & (!valid || (childType == NODE_TYPE_INTERNAL)); + } + } + + return openable; +} + + +GRL_INLINE bool SUBGROUP_can_open_root( + uniform global BVHBase *bvh_base, + uniform const struct GRL_RAYTRACING_INSTANCE_DESC* instance + ) +{ + if (bvh_base == 0 || GRL_get_InstanceMask(instance) == 0) + return false; + + // TODO_OPT: SG-vectorize this AABB test + uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); + uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); + if (!is_aabb_valid(root_lower, root_upper)) + return false; + + uniform global InternalNode *node = get_node(bvh_base, 0); + if ( node->nodeType != NODE_TYPE_INTERNAL ) + return false; + + varying bool openable = true; + varying uint lane = get_sub_group_local_id(); + if (lane < NUM_CHILDREN) + { + varying uint childType = InternalNode_GetChildType(node, lane); + varying bool valid = InternalNode_IsChildValid(node, lane); + openable = childType == NODE_TYPE_INTERNAL || !valid; + } + + return sub_group_all(openable); +} + + + +GRL_INLINE +varying uint2 +SUBGROUP_count_instance_splits(uniform global struct AABB3f *geometry_bounds, + uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance) +{ + uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure; + if (!SUBGROUP_can_open_root(bvh_base, instance)) + return (uint2)(0, 0); + + uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds); + uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); + uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); + + uniform uint16_t quantized_area = quantize_area(transformed_aabb_halfArea(root_lower, root_upper, instance->Transform) * relative_area_scale); + uniform uint16_t node_offs = 0; + + uniform SGHeap heap; + uniform uint num_splits = 0; + + SGHeap_init(&heap); + varying uint sg_split_counts_hi = 0; // cross-subgroup bin counters + varying uint sg_split_counts_lo = 0; + + uniform global InternalNode* node_array = get_node( bvh_base, 0 ); + + LOOP_TRIPWIRE_INIT; + + while (1) + { + uniform global InternalNode* node = node_array + node_offs; + + // count this split + uniform uint bin = get_rebraid_bin_index(quantized_area, NUM_REBRAID_BINS); + varying uint lane = get_sub_group_local_id(); + + sg_split_counts_hi += ((lane + 16) == bin) ? 1 : 0; + sg_split_counts_lo += (lane == bin) ? 1 : 0; + + // open this node and push all of its openable children to heap + varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node); + varying bool sg_openable = 0; + if (lane < NUM_CHILDREN & sg_offs <= MAX_NODE_OFFSET ) + if (InternalNode_IsChildValid(node, lane)) + sg_openable = is_node_openable( node_array + sg_offs); + + uniform uint openable_children = intel_sub_group_ballot(sg_openable); + + if ( openable_children ) + { + varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale ); + + if ( !SGHeap_full( &heap ) ) + { + openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children ); + } + + while ( openable_children ) + { + // pop min element + uniform uint16_t min_area; + uniform uint16_t min_offs; + SGHeap_full_pop_min( &heap, &min_area, &min_offs ); + + // eliminate all children smaller than heap minimum + openable_children &= intel_sub_group_ballot( sg_area > min_area ); + + if ( openable_children ) + { + // if any children survived, + // kick out heap minimum and replace with first child.. otherwise we will re-push the minimum + uniform uint child_id = ctz( openable_children ); + openable_children ^= (1 << child_id); + min_area = sub_group_broadcast( sg_area, child_id ); + min_offs = sub_group_broadcast( sg_offs, child_id ); + } + + // re-insert onto heap + SGHeap_push_and_fill( &heap, min_area, min_offs ); + + // repeat until all children are accounted for. It is possible + // for multiple children to fit in the heap, because heap minimum is now changed and we need to recompute it + } + } + + num_splits++; + if (num_splits == MAX_SPLITS_PER_INSTANCE) + break; + + if (SGHeap_empty(&heap)) + break; + + // get next node from heap + SGHeap_pop_max(&heap, &quantized_area, &node_offs); + + LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_count_splits" ); + + } + + return (uint2)(sg_split_counts_lo, sg_split_counts_hi); +} + +typedef struct RebraidBuffers +{ + global uint *bin_split_counts; // [num_bins] + global uint *bin_instance_counts; // [num_bins] + global uint *instance_bin_counts; // num_intances * num_bins +} RebraidBuffers; + +GRL_INLINE RebraidBuffers cast_rebraid_buffers(global uint *scratch, uint instanceID) +{ + RebraidBuffers b; + b.bin_split_counts = scratch; + b.bin_instance_counts = scratch + NUM_REBRAID_BINS; + b.instance_bin_counts = scratch + (2 + instanceID) * NUM_REBRAID_BINS; + return b; +} + +/////////////////////////////////////////////////////////////////////////////////////////// +// Compute AABB +// Dispatch one work item per instance +/////////////////////////////////////////////////////////////////////////////////////////// + +GRL_INLINE void rebraid_compute_AABB( + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance) +{ + // don't open null rtas + global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure; + + struct AABB new_primref; + if (bvh_base != 0) + { + float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); + float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); + const float *Transform = instance->Transform; + + if (is_aabb_valid(root_lower, root_upper)) + { + new_primref = AABBfromAABB3f(transform_aabb(root_lower, root_upper, Transform)); + } + else + { + // degenerate instance which might be updated to be non-degenerate + // use AABB position to guide BVH construction + // + new_primref.lower.x = Transform[3]; + new_primref.lower.y = Transform[7]; + new_primref.lower.z = Transform[11]; + new_primref.upper = new_primref.lower; + } + } + else + { + AABB_init(&new_primref); + } + + struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref); + + if (get_sub_group_local_id() == 0) + { + AABB3f_atomic_merge_global_lu(&bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz ); + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +rebraid_computeAABB_DXR_instances( + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances) +{ + const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); + rebraid_compute_AABB(bvh, instances + instanceID); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +rebraid_computeAABB_DXR_instances_indirect( + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, + global struct IndirectBuildRangeInfo const * const indirect_data) +{ + const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); + instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) + (((global char*)instances) + indirect_data->primitiveOffset); + rebraid_compute_AABB(bvh, instances + instanceID); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +rebraid_computeAABB_DXR_instances_pointers( + global struct BVHBase* bvh, + global void *instances_in) +{ + global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; + + const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); + rebraid_compute_AABB(bvh, instances[instanceID]); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +rebraid_computeAABB_DXR_instances_pointers_indirect( + global struct BVHBase* bvh, + global void *instances_in, + global struct IndirectBuildRangeInfo const * const indirect_data) +{ + instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset; + global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; + + const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); + rebraid_compute_AABB(bvh, instances[instanceID]); +} + +/////////////////////////////////////////////////////////////////////////////////////////// +// Init scratch: Dispatch one work group +/////////////////////////////////////////////////////////////////////////////////////////// + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(64, 1, 1))) void kernel rebraid_init_scratch(global uint *scratch) +{ + scratch[get_local_id(0) + get_group_id(0)*get_local_size(0)] = 0; +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel rebraid_chase_instance_pointers(global struct GRL_RAYTRACING_INSTANCE_DESC *instances_out, + global void *instance_buff) +{ + global const struct GRL_RAYTRACING_INSTANCE_DESC **instances_in = + (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instance_buff; + + instances_out[get_local_id(0)] = *instances_in[get_local_id(0)]; +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +void kernel rebraid_chase_instance_pointers_indirect( + global struct GRL_RAYTRACING_INSTANCE_DESC* instances_out, + global void* instance_buff, + global struct IndirectBuildRangeInfo const* const indirect_data) +{ + instance_buff = ((global char*)instance_buff) + indirect_data->primitiveOffset; + global const struct GRL_RAYTRACING_INSTANCE_DESC** + instances_in = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instance_buff; + + instances_out[get_local_id(0)] = *instances_in[get_local_id(0)]; +} + +/////////////////////////////////////////////////////////////////////////////////////////// +// Count splits +/////////////////////////////////////////////////////////////////////////////////////////// + +GRL_INLINE void DEBUG_SUBGROUP_print_split_counts( uniform uint instanceID, varying uint split_counts_lo, varying uint split_counts_hi ) +{ + uniform uint vals[32] = { + sub_group_broadcast( split_counts_lo, 0 ), sub_group_broadcast( split_counts_lo, 1 ), + sub_group_broadcast( split_counts_lo, 2 ), sub_group_broadcast( split_counts_lo, 3 ), + sub_group_broadcast( split_counts_lo, 4 ), sub_group_broadcast( split_counts_lo, 5 ), + sub_group_broadcast( split_counts_lo, 6 ), sub_group_broadcast( split_counts_lo, 7 ), + sub_group_broadcast( split_counts_lo, 8 ), sub_group_broadcast( split_counts_lo, 9 ), + sub_group_broadcast( split_counts_lo, 10 ), sub_group_broadcast( split_counts_lo, 11 ), + sub_group_broadcast( split_counts_lo, 12 ), sub_group_broadcast( split_counts_lo, 13 ), + sub_group_broadcast( split_counts_lo, 14 ), sub_group_broadcast( split_counts_lo, 15 ), + + sub_group_broadcast( split_counts_hi, 0 ), sub_group_broadcast( split_counts_hi, 1 ), + sub_group_broadcast( split_counts_hi, 2 ), sub_group_broadcast( split_counts_hi, 3 ), + sub_group_broadcast( split_counts_hi, 4 ), sub_group_broadcast( split_counts_hi, 5 ), + sub_group_broadcast( split_counts_hi, 6 ), sub_group_broadcast( split_counts_hi, 7 ), + sub_group_broadcast( split_counts_hi, 8 ), sub_group_broadcast( split_counts_hi, 9 ), + sub_group_broadcast( split_counts_hi, 10 ), sub_group_broadcast( split_counts_hi, 11 ), + sub_group_broadcast( split_counts_hi, 12 ), sub_group_broadcast( split_counts_hi, 13 ), + sub_group_broadcast( split_counts_hi, 14 ), sub_group_broadcast( split_counts_hi, 15 ) + }; + + if ( get_sub_group_local_id() == 0 ) + { + printf( + "Instance: %4u " + "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " + "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u \n" + , + instanceID, + vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7], + vals[8], vals[9], vals[10], vals[11], vals[12], vals[13], vals[14], vals[15], + vals[16], vals[17], vals[18], vals[19], vals[20], vals[21], vals[22], vals[23], + vals[24], vals[25], vals[26], vals[27], vals[28], vals[29], vals[30], vals[31] + ); + } +} + +GRL_INLINE void do_rebraid_count_splits_SG( + uniform global struct BVHBase* bvh, + uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, + uniform global uint *rebraid_scratch) +{ + uniform const uint instanceID = get_sub_group_global_id(); + uniform RebraidBuffers buffers = cast_rebraid_buffers(rebraid_scratch,instanceID); + + varying uint lane = get_sub_group_local_id(); + varying uint2 splits = SUBGROUP_count_instance_splits(&bvh->Meta.bounds, instances + instanceID); + varying uint split_counts_lo = splits.x; + varying uint split_counts_hi = splits.y; + + // write this instance's per-bin counts + global uint* counts = buffers.instance_bin_counts; + intel_sub_group_block_write2( counts, splits ); + + // update the per-bin split and instance counters + if (split_counts_lo > 0) + { + atomic_add(&buffers.bin_split_counts[lane], split_counts_lo); + GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane]); + } + if (split_counts_hi > 0) + { + atomic_add(&buffers.bin_split_counts[lane + 16], split_counts_hi); + GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane + 16]); + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +rebraid_count_splits_SG( + uniform global struct BVHBase* bvh, + uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, + uniform global uint *rebraid_scratch) +{ + do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +rebraid_count_splits_SG_indirect( + uniform global struct BVHBase* bvh, + uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, + uniform global uint *rebraid_scratch, + global struct IndirectBuildRangeInfo const * const indirect_data) +{ + instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) + (((global char*)instances) + indirect_data->primitiveOffset); + do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch); +} + + +#define HEAP_SIZE 16 +#define COUNT_SPLITS_WG_SIZE 16 + +struct SLMHeapNode +{ + short offs; + ushort area; +}; + +struct SLMHeap +{ + struct SLMHeapNode nodes[HEAP_SIZE]; + ushort size; + ushort min_key; +}; + +GRL_INLINE bool SLMHeapNode_Greater( struct SLMHeapNode a, struct SLMHeapNode b ) +{ + return a.area > b.area; +} + +GRL_INLINE ushort SLMHeapNode_UnpackKey( struct SLMHeapNode a ) +{ + return a.area; +} + +GRL_INLINE void SLMHeapNode_Unpack( struct SLMHeapNode a, ushort* area_out, short* offs_out ) +{ + *area_out = a.area; + *offs_out = a.offs; +} + +GRL_INLINE struct SLMHeapNode SLMHeapNode_Pack( ushort area, short offs ) +{ + struct SLMHeapNode n; + n.offs = offs; + n.area = area; + return n; +} + + +GRL_INLINE void SLMHeap_Init( struct SLMHeap* heap ) +{ + heap->size = 0; + heap->min_key = 0xffff; +} + +GRL_INLINE bool SLMHeap_empty( struct SLMHeap* heap ) +{ + return heap->size == 0; +} + +GRL_INLINE bool SLMHeap_full( struct SLMHeap* heap ) +{ + return heap->size == HEAP_SIZE; +} + + +GRL_INLINE void SLMHeap_push( struct SLMHeap* heap, ushort area, short offs ) +{ + ushort insert_pos; + if ( SLMHeap_full( heap ) ) + { + ushort current_min_key = heap->min_key; + if ( area <= current_min_key ) + return; // don't push stuff that's smaller than the current minimum + + // search for the minimum element + // The heap is laid out in level order, so it is sufficient to search only the last half + ushort last_leaf = HEAP_SIZE - 1; + ushort first_leaf = (last_leaf / 2) + 1; + + // as we search, keep track of what the new min-key will be so we can cull future pushes + ushort new_min_key = area; + ushort min_pos = 0; + + do + { + ushort idx = first_leaf++; + + ushort current_key = SLMHeapNode_UnpackKey( heap->nodes[idx] ); + bool found_min_pos = (min_pos == 0) && (current_key == current_min_key); + + if ( found_min_pos ) + min_pos = idx; + else + new_min_key = min( current_key, new_min_key ); + + } while ( first_leaf != last_leaf ); + + heap->min_key = new_min_key; + insert_pos = min_pos; + } + else + { + insert_pos = heap->size++; + heap->min_key = min( area, heap->min_key ); + } + + heap->nodes[insert_pos] = SLMHeapNode_Pack( area, offs ); + + // heap-up + while ( insert_pos ) + { + ushort parent = insert_pos / 2; + + struct SLMHeapNode parent_node = heap->nodes[parent]; + struct SLMHeapNode current_node = heap->nodes[insert_pos]; + if ( SLMHeapNode_Greater( parent_node, current_node ) ) + break; + + heap->nodes[insert_pos] = parent_node; + heap->nodes[parent] = current_node; + insert_pos = parent; + } + +} + +bool SLMHeap_pop_max( struct SLMHeap* heap, ushort* area_out, short* offs_out ) +{ + if ( SLMHeap_empty( heap ) ) + return false; + + SLMHeapNode_Unpack( heap->nodes[0], area_out, offs_out ); + + // heap down + ushort size = heap->size; + ushort idx = 0; + do + { + ushort left = 2 * idx + 1; + ushort right = 2 * idx + 2; + if ( left >= size ) + break; + + if ( right >= size ) + { + heap->nodes[idx] = heap->nodes[left]; + break; + } + + struct SLMHeapNode left_node = heap->nodes[left]; + struct SLMHeapNode right_node = heap->nodes[right]; + bool go_left = SLMHeapNode_Greater( left_node, right_node ); + heap->nodes[idx] = go_left ? left_node : right_node; + idx = go_left ? left : right; + + } while ( 1 ); + + heap->size = size - 1; + return true; +} + +void SLMHeap_Print( struct SLMHeap* heap ) +{ + printf( " size=%u min=%u {", heap->size, heap->min_key ); + for ( uint i = 0; i < heap->size; i++ ) + printf( "%04x:%04x", heap->nodes[i].area, heap->nodes[i].offs ); +} + + +GRL_INLINE bool can_open_root( + global struct BVHBase* bvh_base, + const struct GRL_RAYTRACING_INSTANCE_DESC* instance + ) +{ + float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds ); + float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds ); + if ( !is_aabb_valid( root_lower, root_upper ) || GRL_get_InstanceMask(instance) == 0 ) + return false; + + global InternalNode* node = get_node( bvh_base, 0 ); + if ( node->nodeType != NODE_TYPE_INTERNAL ) + return false; + + return is_node_openable( node ); +} + + +GRL_INLINE void count_instance_splits( + global struct AABB3f* geometry_bounds, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, + local ushort* bin_split_counts, + local struct SLMHeap* heap +) +{ + global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure; + + SLMHeap_Init( heap ); + + float relative_area_scale = 1.0f / AABB3f_halfArea( geometry_bounds ); + float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds ); + float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds ); + + ushort quantized_area = quantize_area( transformed_aabb_halfArea( root_lower, root_upper, instance->Transform ) * relative_area_scale ); + short node_offs = 0; + ushort num_splits = 0; + + global InternalNode* node_array = get_node( bvh_base, 0 ); + + while ( 1 ) + { + global InternalNode* node = node_array + node_offs; + + // count this split + uint bin = get_rebraid_bin_index( quantized_area, NUM_REBRAID_BINS ); + bin_split_counts[bin]++; + + // open this node and push children to heap + + // TODO_OPT: Restructure this control flow to prevent differnet lanes from skipping different loop iterations and diverging + // TODO_OPT: Precompute openability masks in BLAS nodes at build time... one bit for self and N bits for each child + int offs = node->childOffset; + for ( ushort i = 0; i < NUM_CHILDREN; i++ ) + { + if ( InternalNode_IsChildValid( node, i ) ) + { + if ( offs >= SHRT_MIN && offs <= SHRT_MAX ) + { + if ( is_node_openable( node_array + offs ) ) + { + ushort area = get_child_area( node, i, instance->Transform, relative_area_scale ); + SLMHeap_push( heap, area, (short)offs ); + } + } + } + offs += InternalNode_GetChildBlockIncr( node, i ); + } + + num_splits++; + if ( num_splits == MAX_SPLITS_PER_INSTANCE ) + break; + + if ( !SLMHeap_pop_max( heap, &quantized_area, &node_offs ) ) + break; + } + +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( COUNT_SPLITS_WG_SIZE, 1, 1 )) ) +void kernel +rebraid_count_splits( + global struct BVHBase* bvh_base, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, + global uint* rebraid_scratch, + uint num_instances + ) +{ + local struct SLMHeap heap[COUNT_SPLITS_WG_SIZE]; + local ushort split_counts[COUNT_SPLITS_WG_SIZE][NUM_REBRAID_BINS]; + + // initialize stuff + // TODO_OPT: transpose this and subgroup-vectorize it so that + // block-writes can be used + for ( uint i = 0; i < NUM_REBRAID_BINS; i++ ) + split_counts[get_local_id( 0 )][i] = 0; + + + // count splits for this thread's instance + uniform uint base_instance = get_group_id( 0 ) * get_local_size( 0 ); + uint instanceID = base_instance + get_local_id( 0 ); + + if ( instanceID < num_instances ) + { + global BVHBase* bvh_base = (global BVHBase*)instances[instanceID].AccelerationStructure; + if ( can_open_root( bvh_base, &instances[instanceID] ) ) + { + count_instance_splits( &bvh_base->Meta.bounds, + &instances[instanceID], + &split_counts[get_local_id( 0 )][0], + &heap[get_local_id(0)] ); + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID ); + + + // reduce bins + for ( uint bin = get_local_id( 0 ); bin < NUM_REBRAID_BINS; bin += get_local_size( 0 ) ) + { + // TODO_OPT: There's probably a better way to arrange this computation + uint bin_split_count = 0; + uint bin_instance_count = 0; + for ( uint i = 0; i < COUNT_SPLITS_WG_SIZE; i++ ) + { + uint s = split_counts[i][bin]; + bin_split_count += s; + bin_instance_count += (s > 0) ? 1 : 0; + } + + if ( bin_split_count > 0 ) + { + atomic_add( &buffers.bin_split_counts[bin], bin_split_count ); + atomic_add( &buffers.bin_instance_counts[bin], bin_instance_count ); + } + } + + // write out bin counts for each instance + for ( uniform uint i = get_sub_group_id(); i < COUNT_SPLITS_WG_SIZE; i += get_num_sub_groups() ) + { + uniform uint iid = base_instance + i; + if ( iid > num_instances ) + break; + + global uint* instance_bin_counts = cast_rebraid_buffers( rebraid_scratch, iid ).instance_bin_counts; + + for ( uniform ushort j = 0; j < NUM_REBRAID_BINS; j += get_sub_group_size() ) + { + uint count = split_counts[i][j + get_sub_group_local_id() ]; + intel_sub_group_block_write( instance_bin_counts + j, count ); + } + } + +} + + + + +/////////////////////////////////////////////////////////////////////////////////////////// +// Build PrimRefs +/////////////////////////////////////////////////////////////////////////////////////////// + +GRL_INLINE uint get_instance_split_count(RebraidBuffers buffers, uint instanceID, uint available_splits) +{ + global uint* instance_desired_split_count = buffers.instance_bin_counts; + global uint *bin_split_counts = buffers.bin_split_counts; + global uint *bin_instance_counts = buffers.bin_instance_counts; + + uint total_splits = 0; + uint remaining_available_splits = available_splits; + uint max_bin = 0; + uint desired_splits_this_bin = 0; + uint instance_splits = 0; + + do + { + // stop when we reach a level where we can't satisfy the demand + desired_splits_this_bin = instance_desired_split_count[max_bin]; + uint total_bin_splits = bin_split_counts[max_bin]; + + if (total_bin_splits > remaining_available_splits) + break; + + // we have enough budget to give all instances everything they want at this level, so do it + remaining_available_splits -= total_bin_splits; + instance_splits += desired_splits_this_bin; + desired_splits_this_bin = 0; + max_bin++; + + } while (max_bin < NUM_REBRAID_BINS); + + if (max_bin < NUM_REBRAID_BINS) + { + // we have more split demand than we have splits available. The current bin is the last one that gets any splits + // distribute the leftovers as evenly as possible to instances that want them + if (desired_splits_this_bin > 0) + { + // this instance wants splits. how many does it want? + uint desired_total = instance_splits + desired_splits_this_bin; + + // distribute to all instances as many as possible + uint count = bin_instance_counts[max_bin]; + uint whole = remaining_available_splits / count; + remaining_available_splits -= whole * count; + + // distribute remainder to lower numbered instances + size_t partial = (instanceID < remaining_available_splits) ? 1 : 0; + + // give the instance its share. + instance_splits += whole + partial; + instance_splits = min(instance_splits, desired_total); // don't give it more than it needs + } + } + + return instance_splits; +} + +GRL_INLINE void build_unopened_primref( + struct AABB3f* centroid_bounds, + global __const BVHBase *bvh_base, + global volatile uint *primref_counter, + global struct AABB *primref_buffer, + global __const float *Transform, + uint instanceID, + float matOverhead, + ushort instanceMask) +{ + float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); + float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); + + struct AABB primRef; + AABB_init( &primRef ); + + uint bvhoffset = (uint)BVH_ROOT_NODE_OFFSET; + if (is_aabb_valid(root_lower, root_upper) && instanceMask != 0) + { + primRef = AABBfromAABB3f(compute_xfm_bbox(Transform, BVHBase_GetRootNode(bvh_base), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &bvh_base->Meta.bounds, matOverhead)); + } + else + { + primRef.lower.x = Transform[3]; + primRef.lower.y = Transform[7]; + primRef.lower.z = Transform[11]; + primRef.upper.xyz = primRef.lower.xyz; + + instanceMask = 0; + bvhoffset = NO_NODE_OFFSET; + } + + primRef.lower.w = as_float(instanceID | (instanceMask << 24)); + primRef.upper.w = as_float(bvhoffset); + + float3 centroid = primRef.lower.xyz + primRef.upper.xyz; + centroid_bounds->lower[0] = centroid.x; + centroid_bounds->upper[0] = centroid.x; + centroid_bounds->lower[1] = centroid.y; + centroid_bounds->upper[1] = centroid.y; + centroid_bounds->lower[2] = centroid.z; + centroid_bounds->upper[2] = centroid.z; + + uint place = GRL_ATOMIC_INC(primref_counter); + primref_buffer[place] = primRef; +} + +GRL_INLINE void build_opened_primrefs( + varying bool lane_mask, + varying uint offset, + varying InternalNode* node, + varying struct AABB3f* centroid_bounds, + uniform global BVHBase *bvh_base, + uniform volatile global uint *primref_counter, + uniform global struct AABB *primref_buffer, + uniform uint instanceID, + uniform const float *Transform, + uniform float matOverhead, + varying ushort instanceMask) +{ + // TODO_OPT: This function is often called with <= 6 active lanes + // If lanes are sparse, consider jumping to a sub-group vectorized variant... + + if (lane_mask) + { + varying uint place = GRL_ATOMIC_INC(primref_counter); + + struct AABB box = AABBfromAABB3f(compute_xfm_bbox(Transform, node, XFM_BOX_NOT_REFINED_CLIPPED, &bvh_base->Meta.bounds, matOverhead)); + + box.lower.w = as_float(instanceID | (instanceMask << 24)); + box.upper.w = as_float(offset * 64 + (uint)BVH_ROOT_NODE_OFFSET); + primref_buffer[place] = box; + + AABB3f_extend_point( centroid_bounds, box.lower.xyz + box.upper.xyz ); + } +} + + +GRL_INLINE void SUBGROUP_open_nodes( + uniform global struct AABB3f *geometry_bounds, + uniform uint split_limit, + uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance, + uniform uint instanceID, + uniform volatile global uint *primref_counter, + uniform global struct AABB *primref_buffer, + varying struct AABB3f* centroid_bounds, + float transformOverhead) +{ + uniform SGHeap heap; + SGHeap_init(&heap); + + uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds); + uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure; + + uniform uint16_t node_offs = 0; + varying uint lane = get_sub_group_local_id(); + + uniform InternalNode* node_array = get_node( bvh_base, 0 ); + + LOOP_TRIPWIRE_INIT; + + while ( 1 ) + { + uniform InternalNode *node = node_array + node_offs; + + varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node); + varying bool sg_valid = false; + varying bool sg_openable = false; + if (lane < NUM_CHILDREN) + { + sg_valid = InternalNode_IsChildValid(node, lane); + if (sg_valid && (sg_offs <= MAX_NODE_OFFSET)) + { + sg_openable = is_node_openable( node_array + sg_offs); + } + } + + uniform uint16_t valid_children = intel_sub_group_ballot(sg_valid); + uniform uint16_t openable_children = intel_sub_group_ballot(sg_openable); + uniform uint16_t unopenable_children = valid_children & (~openable_children); + + if ( openable_children ) + { + varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale ); + + // try to push all openable children to the heap + if ( !SGHeap_full( &heap ) ) + { + openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children ); + } + + // we have more openable children than will fit in the heap + // process these one by one. + // TODO: Try re-writing with sub_group_any() and see if compiler does a better job + while ( openable_children ) + { + // pop min element + uniform uint16_t min_area; + uniform uint16_t min_offs; + SGHeap_full_pop_min( &heap, &min_area, &min_offs ); + + // eliminate all children smaller than heap minimum. + // mark eliminated children as unopenable + varying uint culled_children = openable_children & intel_sub_group_ballot( sg_area <= min_area ); + unopenable_children ^= culled_children; + openable_children &= ~culled_children; + + if ( openable_children ) + { + // if any children survived the purge + // find the first such child and swap its offset for the one from the heap + // + uniform uint child_id = ctz( openable_children ); + uniform uint16_t old_min_offs = min_offs; + min_area = sub_group_broadcast( sg_area, child_id ); + min_offs = sub_group_broadcast( sg_offs, child_id ); + + if ( lane == child_id ) + sg_offs = old_min_offs; + + openable_children ^= (1 << child_id); + unopenable_children ^= (1 << child_id); + } + + SGHeap_push_and_fill( &heap, min_area, min_offs ); + + } + } + + if (unopenable_children) + { + varying bool sg_create_primref = ((1 << lane) & unopenable_children); + build_opened_primrefs(sg_create_primref, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance)); + } + + --split_limit; + if (split_limit == 0) + { + // split limit exceeded + // create primrefs for all remaining openable nodes in heap + varying bool sg_mask = SGHeap_get_lane_mask(&heap); + sg_offs = SGHeap_get_lane_values(&heap); + build_opened_primrefs(sg_mask, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance)); + + break; + } + + + // NOTE: the heap should never be empty. If it is, the instance was given too many splits. + + // get next node from heap + uint16_t quantized_area; + SGHeap_pop_max(&heap, &quantized_area, &node_offs); + + LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_build_primrefs" ); + + } +} + + +#define OPEN_QUEUE_SIZE 256 +#define OPEN_QUEUE_NUM_SGS 16 + +typedef struct OpenQueueEntry +{ + uint instanceID; + ushort num_splits; +} OpenQueueEntry; + +typedef struct OpenQueue +{ + uint num_produced; + uint num_consumed; + OpenQueueEntry Q[OPEN_QUEUE_SIZE]; +} OpenQueue; + +uniform uint SUBGROUP_GetNextQueueEntry( local OpenQueue* queue ) +{ + uint next = 0; + if ( get_sub_group_local_id() == 0 ) + next = GRL_ATOMIC_INC( &queue->num_consumed ); + return sub_group_broadcast( next, 0 ); +} + + +GRL_INLINE void do_rebraid_build_primrefs( + local struct AABB3f* SLM_CentroidBounds, + local OpenQueue* SLM_Q, + global struct Globals* globals, + global struct BVHBase* base, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, + global uint* rebraid_scratch, + global struct AABB* primref_buffer, + uint extra_primref_count, + uint num_instances) +{ + varying uint instanceID = get_sub_group_size() * get_sub_group_global_id() + get_sub_group_local_id(); + + uniform volatile global uint* primref_counter = &globals->numPrimitives; + uniform RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID ); + uniform uint available_splits = get_num_splits( extra_primref_count, NUM_CHILDREN ); + + + + varying struct AABB3f centroidBounds; + AABB3f_init( ¢roidBounds ); + + if ( get_local_id( 0 ) == 0 ) + { + SLM_Q->num_produced = 0; + SLM_Q->num_consumed = 0; + AABB3f_init( SLM_CentroidBounds ); + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + // assign splits to unopened instances. Build primrefs for unsplit instances in vectorized form + varying uint num_splits = 0; + if ( instanceID < num_instances ) + { + num_splits = get_instance_split_count( buffers, instanceID, available_splits ); + if ( num_splits == 0 ) + { + varying global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID; + varying global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure; + if ( bvh_base != 0 ) + { + build_unopened_primref( ¢roidBounds, bvh_base, primref_counter, primref_buffer, instance->Transform, instanceID, 0.0f, GRL_get_InstanceMask(instance)); + } + } + else + { + // defer opened instances + uint place = GRL_ATOMIC_INC( &SLM_Q->num_produced ); + SLM_Q->Q[place].instanceID = instanceID; + SLM_Q->Q[place].num_splits = (ushort)num_splits; + } + } + + barrier( CLK_LOCAL_MEM_FENCE ); + + // if there were opened instances, process them, one per subgroup + uniform uint num_produced = SLM_Q->num_produced; + uniform uint next = SUBGROUP_GetNextQueueEntry( SLM_Q ); + + while ( next < num_produced ) + { + uniform uint instanceID = SLM_Q->Q[next].instanceID; + uniform uint num_splits = SLM_Q->Q[next].num_splits; + + uniform global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID; + + float transformOverhead = +#if FINE_TRANSFORM_NODE_BOX + transformation_bbox_surf_overhead(instance->Transform); +#else + 0.0f; +#endif + + SUBGROUP_open_nodes( + &base->Meta.bounds, + num_splits, + instance, + instanceID, + primref_counter, + primref_buffer, + ¢roidBounds, + transformOverhead); + + next = SUBGROUP_GetNextQueueEntry( SLM_Q ); + } + + // reduce the centroid bounds AABB + struct AABB3f reduced = AABB3f_sub_group_reduce( ¢roidBounds ); + if ( get_sub_group_local_id() == 0 ) + AABB3f_atomic_merge_localBB_nocheck( SLM_CentroidBounds, &reduced ); + + barrier( CLK_LOCAL_MEM_FENCE ); + + if( get_local_id(0) == 0 ) + { + atomic_min( (global float*) (&globals->centroidBounds.lower) + 0, SLM_CentroidBounds->lower[0] ); + atomic_min( (global float*) (&globals->centroidBounds.lower) + 1, SLM_CentroidBounds->lower[1] ); + atomic_min( (global float*) (&globals->centroidBounds.lower) + 2, SLM_CentroidBounds->lower[2] ); + atomic_max( (global float*) (&globals->centroidBounds.upper) + 0, SLM_CentroidBounds->upper[0] ); + atomic_max( (global float*) (&globals->centroidBounds.upper) + 1, SLM_CentroidBounds->upper[1] ); + atomic_max( (global float*) (&globals->centroidBounds.upper) + 2, SLM_CentroidBounds->upper[2] ); + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +void kernel rebraid_build_primrefs( + global struct Globals* globals, + global struct BVHBase* base, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, + global uint* rebraid_scratch, + global struct AABB* primref_buffer, + uint extra_primref_count, + uint num_instances) +{ + local struct AABB3f SLM_CentroidBounds; + local OpenQueue SLM_Q; + do_rebraid_build_primrefs( + &SLM_CentroidBounds, + &SLM_Q, + globals, + base, + instance_buffer, + rebraid_scratch, + primref_buffer, + extra_primref_count, + num_instances); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) +void kernel rebraid_build_primrefs_indirect( + global struct Globals* globals, + global struct BVHBase* base, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, + global uint* rebraid_scratch, + global struct AABB* primref_buffer, + global struct IndirectBuildRangeInfo const * const indirect_data, + uint extra_primref_count ) +{ + local struct AABB3f SLM_CentroidBounds; + local OpenQueue SLM_Q; + + instance_buffer = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) + (((global char*)instance_buffer) + indirect_data->primitiveOffset); + + do_rebraid_build_primrefs( + &SLM_CentroidBounds, + &SLM_Q, + globals, + base, + instance_buffer, + rebraid_scratch, + primref_buffer, + extra_primref_count, + indirect_data->primitiveCount); +} + + +/////////////////////////////////////////////////////////////////////////////////////////// +// Misc +/////////////////////////////////////////////////////////////////////////////////////////// + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +ISA_TEST(global InternalNode *n, global uint *out, global float *xform, float scale) +{ + + out[get_sub_group_local_id()] = InternalNode_IsChildValid(n, get_sub_group_local_id()); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) void kernel +DEBUG_PRINT( + global struct Globals* globals, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, + global uint* rebraid_scratch, + global struct AABB* primref_buffer, + dword num_extra, + dword input_instances ) +{ +#if 0 + // validate primrefs + if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) + { + uint refs = globals->numPrimitives; + for ( uint i = 0; i < refs; i++ ) + { + if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) || + any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) || + any( isnan(primref_buffer[i].lower.xyz) ) || + any( isnan(primref_buffer[i].upper.xyz) ) ) + { + struct AABB box = primref_buffer[i]; + printf( "BAD BOX: %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ), + box.lower.x, box.lower.y, box.lower.z, + box.upper.x, box.upper.y, box.upper.z, + as_uint( box.lower.w ) ); + } + + const uint instIndex = PRIMREF_instanceID(&primref_buffer[i]); // TODO: Refactor me. We should not be using struct AABB for primRefs + const uint rootByteOffset = as_uint( primref_buffer[i].upper.w ); // It should be struct PrimRef + if ( instIndex >= input_instances ) + printf( "BAD INSTANCE INDEX: %u", i ); + else + { + global struct BVHBase* blas = (global struct BVHBase*)instance_buffer[instIndex].AccelerationStructure; + if ( blas ) + { + struct InternalNode* start = BVHBase_GetInternalNodes( blas ); + struct InternalNode* end = BVHBase_GetInternalNodesEnd( blas ); + + InternalNode* entryPoint = (struct InternalNode*)((char*)instance_buffer[instIndex].AccelerationStructure + rootByteOffset); + if ( entryPoint < start || entryPoint >= end ) + printf( "BAD ENTRYPOINT: %u\n", i ); + if ( (rootByteOffset & 63) != 0 ) + printf( "MISALIGNED ENTRYPOInt: %u\n", i ); + + } + } + } + } +#endif +#if 0 + if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) + printf( "REBRAIDED: %u\n", globals->numPrimitives ); + + // print instance bin information + if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) + { + printf( "REBRAIDED: %u\n", globals->numPrimitives ); + for( uint i=0; i<231; i++ ) + { + RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch,i ); + printf( " ID:%4u ", i ); + for ( uint j = 0; j < NUM_REBRAID_BINS; j++ ) + { + global uint* count = buffers.instance_bin_counts; + printf( " %2u ", count[j] ); + } + printf( "\n" ); + } + } +#endif +#if 0 + if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) + { + printf( "Instances: %u\n", globals->numPrimitives ); + + for ( uint i = 0; i < globals->numPrimitives; i++ ) + { + if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) || + any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) ) + { + struct AABB box = primref_buffer[i]; + printf( " %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ), + box.lower.x, box.lower.y, box.lower.z, + box.upper.x, box.upper.y, box.upper.z, + as_uint( box.lower.w ) ); + } + + } + } +#endif +} + diff --git a/src/intel/vulkan/grl/gpu/common.h b/src/intel/vulkan/grl/gpu/common.h new file mode 100644 index 00000000000..5fa0e117ae4 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/common.h @@ -0,0 +1,429 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "shared.h" +#include "intrinsics.h" +#include "AABB.h" +#include "AABB3f.h" +#include "qbvh6.h" + +/* ====== BVH_BUILDER config ====== */ + +__constant const float cfg_intCost = 4.0f; +__constant const float cfg_travCost = 1.0f; +__constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN; +__constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX; +__constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE; + +#define ENABLE_CONVERSION_CHECKS 0 + +#ifdef ENABLE_BIG_REG_ANNOTATION +#define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4"))) +#else +#define GRL_ANNOTATE_BIG_REG_REQ +#endif + +#ifdef ENABLE_IGC_DO_NOT_SPILL +#define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill"))) +#else +#define GRL_ANNOTATE_IGC_DO_NOT_SPILL +#endif + +#define ERROR() + +/* =================================================================================================================================================== */ +/* =================================================================================================================================================== */ +/* =================================================================================================================================================== */ +/* =================================================================================================================================================== */ + +GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset) +{ + return (offset & 0x7) - 3; +} + +GRL_INLINE unsigned int getLeafOffset(unsigned int offset) +{ + return offset & (~0x7); +} + +GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2) +{ + const float4 a = v1 - v0; + const float4 b = v2 - v0; + return cross(a, b); +} + +GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2) +{ + const float4 normal = triangleNormal(v0, v1, v2); + return length((float3)(normal.x, normal.y, normal.z)) * 0.5f; +} + +GRL_INLINE float det2(const float2 a, const float2 b) +{ + return a.x * b.y - a.y * b.x; +} + +GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2) +{ + const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy)); + const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz)); + const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx)); + return xy + yz + zx; +} + +typedef struct Block64B { + char data[64]; +} Block64B __attribute__((aligned(64))); + +typedef char byte_align64B __attribute__((aligned(64))); + +/* ====================================================================== */ +/* ============================== GLOBALS =============================== */ +/* ====================================================================== */ + +GRL_INLINE bool Globals_OnFinish(global struct Globals *globals) +{ + /* last active HW thread ? */ + if (get_local_id(0) == 0) + { + const uint sync = atomic_add(&globals->sync, 1); + if (sync + 1 == get_num_groups(0)) + { + globals->sync = 0; + return true; + } + } + return false; +} + +GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p) +{ + return p->cur - p->start; +}; + +GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size) +{ + return atomic_add(&p->cur, size); +} + +GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size) +{ + uint offset = 0; + if (get_sub_group_local_id() == 0) + offset = atomic_add(&p->cur, size); + return sub_group_broadcast(offset, 0); +} + +// node allocation returns an offset from beginning of BVH to allocated node +// in multiples of 64B +GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes ) +{ + return atomic_add_global( &base->nodeDataCur, num_nodes ); +} +GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes) +{ + return atomic_add_global(&base->proceduralDataCur, num_nodes); +} + +GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes) +{ + return atomic_add_global(&base->quadLeafCur, num_nodes); +} + +#if 0 +GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size) +{ + const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ + return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size); +} + +GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size) +{ + const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ + return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size); +} + +GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size) +{ + const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ + return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size); +} + +GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size) +{ + const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ + return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size); +} +#endif + +GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals) +{ + return (global struct BuildRecord *)(bvh_mem + globals->build_record_start); +} + +/* ======================================================================= */ +/* ============================== TRIANGLE =============================== */ +/* ======================================================================= */ + +/*GRL_INLINE void printTriangle(struct Triangle *t) +{ + printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID); + }*/ + +/* ==================================================================== */ +/* ============================== SPLIT =============================== */ +/* ==================================================================== */ + +GRL_INLINE void printSplit(struct Split *split) +{ + printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos); +} + +/* ========================================================================== */ +/* ============================== BUILDRECORD =============================== */ +/* ========================================================================== */ + +GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end) +{ + AABB_init(&buildRecord->centroidBounds); + buildRecord->start = start; + buildRecord->end = end; +} + +GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref) +{ + AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref)); +} + +GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord) +{ + return as_uint(buildRecord->centroidBounds.upper.w); +} + +GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth) +{ + buildRecord->centroidBounds.upper.w = as_float(depth); +} + +GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord) +{ + return buildRecord->end - buildRecord->start; +} + +/* ========================================================================== */ +/* =================== BinaryMortonCodeHierarchy ============================= */ +/* ========================================================================== */ + +GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end) +{ + record->range.start = start; + record->range.end = end; + record->leftChild = -1; + record->rightChild = -1; +// record->flag = 0; +} + +GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID) +{ + /* leaf case */ + if (nodeID & (uint)(1 << 31)) + return 1; + + /* inner node case*/ + else + return nodes[nodeID].range.end - nodes[nodeID].range.start + 1; +} + +GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID) +{ + struct BinaryMortonCodeHierarchy entry; + + if (nodeID & (uint)(1 << 31)) { + /* leaf case */ + uint rangeStart = nodeID ^ (uint)(1 << 31); + BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart); + } + else { + /* inner node case*/ + entry = nodes[nodeID]; + } + + return entry; +} + +GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID) +{ + /* leaf case */ + if (nodeID & (uint)(1 << 31)) + return nodeID ^ (uint)(1 << 31); + + /* inner node case*/ + else + return nodes[nodeID].range.start; +} + +/* ==================================================================== */ +/* ============================== RANGE =============================== */ +/* ==================================================================== */ + +GRL_INLINE void printRange(struct Range *range) +{ + printf("start %d end %d \n", range->start, range->end); +} + +GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1) +{ + if (range0->start == range1->start && + range0->end == range1->end) + return true; + return false; +} + +GRL_INLINE uint getSizeRange(struct Range *range) +{ + return range->end - range->start; +} + +/* ==================================================================== */ +/* ========================= ProceduralLeaf =========================== */ +/* ==================================================================== */ + +#if 0 +struct ProceduralLeaf +{ + uint shaderIndex_geomMask; + uint geomIndex_flags; + uint N_last; + uint primIndex[13]; +}; +#endif + +GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This) +{ + return This->leafDesc.geomIndex_flags & 0x1FFFFFFF; +} + +GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i) +{ + //assert(i < N); + return This->_primIndex[i]; +} + +/* ==================================================================== */ +/* =========================== TrianglePair =========================== */ +/* ==================================================================== */ + +struct TrianglePair +{ + uint4 a; // indices of the 4 verts to store in the quad + uint3 lb; // index of the second triangle's verts in 'a' +}; + +GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1) +{ + struct TrianglePair q; + q.a.x = tri0.x; + q.a.y = tri0.y; + q.a.z = tri0.z; + q.a.w = tri0.z; + + uint3 b; + b.x = tri1.x; + b.y = tri1.y; + b.z = tri1.z; + + q.lb = (uint3)(3); + + q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x; + q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y; + q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z; + + q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x; + q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y; + q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z; + + q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x; + q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y; + q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z; + + q.lb.x = (primID0 != primID1) ? q.lb.x : 0; + q.lb.y = (primID0 != primID1) ? q.lb.y : 0; + q.lb.z = (primID0 != primID1) ? q.lb.z : 0; + + q.a.w = (q.lb.x == 3) ? b.x : q.a.w; + q.a.w = (q.lb.y == 3) ? b.y : q.a.w; + q.a.w = (q.lb.z == 3) ? b.z : q.a.w; + + return q; +} + +GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column) +{ + return d->Transform[row][column]; +} + +GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d) +{ + return d->InstanceIDAndMask & (0x00FFFFFF); +} + +GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d) +{ + return d->InstanceIDAndMask >> 24; +} + +GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d) +{ + return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1); +} + +GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d) +{ + return d->InstanceContributionToHitGroupIndexAndFlags >> 24; +} + +GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d) +{ + return d->AccelerationStructureGPUVA; +} + +GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value) +{ + d->Transform[row][column] = value; +} + +GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id) +{ + d->InstanceIDAndMask &= 255 << 24; + d->InstanceIDAndMask |= id & ((1 << 24) - 1); +} + +GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask) +{ + d->InstanceIDAndMask &= ((1 << 24) - 1); + d->InstanceIDAndMask |= mask << 24; +} + +GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution) +{ + d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24; + d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1); +} + +GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags) +{ + d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1); + d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24; +} + +GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address) +{ + d->AccelerationStructureGPUVA = address; +} diff --git a/src/intel/vulkan/grl/gpu/copy.grl b/src/intel/vulkan/grl/gpu/copy.grl new file mode 100644 index 00000000000..1bb500a4ea0 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/copy.grl @@ -0,0 +1,129 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module copy; // In copy we assume output data structure to be DXR compatible + +kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" > +kernel compact < source="bvh_copy.cl", kernelFunction="compact" > +kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" > +kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" > +kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" > +kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" > + +metakernel clone_indirect( + qword dest, + qword src, + qword srcBVHsizedwordAddr) +{ +// this has to be compatible with in kernel GroupCountForCopy(...) + define byteSize REG0; + define numGroupsRqd REG1; + define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255; + define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8; + define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4; + byteSize = load_dword(srcBVHsizedwordAddr); + numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; + numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; + + DISPATCHDIM_X = numGroupsRqd.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect clone_indirect args( + dest, + src); +} + +metakernel compact( + qword dest, + qword src) +{ + dispatch compact(32,1,1) args( + dest, + src, + 32); +} + +metakernel serialize_indirect( + qword dest, + qword src, + qword driverID, + qword srcBVHsizedwordAddr) +{ + define byteSize REG0; + define numGroupsRqd REG1; + define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255; + define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8; + define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4; + byteSize = load_dword(srcBVHsizedwordAddr); + numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; + numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; + DISPATCHDIM_X = numGroupsRqd.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect serialize_indirect args( + dest, + src, + driverID); +} + +metakernel serialize_for_input_dump_indirect( + qword batchPtrs, + qword dstOffset, + qword src, + qword driverID, + qword srcBVHsizedwordAddr) +{ + define byteSize REG0; + define numGroupsRqd REG1; + define BYTE_PER_GROUP_CHUNK_SHIFT REG2; BYTE_PER_GROUP_CHUNK_SHIFT = 8; + define REMINDER_NUM_GROUPS REG3; REMINDER_NUM_GROUPS = 4; + byteSize = load_dword(srcBVHsizedwordAddr); + numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; + numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; + DISPATCHDIM_X = numGroupsRqd.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect serialize_for_input_dump_indirect args( + batchPtrs, + dstOffset, + src, + driverID); +} + +metakernel deserialize_indirect( + qword dest, + qword src, + qword srcBVHsizedwordAddr) +{ + define byteSize REG0; + define numGroupsRqd REG1; + define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255; + define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8; + define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4; + byteSize = load_dword(srcBVHsizedwordAddr); + numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; + numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; + DISPATCHDIM_X = numGroupsRqd.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect deserialize_indirect args( + dest, + src); +} + +metakernel dxr_decode( + qword dest, + qword src) +{ + dispatch dxr_decode(1,1,1) args( + dest, + src); +} diff --git a/src/intel/vulkan/grl/gpu/d3d12.h b/src/intel/vulkan/grl/gpu/d3d12.h new file mode 100644 index 00000000000..32a7654eac5 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/d3d12.h @@ -0,0 +1,525 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once +#include "GRLStructs.h" +#include "shared.h" + +typedef global void *D3D12_GPU_VIRTUAL_ADDRESS; +typedef void *ID3D12StateObjectPrototype; + +enum DXGI_FORMAT +{ + DXGI_FORMAT_UNKNOWN, + DXGI_FORMAT_R32G32B32A32_TYPELESS, + DXGI_FORMAT_R32G32B32A32_FLOAT, + DXGI_FORMAT_R32G32B32A32_UINT, + DXGI_FORMAT_R32G32B32A32_SINT, + DXGI_FORMAT_R32G32B32_TYPELESS, + DXGI_FORMAT_R32G32B32_FLOAT, + DXGI_FORMAT_R32G32B32_UINT, + DXGI_FORMAT_R32G32B32_SINT, + DXGI_FORMAT_R16G16B16A16_TYPELESS, + DXGI_FORMAT_R16G16B16A16_FLOAT, + DXGI_FORMAT_R16G16B16A16_UNORM, + DXGI_FORMAT_R16G16B16A16_UINT, + DXGI_FORMAT_R16G16B16A16_SNORM, + DXGI_FORMAT_R16G16B16A16_SINT, + DXGI_FORMAT_R32G32_TYPELESS, + DXGI_FORMAT_R32G32_FLOAT, + DXGI_FORMAT_R32G32_UINT, + DXGI_FORMAT_R32G32_SINT, + DXGI_FORMAT_R32G8X24_TYPELESS, + DXGI_FORMAT_D32_FLOAT_S8X24_UINT, + DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS, + DXGI_FORMAT_X32_TYPELESS_G8X24_UINT, + DXGI_FORMAT_R10G10B10A2_TYPELESS, + DXGI_FORMAT_R10G10B10A2_UNORM, + DXGI_FORMAT_R10G10B10A2_UINT, + DXGI_FORMAT_R11G11B10_FLOAT, + DXGI_FORMAT_R8G8B8A8_TYPELESS, + DXGI_FORMAT_R8G8B8A8_UNORM, + DXGI_FORMAT_R8G8B8A8_UNORM_SRGB, + DXGI_FORMAT_R8G8B8A8_UINT, + DXGI_FORMAT_R8G8B8A8_SNORM, + DXGI_FORMAT_R8G8B8A8_SINT, + DXGI_FORMAT_R16G16_TYPELESS, + DXGI_FORMAT_R16G16_FLOAT, + DXGI_FORMAT_R16G16_UNORM, + DXGI_FORMAT_R16G16_UINT, + DXGI_FORMAT_R16G16_SNORM, + DXGI_FORMAT_R16G16_SINT, + DXGI_FORMAT_R32_TYPELESS, + DXGI_FORMAT_D32_FLOAT, + DXGI_FORMAT_R32_FLOAT, + DXGI_FORMAT_R32_UINT, + DXGI_FORMAT_R32_SINT, + DXGI_FORMAT_R24G8_TYPELESS, + DXGI_FORMAT_D24_UNORM_S8_UINT, + DXGI_FORMAT_R24_UNORM_X8_TYPELESS, + DXGI_FORMAT_X24_TYPELESS_G8_UINT, + DXGI_FORMAT_R8G8_TYPELESS, + DXGI_FORMAT_R8G8_UNORM, + DXGI_FORMAT_R8G8_UINT, + DXGI_FORMAT_R8G8_SNORM, + DXGI_FORMAT_R8G8_SINT, + DXGI_FORMAT_R16_TYPELESS, + DXGI_FORMAT_R16_FLOAT, + DXGI_FORMAT_D16_UNORM, + DXGI_FORMAT_R16_UNORM, + DXGI_FORMAT_R16_UINT, + DXGI_FORMAT_R16_SNORM, + DXGI_FORMAT_R16_SINT, + DXGI_FORMAT_R8_TYPELESS, + DXGI_FORMAT_R8_UNORM, + DXGI_FORMAT_R8_UINT, + DXGI_FORMAT_R8_SNORM, + DXGI_FORMAT_R8_SINT, + DXGI_FORMAT_A8_UNORM, + DXGI_FORMAT_R1_UNORM, + DXGI_FORMAT_R9G9B9E5_SHAREDEXP, + DXGI_FORMAT_R8G8_B8G8_UNORM, + DXGI_FORMAT_G8R8_G8B8_UNORM, + DXGI_FORMAT_BC1_TYPELESS, + DXGI_FORMAT_BC1_UNORM, + DXGI_FORMAT_BC1_UNORM_SRGB, + DXGI_FORMAT_BC2_TYPELESS, + DXGI_FORMAT_BC2_UNORM, + DXGI_FORMAT_BC2_UNORM_SRGB, + DXGI_FORMAT_BC3_TYPELESS, + DXGI_FORMAT_BC3_UNORM, + DXGI_FORMAT_BC3_UNORM_SRGB, + DXGI_FORMAT_BC4_TYPELESS, + DXGI_FORMAT_BC4_UNORM, + DXGI_FORMAT_BC4_SNORM, + DXGI_FORMAT_BC5_TYPELESS, + DXGI_FORMAT_BC5_UNORM, + DXGI_FORMAT_BC5_SNORM, + DXGI_FORMAT_B5G6R5_UNORM, + DXGI_FORMAT_B5G5R5A1_UNORM, + DXGI_FORMAT_B8G8R8A8_UNORM, + DXGI_FORMAT_B8G8R8X8_UNORM, + DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, + DXGI_FORMAT_B8G8R8A8_TYPELESS, + DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, + DXGI_FORMAT_B8G8R8X8_TYPELESS, + DXGI_FORMAT_B8G8R8X8_UNORM_SRGB, + DXGI_FORMAT_BC6H_TYPELESS, + DXGI_FORMAT_BC6H_UF16, + DXGI_FORMAT_BC6H_SF16, + DXGI_FORMAT_BC7_TYPELESS, + DXGI_FORMAT_BC7_UNORM, + DXGI_FORMAT_BC7_UNORM_SRGB, + DXGI_FORMAT_AYUV, + DXGI_FORMAT_Y410, + DXGI_FORMAT_Y416, + DXGI_FORMAT_NV12, + DXGI_FORMAT_P010, + DXGI_FORMAT_P016, + DXGI_FORMAT_420_OPAQUE, + DXGI_FORMAT_YUY2, + DXGI_FORMAT_Y210, + DXGI_FORMAT_Y216, + DXGI_FORMAT_NV11, + DXGI_FORMAT_AI44, + DXGI_FORMAT_IA44, + DXGI_FORMAT_P8, + DXGI_FORMAT_A8P8, + DXGI_FORMAT_B4G4R4A4_UNORM, + DXGI_FORMAT_P208, + DXGI_FORMAT_V208, + DXGI_FORMAT_V408, + DXGI_FORMAT_FORCE_UINT +}; + +typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS +{ + D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0, + D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1, + D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2 +} D3D12_RAYTRACING_GEOMETRY_FLAGS; + +typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE +{ + D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0, + D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1) +} D3D12_RAYTRACING_GEOMETRY_TYPE; + +typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS +{ + D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0, + D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1, + D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2, + D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4, + D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8 +} D3D12_RAYTRACING_INSTANCE_FLAGS; + +typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE +{ + D3D12_GPU_VIRTUAL_ADDRESS StartAddress; + unsigned long StrideInBytes; +} D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE; + +typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE +{ + D3D12_GPU_VIRTUAL_ADDRESS StartAddress; + unsigned long SizeInBytes; +} D3D12_GPU_VIRTUAL_ADDRESSRANGE; + +typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE +{ + D3D12_GPU_VIRTUAL_ADDRESS StartAddress; + unsigned long SizeInBytes; + unsigned long StrideInBytes; +} D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE; + +typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC +{ + D3D12_GPU_VIRTUAL_ADDRESS Transform; + enum DXGI_FORMAT IndexFormat; + enum DXGI_FORMAT VertexFormat; + unsigned int IndexCount; + unsigned int VertexCount; + D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer; + struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer; +} D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC; + +typedef struct D3D12_RAYTRACING_AABB +{ + float MinX; + float MinY; + float MinZ; + float MaxX; + float MaxY; + float MaxZ; +} D3D12_RAYTRACING_AABB; + +GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source) +{ + dest->MinX = source->lower.x; + dest->MinY = source->lower.y; + dest->MinZ = source->lower.z; + dest->MaxX = source->upper.x; + dest->MaxY = source->upper.y; + dest->MaxZ = source->upper.z; +} + +typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC +{ + unsigned long AABBCount; + D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs; +} D3D12_RAYTRACING_GEOMETRY_AABBS_DESC; + +typedef struct D3D12_RAYTRACING_GEOMETRY_DESC +{ + D3D12_RAYTRACING_GEOMETRY_TYPE Type; + D3D12_RAYTRACING_GEOMETRY_FLAGS Flags; + //unsigned int ShaderIndex : 24; // extension + //unsigned int Mask : 8; // extension + //unsigned int ShaderIndex_Mask; // extension + union { + D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles; + D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs; + }; +} D3D12_RAYTRACING_GEOMETRY_DESC; + +GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type) +{ + geomDesc->Type = type; +} + +GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Type; +} + +GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags) +{ + geomDesc->Flags = flags; +} + +GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Flags; +} + +GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform) +{ + geomDesc->Triangles.Transform = transform; +} + +GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Triangles.Transform; +} + +GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format) +{ + switch (format) + { + case INDEX_FORMAT_NONE: + geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN; + break; + case INDEX_FORMAT_R16_UINT: + geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT; + break; + case INDEX_FORMAT_R32_UINT: + geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT; + break; + } +} + +GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + switch (geomDesc->Triangles.IndexFormat) + { + case DXGI_FORMAT_R16_UINT: + return INDEX_FORMAT_R16_UINT; + case DXGI_FORMAT_R32_UINT: + return INDEX_FORMAT_R32_UINT; + case DXGI_FORMAT_UNKNOWN: + default: + return INDEX_FORMAT_NONE; + } +} + +GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format) +{ + switch (format) + { + case VERTEX_FORMAT_R32G32_FLOAT: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT; + break; + case VERTEX_FORMAT_R32G32B32_FLOAT: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT; + break; + case VERTEX_FORMAT_R16G16_FLOAT: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT; + break; + case VERTEX_FORMAT_R16G16B16A16_FLOAT: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + case VERTEX_FORMAT_R16G16_SNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM; + break; + case VERTEX_FORMAT_R16G16B16A16_SNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM; + break; + case VERTEX_FORMAT_R16G16B16A16_UNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM; + break; + case VERTEX_FORMAT_R16G16_UNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM; + break; + case VERTEX_FORMAT_R10G10B10A2_UNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM; + break; + case VERTEX_FORMAT_R8G8B8A8_UNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + case VERTEX_FORMAT_R8G8_UNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM; + break; + case VERTEX_FORMAT_R8G8B8A8_SNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM; + break; + case VERTEX_FORMAT_R8G8_SNORM: + geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM; + break; + } +} + +GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + switch(geomDesc->Triangles.VertexFormat) + { + case DXGI_FORMAT_R32G32_FLOAT: + return VERTEX_FORMAT_R32G32_FLOAT; + case DXGI_FORMAT_R32G32B32_FLOAT: + return VERTEX_FORMAT_R32G32B32_FLOAT; + case DXGI_FORMAT_R16G16_FLOAT: + return VERTEX_FORMAT_R16G16_FLOAT; + case DXGI_FORMAT_R16G16B16A16_FLOAT: + return VERTEX_FORMAT_R16G16B16A16_FLOAT; + case DXGI_FORMAT_R16G16_SNORM: + return VERTEX_FORMAT_R16G16_SNORM; + case DXGI_FORMAT_R16G16B16A16_SNORM: + return VERTEX_FORMAT_R16G16B16A16_SNORM; + case DXGI_FORMAT_R16G16B16A16_UNORM: + return VERTEX_FORMAT_R16G16B16A16_UNORM; + case DXGI_FORMAT_R16G16_UNORM: + return VERTEX_FORMAT_R16G16_UNORM; + case DXGI_FORMAT_R10G10B10A2_UNORM: + return VERTEX_FORMAT_R10G10B10A2_UNORM; + case DXGI_FORMAT_R8G8B8A8_UNORM: + return VERTEX_FORMAT_R8G8B8A8_UNORM; + case DXGI_FORMAT_R8G8_UNORM: + return VERTEX_FORMAT_R8G8_UNORM; + case DXGI_FORMAT_R8G8B8A8_SNORM: + return VERTEX_FORMAT_R8G8B8A8_SNORM; + case DXGI_FORMAT_R8G8_SNORM: + return VERTEX_FORMAT_R8G8_SNORM; + default: + return VERTEX_FORMAT_R32G32_FLOAT; + } +} + +GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count) +{ + geomDesc->Triangles.IndexCount = count; +} + +GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Triangles.IndexCount; +} + +GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count) +{ + geomDesc->Triangles.VertexCount = count; +} + +GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Triangles.VertexCount; +} + +GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer) +{ + geomDesc->Triangles.IndexBuffer = buffer; +} + +GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Triangles.IndexBuffer; +} + +GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address) +{ + geomDesc->Triangles.VertexBuffer.StartAddress = address; +} + +GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Triangles.VertexBuffer.StartAddress; +} + +GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride) +{ + geomDesc->Triangles.VertexBuffer.StrideInBytes = stride; +} + +GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->Triangles.VertexBuffer.StrideInBytes; +} + +GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count) +{ + geomDesc->AABBs.AABBCount = count; +} + +GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->AABBs.AABBCount; +} + +GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address) +{ + geomDesc->AABBs.AABBs.StartAddress = address; +} + +GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->AABBs.AABBs.StartAddress; +} + +GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride) +{ + geomDesc->AABBs.AABBs.StrideInBytes = stride; +} + +GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) +{ + return geomDesc->AABBs.AABBs.StrideInBytes; +} + +typedef struct D3D12_RAYTRACING_INSTANCE_DESC +{ + float Transform[12]; + // unsigned int InstanceID : 24; + // unsigned int InstanceMask : 8; + uint32_t DW0; + // unsigned int InstanceContributionToHitGroupIndex : 24; + // unsigned int Flags : 8; + uint32_t DW1; + global char *AccelerationStructure; +} D3D12_RAYTRACING_INSTANCE_DESC; + +GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column) +{ + return d->Transform[row * 4 + column]; +} + +GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d) +{ + return d->DW0 & ((1 << 24) - 1); +} + +GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d) +{ + return d->DW0 >> 24; +} + +GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d) +{ + return d->DW1 & ((1 << 24) - 1); +} + +GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d) +{ + return d->DW1 >> 24; +} + +GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d) +{ + return (gpuva_t)d->AccelerationStructure; +} + +GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value) +{ + d->Transform[row * 4 + column] = value; +} + +GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id) +{ + d->DW0 &= 255 << 24; + d->DW0 |= id & ((1 << 24) - 1); +} + +GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask) +{ + d->DW0 &= ((1 << 24) - 1); + d->DW0 |= mask << 24; +} + +GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution) +{ + d->DW1 &= 255 << 24; + d->DW1 |= contribution & ((1 << 24) - 1); +} + +GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags) +{ + d->DW1 &= ((1 << 24) - 1); + d->DW1 |= flags << 24; +} + +GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address) +{ + d->AccelerationStructure = (global char*)address; +} diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl new file mode 100644 index 00000000000..d37adbbbb2b --- /dev/null +++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl @@ -0,0 +1,59 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" + +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom( + global struct Geo *src, + global struct Geo *dst, + global float4 *vec, + global ushort *indices, + dword step) +{ + src = src + get_group_id(0); + dst = dst + get_group_id(0); + dst->Flags = src->Flags; + dst->Type = src->Type; + if (src->Type == GEOMETRY_TYPE_PROCEDURAL) + { + dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride; + dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount; + dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride; + } + else + { + dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer; + if (step == 0) + return; + dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount; + if (step == 1) + return; + dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount; + if (step == 2) + return; + dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat; + if (step == 3) + return; + dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer; + if (step == 4) + return; + dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer; + if (step == 5) + return; + dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride; + + dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat; + + for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++) + { + uint3 tri = GRL_load_triangle(src, t); + vec[t * 3] = GRL_load_vertex(src, tri[0]); + vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]); + vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]); + } + } +} diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl new file mode 100644 index 00000000000..3779439c54b --- /dev/null +++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl @@ -0,0 +1,27 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module api_interface_verify; + +kernel copy_geom < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" > + +metakernel ifc0_copy( + qword src, + qword dst, + qword vec, + qword srcIndices, + dword numGroups, + dword step) +{ + dispatch copy_geom(numGroups,1,1) args( + src, + dst, + vec, + srcIndices, + step + ); +} diff --git a/src/intel/vulkan/grl/gpu/input_dump.cl b/src/intel/vulkan/grl/gpu/input_dump.cl new file mode 100644 index 00000000000..f668f053f1f --- /dev/null +++ b/src/intel/vulkan/grl/gpu/input_dump.cl @@ -0,0 +1,723 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "common.h" +#include "d3d12.h" +#include "mem_utils.h" +#include "misc_shared.h" + +/// Align value to 128 +/// +/// @param value vale to align +/// @return aligned value +GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; } + +GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) { + return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch))); +} + +/// Finds max used byte in vertex buffer +/// +/// @param indexBuffPtr pointer to index buffer +/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers +/// @param IndexCount number of indices in index buffer +/// @param IndexFormat index format +/// @param VertexCount number of vertices in vertex buffer +/// @param VertexBufferByteStride vertex buffer byte stride +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel find_max_used_byte_in_buff( + global void* indexBuffPtr, + global uint* vertexBufferUsedByteEnd, + dword IndexCount, + dword IndexFormat, + dword VertexCount, + qword VertexBufferByteStride) +{ + local uint sgMax[16]; + uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0); + + if (IndexFormat != INDEX_FORMAT_NONE) + { + uint endByte = 0; + if (glob_id < IndexCount) + { + if (IndexFormat == INDEX_FORMAT_R16_UINT) + { + global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr; + endByte = indexBuffPtrShort[glob_id]; + } + else + { + global uint* indexBuffPtrUint = (global uint*) indexBuffPtr; + endByte = indexBuffPtrUint[glob_id]; + } + } + + endByte = sub_group_reduce_max(endByte); + + if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_sub_group_id() == 0) + { + endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]); + if (get_sub_group_local_id() == 0) + { + endByte = min(endByte, VertexCount); + if (endByte < VertexCount && IndexCount != 0) + ++endByte; + endByte *= (dword)VertexBufferByteStride; + atomic_max(vertexBufferUsedByteEnd, endByte); + } + } + } + else if (glob_id == 0) + { + uint endByte = VertexCount * VertexBufferByteStride; + atomic_max(vertexBufferUsedByteEnd, endByte); + } +} + +/// Allocates buffer for vertices +/// +/// @param batchPtrs batch pointers struct +/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers +/// @param vertexBufferOffset pointer to offsets to vertex buffers +/// @param numVertexBuffers number of vertex buffers +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel allocate_linear_offsets_for_vertex_buffers( + global InputBatchPtrs* batchPtrs, + global uint* vertexBufferUsedByteEnd, + global uint* vertexBufferOffset, + dword numVertexBuffers) +{ + uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id(); + + if (glob_id < numVertexBuffers) + { + uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]); + uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes); + vertexBufferOffset[glob_id] = position; + } +} + +/// Sets the dst data space for input dump of this batch +/// +/// @param inputDumpMainBuffer pointer to main dump buffer +/// @param batchPtrs batch pointers struct +/// @param nonVertexSize size of non vertex data +/// @param batchIdPtr pointer to batch id +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel allocate_data_space_for_inputs( + global DebugBufferHeader* inputDumpMainBuffer, + global InputBatchPtrs* batchPtrs, + uint nonVertexSize, + global qword* batchIdPtr) +{ + if (get_sub_group_local_id() == 0) + { + uint vertexBufferSize = batchPtrs->vertexBuffersSize; + uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize; + + if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) + { + inputDumpMainBuffer->overflow = 1; + batchPtrs->dumpDst = 0; + batchPtrs->globalDumpBuffer = 0; + batchPtrs->nonVertexDataStart = 0; + batchPtrs->totalSize = 0; + return; + } + + dword prevHead = inputDumpMainBuffer->gpuHead; + dword newHead; + bool circled; + + do + { + circled = false; + newHead = prevHead + sizeOfThisBatch; + dword bufferBegin = prevHead; + if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize) + { + circled = true; + newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch; + bufferBegin = inputDumpMainBuffer->headStart; + } + dword bufferEnd = newHead + sizeof(InputBatch); + + uint tail; + uint tail2 = 7; + bool wait; + do + { + wait = true; + tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0); + + // dead code, workaround so IGC won't move tail load out of loop + if (tail > inputDumpMainBuffer->totalSize) + { + store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2); + tail2 = tail; + } + + if( prevHead >= tail ) + { + //colision example: + // ----------T=======H------------ + // -------B=====E----------------- + // + if((bufferEnd < tail) || (bufferBegin >= prevHead)) + { + wait = false; + } + } + else + { + //colision example: + // ==========H-------T============ + // B==============E--------------- + // caution: we will never have H circled completely so that H == T + if((bufferEnd < tail) && (bufferBegin >= prevHead)) + { + wait = false; + } + } + } while (wait); + } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead)); + + if (circled) + { + global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead); + endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER; + prevHead = inputDumpMainBuffer->headStart; + } + + global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead; + batchPtrs->dumpDst = (qword)thisBatchDump; + batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer; + batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize); + batchPtrs->totalSize = sizeOfThisBatch; + + global InputBatch* batchOp = (global InputBatch*) thisBatchDump; + batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH; + batchOp->header.opHeader.endOfData = sizeOfThisBatch; + batchOp->vertexBufferDataSize = vertexBufferSize; + batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize; + batchOp->batchId = *batchIdPtr; + } +} + +/// Sets the dst data space for output dump of this batch +/// +/// @param outputDumpMainBuffer pointer to main dump buffer +/// @param batchPtrs batch pointers struct +/// @param batchIdPtr pointer to batch id +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel allocate_data_space_for_outputs( + global DebugBufferHeader* outputDumpMainBuffer, + global OutputBatchPtrs* batchPtrs, + global qword* batchIdPtr) +{ + if (get_sub_group_local_id() == 0) + { + uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize; + + if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) + { + outputDumpMainBuffer->overflow = 1; + batchPtrs->dumpDst = 0; + batchPtrs->dataStart = 0; + batchPtrs->totalSize = 0; + return; + } + + dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead)); + dword newHead; + bool circled; + + do + { + //mem_fence_gpu_invalidate(); + //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead)); + circled = false; + newHead = prevHead + sizeOfThisBatch; + dword bufferBegin = prevHead; + if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize) + { + circled = true; + newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch; + bufferBegin = outputDumpMainBuffer->headStart; + } + dword bufferEnd = newHead + sizeof(OutputBatch); + + uint tail; + uint tail2 = 7; + bool wait; + do + { + wait = true; + tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0); + + // dead code, workaround so IGC won't move tail load out of loop + if (tail > outputDumpMainBuffer->totalSize) + { + store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2); + tail2 = tail; + } + + if( prevHead >= tail ) + { + //colision example: + // ----------T=======H------------ + // -------B=====E----------------- + // + if((bufferEnd < tail) || (bufferBegin >= prevHead)) + { + wait = false; + } + } + else + { + //colision example: + // ==========H-------T============ + // B==============E--------------- + // caution: we will never have H circled completely so that H == T + if((bufferEnd < tail) && (bufferBegin >= prevHead)) + { + wait = false; + } + } + } while (wait); + } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead)); + + if (circled) + { + global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead); + endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER; + prevHead = outputDumpMainBuffer->headStart; + } + + global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead; + batchPtrs->dumpDst = (qword)thisBatchDump; + batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch))); + batchPtrs->totalSize = sizeOfThisBatch; + + global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump; + batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH; + batchOp->header.opHeader.endOfData = sizeOfThisBatch; + batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch)); + batchOp->batchId = *batchIdPtr; + } +} + +/// Calculates sum of output sizes +/// +/// @param pbi pointer to post build infos +/// @param destOffset offset in dest buffer +/// @param numOutputs number of outputs +/// @param batchPtrs batch pointers struct +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel calc_outputs_data_size( + global PostbuildInfoSerializationDesc* pbi, + global dword* destOffsets, + qword numOutputs, + global OutputBatchPtrs* batchPtrs) +{ + uint offset = 0; + for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH) + { + uint size = 0; + if (i < numOutputs) + { + size = AlignTo128(pbi[i].SerializedSizeInBytes); + size += AlignTo128(sizeof(OutputData)); + destOffsets[i] = offset + sub_group_scan_exclusive_add(size); + } + offset += sub_group_reduce_add(size); + } + if (get_sub_group_local_id() == 0) + batchPtrs->dataSize = offset; +} + +/// Adds output data operation to batch +/// +/// @param batchPtrs batch pointers struct +/// @param destOffset offset in dest buffer +/// @param src pointer to source bvh +/// @param pbi pointer to post build info +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel write_output_data_op( + global OutputBatchPtrs* batchPtrs, + global dword* destOffset, + qword src, + global PostbuildInfoSerializationDesc* pbi) +{ + if (batchPtrs->dataStart == 0) + return; + + global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset); + out->header.operationType = OUTPUT_DUMP_OP_DATA; + out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes); + out->srcBvhPtr = src; +} + +/// Writes indices and transform or procedurals data +/// +/// @param batchPtrs batch pointers struct +/// @param srcDesc description of source geometry +/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer +/// @param dstDescOffset offset to dest geo desc +/// @param dstDataOffset offset to dest geo data +/// @param numThreads number of threads +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel write_geo_data( + global InputBatchPtrs* batchPtrs, + global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc, + global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers, + global uint* pVertexBufferSize, + qword dstDescOffset, + qword dstDataOffset, + dword numThreads) +{ + if (batchPtrs->dumpDst == 0) return; + + uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); + + GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc; + + global char* dstDataPtr = (global char*)( + batchPtrs->nonVertexDataStart + dstDataOffset); + + global char* srcDataPtr; + global char* dstTransform; + uint bytesToCopy = 0; + + if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES) + { + uint sizeOfMatrix = 0; + + if (geoDescToStore.Desc.Triangles.pTransformBuffer) + { + sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float)); + if (glob_id < 12) + { + global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer; + global float* matrixDst = (global float*)dstDataPtr; + matrixDst[glob_id] = matrixSrc[glob_id]; + if (glob_id == 0) + { + geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer; + } + } + } + + dstDataPtr += sizeOfMatrix; + srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer; + + bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount); + + if (bytesToCopy && (glob_id == 0)) + { + qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers); + // for this we remember offset relative to global debug buffer + geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer; + geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer; + geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride; + } + else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0) + { + if (geoDescToStore.Desc.Triangles.pVertexBuffer) + { + qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers); + // for this we remember offset relative to global debug buffer + geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer; + } + } + else if (glob_id == 0) + { + geoDescToStore.Desc.Triangles.IndexCount = 0; + geoDescToStore.Desc.Triangles.VertexCount = 0; + geoDescToStore.Desc.Triangles.pVertexBuffer = 0; + geoDescToStore.Desc.Triangles.pIndexBuffer = 0; + } + } + else + { + srcDataPtr = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA; + bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount); + if (glob_id == 0) + { + geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer; + } + } + + if (bytesToCopy) + { + CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads); + } + + if (glob_id == 0) + { + global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)( + batchPtrs->nonVertexDataStart + dstDescOffset); + *dstDescPtr = geoDescToStore; + } +} + +/// Adds build operation to batch +/// +/// @param batchPtrs batch pointers struct +/// @param buildOpOffset offset in dst buffer +/// @param srcBvh address of src bvh (in case of update) +/// @param dstBvhAddr address of dest bvh buffer +/// @param offsetToEnd offset to end of this operation +/// @param flags build flags +/// @param numGeometries number of geometries in build +/// @param numInstances number of instances in build +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel write_input_build_op( + global InputBatchPtrs* batchPtrs, + qword buildOpOffset, + qword srcBvh, + qword dstBvhAddr, + dword offsetToEnd, + dword flags, + dword numGeometries, + dword numInstances, + dword instArrayOfPtrs) +{ + uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); + if (batchPtrs->dumpDst == 0 || glob_id != 0) return; + + global InputBuild* buildOp = (global InputBuild*)( + batchPtrs->nonVertexDataStart + buildOpOffset); + buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD; + buildOp->header.endOfData = offsetToEnd; + buildOp->dstBvhPtr = dstBvhAddr; + buildOp->srcBvhPtr = srcBvh; + buildOp->flags = flags; + buildOp->numGeos = numGeometries; + buildOp->numInstances = numInstances; + buildOp->instArrayOfPtrs = instArrayOfPtrs; +} + +/// Copies instance description +/// +/// @param batchPtrs batch pointers struct +/// @param instanceDescArr inst desc source +/// @param offset ptr to offset in dst buffer +/// @param numInstances number of instances to copy +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +copy_instance_descriptors_array( + global InputBatchPtrs* batchPtrs, + global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr, + qword offset, + dword numInstances) +{ + uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); + if (batchPtrs->dumpDst == 0) return; + + global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )( + batchPtrs->nonVertexDataStart + offset); + + if (glob_id < numInstances) + { + dst[glob_id] = instanceDescArr[glob_id]; + } +} + +/// Copies instance description, array of pointers version +/// +/// @param batchPtrs batch pointers struct +/// @param pInstanceDescPtrsArr inst desc source +/// @param offset ptr to offset in dst buffer +/// @param numInstances number of instances to copy +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +copy_instance_descriptors_array_of_ptrs( + global InputBatchPtrs* batchPtrs, + global qword* pInstanceDescPtrsArr, + qword offset, + dword numInstances) +{ + uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); + if (batchPtrs->dumpDst == 0) return; + + // save gpuva of instance descs for debug + global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset); + + global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)( + batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset); + global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr; + + if (glob_id < numInstances) + { + gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id]; + dst[glob_id] = *(instanceDescPtrsArr[glob_id]); + } +} + +/// Adds copy operation to batch +/// +/// @param batchPtrs batch pointers struct +/// @param offset ptr to offset in dst buffer +/// @param src copy source pointer +/// @param dst copy destination pointer +/// @param copyOpType copy type +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel insert_copy_op( + global InputBatchPtrs* batchPtrs, + qword offset, + global void* src, + global void* dst, + uint copyOpType) +{ + uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); + if (batchPtrs->dumpDst == 0 || glob_id != 0) return; + + global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset); + + copyOp->header.operationType = copyOpType; + copyOp->header.endOfData = AlignTo128(sizeof(InputCopy)); + copyOp->srcBvhPtr = (qword)src; + copyOp->dstBvhPtr = (qword)dst; +} + +/// Copies vertex buffer +/// +/// @param batchPtrs batch pointers struct +/// @param src input buffer +/// @param offset ptr to offset in dst buffer +/// @param size ptr to number of bytes to copy +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_vertex_data( + global InputBatchPtrs* batchPtrs, + global const char* src, + global const uint* offset, + global const uint* size) +{ + if (batchPtrs->dumpDst == 0) return; + + global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset); + uint numGroups = (*size >> 6) + 1; + CopyMemory(dst, src, *size, numGroups); +} + +/// Generate unique batch id +/// +/// @param batchIds array of unique batch ids +/// @param index index of batch id to generate +__attribute__((reqd_work_group_size(1, 1, 1))) +void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) { + global unsigned int *counterPtrs = (global unsigned int *)batchIds; + atomic_add(&counterPtrs[index * 2 + 1], 1); + batchIds[index] |= (unsigned long)index; +} + +/// Sets batch as ready to read and moves cpuHead forward, inputs case +/// +/// @param batchPtrs batch pointers struct +/// @param dumpMainBuffer pointer to main dump buffer +__attribute__((reqd_work_group_size(1, 1, 1))) +void kernel finish_batch_dump_inputs( + global InputBatchPtrs* batchPtrs, + global DebugBufferHeader* dumpMainBuffer) +{ + if (batchPtrs->dumpDst == 0) + return; + + global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst; + + dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer); + + dword seven = 7; + while (true) + { + dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0); + if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop + { + store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven); + currentHead = seven; + } + + if (currentHead == myDstOffset) + { + mem_fence_evict_to_memory(); + dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData; + break; + } + else if (myDstOffset == dumpMainBuffer->headStart) + { + global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead); + if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER) + { + mem_fence_evict_to_memory(); + dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData; + break; + } + } + } +} + +/// Sets batch as ready to read and moves cpuHead forward, outputs case +/// +/// @param batchPtrs batch pointers struct +/// @param dumpMainBuffer pointer to main dump buffer +__attribute__((reqd_work_group_size(1, 1, 1))) +void kernel finish_batch_dump_outputs( + global OutputBatchPtrs* batchPtrs, + global DebugBufferHeader* dumpMainBuffer) +{ + if (batchPtrs->dumpDst == 0) + return; + + global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst; + + dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer); + + dword seven = 7; + while (true) + { + dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0); + if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop + { + store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven); + currentHead = seven; + } + + if (currentHead == myDstOffset) + { + mem_fence_evict_to_memory(); + dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData; + break; + } + else if (myDstOffset == dumpMainBuffer->headStart) + { + global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead); + if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER) + { + mem_fence_evict_to_memory(); + dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData; + break; + } + } + } +} diff --git a/src/intel/vulkan/grl/gpu/input_dump.grl b/src/intel/vulkan/grl/gpu/input_dump.grl new file mode 100644 index 00000000000..7cc6e60a95d --- /dev/null +++ b/src/intel/vulkan/grl/gpu/input_dump.grl @@ -0,0 +1,252 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module input_dump; + +kernel_module input_dumper("input_dump.cl") +{ + links lsc_intrinsics; + + kernel opencl_kernel_find_max_used_byte_in_buff < kernelFunction="find_max_used_byte_in_buff" >; + kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >; + kernel opencl_kernel_allocate_data_space_for_inputs < kernelFunction="allocate_data_space_for_inputs" >; + kernel opencl_kernel_allocate_data_space_for_outputs < kernelFunction="allocate_data_space_for_outputs" >; + kernel opencl_kernel_calc_outputs_data_size < kernelFunction="calc_outputs_data_size" >; + kernel opencl_kernel_write_output_data_op < kernelFunction="write_output_data_op" >; + kernel opencl_kernel_write_geo_data < kernelFunction="write_geo_data" >; + kernel opencl_kernel_write_input_build_op < kernelFunction="write_input_build_op" >; + kernel opencl_kernel_copy_instance_descriptors_array < kernelFunction="copy_instance_descriptors_array" >; + kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs < kernelFunction="copy_instance_descriptors_array_of_ptrs" >; + kernel opencl_kernel_insert_copy_op < kernelFunction="insert_copy_op" >; + kernel opencl_kernel_copy_vertex_data < kernelFunction="copy_vertex_data" >; + kernel opencl_kernel_generate_unique_batch_id < kernelFunction="generate_unique_batch_id" >; + kernel opencl_kernel_finish_batch_dump_inputs < kernelFunction="finish_batch_dump_inputs" >; + kernel opencl_kernel_finish_batch_dump_outputs < kernelFunction="finish_batch_dump_outputs" >; +} + + +metakernel find_max_used_byte_in_buff( + qword indexBuffPtr, + qword vertexBufferUsedByteEnd, + dword IndexCount, + dword IndexFormat, + dword VertexCount, + qword VertexBufferByteStride, + dword numPhysThreads) +{ + dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1) args( + indexBuffPtr, + vertexBufferUsedByteEnd, + IndexCount, + IndexFormat, + VertexCount, + VertexBufferByteStride); +} + +metakernel allocate_linear_offsets_for_vertex_buffers( + qword batchPtrs, + qword m_VertexBufferUsedByteEnd, + qword m_VertexBufferOffset, + dword numVertexBuffers, + dword numPhysThreads) +{ + dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args( + batchPtrs, + m_VertexBufferUsedByteEnd, + m_VertexBufferOffset, + numVertexBuffers); +} + +metakernel allocate_data_space_for_inputs( + qword inputDumpMainBuffer, + qword batchPtrs, + dword nonVertexSize, + qword batchIdPtr) +{ + dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args( + inputDumpMainBuffer, + batchPtrs, + nonVertexSize, + batchIdPtr); +} + +metakernel allocate_data_space_for_outputs( + qword inputDumpMainBuffer, + qword batchPtrs, + qword batchIdPtr) +{ + dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args( + inputDumpMainBuffer, + batchPtrs, + batchIdPtr); +} + +metakernel calc_outputs_data_size( + qword pbi, + qword destOffsets, + qword numOutputs, + qword batchPtrs) +{ + dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args( + pbi, + destOffsets, + numOutputs, + batchPtrs); +} + +metakernel write_output_data_op( + qword batchPtrs, + qword destOffset, + qword src, + qword pbi) +{ + dispatch opencl_kernel_write_output_data_op(1, 1, 1) args( + batchPtrs, + destOffset, + src, + pbi); +} + +metakernel write_geo_data( + qword batchPtrs, + qword srcDesc, + qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers, + qword pVertexBufferSize, + qword dstDescOffset, + qword dstDataOffset, + dword numThreads) +{ + dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args( + batchPtrs, + srcDesc, + pVertexBufferOffsetInLinearisedUniqueVertexBuffers, + pVertexBufferSize, + dstDescOffset, + dstDataOffset, + numThreads); +} + +metakernel write_input_build_op( + qword batchPtrs, + qword buildOpOffset, + qword srcBvh, + qword dstBvhAddr, + dword offsetToEnd, + dword flags, + dword numGeometries, + dword numInstances, + dword instArrayOfPtrs) + +{ + dispatch opencl_kernel_write_input_build_op(1, 1, 1) args( + batchPtrs, + buildOpOffset, + srcBvh, + dstBvhAddr, + offsetToEnd, + flags, + numGeometries, + numInstances, + instArrayOfPtrs); +} + +metakernel copy_instance_descriptors_array( + qword batchPtrs, + qword instanceDescArr, + qword offset, + dword numInstances, + dword numPhysThreads) +{ + dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args( + batchPtrs, + instanceDescArr, + offset, + numInstances); +} + +metakernel copy_instance_descriptors_array_of_ptrs( + qword batchPtrs, + qword instanceDescArrPtrs, + qword offset, + dword numInstances, + dword numPhysThreads) +{ + dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args( + batchPtrs, + instanceDescArrPtrs, + offset, + numInstances); +} + +metakernel insert_copy_op( + qword batchPtrs, + qword offset, + qword src, + qword dst, + dword type) +{ + dispatch opencl_kernel_insert_copy_op(1, 1, 1) args( + batchPtrs, + offset, + src, + dst, + type); +} + +metakernel copy_vertex_data( + qword desc, + qword src, + qword offset, + qword size) +{ + define byteSize REG0; + define numGroupsRqd REG1; + define shift REG2; + define minimum REG3; + + shift = 6; + minimum = 1; + byteSize = load_dword(size); + numGroupsRqd = byteSize >> shift; + numGroupsRqd = numGroupsRqd + minimum; + DISPATCHDIM_X = numGroupsRqd.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_copy_vertex_data args( + desc, + src, + offset, + size); +} + +metakernel generate_unique_batch_id( + qword batchIds, + dword batchIndex) +{ + dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args( + batchIds, + batchIndex); +} + +metakernel finish_batch_dump_inputs( + qword batchPtrs, + qword dumpMainBuffer) +{ + dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args( + batchPtrs, + dumpMainBuffer); +} + +metakernel finish_batch_dump_outputs( + qword batchPtrs, + qword dumpMainBuffer) +{ + dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args( + batchPtrs, + dumpMainBuffer); +} diff --git a/src/intel/vulkan/grl/gpu/instance.h b/src/intel/vulkan/grl/gpu/instance.h new file mode 100644 index 00000000000..e463a01dc90 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/instance.h @@ -0,0 +1,183 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "shared.h" +#include "affinespace.h" +#include "api_interface.h" +#include "qbvh6.h" +#include "libs/lsc_intrinsics.h" + +GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I) +{ + return I->part1.instanceIndex; +} + +GRL_INLINE void encodeDW0_HwInstanceLeafPart0( + uint32_t shaderIndex, + uint32_t geomMask, + uint4 *dst) +{ + (*dst).x = (shaderIndex & ((1 << 24) - 1)) | + (geomMask << 24); +} + +GRL_INLINE void encodeDW1_HwInstanceLeafPart0( + uint32_t instanceContributionToHitGroupIndex, + uint32_t notProcedural, + uint32_t geomFlags, + uint4* dst) +{ + (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) | + ((notProcedural & 1) << (24 + 5)) | + ((geomFlags & 3) << (24 + 5 + 1)); +} + +GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0( + uint64_t rootNodePtr, + uint32_t instFlags, + uint4* dst) +{ + uint64_t flags = instFlags; + uint DW2 = (uint)rootNodePtr; + uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff); + DW3 |= flags << 16ull; + (*dst).z = DW2; + (*dst).w = DW3; +} + +GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I, + uint32_t shaderIndex, + uint32_t geomMask) +{ + I->part0.DW0 = + (shaderIndex & ((1 << 24) - 1)) | + (geomMask << 24); +} + +GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I, + uint32_t instanceContributionToHitGroupIndex, + uint32_t notProcedural, + uint32_t geomFlags) +{ + I->part0.DW1 = + (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) | + ((notProcedural & 1) << (24 + 5)) | + ((geomFlags & 3) << (24 + 5 + 1)); +} + +GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I, + global char *pBvhPtr) +{ + I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1); +} + +GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I, + uint64_t rootNodePtr, + uint32_t instFlags) +{ + uint64_t flags = instFlags; + flags = flags << 48ull; + uint64_t ptr = rootNodePtr & 0x0000ffffffffffff; + I->part0.DW2_DW3 = ptr + flags; +} + +GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf, + global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc, + uint instanceIndex, + uint rootNodeByteOffset, + uint instanceMask) +{ + global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf); + + struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform); + + qword accStructPtr = (qword)instDesc->AccelerationStructure; + uint4 p1_DW0_3 = (uint4)( + (uint)accStructPtr, + (uint)(accStructPtr >> (uint64_t)32), + GRL_get_instanceID(instDesc), + instanceIndex); + + struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3); + + uint4 p1_DW4_7 = (uint4)( + as_uint(obj2world.l.vx.x), + as_uint(obj2world.l.vx.y), + as_uint(obj2world.l.vx.z), + as_uint(obj2world.l.vy.x)); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7); + + uint4 p1_DW8_11 = (uint4)( + as_uint(obj2world.l.vy.y), + as_uint(obj2world.l.vy.z), + as_uint(obj2world.l.vz.x), + as_uint(obj2world.l.vz.y)); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11); + + + uint4 p1_DW12_15 = (uint4)( + as_uint(obj2world.l.vz.z), + as_uint(world2obj.p.x), + as_uint(world2obj.p.y), + as_uint(world2obj.p.z)); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15); + + + uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc); + global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure; + + uint4 p0_DW0_3; + + encodeDW0_HwInstanceLeafPart0( + hit_group_index, + instanceMask, + &p0_DW0_3); + + encodeDW1_HwInstanceLeafPart0( + hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index + 1, // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing + 0, + &p0_DW0_3); + + encodeDW2DW3_HwInstanceLeafPart0( + rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer + GRL_get_InstanceFlags(instDesc), + &p0_DW0_3); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3); + + uint4 p0_DW4_7 = (uint4)( + as_uint(world2obj.l.vx.x), + as_uint(world2obj.l.vx.y), + as_uint(world2obj.l.vx.z), + as_uint(world2obj.l.vy.x)); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7); + + uint4 p0_DW8_11 = (uint4)( + as_uint(world2obj.l.vy.y), + as_uint(world2obj.l.vy.z), + as_uint(world2obj.l.vz.x), + as_uint(world2obj.l.vz.y)); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11); + + uint4 p0_DW12_15 = (uint4)( + as_uint(world2obj.l.vz.z), + as_uint(obj2world.p.x), + as_uint(obj2world.p.y), + as_uint(obj2world.p.z)); + + store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15); +} diff --git a/src/intel/vulkan/grl/gpu/intrinsics.h b/src/intel/vulkan/grl/gpu/intrinsics.h new file mode 100644 index 00000000000..0dff3147d8a --- /dev/null +++ b/src/intel/vulkan/grl/gpu/intrinsics.h @@ -0,0 +1,581 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +// TODO: AABB_work_group_reduce is super slow, remove !!! + +#pragma cl_intel_subgroups : enable +#pragma cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + + +uint intel_sub_group_ballot(bool valid); + +// atom_min +float __attribute__((overloadable)) atom_min(volatile __global float *p, float val); +float __attribute__((overloadable)) atom_min(volatile __local float *p, float val); +float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val); +float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val); +// atom_max +float __attribute__((overloadable)) atom_max(volatile __global float *p, float val); +float __attribute__((overloadable)) atom_max(volatile __local float *p, float val); +float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val); +float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val); +// atom_cmpxchg +float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val); +float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val); +float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val); +float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val); + + + +inline uint subgroup_single_atomic_add(global uint *p, uint val) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0; + return sub_group_broadcast(v, 0); +} + +inline float halfarea(const float3 d) +{ + return fma(d.x, (d.y + d.z), d.y * d.z); +} + +inline float area(const float3 d) +{ + return halfarea(d) * 2.0f; +} + +inline uint maxDim(const float3 a) +{ + const float3 b = fabs(a); + const bool b_x_y = b.x > b.y; + const float cur_max = b_x_y ? b.x : b.y; + const uint cur_idx = b_x_y ? 0 : 1; + const bool b_x_y_z = b.z > cur_max; + return b_x_y_z ? 2 : cur_idx; +} + +inline uint3 sortByMaxDim(const float3 a) +{ + const uint kz = maxDim(a); + const uint _kx = (kz + 1) % 3; + const uint _ky = (_kx + 1) % 3; + const bool kz_pos = a[kz] >= 0.0f; + const uint kx = kz_pos ? _ky : _kx; + const uint ky = kz_pos ? _kx : _ky; + return (uint3)(kx, ky, kz); +} + +inline uint4 sort4_ascending(const uint4 dist) +{ + const uint a0 = dist.s0; + const uint a1 = dist.s1; + const uint a2 = dist.s2; + const uint a3 = dist.s3; + const uint b0 = min(a0, a2); + const uint b1 = min(a1, a3); + const uint b2 = max(a0, a2); + const uint b3 = max(a1, a3); + const uint c0 = min(b0, b1); + const uint c1 = max(b0, b1); + const uint c2 = min(b2, b3); + const uint c3 = max(b2, b3); + const uint d0 = c0; + const uint d1 = min(c1, c2); + const uint d2 = max(c1, c2); + const uint d3 = c3; + return (uint4)(d0, d1, d2, d3); +} + +__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6}; +__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4}; +__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6}; +__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0}; +__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5}; +__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6}; +__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6}; + +__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1}; +__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1}; +__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1}; + +__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1}; + +inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask) +{ + const uint a1 = intel_sub_group_shuffle(a0, shuffleMask); + const uint a_min = min(a0, a1); + const uint a_max = max(a0, a1); + return select(a_max, a_min, selectMask); +} + +inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask) +{ + const uint a1 = intel_sub_group_shuffle(a0, shuffleMask); + const uint a_min = min(a0, a1); + const uint a_max = max(a0, a1); + return select(a_min, a_max, selectMask); +} + +inline uint sort8_descending(const uint aa) +{ + const unsigned int slotID = get_sub_group_local_id() % 8; + const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]); + const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]); + const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]); + const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]); + const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]); + const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]); + return gg; +} + +inline uint sort8_ascending(const uint aa) +{ + const unsigned int slotID = get_sub_group_local_id() % 8; + const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]); + const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]); + const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]); + const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]); + const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]); + const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]); + return gg; +} + +inline uint sort4_descending(const uint aa) +{ + const unsigned int slotID = get_sub_group_local_id() % 8; + const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]); + const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]); + const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]); + return dd; +} + +inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask) +{ + const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask); + const ulong a_min = min(a0, a1); + const ulong a_max = max(a0, a1); + return select(a_max, a_min, (ulong)selectMask); +} + +inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask) +{ + const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask); + const ulong a_min = min(a0, a1); + const ulong a_max = max(a0, a1); + return select(a_min, a_max, (ulong)selectMask); +} + +inline ulong sort8_ascending_ulong(const ulong aa) +{ + const unsigned int slotID = get_sub_group_local_id() % 8; + const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]); + const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]); + const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]); + const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]); + const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]); + const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]); + return gg; +} + +inline uint bitInterleave3D(const uint4 in) +{ + uint x = in.x, y = in.y, z = in.z; + x = (x | (x << 16)) & 0x030000FF; + x = (x | (x << 8)) & 0x0300F00F; + x = (x | (x << 4)) & 0x030C30C3; + x = (x | (x << 2)) & 0x09249249; + + y = (y | (y << 16)) & 0x030000FF; + y = (y | (y << 8)) & 0x0300F00F; + y = (y | (y << 4)) & 0x030C30C3; + y = (y | (y << 2)) & 0x09249249; + + z = (z | (z << 16)) & 0x030000FF; + z = (z | (z << 8)) & 0x0300F00F; + z = (z | (z << 4)) & 0x030C30C3; + z = (z | (z << 2)) & 0x09249249; + + return x | (y << 1) | (z << 2); +} + +inline uint bitInterleave4D(const uint4 in) +{ + uint x = in.x, y = in.y, z = in.z, w = in.w; + + x = x & 0x000000ff; + x = (x ^ (x << 16)) & 0x00c0003f; + x = (x ^ (x << 8)) & 0x00c03807; + x = (x ^ (x << 4)) & 0x08530853; + x = (x ^ (x << 2)) & 0x09090909; + x = (x ^ (x << 1)) & 0x11111111; + + y = y & 0x000000ff; + y = (y ^ (y << 16)) & 0x00c0003f; + y = (y ^ (y << 8)) & 0x00c03807; + y = (y ^ (y << 4)) & 0x08530853; + y = (y ^ (y << 2)) & 0x09090909; + y = (y ^ (y << 1)) & 0x11111111; + + z = z & 0x000000ff; + z = (z ^ (z << 16)) & 0x00c0003f; + z = (z ^ (z << 8)) & 0x00c03807; + z = (z ^ (z << 4)) & 0x08530853; + z = (z ^ (z << 2)) & 0x09090909; + z = (z ^ (z << 1)) & 0x11111111; + + w = w & 0x000000ff; + w = (w ^ (w << 16)) & 0x00c0003f; + w = (w ^ (w << 8)) & 0x00c03807; + w = (w ^ (w << 4)) & 0x08530853; + w = (w ^ (w << 2)) & 0x09090909; + w = (w ^ (w << 1)) & 0x11111111; + + return (x | (y << 1) | (z << 2) | (w << 3)); +} + +inline ulong ulong_bitInterleave4D(const uint4 in) +{ + ulong x = in.x, y = in.y, z = in.z, w = in.w; + + x = x & 0x0000ffff; + x = (x ^ (x << 32)) & 0x0000f800000007ff; + x = (x ^ (x << 16)) & 0x0000f80007c0003f; + x = (x ^ (x << 8)) & 0x00c0380700c03807; + x = (x ^ (x << 4)) & 0x0843084308430843; + x = (x ^ (x << 2)) & 0x0909090909090909; + x = (x ^ (x << 1)) & 0x1111111111111111; + + y = y & 0x0000ffff; + y = (y ^ (y << 32)) & 0x0000f800000007ff; + y = (y ^ (y << 16)) & 0x0000f80007c0003f; + y = (y ^ (y << 8)) & 0x00c0380700c03807; + y = (y ^ (y << 4)) & 0x0843084308430843; + y = (y ^ (y << 2)) & 0x0909090909090909; + y = (y ^ (y << 1)) & 0x1111111111111111; + + z = z & 0x0000ffff; + z = (z ^ (z << 32)) & 0x0000f800000007ff; + z = (z ^ (z << 16)) & 0x0000f80007c0003f; + z = (z ^ (z << 8)) & 0x00c0380700c03807; + z = (z ^ (z << 4)) & 0x0843084308430843; + z = (z ^ (z << 2)) & 0x0909090909090909; + z = (z ^ (z << 1)) & 0x1111111111111111; + + w = w & 0x0000ffff; + w = (w ^ (w << 32)) & 0x0000f800000007ff; + w = (w ^ (w << 16)) & 0x0000f80007c0003f; + w = (w ^ (w << 8)) & 0x00c0380700c03807; + w = (w ^ (w << 4)) & 0x0843084308430843; + w = (w ^ (w << 2)) & 0x0909090909090909; + w = (w ^ (w << 1)) & 0x1111111111111111; + + return (x | (y << 1) | (z << 2) | (w << 3)); +} + +inline uint bitCompact(uint x) +{ + x &= 0x09249249; + x = (x ^ (x >> 2)) & 0x030c30c3; + x = (x ^ (x >> 4)) & 0x0300f00f; + x = (x ^ (x >> 8)) & 0xff0000ff; + x = (x ^ (x >> 16)) & 0x000003ff; + return x; +} + +inline uint3 bitCompact3D(const uint in) +{ + const uint x = bitCompact(x >> 0); + const uint y = bitCompact(y >> 1); + const uint z = bitCompact(z >> 2); + return (uint3)(x, y, z); +} + +inline uint convertToPushIndices8(uint ID) +{ + const unsigned int slotID = get_sub_group_local_id(); + uint index = 0; + for (uint i = 0; i < 8; i++) + { + const uint mask = intel_sub_group_ballot(ID == i); + const uint new_index = ctz(mask); + index = i == slotID ? new_index : index; + } + return index; +} + +inline uint convertToPushIndices16(uint ID) +{ + const unsigned int slotID = get_sub_group_local_id(); + uint index = 0; + for (uint i = 0; i < 16; i++) + { + const uint mask = intel_sub_group_ballot(ID == i); + const uint new_index = ctz(mask); + index = i == slotID ? new_index : index; + } + return index; +} + +#define FLOAT_EXPONENT_MASK (0x7F800000) // used to be EXPONENT_MASK +#define FLOAT_MANTISSA_MASK (0x007FFFFF) // used to be MANTISSA_MASK +#define FLOAT_NEG_ONE_EXP_MASK (0x3F000000) +#define FLOAT_BIAS (127) +#define FLOAT_MANTISSA_BITS (23) + +inline float3 frexp_vec3(float3 len, int3* exp) +{ + float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK)); + mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f))); + mant = copysign(mant, len); + *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1)); + return mant; +} + + +#ifndef uniform +#define uniform +#endif + +#ifndef varying +#define varying +#endif + +uint get_sub_group_global_id() +{ + return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 ); +} + +// each lane contains the number of 1 bits below the corresponding position in 'mask' +uint subgroup_bit_prefix_exclusive(uniform uint mask) +{ + varying ushort lane = get_sub_group_local_id(); + varying uint lane_mask = (1 << lane) - 1; + varying uint m = mask & lane_mask; + return popcount(m); +} + +uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx ) +{ + varying uint lane_mask = (1 << lane_idx) - 1; + varying uint m = mask & lane_mask; + return popcount(m); +} + + +uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx) +{ + return (uint3)(sub_group_broadcast(v.x,idx), + sub_group_broadcast(v.y,idx), + sub_group_broadcast(v.z,idx)); +} + +float3 sub_group_broadcast_float3(float3 v, uniform ushort idx) +{ + return (float3)(sub_group_broadcast(v.x, idx), + sub_group_broadcast(v.y, idx), + sub_group_broadcast(v.z, idx)); +} + +float3 sub_group_reduce_min_float3(float3 v) +{ + return (float3)(sub_group_reduce_min(v.x), + sub_group_reduce_min(v.y), + sub_group_reduce_min(v.z) ); +} +float3 sub_group_reduce_max_float3(float3 v) +{ + return (float3)(sub_group_reduce_max(v.x), + sub_group_reduce_max(v.y), + sub_group_reduce_max(v.z)); +} + +float3 sub_group_shuffle_float3(float3 v, uniform ushort idx) +{ + return (float3)(intel_sub_group_shuffle(v.x, idx), + intel_sub_group_shuffle(v.y, idx), + intel_sub_group_shuffle(v.z, idx)); +} +uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx) +{ + return (uint3)( intel_sub_group_shuffle(v.x, idx), + intel_sub_group_shuffle(v.y, idx), + intel_sub_group_shuffle(v.z, idx)); +} + + +inline uchar sub_group_reduce_or_N6(uchar val) +{ + val = val | intel_sub_group_shuffle_down(val, val, 4); + val = val | intel_sub_group_shuffle_down(val, val, 2); + val = val | intel_sub_group_shuffle_down(val, val, 1); + return sub_group_broadcast(val, 0); +} + +inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val) +{ + uint SIMD8_id = get_sub_group_local_id() / 8; + val = val | intel_sub_group_shuffle_down(val, val, 4); + val = val | intel_sub_group_shuffle_down(val, val, 2); + val = val | intel_sub_group_shuffle_down(val, val, 1); + + return intel_sub_group_shuffle(val, SIMD8_id * 8); +} + + +inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p ) +{ + return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group ); +} + +inline __attribute__((overloadable)) int atomic_inc_local(local int* p) +{ + return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group); +} + +inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p) +{ + return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group); +} + +inline __attribute__((overloadable)) int atomic_dec_local(local int* p) +{ + return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group); +} + +inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n) +{ + return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); +} + +inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n ) +{ + return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group); +} + +inline uint atomic_add_local( local uint* p, uint n ) +{ + return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); +} + +inline uint atomic_xor_local(local uint* p, uint n) +{ + return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); +} + +inline uint atomic_or_local(local uint* p, uint n) +{ + return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); +} + +inline uint atomic_min_local(local uint* p, uint n) +{ + return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); +} + +inline uint atomic_max_local(local uint* p, uint n) +{ + return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); +} + + + + +inline uint atomic_inc_global( global uint* p ) +{ + return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device); +} + +inline uint atomic_dec_global(global uint* p) +{ + return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device); +} + +inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired) +{ + return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device); +} + +inline uint atomic_add_global( global uint* p, uint n ) +{ + return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); +} + +inline uint atomic_sub_global(global uint* p, uint n) +{ + return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); +} + +inline uint atomic_or_global(global uint* p, uint n) +{ + return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); +} + + +inline uint atomic_inc_global_acquire(global uint* p) +{ + return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device); +} + + +inline uint atomic_inc_global_release(global uint* p) +{ + return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device); +} +inline uint atomic_dec_global_release(global uint* p) +{ + return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device); +} + +inline uint generic_atomic_add(uint* p, uint val) +{ + if (to_global(p) != NULL) + return atomic_add_global(to_global(p), val); + if (to_local(p) != NULL) + return atomic_add_local(to_local(p), val); + return 0; +} + +inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n ) +{ + n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) ); + n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) ); + n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) ); + return sub_group_broadcast( n, 0 ); +} + +inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n ) +{ + n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) ); + n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) ); + n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) ); + return sub_group_broadcast( n, 0 ); +} + +inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n) +{ + n = max(n, intel_sub_group_shuffle_down(n, n, 4)); + n = max(n, intel_sub_group_shuffle_down(n, n, 2)); + n = max(n, intel_sub_group_shuffle_down(n, n, 1)); + return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0); +} + +inline uint generic_atomic_inc(uint* p) +{ + if (to_global(p) != NULL) + return atomic_inc_global(to_global(p)); + if (to_local(p) != NULL) + return atomic_inc(to_local(p)); + return 0; +} + + +// Built-in GRL function which, if called in a kernel body, will force the kernel +// to be compiled to the minimum SIMD width supported by the platform +void GRL_UseMinimumSIMDWidth(); \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/libs/libraries.grl b/src/intel/vulkan/grl/gpu/libs/libraries.grl new file mode 100644 index 00000000000..1d6c0d2c6c5 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/libs/libraries.grl @@ -0,0 +1,13 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +library lsc_intrinsics +{ + default "lsc_intrinsics.cl" ; + fallback "lsc_intrinsics_fallback.cl"; +} + diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl new file mode 100644 index 00000000000..03a76ba36f1 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl @@ -0,0 +1,1033 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// LSC Cache options +// Load message caching control +enum LSC_LDCC { + LSC_LDCC_DEFAULT, + LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached + LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached + LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached + LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached + LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached + LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached + LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached +}; + +// Store message caching control (also used for atomics) +enum LSC_STCC { + LSC_STCC_DEFAULT, + LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached + LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back + LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached + LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back + LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached + LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back + LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back +}; + +// LSC Loads + +// Global address space +uint __builtin_IB_lsc_load_global_uchar_to_uint (const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32 +uint __builtin_IB_lsc_load_global_ushort_to_uint(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32 +uint __builtin_IB_lsc_load_global_uint (const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1 +uint2 __builtin_IB_lsc_load_global_uint2 (const __global uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2 +uint3 __builtin_IB_lsc_load_global_uint3 (const __global uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3 +uint4 __builtin_IB_lsc_load_global_uint4 (const __global uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4 +uint8 __builtin_IB_lsc_load_global_uint8 (const __global uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8 +ulong __builtin_IB_lsc_load_global_ulong (const __global ulong *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1 +ulong2 __builtin_IB_lsc_load_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2 +ulong3 __builtin_IB_lsc_load_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3 +ulong4 __builtin_IB_lsc_load_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4 +ulong8 __builtin_IB_lsc_load_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8 + +// Local address space +uint __builtin_IB_lsc_load_local_uchar_to_uint( const __local uchar *base, int immElemOff); //D8U32 +uint __builtin_IB_lsc_load_local_ushort_to_uint(const __local ushort *base, int immElemOff); //D16U32 +uint __builtin_IB_lsc_load_local_uint (const __local uint *base, int immElemOff); //D32V1 +uint2 __builtin_IB_lsc_load_local_uint2 (const __local uint2 *base, int immElemOff); //D32V2 +uint3 __builtin_IB_lsc_load_local_uint3 (const __local uint3 *base, int immElemOff); //D32V3 +uint4 __builtin_IB_lsc_load_local_uint4 (const __local uint4 *base, int immElemOff); //D32V4 +uint8 __builtin_IB_lsc_load_local_uint8 (const __local uint8 *base, int immElemOff); //D32V8 +ulong __builtin_IB_lsc_load_local_ulong (const __local ulong *base, int immElemOff); //D64V1 +ulong2 __builtin_IB_lsc_load_local_ulong2(const __local ulong2 *base, int immElemOff); //D64V2 +ulong3 __builtin_IB_lsc_load_local_ulong3(const __local ulong3 *base, int immElemOff); //D64V3 +ulong4 __builtin_IB_lsc_load_local_ulong4(const __local ulong4 *base, int immElemOff); //D64V4 +ulong8 __builtin_IB_lsc_load_local_ulong8(const __local ulong8 *base, int immElemOff); //D64V8 + +// LSC Stores + +// Global address space +void __builtin_IB_lsc_store_global_uchar_from_uint (__global uchar *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D8U32 +void __builtin_IB_lsc_store_global_ushort_from_uint(__global ushort *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D16U32 +void __builtin_IB_lsc_store_global_uint (__global uint *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D32V1 +void __builtin_IB_lsc_store_global_uint2 (__global uint2 *base, int immElemOff, uint2 val, enum LSC_STCC cacheOpt); //D32V2 +void __builtin_IB_lsc_store_global_uint3 (__global uint3 *base, int immElemOff, uint3 val, enum LSC_STCC cacheOpt); //D32V3 +void __builtin_IB_lsc_store_global_uint4 (__global uint4 *base, int immElemOff, uint4 val, enum LSC_STCC cacheOpt); //D32V4 +void __builtin_IB_lsc_store_global_uint8 (__global uint8 *base, int immElemOff, uint8 val, enum LSC_STCC cacheOpt); //D32V8 +void __builtin_IB_lsc_store_global_ulong (__global ulong *base, int immElemOff, ulong val, enum LSC_STCC cacheOpt); //D64V1 +void __builtin_IB_lsc_store_global_ulong2(__global ulong2 *base, int immElemOff, ulong2 val, enum LSC_STCC cacheOpt); //D64V2 +void __builtin_IB_lsc_store_global_ulong3(__global ulong3 *base, int immElemOff, ulong3 val, enum LSC_STCC cacheOpt); //D64V3 +void __builtin_IB_lsc_store_global_ulong4(__global ulong4 *base, int immElemOff, ulong4 val, enum LSC_STCC cacheOpt); //D64V4 +void __builtin_IB_lsc_store_global_ulong8(__global ulong8 *base, int immElemOff, ulong8 val, enum LSC_STCC cacheOpt); //D64V8 + +// Local address space +void __builtin_IB_lsc_store_local_uchar_from_uint (__local uchar *base, int immElemOff, uint val); //D8U32 +void __builtin_IB_lsc_store_local_ushort_from_uint(__local ushort *base, int immElemOff, uint val); //D16U32 +void __builtin_IB_lsc_store_local_uint (__local uint *base, int immElemOff, uint val); //D32V1 +void __builtin_IB_lsc_store_local_uint2 (__local uint2 *base, int immElemOff, uint2 val); //D32V2 +void __builtin_IB_lsc_store_local_uint3 (__local uint3 *base, int immElemOff, uint3 val); //D32V3 +void __builtin_IB_lsc_store_local_uint4 (__local uint4 *base, int immElemOff, uint4 val); //D32V4 +void __builtin_IB_lsc_store_local_uint8 (__local uint8 *base, int immElemOff, uint8 val); //D32V8 +void __builtin_IB_lsc_store_local_ulong (__local ulong *base, int immElemOff, ulong val); //D64V1 +void __builtin_IB_lsc_store_local_ulong2(__local ulong2 *base, int immElemOff, ulong2 val); //D64V2 +void __builtin_IB_lsc_store_local_ulong3(__local ulong3 *base, int immElemOff, ulong3 val); //D64V3 +void __builtin_IB_lsc_store_local_ulong4(__local ulong4 *base, int immElemOff, ulong4 val); //D64V4 +void __builtin_IB_lsc_store_local_ulong8(__local ulong8 *base, int immElemOff, ulong8 val); //D64V8 + +// LSC prefetching + +// LSC Pre-Fetch Load functions with CacheControls +// Global address space +void __builtin_IB_lsc_prefetch_global_uchar (const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32 +void __builtin_IB_lsc_prefetch_global_ushort(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32 +void __builtin_IB_lsc_prefetch_global_uint (const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1 +void __builtin_IB_lsc_prefetch_global_uint2 (const __global uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2 +void __builtin_IB_lsc_prefetch_global_uint3 (const __global uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3 +void __builtin_IB_lsc_prefetch_global_uint4 (const __global uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4 +void __builtin_IB_lsc_prefetch_global_uint8 (const __global uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8 +void __builtin_IB_lsc_prefetch_global_ulong (const __global ulong *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1 +void __builtin_IB_lsc_prefetch_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2 +void __builtin_IB_lsc_prefetch_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3 +void __builtin_IB_lsc_prefetch_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4 +void __builtin_IB_lsc_prefetch_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8 + +// LSC Fence support + +// FS - Fence Scope +enum LSC_FS { + LSC_FS_THREAD_GROUP, + LSC_FS_LOCAL, + LSC_FS_TILE, + LSC_FS_GPU, + LSC_FS_GPUs, + LSC_FS_SYSTEM_RELEASE, + LSC_FS_SYSTEM_ACQUIRE +}; + +// FT - Fence Type +enum LSC_FT { + LSC_FT_DEFAULT, + LSC_FT_EVICT, + LSC_FT_INVALIDATE, + LSC_FT_DISCARD, + LSC_FT_CLEAN, + LSC_FT_L3 +}; + +// LSC Fence functions +void __builtin_IB_lsc_fence_global_untyped(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - UGM +void __builtin_IB_lsc_fence_global_untyped_cross_tile(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - UGML +void __builtin_IB_lsc_fence_global_typed(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - TGM +void __builtin_IB_lsc_fence_local(); // Mem Port - SLM + +// Exported functions + +// LSC Loads +// uchar +uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3C); +} + +uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3C); +} + +uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset) +{ + return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// ushort +uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3C); +} + +uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3C); +} + +uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset) +{ + return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// uint +uint load_uint_L1UC_L3UC(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint load_uint_L1UC_L3C(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint load_uint_L1C_L3UC(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint load_uint_L1C_L3C(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3C); +} + +uint load_uint_L1S_L3UC(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint load_uint_L1S_L3C(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3C); +} + +uint load_uint_L1IAR_L3C(global uint* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// uint2 +uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint2 load_uint2_L1UC_L3C(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint2 load_uint2_L1C_L3UC(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint2 load_uint2_L1C_L3C(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3C); +} + +uint2 load_uint2_L1S_L3UC(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint2 load_uint2_L1S_L3C(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3C); +} + +uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// uint3 +uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint3 load_uint3_L1UC_L3C(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint3 load_uint3_L1C_L3UC(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint3 load_uint3_L1C_L3C(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3C); +} + +uint3 load_uint3_L1S_L3UC(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint3 load_uint3_L1S_L3C(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3C); +} + +uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// uint4 +uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint4 load_uint4_L1UC_L3C(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint4 load_uint4_L1C_L3UC(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint4 load_uint4_L1C_L3C(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3C); +} + +uint4 load_uint4_L1S_L3UC(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint4 load_uint4_L1S_L3C(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3C); +} + +uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// uint8 +uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3UC); +} + +uint8 load_uint8_L1UC_L3C(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3C); +} + +uint8 load_uint8_L1C_L3UC(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3UC); +} + +uint8 load_uint8_L1C_L3C(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3C); +} + +uint8 load_uint8_L1S_L3UC(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3UC); +} + +uint8 load_uint8_L1S_L3C(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3C); +} + +uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset) +{ + return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// ulong +ulong load_ulong_L1UC_L3UC(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3UC); +} + +ulong load_ulong_L1UC_L3C(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3C); +} + +ulong load_ulong_L1C_L3UC(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3UC); +} + +ulong load_ulong_L1C_L3C(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3C); +} + +ulong load_ulong_L1S_L3UC(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3UC); +} + +ulong load_ulong_L1S_L3C(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3C); +} + +ulong load_ulong_L1IAR_L3C(global ulong* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// ulong2 +ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3UC); +} + +ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3C); +} + +ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3UC); +} + +ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3C); +} + +ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3UC); +} + +ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3C); +} + +ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// ulong3 +ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3UC); +} + +ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3C); +} + +ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3UC); +} + +ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3C); +} + +ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3UC); +} + +ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3C); +} + +ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// ulong4 +ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3UC); +} + +ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3C); +} + +ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3UC); +} + +ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3C); +} + +ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3UC); +} + +ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3C); +} + +ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// ulong8 +ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3UC); +} + +ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3C); +} + +ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3UC); +} + +ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3C); +} + +ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3UC); +} + +ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3C); +} + +ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset) +{ + return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1IAR_L3C); +} + +// LSC Stores +// uchar +void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// ushort +void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// uint +void store_uint_L1UC_L3UC(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_uint_L1UC_L3WB(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_uint_L1WT_L3UC(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_uint_L1WT_L3WB(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_uint_L1S_L3UC(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_uint_L1S_L3WB(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_uint_L1WB_L3WB(global uint* it, int offset, uint value) +{ + __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// uint2 +void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value) +{ + __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// uint3 +void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value) +{ + __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// uint4 +void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value) +{ + __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// uint8 +void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value) +{ + __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// ulong +void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value) +{ + __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// ulong2 +void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value) +{ + __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// ulong3 +void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value) +{ + __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// ulong4 +void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value) +{ + __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// ulong8 +void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3UC); +} + +void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3WB); +} + +void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3UC); +} + +void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3WB); +} + +void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3UC); +} + +void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3WB); +} + +void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value) +{ + __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WB_L3WB); +} + +// LSC Fence support +void mem_fence_gpu_default() +{ + __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_DEFAULT); +} + +void mem_fence_workgroup_default() +{ + __builtin_IB_lsc_fence_global_untyped(LSC_FS_THREAD_GROUP, LSC_FT_DEFAULT); +} + +void mem_fence_gpu_invalidate() +{ + // NOTE: 'FS_TILE' is used here to avoid DG2 HW bug where L3 is needlessly flushed on a 'GPU' scope fence + __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_INVALIDATE); +} + +void mem_fence_gpu_evict() +{ + __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_EVICT); +} + +void mem_fence_evict_to_memory() +{ + __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_EVICT); + __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_L3); +} diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h new file mode 100644 index 00000000000..a12dac00e77 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h @@ -0,0 +1,207 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// LSC Loads +uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset); +uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset); +uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset); +uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset); +uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset); +uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset); +uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset); + +uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset); +uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset); +uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset); +uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset); +uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset); +uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset); +uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset); + +uint load_uint_L1UC_L3UC(global uint* it, int offset); +uint load_uint_L1UC_L3C(global uint* it, int offset); +uint load_uint_L1C_L3UC(global uint* it, int offset); +uint load_uint_L1C_L3C(global uint* it, int offset); +uint load_uint_L1S_L3UC(global uint* it, int offset); +uint load_uint_L1S_L3C(global uint* it, int offset); +uint load_uint_L1IAR_L3C(global uint* it, int offset); + +uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset); +uint2 load_uint2_L1UC_L3C(global uint2* it, int offset); +uint2 load_uint2_L1C_L3UC(global uint2* it, int offset); +uint2 load_uint2_L1C_L3C(global uint2* it, int offset); +uint2 load_uint2_L1S_L3UC(global uint2* it, int offset); +uint2 load_uint2_L1S_L3C(global uint2* it, int offset); +uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset); + +uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset); +uint3 load_uint3_L1UC_L3C(global uint3* it, int offset); +uint3 load_uint3_L1C_L3UC(global uint3* it, int offset); +uint3 load_uint3_L1C_L3C(global uint3* it, int offset); +uint3 load_uint3_L1S_L3UC(global uint3* it, int offset); +uint3 load_uint3_L1S_L3C(global uint3* it, int offset); +uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset); + +uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset); +uint4 load_uint4_L1UC_L3C(global uint4* it, int offset); +uint4 load_uint4_L1C_L3UC(global uint4* it, int offset); +uint4 load_uint4_L1C_L3C(global uint4* it, int offset); +uint4 load_uint4_L1S_L3UC(global uint4* it, int offset); +uint4 load_uint4_L1S_L3C(global uint4* it, int offset); +uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset); + +uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset); +uint8 load_uint8_L1UC_L3C(global uint8* it, int offset); +uint8 load_uint8_L1C_L3UC(global uint8* it, int offset); +uint8 load_uint8_L1C_L3C(global uint8* it, int offset); +uint8 load_uint8_L1S_L3UC(global uint8* it, int offset); +uint8 load_uint8_L1S_L3C(global uint8* it, int offset); +uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset); + +ulong load_ulong_L1UC_L3UC(global ulong* it, int offset); +ulong load_ulong_L1UC_L3C(global ulong* it, int offset); +ulong load_ulong_L1C_L3UC(global ulong* it, int offset); +ulong load_ulong_L1C_L3C(global ulong* it, int offset); +ulong load_ulong_L1S_L3UC(global ulong* it, int offset); +ulong load_ulong_L1S_L3C(global ulong* it, int offset); +ulong load_ulong_L1IAR_L3C(global ulong* it, int offset); + +ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset); +ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset); +ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset); +ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset); +ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset); +ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset); +ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset); + +ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset); +ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset); +ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset); +ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset); +ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset); +ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset); +ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset); + +ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset); +ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset); +ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset); +ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset); +ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset); +ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset); +ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset); + +ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset); +ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset); +ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset); +ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset); +ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset); +ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset); +ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset); + +// LSC Stores +void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value); +void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value); +void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value); +void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value); +void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value); +void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value); +void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value); + +void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value); +void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value); +void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value); +void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value); +void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value); +void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value); +void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value); + +void store_uint_L1UC_L3UC(global uint* it, int offset, uint value); +void store_uint_L1UC_L3WB(global uint* it, int offset, uint value); +void store_uint_L1WT_L3UC(global uint* it, int offset, uint value); +void store_uint_L1WT_L3WB(global uint* it, int offset, uint value); +void store_uint_L1S_L3UC(global uint* it, int offset, uint value); +void store_uint_L1S_L3WB(global uint* it, int offset, uint value); +void store_uint_L1WB_L3WB(global uint* it, int offset, uint value); + +void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value); +void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value); +void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value); +void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value); +void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value); +void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value); +void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value); + +void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value); +void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value); +void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value); +void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value); +void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value); +void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value); +void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value); + +void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value); +void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value); +void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value); +void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value); +void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value); +void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value); +void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value); + +void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value); +void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value); +void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value); +void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value); +void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value); +void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value); +void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value); + +void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value); +void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value); +void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value); +void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value); +void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value); +void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value); +void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value); + +void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value); +void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value); +void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value); +void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value); +void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value); +void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value); +void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value); + +void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value); +void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value); +void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value); +void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value); +void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value); +void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value); +void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value); + +void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value); +void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value); +void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value); +void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value); +void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value); +void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value); +void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value); + +void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value); +void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value); +void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value); +void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value); +void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value); +void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value); +void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value); + +// LSC Fence support +void mem_fence_gpu_default(); +void mem_fence_workgroup_default(); +void mem_fence_gpu_invalidate(); +void mem_fence_gpu_evict(); +void mem_fence_evict_to_memory(); diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl new file mode 100644 index 00000000000..2217618c7c5 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl @@ -0,0 +1,898 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// LSC Loads +// uchar +uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset) +{ + return (uint)(it[offset]); +} + +// ushort +uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset) +{ + return (uint)(it[offset]); +} + +// uint +uint load_uint_L1UC_L3UC(global uint* it, int offset) +{ + return it[offset]; +} + +uint load_uint_L1UC_L3C(global uint* it, int offset) +{ + return it[offset]; +} + +uint load_uint_L1C_L3UC(global uint* it, int offset) +{ + return it[offset]; +} + +uint load_uint_L1C_L3C(global uint* it, int offset) +{ + return it[offset]; +} + +uint load_uint_L1S_L3UC(global uint* it, int offset) +{ + return it[offset]; +} + +uint load_uint_L1S_L3C(global uint* it, int offset) +{ + return it[offset]; +} + +uint load_uint_L1IAR_L3C(global uint* it, int offset) +{ + return it[offset]; +} + +// uint2 +uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset) +{ + return it[offset]; +} + +uint2 load_uint2_L1UC_L3C(global uint2* it, int offset) +{ + return it[offset]; +} + +uint2 load_uint2_L1C_L3UC(global uint2* it, int offset) +{ + return it[offset]; +} + +uint2 load_uint2_L1C_L3C(global uint2* it, int offset) +{ + return it[offset]; +} + +uint2 load_uint2_L1S_L3UC(global uint2* it, int offset) +{ + return it[offset]; +} + +uint2 load_uint2_L1S_L3C(global uint2* it, int offset) +{ + return it[offset]; +} + +uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset) +{ + return it[offset]; +} + +// uint3 +uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset) +{ + return it[offset]; +} + +uint3 load_uint3_L1UC_L3C(global uint3* it, int offset) +{ + return it[offset]; +} + +uint3 load_uint3_L1C_L3UC(global uint3* it, int offset) +{ + return it[offset]; +} + +uint3 load_uint3_L1C_L3C(global uint3* it, int offset) +{ + return it[offset]; +} + +uint3 load_uint3_L1S_L3UC(global uint3* it, int offset) +{ + return it[offset]; +} + +uint3 load_uint3_L1S_L3C(global uint3* it, int offset) +{ + return it[offset]; +} + +uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset) +{ + return it[offset]; +} + +// uint4 +uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset) +{ + return it[offset]; +} + +uint4 load_uint4_L1UC_L3C(global uint4* it, int offset) +{ + return it[offset]; +} + +uint4 load_uint4_L1C_L3UC(global uint4* it, int offset) +{ + return it[offset]; +} + +uint4 load_uint4_L1C_L3C(global uint4* it, int offset) +{ + return it[offset]; +} + +uint4 load_uint4_L1S_L3UC(global uint4* it, int offset) +{ + return it[offset]; +} + +uint4 load_uint4_L1S_L3C(global uint4* it, int offset) +{ + return it[offset]; +} + +uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset) +{ + return it[offset]; +} + +// uint8 +uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset) +{ + return it[offset]; +} + +uint8 load_uint8_L1UC_L3C(global uint8* it, int offset) +{ + return it[offset]; +} + +uint8 load_uint8_L1C_L3UC(global uint8* it, int offset) +{ + return it[offset]; +} + +uint8 load_uint8_L1C_L3C(global uint8* it, int offset) +{ + return it[offset]; +} + +uint8 load_uint8_L1S_L3UC(global uint8* it, int offset) +{ + return it[offset]; +} + +uint8 load_uint8_L1S_L3C(global uint8* it, int offset) +{ + return it[offset]; +} + +uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset) +{ + return it[offset]; +} + +// ulong +ulong load_ulong_L1UC_L3UC(global ulong* it, int offset) +{ + return it[offset]; +} + +ulong load_ulong_L1UC_L3C(global ulong* it, int offset) +{ + return it[offset]; +} + +ulong load_ulong_L1C_L3UC(global ulong* it, int offset) +{ + return it[offset]; +} + +ulong load_ulong_L1C_L3C(global ulong* it, int offset) +{ + return it[offset]; +} + +ulong load_ulong_L1S_L3UC(global ulong* it, int offset) +{ + return it[offset]; +} + +ulong load_ulong_L1S_L3C(global ulong* it, int offset) +{ + return it[offset]; +} + +ulong load_ulong_L1IAR_L3C(global ulong* it, int offset) +{ + return it[offset]; +} + +// ulong2 +ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset) +{ + return it[offset]; +} + +ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset) +{ + return it[offset]; +} + +ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset) +{ + return it[offset]; +} + +ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset) +{ + return it[offset]; +} + +ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset) +{ + return it[offset]; +} + +ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset) +{ + return it[offset]; +} + +ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset) +{ + return it[offset]; +} + +// ulong3 +ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset) +{ + return it[offset]; +} + +ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset) +{ + return it[offset]; +} + +ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset) +{ + return it[offset]; +} + +ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset) +{ + return it[offset]; +} + +ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset) +{ + return it[offset]; +} + +ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset) +{ + return it[offset]; +} + +ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset) +{ + return it[offset]; +} + +// ulong4 +ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset) +{ + return it[offset]; +} + +ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset) +{ + return it[offset]; +} + +ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset) +{ + return it[offset]; +} + +ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset) +{ + return it[offset]; +} + +ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset) +{ + return it[offset]; +} + +ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset) +{ + return it[offset]; +} + +ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset) +{ + return it[offset]; +} + +// ulong8 +ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset) +{ + return it[offset]; +} + +ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset) +{ + return it[offset]; +} + +ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset) +{ + return it[offset]; +} + +ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset) +{ + return it[offset]; +} + +ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset) +{ + return it[offset]; +} + +ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset) +{ + return it[offset]; +} + +ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset) +{ + return it[offset]; +} + +// LSC Stores +// uchar +void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value) +{ + it[offset] = (uchar)(value); +} + +// ushort +void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value) +{ + it[offset] = (ushort)(value); +} + +// uint +void store_uint_L1UC_L3UC(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +void store_uint_L1UC_L3WB(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +void store_uint_L1WT_L3UC(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +void store_uint_L1WT_L3WB(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +void store_uint_L1S_L3UC(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +void store_uint_L1S_L3WB(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +void store_uint_L1WB_L3WB(global uint* it, int offset, uint value) +{ + it[offset] = value; +} + +// uint2 +void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value) +{ + it[offset] = value; +} + +// uint3 +void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value) +{ + it[offset] = value; +} + +// uint4 +void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value) +{ + it[offset] = value; +} + +// uint8 +void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value) +{ + it[offset] = value; +} + +// ulong +void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value) +{ + it[offset] = value; +} + +// ulong2 +void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value) +{ + it[offset] = value; +} + +// ulong3 +void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value) +{ + it[offset] = value; +} + +// ulong4 +void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value) +{ + it[offset] = value; +} + +// ulong8 +void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value) +{ + it[offset] = value; +} + +// LSC Fence support +void mem_fence_gpu_default() +{ + write_mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +void mem_fence_workgroup_default() +{ + write_mem_fence( CLK_GLOBAL_MEM_FENCE ); +} + +void mem_fence_gpu_invalidate() +{ + read_mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +void mem_fence_gpu_evict() +{ + read_mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +void mem_fence_evict_to_memory() +{ + mem_fence(CLK_GLOBAL_MEM_FENCE); +} diff --git a/src/intel/vulkan/grl/gpu/mem_utils.h b/src/intel/vulkan/grl/gpu/mem_utils.h new file mode 100644 index 00000000000..b57a25279fd --- /dev/null +++ b/src/intel/vulkan/grl/gpu/mem_utils.h @@ -0,0 +1,161 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "shared.h" + +/// Write cache line to global memory +/// Assumes subgroup_size is 16 +/// +/// @param dst 64 bytes aligned output pointer +/// @param val value to write +GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val) +{ + global uint* addrAligned = (global uint*)(global uint16*)dst; + intel_sub_group_block_write(addrAligned, val); +} + +/// Read cache line from global memory +/// Assumes subgroup_size is 16 +/// +/// @param src 64 bytes aligned input pointer +/// @return uint read from memory +GRL_INLINE uint CacheLineSubgroupRead(const global char* src) +{ + const global uint* addrAligned = (const global uint*)(global uint16*)src; + return intel_sub_group_block_read(addrAligned); +} + +/// Copy cache line +/// Assumes subgroup_size is 16 +/// +/// @param dst 64 bytes aligned output pointer +/// @param src input pointer +GRL_INLINE void CopyCacheLine(global char* dst, const global char* src) +{ + global const uint* usrc = (global const uint*) (src); + + uint data = intel_sub_group_block_read(usrc); + CacheLineSubgroupWrite(dst, data); +} + +/// Fast memory copy +/// +/// @param dst output pointer +/// @param src input pointer +/// @param size number of bytes to copy +/// @param numGroups number of groups that execute this function +GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups) +{ + const uint CACHELINE_SIZE = 64; + + uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0); + + // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline. + // it copies laso reminder + { + uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1); + alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1); + + if (size > alignAdd) + { + uint alignedBytesCount = size - alignAdd; + uint alignedDWsCount = alignedBytesCount >> 2; + global uint* dstAlignedPart = (global uint*)(dst + alignAdd); + global uint* srcAlignedPart = (global uint*)(src + alignAdd); + + for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups) + { + dstAlignedPart[id] = srcAlignedPart[id]; + } + + if (globalID < alignedBytesCount - (alignedDWsCount << 2)) + { + global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount); + global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount); + dstByteRem[globalID] = srcByteRem[globalID]; + } + } + } + + // copy to dst below aligned up to chacheline + { + uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3; + if (misalignmentBytesSize) + { + if (globalID < misalignmentBytesSize) + { + dst[globalID] = src[globalID]; + } + dst += misalignmentBytesSize; + src += misalignmentBytesSize; + } + + uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1); + if (misalignmentDWSize) + { + if (globalID < (misalignmentDWSize >> 2)) + { + ((global uint*)dst)[globalID] = ((global uint*)src)[globalID]; + } + } + } +} + +#define CACHELINE_SIZE 64 +#define CACHELINE_PER_BLOCK 4 +#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK; + +GRL_INLINE +global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset) +{ + if (array != NULL) + { + return array + byteOffset; + } + else + { + return (global char *)arrayOfPtrs[byteOffset >> 6]; + } +} + +// assummed: +// dst is always 64 bytes alligned +// size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes) +GRL_INLINE +void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups) +{ + uint taskId = get_group_id(0); + + uint blockedSize = (size) & (~(BLOCK_SIZE - 1)); + + uint cachelinedTailOffset = blockedSize; + uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1)); + + uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE + uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1))); + if (reversedTaskId < tailCacheLines) + { + uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE); + global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset); + CopyCacheLine(dst + byteOffset, src); + } + + uint numBlocks = blockedSize >> 8; + while (taskId < numBlocks) + { + uint byteOffset = (taskId * BLOCK_SIZE); + + for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++) + { + global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset); + CopyCacheLine(dst + byteOffset, src); + byteOffset += CACHELINE_SIZE; + } + + taskId += numGroups; + } +} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/misc.cl b/src/intel/vulkan/grl/gpu/misc.cl new file mode 100644 index 00000000000..d32c8267b73 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/misc.cl @@ -0,0 +1,367 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "api_interface.h" +#include "common.h" +#include "instance.h" +#include "misc_shared.h" +#include "mem_utils.h" + +#define DBG(x) +#define ENABLE_CHECKS 0 + +#define CACHELINE_SIZE 64 +#define CACHELINE_PER_BLOCK 4 +#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK; + +GRL_INLINE +uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index) +{ + return (uint32_t)GRL_get_primitive_count(&geomDesc[index]); +} + +GRL_INLINE +uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index) +{ + return (uint32_t)GRL_get_Type(&geomDesc[index]) | + (((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16); +} + +GRL_INLINE +uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index) +{ + return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) | + (((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32); +} + +// assummed: +// dst is always 64 bytes alligned +GRL_INLINE +void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups) +{ + uint taskId = get_group_id(0); + uint localId = get_sub_group_local_id(); + + uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1)); + + uint reminderOffset = cachelinedSize; + uint reminderQWSize = (size - reminderOffset) >> 3; + + uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE + uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1))); + if (reversedTaskId == tailCacheLines && localId < reminderQWSize) + { + uint reminderOffsetQW = reminderOffset >> 3; + global uint64_t* dstQW = (global uint64_t*)(dst); + dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW); + } + + uint numCacheLines = cachelinedSize >> 6; + while (taskId < numCacheLines) + { + uint byteOffset = taskId * CACHELINE_SIZE; + uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1); + + uint32_t data = 0; + if (localId & 1) + { + data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset); + } + else + { + data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset); + } + CacheLineSubgroupWrite(dst + byteOffset, data); + + taskId += numGroups; + } +} + +GRL_INLINE +uint groupCountForInstancesCopySize(uint size) +{ + return (size >> 8) + 3; +} + +GRL_INLINE +uint groupCountForGeoMetaDataCopySize(uint size) +{ + return (size >> 6) + 1; +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size) +{ + // global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); + copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data) +{ + uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); + instancesArray += indirect_data->primitiveOffset; + uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (tid == 0) + { + struct BVHBase* bvh = (struct BVHBase*)dest; + bvh->Meta.instanceCount = indirect_data->primitiveCount; + } + copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size) +{ + //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); + copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data) +{ + uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); + arrayOfPtrs += indirect_data->primitiveOffset; + uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (tid == 0) + { + struct BVHBase* bvh = (struct BVHBase*)dest; + bvh->Meta.instanceCount = indirect_data->primitiveCount; + } + copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size) +{ + global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); + copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data) +{ + global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart); + uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); + instancesArray += indirect_data->primitiveOffset; + copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size) +{ + global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); + copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data) +{ + global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart); + uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); + arrayOfPtrs += indirect_data->primitiveOffset; + copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size) +{ + //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart); + global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src); + copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size)); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) ) +__attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) ) +void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries) +{ + uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0); + if (gid < numGeometries) { + global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest); + global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src); + + GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid]; + + uint primitiveCount = indirect_data[gid].primitiveCount; + uint primitiveOffset = indirect_data[gid].primitiveOffset; + uint firstVertex = indirect_data[gid].firstVertex; + uint transformOffset = indirect_data[gid].transformOffset; + + if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES) + { + if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE) + { + geo.Desc.Triangles.VertexCount = primitiveCount * 3; + geo.Desc.Triangles.pVertexBuffer += primitiveOffset + + firstVertex * geo.Desc.Triangles.VertexBufferByteStride; + } + else + { + geo.Desc.Triangles.IndexCount = primitiveCount * 3; + geo.Desc.Triangles.pIndexBuffer += primitiveOffset; + geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride; + } + if (geo.Desc.Triangles.pTransformBuffer) { + geo.Desc.Triangles.pTransformBuffer += transformOffset; + } + } else { + // GEOMETRY_TYPE_PROCEDURAL + geo.Desc.Procedural.AABBCount = primitiveCount; + geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset; + } + + dstDesc[gid] = geo; + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data) +{ + uint groupID = get_group_id(0); + + struct BatchedInitGlobalsData entry = data[groupID]; + + global struct Globals* globals = (global struct Globals*)entry.p_build_globals; + global char *bvh_mem = (global char*)entry.p_bvh_buffer; + uint numPrimitives = entry.numPrimitives; + uint numGeometries = entry.numGeometries; + uint numInstances = entry.numInstances; + uint instance_descs_start = entry.instance_descs_start; + uint geo_meta_data_start = entry.geo_meta_data_start; + uint node_data_start = entry.node_data_start; + uint quad_data_start = entry.leaf_data_start; + uint instance_data_start = entry.leaf_data_start; + uint procedural_data_start = entry.procedural_data_start; + uint back_pointer_start = entry.back_pointer_start; + uint build_record_start = entry.leaf_data_start; + uint totalBytes = entry.sizeTotal; + uint leafPrimType = entry.leafType; + uint leafSize = entry.leafSize; + + uint root_node_offset = node_data_start; + struct BVHBase *base = (struct BVHBase *)bvh_mem; + + base->Meta.instanceCount = numInstances; + base->Meta.geoCount = numGeometries; + base->Meta.instanceDescsStart = instance_descs_start; + base->Meta.geoDescsStart = geo_meta_data_start; + base->Meta.allocationSize = totalBytes; + // This doesnt work correctly + //ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA }; + //base->Meta.errors = initErr; + base->Meta.errors.type = 0; + base->Meta.errors.offset_in_BVH = 0; //in 64B units + base->Meta.errors.when = 0; + base->Meta.errors.reserved = 0xAAABBAAA; + + base->nodeDataCur = node_data_start / 64; + base->quadLeafStart = quad_data_start / 64; + base->quadLeafCur = quad_data_start / 64; + base->instanceLeafStart = instance_data_start / 64; + base->instanceLeafEnd = instance_data_start / 64; + base->proceduralDataStart = procedural_data_start / 64; + base->proceduralDataCur = procedural_data_start / 64; + base->backPointerDataStart = back_pointer_start / 64; + base->refitTreeletsDataStart = totalBytes / 64; + base->refitStartPointDataStart = totalBytes / 64; + base->BVHDataEnd = totalBytes / 64; + base->refitTreeletCnt = 0; + base->refitTreeletCnt2 = 0; + base->rootNodeOffset = root_node_offset; + + base->fatLeafCount = 0; + base->fatLeafTableStart = entry.fatleaf_table_start / 64; + base->innerCount = 0; + base->innerTableStart = entry.innernode_table_start / 64; + base->quadLeftoversCountNewAtomicUpdate = 0; + base->quadTableSizeNewAtomicUpdate = 0; + base->quadIndicesDataStart = entry.quad_indices_data_start / 64; + + if (back_pointer_start != totalBytes) + { + BackPointers* back_pointers = BVHBase_GetBackPointers(base); + uint root_node_idx = root_node_offset - node_data_start; + global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx); + *root_node_backpointer = ((uint)-1) << 6; + } + + AABB3f_init(&base->Meta.bounds); + AABB_init(&globals->centroidBounds); + + globals->build_record_start = build_record_start; + + globals->numBuildRecords = 0; + globals->numBuildRecords_extended = 0; + globals->numPrimitives = numPrimitives; + globals->numSplittedPrimitives = 0; + globals->sync = 0; + globals->probThreshold = 0.0f; + globals->leafPrimType = leafPrimType; + globals->leafSize = leafSize; +} + + + +// This is temporary WA for mock in DXR +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest, + global char *src, + uint32_t size) +{ + uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0); + uint32_t globalSize = get_num_groups(0) * get_local_size(0); + for (uint32_t i = globalId; i < size; i += globalSize) + { + dest[i] = src[i]; + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(32, 1, 1))) +void kernel mem_set(global char *dest, + dword byte, + dword size) +{ + uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0); + if (globalId < size) + { + dest[globalId] = (char)byte; + } +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(32, 1, 1))) +void kernel mem_set_size_ptr(global char *dest, + dword byte, + global qword* sizePtr) +{ + uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0); + if (globalId < *sizePtr) + { + dest[globalId] = (char)byte; + } +} diff --git a/src/intel/vulkan/grl/gpu/misc.grl b/src/intel/vulkan/grl/gpu/misc.grl new file mode 100644 index 00000000000..cb98534afb4 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/misc.grl @@ -0,0 +1,278 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module misc; + +kernel_module misc("misc.cl") +{ + kernel opencl_kernel_batched_init_globals < kernelFunction="batched_init_globals" >; + kernel opencl_kernel_copy_instances < kernelFunction="copy_instances" >; + kernel opencl_kernel_copy_instances_indirect < kernelFunction="copy_instances_indirect" >; + kernel opencl_kernel_copy_instance_ptrs < kernelFunction="copy_instance_ptrs" >; + kernel opencl_kernel_copy_instance_ptrs_indirect < kernelFunction="copy_instance_ptrs_indirect" >; + kernel opencl_kernel_copy_instances_base_ptr < kernelFunction="copy_instances_base_ptr" >; + kernel opencl_kernel_copy_instances_base_ptr_indirect < kernelFunction="copy_instances_base_ptr_indirect" >; + kernel opencl_kernel_copy_instance_ptrs_base_ptr < kernelFunction="copy_instance_ptrs_base_ptr" >; + kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >; + kernel opencl_kernel_copy_geo_meta_data < kernelFunction="copy_geo_meta_data" >; + kernel opencl_kernel_copy_geo_descs_indirect_build < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >; + kernel opencl_kernel_copy_mock < kernelFunction="copy_mock" >; + kernel opencl_kernel_memset < kernelFunction="mem_set" >; + kernel opencl_kernel_memset_size_ptr < kernelFunction="mem_set_size_ptr" >; +} + +import struct MKBuilderState "structs.grl"; +import struct MKSizeEstimate "structs.grl"; + + +metakernel batched_init_globals( + qword p_data, + dword numWgs) +{ + dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data); +} + +metakernel copy_instances( + qword bvh_buffer, + qword instanceDescsBuffer, + qword totalSizeToCopy, + dword numThreads) +{ + dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args( + bvh_buffer, + instanceDescsBuffer, + totalSizeToCopy); +} + +metakernel +copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo ) +{ + + define num_groups REG0; + define C_2 REG2; + define C_3 REG3; + + C_2 = 2; + C_3 = 3; + + // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions + // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 + num_groups = load_dword( indirectBuildRangeInfo ); + num_groups = num_groups >> C_2; + num_groups = num_groups + C_3; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_copy_instances_indirect args( + bvh_buffer, + instanceDescsBuffer, + indirectBuildRangeInfo); +} + +metakernel copy_instance_ptrs( + qword bvh_buffer, + qword instanceDescPtrsBuffer, + qword totalSizeToCopy, + dword numThreads) +{ + dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args( + bvh_buffer, + instanceDescPtrsBuffer, + totalSizeToCopy); +} + +metakernel copy_instance_ptrs_indirect( + qword bvh_buffer, + qword instanceDescPtrsBuffer, + qword indirectBuildRangeInfo) +{ + define num_groups REG0; + define C_2 REG2; + define C_3 REG3; + + C_2 = 2; + C_3 = 3; + + // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions + // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 + num_groups = load_dword( indirectBuildRangeInfo ); + num_groups = num_groups >> C_2; + num_groups = num_groups + C_3; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args( + bvh_buffer, + instanceDescPtrsBuffer, + indirectBuildRangeInfo); +} + +metakernel copy_instances_base_ptr( + qword bvh_buffer, + qword instanceDescsBuffer, + qword totalSizeToCopy, + dword numThreads) +{ + dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args( + bvh_buffer, + instanceDescsBuffer, + totalSizeToCopy); +} + +metakernel copy_instances_base_ptr_indirect( + qword bvh_buffer, + qword instanceDescsBuffer, + qword indirectBuildRangeInfo) +{ + define num_groups REG0; + define C_2 REG2; + define C_3 REG3; + + C_2 = 2; + C_3 = 3; + + // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions + // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 + num_groups = load_dword( indirectBuildRangeInfo ); + num_groups = num_groups >> C_2; + num_groups = num_groups + C_3; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args( + bvh_buffer, + instanceDescsBuffer, + indirectBuildRangeInfo); +} + +metakernel copy_instance_ptrs_base_ptr( + qword bvh_buffer, + qword instanceDescPtrsBuffer, + qword totalSizeToCopy, + dword numThreads) +{ + dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args( + bvh_buffer, + instanceDescPtrsBuffer, + totalSizeToCopy); +} + +metakernel copy_instance_ptrs_base_ptr_indirect( + qword bvh_buffer, + qword instanceDescPtrsBuffer, + qword indirectBuildRangeInfo) +{ + define num_groups REG0; + define C_2 REG2; + define C_3 REG3; + + C_2 = 2; + C_3 = 3; + + // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions + // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 + num_groups = load_dword( indirectBuildRangeInfo ); + num_groups = num_groups >> C_2; + num_groups = num_groups + C_3; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect args( + bvh_buffer, + instanceDescPtrsBuffer, + indirectBuildRangeInfo); +} + +metakernel copy_geo_descs( + qword private_dest, + qword transient_src, + qword indirectBuildRangeInfo, + dword numGeometries) +{ + + define num_groups (numGeometries + 16 - 1) / 16; + dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args( + private_dest, + transient_src, + indirectBuildRangeInfo, + numGeometries); +} + +metakernel copy_geo_meta_data( + qword bvh_buffer, + qword geomdesc_buffer, + qword totalSizeToCopy, + dword numThreads) +{ + dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args( + bvh_buffer, + geomdesc_buffer, + totalSizeToCopy); +} + + +const COPY_MOCK_GROUP_SIZE = 16; + +metakernel copy_mock( + qword dest, + qword src, + dword size) +{ + define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE; + dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args( + dest, + src, + size); +} + +metakernel memset( + qword dest, + dword byte, + dword size) +{ + define num_groups (size + 32 - 1) / 32; + dispatch opencl_kernel_memset(num_groups, 1, 1) args( + dest, + byte, + size); +} + +metakernel memset_size_ptr( + qword dest, + dword byte, + qword sizePtr) +{ + define byteSize REG0; + define C_32 REG1; C_32 = 32; + define C_1 REG2; C_1 = 1; + define C_4 REG3; C_4 = 4; + define numGroupsRqd REG4; + + byteSize = load_dword(sizePtr); + + numGroupsRqd = byteSize + C_32; + numGroupsRqd = numGroupsRqd - C_1; + numGroupsRqd = numGroupsRqd >> C_4; + numGroupsRqd = numGroupsRqd >> C_1; + + DISPATCHDIM_X = numGroupsRqd.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_kernel_memset_size_ptr args( + dest, + byte, + sizePtr); +} diff --git a/src/intel/vulkan/grl/gpu/misc_legacy.cl b/src/intel/vulkan/grl/gpu/misc_legacy.cl new file mode 100644 index 00000000000..a464e89537c --- /dev/null +++ b/src/intel/vulkan/grl/gpu/misc_legacy.cl @@ -0,0 +1,386 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "input_client_structs.h" +#include "common.h" +#include "instance.h" + +#define DBG(x) +#define ENABLE_CHECKS 0 + +/* + + This kernel implements a exclusive scan addition operation. The + implementation currently only uses one DSS. + + */ +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_scan_exclusive_add(global uint *input, + global uint *output, + const uint N) +{ + const uint j = get_local_id(0); + const uint J = get_local_size(0); + const uint BLOCKSIZE = (N + J - 1) / J; + const uint start = min((j + 0) * BLOCKSIZE, N); + const uint end = min((j + 1) * BLOCKSIZE, N); + + uint base = 0; + for (uint i = start; i < end; i++) + base += input[i]; + + base = work_group_scan_exclusive_add(base); + + uint accu = 0; + for (uint i = start; i < end; i++) + { + output[i] = base + accu; + accu += input[i]; + } +} + +/* + + This kernel implements a exclusive scan addition operation that can use the entire GPU. + + */ +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_scan_exclusive_add_phase0(global uint *input, + global uint *output, + global uint *prefix_sums, + const uint N) +{ + const uint local_size = get_local_size(0); + const uint numTasks = get_num_groups(0); + const uint groupID = get_group_id(0); + const uint localID = get_local_id(0); + const uint global_startID = (groupID + 0) * N / numTasks; + const uint global_endID = (groupID + 1) * N / numTasks; + + uint base = 0; + for (uint i = global_startID + localID; i < global_endID; i += local_size) + base += input[i]; + + base = work_group_reduce_add(base); + + if (localID == 0) + { + prefix_sums[groupID] = base; + printf("%d -> %d \n", groupID, base); + } +} + +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_scan_exclusive_add_phase1(global uint *input, + global uint *output, + global uint *prefix_sums, + const uint N) +{ + const uint local_size = get_local_size(0); + const uint numTasks = get_num_groups(0); + const uint groupID = get_group_id(0); + const uint localID = get_local_id(0); + const uint global_startID = (groupID + 0) * N / numTasks; + const uint global_endID = (groupID + 1) * N / numTasks; + const uint local_range = global_endID - global_startID; + + uint global_base = 0; + for (uint i = 0; i < groupID; i++) + global_base += prefix_sums[i]; + + const uint j = get_local_id(0); + const uint J = get_local_size(0); + const uint BLOCKSIZE = (local_range + J - 1) / J; + const uint startID = (j + 0) * local_range / J + global_startID; + const uint endID = (j + 1) * local_range / J + global_startID; + + uint base = 0; + for (uint i = startID; i < endID; i++) + base += input[i]; + + base = work_group_scan_exclusive_add(base); + + uint accu = 0; + for (uint i = startID; i < endID; i++) + { + output[i] = global_base + base + accu; + accu += input[i]; + } +} + +/* ========================================================================= */ +/* ============================== STATISTICS =============================== */ +/* ========================================================================= */ + +/* ====== STATS config ====== */ + +#define ENABLE_STAT_CHECKS 1 +#define DBG_STATS(x) + +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +printBVHStatistics(global struct Globals *globals, + global char *bvh_mem, + global struct StatStackEntry *global_stack0, + global struct StatStackEntry *global_stack1, + const uint presplit) +{ + const uint globalID = get_global_id(0); + const uint localID = get_local_id(0); + const uint local_size = get_local_size(0); + + struct BVHBase *base = (struct BVHBase *)bvh_mem; + const uint root = base->rootNodeOffset; + + local uint stack_items[2]; + local uint iterations; + + struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root)); + root_aabb = conservativeAABB(&root_aabb); + const float root_area = AABB_halfArea(&root_aabb); + + global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset); + + if (root_node->type != BVH_INTERNAL_NODE) + { + const uint numChildren = getNumChildren_QBVHNodeN(root_node); + const uint current = root; + for (uint i = 0; i < numChildren; i++) + { + struct AABB aabb = extractAABB_QBVHNodeN(root_node, i); + const float area = AABB_halfArea(&aabb); + + global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad); + global_stack0[i].type = root_node->type; + global_stack0[i].area = area; + global_stack0[i].aabb = aabb; + global_stack0[i].depth = 0; + } + stack_items[0] = numChildren; + stack_items[1] = 0; + } + else + { + global_stack0[0].node = root; + global_stack0[0].type = root_node->type; + global_stack0[0].area = root_area; + global_stack0[0].aabb = root_aabb; + global_stack0[0].depth = 1; + stack_items[0] = 1; + stack_items[1] = 0; + } + + const uint maxInnerNodeOffset = globals->node_mem_allocator.cur; + const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur; + + DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64)); + + iterations = 0; + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + float sah_nodes = 0.0f; + float sah_leaves = 0.0f; + uint leaves = 0; + uint inner_nodes = 0; + uint max_depth = 0; + uint leaf_items = 0; + uint inner_nodes_valid_children = 0; + + while (1) + { + work_group_barrier(CLK_GLOBAL_MEM_FENCE); + const uint buffer_index = (iterations % 2) == 0 ? 0 : 1; + global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1; + global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0; + + const uint local_stack_items = stack_items[buffer_index]; + stack_items[1 - buffer_index] = 0; + + DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items)); + + if (local_stack_items == 0) + break; + //if (iterations == 5) break; + + work_group_barrier(CLK_GLOBAL_MEM_FENCE); + + if (globalID == 0) + iterations++; + + for (uint sindex = localID; sindex < local_stack_items; sindex += local_size) + { + + uint current = input_global_stack[sindex].node; + uint type = input_global_stack[sindex].type; + float current_area = input_global_stack[sindex].area; + struct AABB current_aabb = input_global_stack[sindex].aabb; + uint current_depth = input_global_stack[sindex].depth; + + //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items); + + max_depth = max(max_depth, current_depth); + + if (type == BVH_QUAD_NODE) + { + unsigned int prims = 1; //getNumLeafPrims(current); + if (prims > BVH_LEAF_N_MAX) + printf("too many items in leaf %d \n", prims); + unsigned int prims_offset = current; //getLeafOffset(current); + //printf("prims_offset %d \n",prims_offset); + + leaf_items += prims; + sah_leaves += current_area; + leaves++; +#if ENABLE_STAT_CHECKS == 1 + struct AABB leafAABB; + AABB_init(&leafAABB); + + global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset); + //printf("prims_offset %d \n",prims_offset); + + for (uint i = 0; i < prims; i++) + { + struct AABB quadAABB = getAABB_Quad(&quads[i]); + AABB_extend(&leafAABB, &quadAABB); + } + + if (!presplit && !AABB_subset(&leafAABB, ¤t_aabb)) + { + printf("leaf error: current %d depth %d \n", current, current_depth); + AABB_print(¤t_aabb); + printf("leaf bounds: \n"); + AABB_print(&leafAABB); + } +#endif + } + else if (type == BVH_INTERNAL_NODE) + { + inner_nodes++; + sah_nodes += current_area; + global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current); + + uint children = 0; + for (uint i = 0; i < BVH_NODE_N6; i++) + { + if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i]) + break; + children++; + } + //printf("children %d \n",children); + +#if ENABLE_STAT_CHECKS == 1 + if (children > BVH_NODE_N6 || children == 0) + { + printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID); + printQBVHNodeN(nodeN); + } + + if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0) + { + printf("offset error %d \n", nodeN->offset); + } +#endif + + uint children_offset = atomic_add(&stack_items[1 - buffer_index], children); + + for (uint i = 0; i < children; i++) + { + inner_nodes_valid_children++; + + struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i); + const float area = AABB_halfArea(&aabb); + + aabb = conservativeAABB(&aabb); + +#if 0 // ENABLE_STAT_CHECKS == 1 // FIXME: not clear whether parent child property still holds !!!! + + // if (aabb.lower.x == (float)(INFINITY)) + // { + // printf("aabb inf error %d current %d nodeN %d \n",i, current, children); + // break; + // } + + + if (!presplit && !AABB_subset(&aabb,¤t_aabb)) + { + printf("Parent: current %d depth %d children %d \n",current, current_depth, children); + AABB_print(¤t_aabb); + printf("Child %d: \n",i); + AABB_print(&aabb); + } +#endif + + uint dest_index = children_offset + i; + if (nodeN->type == BVH_QUAD_NODE) + { + output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad); + if (output_global_stack[dest_index].node >= maxLeafNodeOffset) + { + printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64); + } + } + else if (nodeN->type == BVH_INTERNAL_NODE) + { + output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN)); + if (output_global_stack[dest_index].node >= maxInnerNodeOffset) + { + printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset); + } + } + + output_global_stack[dest_index].type = nodeN->type; + output_global_stack[dest_index].area = area; + output_global_stack[dest_index].aabb = aabb; + output_global_stack[dest_index].depth = current_depth + 1; + //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type); + } + } + } + } + + sah_nodes = work_group_reduce_add(sah_nodes); + sah_leaves = work_group_reduce_add(sah_leaves); + leaves = work_group_reduce_add(leaves); + inner_nodes = work_group_reduce_add(inner_nodes); + max_depth = work_group_reduce_max(max_depth); + leaf_items = work_group_reduce_add(leaf_items); + inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children); + + if (globalID == 0) + { + /* + sah_nodes *= 1.0f / root_area; + sah_leaves *= 1.0f / root_area; + float sah = sah_nodes + sah_leaves; + + const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start; + const uint totalAllocatedMem = globals->totalAllocatedMem; + + printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX); + float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6); + float leaf_util = 100.0f * (float)leaf_items / (leaves); + printf("allocators: node %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start); + printf("inner nodes %d leaves %d sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves); + uint node_mem = globals->node_mem_allocator_cur; + uint max_node_mem = globalLeafMemAllocatorOffset; + float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem; + + uint leaf_mem = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset; + uint max_leaf_mem = totalAllocatedMem - globalLeafMemAllocatorOffset; + float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem; + + uint total_mem = node_mem + leaf_mem; + float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem; + + printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem); + */ + } +} diff --git a/src/intel/vulkan/grl/gpu/misc_shared.h b/src/intel/vulkan/grl/gpu/misc_shared.h new file mode 100644 index 00000000000..218f2fa4291 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/misc_shared.h @@ -0,0 +1,196 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// +// This file contains structure definitions shared by GRL OCL kernels and host code +// + +#pragma once + +#include "GRLGen12.h" + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) +GRL_NAMESPACE_BEGIN(MISC) + +struct BatchedInitGlobalsData +{ + qword p_build_globals; + qword p_bvh_buffer; + dword numPrimitives; + dword numGeometries; + dword numInstances; + dword instance_descs_start; + dword geo_meta_data_start; + dword node_data_start; + dword leaf_data_start; + dword procedural_data_start; + dword back_pointer_start; + dword sizeTotal; + dword leafType; + dword leafSize; + dword fatleaf_table_start; + dword innernode_table_start; + dword quad_indices_data_start; +}; + +/// Header of debug buffer +/// +/// Header is placed at the begining of debug buffer. +/// After header there is circullar buffer space +typedef struct DebugBufferHeader +{ + /// Offset to begin of buffer (after header) + dword headStart; + /// Offset to free memory in buffer (used by gpu) + dword gpuHead; + /// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader) + dword cpuHead; + /// Flag for buffer overflow + dword overflow; + /// Total size of buffer + dword totalSize; + /// Padding needed because otherwise GPU overrides tail with cacheline flush + dword pad[11]; + /// Offset to begin of data in buffer + dword tail; +} DebugBufferHeader; + +enum InputDumpOperationType +{ + INPUT_DUMP_OP_NOP, + INPUT_DUMP_OP_BATCH, + INPUT_DUMP_OP_BUILD, + INPUT_DUMP_OP_UPDATE, + INPUT_DUMP_OP_CLONE, + INPUT_DUMP_OP_COMPACT, + INPUT_DUMP_OP_SERIALIZE, + INPUT_DUMP_OP_DESERIALIZE, + INPUT_DUMP_OP_END_BUFFER +}; + +// each operation starts with the same header structure and looks like this + +// some defined struct { <-----------------start +// OpHeader +// .... struct type specific data +// } +// ... auxilary data of variable len +// <-------------------------------------- end - indicated by endOfData +typedef struct OpHeader +{ + dword operationType; + dword endOfData; // offset to end of this primitive +} OpHeader; + +// header for batch operations +typedef struct BatchOpHeader +{ + OpHeader opHeader; +} BatchOpHeader; + +// interpretation for operationType INPUT_DUMP_OP_BATCH +typedef struct InputBatch +{ + BatchOpHeader header; + qword batchId; + dword vertexBufferDataSize; + dword firstContainedOpOffset; + + // layout of batch is as below, each line is 128B aligned: + + // + // InputBatch <-------------------------------- start + // optional: batchVertexData + // InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset + // optional: extra data of above token + // InputBuildDesc/InputCopy + // optional: extra data of above token + // ... + // InputBuildDesc/InputCopy + // optional: extra data of above token + // <-------------------------------------------- end = start + endOfData +} InputBatch; + +// for operationType: +// INPUT_DUMP_OP_BUILD, +// INPUT_DUMP_OP_UPDATE, +// followed by auxilary data of variable len +typedef struct InputBuild +{ + OpHeader header; + qword srcBvhPtr; + qword dstBvhPtr; + dword flags; + dword numGeos; + dword numInstances; + dword instArrayOfPtrs; +} InputBuild; + +// for operationType: +// INPUT_DUMP_OP_CLONE, +// INPUT_DUMP_OP_COMPACT, +// INPUT_DUMP_OP_SERIALIZE, +// +// Not for INPUT_DUMP_OP_DESERIALIZE! +typedef struct InputCopy +{ + OpHeader header; + qword srcBvhPtr; + qword dstBvhPtr; +} InputCopy; + +// for INPUT_DUMP_OP_DESERIALIZE +// decode for debug tools follows this format +typedef struct InputDeserialize +{ + OpHeader header; + qword dstBvhPtr; +} InputDeserialize; + +typedef struct InputBatchPtrs +{ + qword dumpDst; + qword globalDumpBuffer; + qword nonVertexDataStart; + dword vertexBuffersSize; + dword totalSize; +} InputBatchPtrs; + +enum OutputDumpOperationType +{ + OUTPUT_DUMP_OP_NOP, + OUTPUT_DUMP_OP_BATCH, + OUTPUT_DUMP_OP_DATA, + OUTPUT_DUMP_OP_END_BUFFER +}; + +// interpretation for operationType OUTPUT_DUMP_OP_BATCH +typedef struct OutputBatch { + BatchOpHeader header; + qword batchId; + dword firstContainedOpOffset; +} OutputBatch; + +// interpretation for operationType OUTPUT_DUMP_OP_DATA +typedef struct OutputData +{ + OpHeader header; + qword srcBvhPtr; +} OutputData; + +typedef struct OutputBatchPtrs +{ + qword dumpDst; + qword dataStart; + dword dataSize; + dword totalSize; +} OutputBatchPtrs; + +GRL_NAMESPACE_END(MISC) +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/morton/morton_common.h b/src/intel/vulkan/grl/gpu/morton/morton_common.h new file mode 100644 index 00000000000..2beb7a1aff3 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton/morton_common.h @@ -0,0 +1,245 @@ +// +// Copyright (C) 2009-2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "common.h" + +#define MORTON_DEBUG_CHECKS 0 +#define MORTON_VERBOSE_LOG 0 + +GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift ) +{ +#if 0 // turn off, because current hierarchy build requires full sort + // Difference between max iterations needed for LSB sorting and + // number of iterations needed for LSB sorting without primIDs + // This indicates how many of first iterations would be skipped in LSB + return 8 - (8 - (shift >> 3)); +#else + return 0; +#endif +} + +typedef struct BuildRecordLocalMortonFlattener +{ + unsigned int leftChild; // global + unsigned int rightChild; // global + unsigned int rangeStart; // global + unsigned int local_parent_index__numItems; +} BuildRecordLocalMortonFlattener; + +// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced +typedef union UPerNodeData { + float4 four_DWs; + BuildRecordLocalMortonFlattener buildRecord; + MortonFlattenedBoxlessNode boxlessNode; + struct AABB box; +} UPerNodeData; + +GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn) +{ + return bn.childOffset_type >> 6; +} + +GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn) +{ + return bn.childOffset_type & ((1<<6) -1); +} + +GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane) +{ + short lane_used = index % get_sub_group_size(); + short shift = (index / get_sub_group_size()) * get_sub_group_size(); + if (lane_used == lane) { + *arr |= (val << shift); + } +} + +GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane) +{ + short r = 0; + short lane_used = index % get_sub_group_size(); + short shift = (index / get_sub_group_size()) * get_sub_group_size(); + r = arr >> shift; + r = sub_group_broadcast(r, lane_used); + return r; +} + +GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst) +{ + if (lane < count) + { + dst[lane]=(ushort)(arr & 0xFFFF); + short hi_idx = lane + get_sub_group_size(); + if (hi_idx < count) { + dst[hi_idx] = (ushort)(arr >> 16); + } + } +} + + +GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane) +{ + if (lane < count) + { + *arr = src[lane]; + short hi_idx = lane + get_sub_group_size(); + if (hi_idx < count) { + *arr |= ((uint)(src[hi_idx])) << 16u; + } + } +} + +GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane) +{ + short lane_used = index % get_sub_group_size(); + short shift = (index / get_sub_group_size()) * get_sub_group_size(); + if (lane_used == lane) { + uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint + *arr = (val << shift) | rem_val; + } +} + +GRL_INLINE void SUBGROUP_refit_bottom_up_local( + uniform struct QBVHNodeN* globalNodeData, + uniform struct BackPointers* backPointers, + uniform uint treeletRootGlobalIndex, + uniform uint globalBaseForInternalNodes, + varying ushort lane, + uniform local union UPerNodeData* local_nodes, + varying uint sg_bu_startpoints, + uniform uint sg_bu_startpoints_cnt) +{ + if(sg_bu_startpoints_cnt == 0) + return; + + const uint head_lane = 0; + uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane); + + uniform uint prev_loc_index = 0; + uniform struct AABB child_aabb; // this carries reduced aabb between loop turns + + uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer; + + while (curNodeIndex != 0) + { + uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode); + uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode); + varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane; + + uint numChildren = BackPointer_GetNumChildren(backpointer); + if (child_loc_idx != prev_loc_index && + lane < numChildren) + { + child_aabb = local_nodes[child_loc_idx].box; + } + else if (lane >= numChildren) { + AABB_init(&child_aabb); + child_aabb.lower.w = as_float(0u); + } + + // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM + struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); + reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 ); + + uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w)); + reduced_bounds.lower.w = as_float((uint)instMask); + uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0); + local uint* pbox = (local uint*)(local_nodes+ curNodeIndex); + if (lane < 8) + { + pbox[lane] = reduce_bounds_lane; + } + + uint global_node_idx = globalBaseForInternalNodes + curNodeIndex; + /* get bounds of all children from child nodes directly */ + struct QBVHNodeN* qnode = globalNodeData + global_node_idx; + subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false); + child_aabb = reduced_bounds; + uint parentIndex = BackPointer_GetParentIndex(backpointer); + + write_mem_fence(CLK_LOCAL_MEM_FENCE); + + if (lane == 0) + { + backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer)); + uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex; + uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3); + + /* set global back pointer */ + *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer; + +#if MORTON_VERBOSE_LOG + printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n", + global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx); +#endif + } + + backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane); + prev_loc_index = curNodeIndex; + curNodeIndex = parentIndex; + + /* if all children got refitted, then continue */ + uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7; + uniform uint numChildrenTotal = (backpointer >> 3) & 0x7; + if (numChildrenRefitted != numChildrenTotal) + { + if(sg_bu_startpoints_cnt) + { + curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane); + backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer; + } + else + return; + } + } + + // process root of the treelet + { + +#if MORTON_DEBUG_CHECKS + if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n"); +#endif + + uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode); + varying uint child_loc_idx = lead_child_loc_offset + 0 + lane; + uint numChildren = BackPointer_GetNumChildren(backpointer); + + if (child_loc_idx != prev_loc_index && + lane < numChildren) + { + child_aabb = local_nodes[child_loc_idx].box; + } + else if (lane >= numChildren) { + AABB_init(&child_aabb); + child_aabb.lower.w = as_float(0u); + } + + // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM + uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w)); + uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode); + uint global_node_idx = treeletRootGlobalIndex; + uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset; + + /* get bounds of all children from child nodes directly */ + struct QBVHNodeN* qnode = globalNodeData + global_node_idx; + + subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false); + + /* reset refit counter for next refit */ + if (lane == 0) + { + /* set global back pointer */ + *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u); + + // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes + +#if MORTON_VERBOSE_LOG + printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n", + curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt); +#endif + } + } +} diff --git a/src/intel/vulkan/grl/gpu/morton/phase0.cl b/src/intel/vulkan/grl/gpu/morton/phase0.cl new file mode 100644 index 00000000000..2fa91c214e1 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton/phase0.cl @@ -0,0 +1,400 @@ +// +// Copyright (C) 2009-2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "libs/lsc_intrinsics.h" +#include "morton/morton_common.h" + +GRL_INLINE void SUBGROUP_create_node_phase0( + uniform global struct Globals* globals, + uniform global struct BinaryMortonCodeHierarchy* bnodes, + uniform global char* bvh_mem, + uniform global uint *global_refit_startpoints, + uniform uint rID, + uniform local uint* local_numRecords, + uniform local uint* local_QNodeOffset, + uniform global struct BuildRecordMorton* records, + uniform struct BuildRecordMorton current, + uniform local uint* local_startpoints_num) +{ + uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; + uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + + varying ushort lane = get_sub_group_local_id(); + + /* initialize child array */ + uniform uint numChildren = 2; + varying struct BuildRecordMorton sg_children; + sg_children.items = 0; + sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild; + + if ( lane < numChildren ) + sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID ); + + /* fill QBVH6 node with up to 6 children */ + while ( numChildren < BVH_NODE_N6 ) + { + varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize; + if ( sub_group_all( sg_is_leaf ) ) + break; + + uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items ); + uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) ); + uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild ); + + varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild; + + if ( lane == numChildren || lane == bestChild ) + { + sg_children.nodeID = nodeID; + sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID ); + } + + numChildren++; + } + + const uint current_index = current.current_index; + struct QBVHNodeN* qnode = nodeData + current_index; + SUBGROUP_QBVHNodeN_setChildIncr1( qnode ); + + uniform uint global_offset; + uniform uint child_node_offset; + + // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later + // used in global refit after phase1 + varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0; + uniform uchar children_roots_num = sub_group_reduce_add(is_children_root); + + if ( lane == 0 ) + { + child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); + + /* create node, but to not set bounds yet as these get calculated during refit */ + QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE ); + QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) ); + /* set back pointers */ + uint backpointer = (current.parent_index << 6) | (numChildren << 3); + + global_offset = atomic_add_local( local_numRecords, numChildren - 1 ); + +#if MORTON_VERBOSE_LOG + printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n", + rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren); +#endif + + if(children_roots_num == numChildren) + { + uint startpoints_offset = atomic_inc_local( local_startpoints_num ); + global_refit_startpoints[startpoints_offset] = current_index; + } + else + { + backpointer += children_roots_num; + } + + *InnerNode_GetBackPointer(backPointers, current_index) = backpointer; + } + + child_node_offset = sub_group_broadcast( child_node_offset, 0 ); + global_offset = sub_group_broadcast( global_offset, 0 ); + + uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset); + + sg_children.current_index = childNodes - nodeData + lane; + sg_children.parent_index = current_index; + + if ( lane < numChildren ) + { + uint write_position = (lane == 0) ? rID : global_offset + lane - 1; + records[write_position] = sg_children; + } +} + + +GRL_INLINE void SUBGROUP_create_node_phase0_local_sync( + uniform global struct Globals* globals, + uniform global struct BinaryMortonCodeHierarchy* bnodes, + uniform global char* bvh_mem, + uniform uint rID, + uniform local uint* local_numRecords, + uniform local uint* local_QNodeOffset, + uniform global struct BuildRecordMorton* records, + uniform struct BuildRecordMorton current, + uniform local uint* local_p0_total, + uniform global struct MortonFlattenedBoxlessNode *boxless_nodes, + uniform uint nodeDataStart) +{ + uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + uniform const uint rootNodeOffset = bvh->rootNodeOffset; + uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + + varying ushort lane = get_sub_group_local_id(); + + /* initialize child array */ + uniform uint numChildren = 2; + varying struct BuildRecordMorton sg_children; + sg_children.items = 0; + sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild; + + if ( lane < numChildren ) + sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID ); + + /* fill QBVH6 node with up to 6 children */ + while ( numChildren < BVH_NODE_N6 ) + { + varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize; + if ( sub_group_all( sg_is_leaf ) ) + break; + + uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items ); + uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) ); + uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild ); + + varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild; + + if ( lane == numChildren || lane == bestChild ) + { + sg_children.nodeID = nodeID; + sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID ); + } + + numChildren++; + } + + const uint current_index = current.current_index; + uniform uint global_offset; + uniform uint child_node_offset; + + // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later + // used in global refit after phase1 + varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0; + uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane); + uniform uchar children_roots_num = sub_group_reduce_add(is_children_root); + + if ( lane == 0 ) + { + child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); + + /* Do not create qnodes here */ + uint backpointer = (current.parent_index << 6) | (numChildren << 3); + + global_offset = atomic_add_local( local_numRecords, numChildren - 1 ); + +#if MORTON_VERBOSE_LOG + printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n", + rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart); +#endif + + MortonFlattenedBoxlessNode flattened_node; + + if(children_roots_num != numChildren) + backpointer += children_roots_num; + + flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask; + + uint loc_id = atomic_inc_local( local_p0_total ); + + flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE; + flattened_node.backPointer = backpointer; + + //TODO: change this writes to L1WB or streaming + boxless_nodes[loc_id] = flattened_node; + + *InnerNode_GetBackPointer(backPointers, current_index) = backpointer; + } + + child_node_offset = sub_group_broadcast( child_node_offset, 0 ); + global_offset = sub_group_broadcast( global_offset, 0 ); + + uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset); + + sg_children.current_index = childNodes - nodeData + lane; + sg_children.parent_index = current_index; + + if ( lane < numChildren ) + { + uint write_position = (lane == 0) ? rID : global_offset + lane - 1; + records[write_position] = sg_children; + } +} + +/* + + In this phase a single large work group performs the construction of + the top of the BVH and creates a build record array. + + Two varians of this kernel: + 1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit + in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase + that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate + that is not effective. + 2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with + number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit. + In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1. + Refit is performed only with local synchronization. + +*/ + +__attribute__((reqd_work_group_size(512, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +parallel_build_phase0(global struct Globals *globals, + global struct BinaryMortonCodeHierarchy *bnodes, + global char *bvh_mem, + global uint *global_refit_startpoints) +{ + global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; + global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); + + /* a queue of build records in global memory */ + global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); + local uint local_numRecords; + local uint local_QNodeOffset; + local uint local_startpoints_num; + + /* initialize first build record */ + if (get_local_id(0) == 0) + { + /* allocate root node */ + uint root_node_offset = 64*bvh->nodeDataCur; + global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset); + + //assert(root_node_offset == 0); + records[0].nodeID = globals->binary_hierarchy_root; + records[0].items = globals->numPrimitives; + records[0].current_index = rootNode - nodeData; + records[0].parent_index = -1; + + local_numRecords = 1; + local_QNodeOffset = root_node_offset + 64; + local_startpoints_num = 0; + + mem_fence_workgroup_default(); + } + + uint num_records = 1; + + /* terminate when all subtrees are under size threshold */ + while(true) + { + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + /* all work items in the work group pick a subtree to build */ + for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() ) + { + /* small subtrees will get built in next phase */ + if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives + continue; + + /* create QBVH node */ + SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset, + records, records[ID], &local_startpoints_num); + } + + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + mem_fence_workgroup_default(); + uint old_num_records = num_records; + num_records = local_numRecords; + if( old_num_records == num_records ) + break; + + } + + /* remember number of build records for next phase */ + if (get_local_id( 0 ) == 0) + { + globals->numBuildRecords = local_numRecords; + globals->p0_created_num = local_startpoints_num; + bvh->nodeDataCur = local_QNodeOffset / 64; + +#if MORTON_VERBOSE_LOG + printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num); +#endif + } +} + +__attribute__((reqd_work_group_size(512, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +parallel_build_phase0_local_sync(global struct Globals *globals, + global struct BinaryMortonCodeHierarchy *bnodes, + global char *bvh_mem, + global struct MortonFlattenedBoxlessNode *boxless_nodes) +{ + global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; + global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); + uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; + + /* a queue of build records in global memory */ + global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); + local uint local_numRecords; + local uint local_QNodeOffset; + local uint local_p0_total; + + /* initialize first build record */ + if (get_local_id(0) == 0) + { + /* allocate root node */ + uint root_node_offset = 64*bvh->nodeDataCur; + global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset); + + //assert(root_node_offset == 0); + records[0].nodeID = globals->binary_hierarchy_root; + records[0].items = globals->numPrimitives; + records[0].current_index = rootNode - nodeData; + records[0].parent_index = -1; + + local_numRecords = 1; + local_QNodeOffset = root_node_offset + 64; + local_p0_total = 0; + + mem_fence_workgroup_default(); + } + + uint num_records = 1; + + /* terminate when all subtrees are under size threshold */ + while(true) + { + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + /* all work items in the work group pick a subtree to build */ + for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() ) + { + /* small subtrees will get built in next phase */ + if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives + continue; + + /* create QBVH node */ + SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records, + records[ID], &local_p0_total, boxless_nodes, nodeDataStart); + } + + mem_fence_workgroup_default(); + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + + uint old_num_records = num_records; + num_records = local_numRecords; + if( old_num_records == num_records ) + break; + + } + + /* remember number of build records for next phase */ + if (get_local_id( 0 ) == 0) + { + globals->numBuildRecords = local_numRecords; + bvh->nodeDataCur = local_QNodeOffset / 64; + + globals->p0_allocated_num = BVHBase_numNodes(bvh); + globals->p0_created_num = local_p0_total; + +#if MORTON_VERBOSE_LOG + printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints); +#endif + } +} diff --git a/src/intel/vulkan/grl/gpu/morton/phase1.cl b/src/intel/vulkan/grl/gpu/morton/phase1.cl new file mode 100644 index 00000000000..6a1dd2aa44b --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton/phase1.cl @@ -0,0 +1,785 @@ +// +// Copyright (C) 2009-2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "libs/lsc_intrinsics.h" +#include "morton/morton_common.h" + +// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards; +BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec) +{ + BuildRecordLocalMortonFlattener rec; + rec.leftChild = srcRec.leftChild; + rec.rightChild = srcRec.rightChild; + rec.rangeStart = srcRec.range.start; + rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1; + return rec; +} + +GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless) +{ + BuildRecordLocalMortonFlattener rec; + rec.leftChild = boxless.binary_hierarchy_index; + rec.rightChild = boxless.childOffset_type; + rec.rangeStart = boxless.backPointer; + rec.local_parent_index__numItems = 0; + return rec; +} + +GRL_INLINE void SUBGROUP_create_boxless_node_phase1( + uniform global struct Globals* globals, + uniform global struct BinaryMortonCodeHierarchy* bnodes, + uniform global char* bvh_mem, + uniform BuildRecordLocalMortonFlattener currentRecord, + uniform uint currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record + uniform local uint* local_numRecords, + uniform uint tictoc, + uniform uint* sg_bu_startpoint_arr, + uniform uint* sg_bu_startpoint_cnt, + uniform uint parentOfRoot, + uniform bool processRoot, + uniform UPerNodeData* nodeData) +{ + varying ushort lane = get_sub_group_local_id(); + + /* initialize child array */ + uniform uint numChildren = 2; + varying struct BuildRecordLocalMortonFlattener sg_children; + sg_children.local_parent_index__numItems = 0; + + uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild; + if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31; + + sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx)); + + /* fill QBVH6 node with up to 6 children */ + while (numChildren < BVH_NODE_N6) + { + // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point + uint childNumItems = sg_children.local_parent_index__numItems; + varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize; + if (sub_group_all(sg_is_leaf)) { break; } + + uniform uint bestItems = sub_group_reduce_max_N6(childNumItems); + uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems)); + varying uint leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes + uniform uint rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild); + + varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest; + + if (lane == numChildren || lane == bestChild) + { + sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID)); + } + + numChildren++; + } + + uniform uint global_offset; + uniform uint child_node_index; + + bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren); + uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild)); + + if (lane <= numChildren) { + uint writeIDX = 0; + + if (lane == numChildren) + { + /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */ + MortonFlattenedBoxlessNode flattened_node; + uint parentIDX; + + if (processRoot) + { + *local_numRecords = numChildren + 1; + child_node_index = 1; + writeIDX = 0; + flattened_node.binary_hierarchy_index = 0xFFFFFFFF; + flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE; + parentIDX = parentOfRoot; + } + else + { + uint shift = (16 * tictoc); + uint mask = 0xFFFF; + uint atomicAddVal = numChildren << shift; + child_node_index = atomic_add_local(local_numRecords, atomicAddVal); + sub_group_barrier(0); + writeIDX = currQnodeLocalId; + parentIDX = currentRecord.local_parent_index__numItems >> 16; + flattened_node.binary_hierarchy_index = 0xFFFFFFFF; + sub_group_barrier(0); + child_node_index = (child_node_index >> 16) + (child_node_index & mask); + flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE; + } + +#if MORTON_VERBOSE_LOG + printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren); +#endif + flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren; + sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node); + } + + child_node_index = sub_group_broadcast(child_node_index, numChildren); + + if (lane != numChildren) + { + writeIDX = child_node_index + lane; + sg_children.local_parent_index__numItems |= currQnodeLocalId << 16; + } + + nodeData[writeIDX].buildRecord = sg_children; + } + + if (numFatleafChildren == numChildren) { + uint arridx = *sg_bu_startpoint_cnt; + // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane) + set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane); + *sg_bu_startpoint_cnt = arridx + 1; + } +} + +// TODO_OPT: Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants +// of this kernel with different WG sizes. There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is +// probably often wasted +GRL_INLINE void phase1_process_fatleaf( + uint globalBaseForInternalNodes, // for root node this is indexOfRoot + uint globalParent , // for root this should be parentOfRoot + bool isInstancePrimLeafType, // + uint leafPrimType, // + uint leafStride, // + global struct QBVHNodeN* nodeData, // per group + uint nodeDataStart, // + struct AABB* primref, // + BackPointers* backPointers, // + global struct MortonCodePrimitive* mc,// + uint nodesToLeafsGap, // + local union UPerNodeData* perNodeData,// + bool processRoot, // + short localNodeId, // + BuildRecordLocalMortonFlattener fatleafRecord, // per node + uint primID ) // +{ + uint lane = get_sub_group_local_id(); + uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); + uniform uint mcID = fatleafRecord.rangeStart; + uint pseudolane = lane < numChildren ? lane : 0; + varying struct AABB sg_bounds = primref[primID]; + + uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16); + uint globalNodeId = globalBaseForInternalNodes + localNodeId; + uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId; + + uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId; + + { + /* For all primitives in a fat leaf we store a back + * pointer. This way we can modify the fat leaf node at leaf construction time. */ + uint back_pointer = globalNodeId + nodeDataStart; + /* Store back pointer and primID inside morton code array to + * be later used by leaf creation. */ + mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; + } + + struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds); + reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 ); + + uint8_t instMask; + if (isInstancePrimLeafType) + { + instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0; + subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask); + instMask = sub_group_reduce_or_N6(instMask); + } + else + { + instMask = 0xFF; + subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds); + } + + reduce_bounds.lower.w = as_float((uint)instMask); + uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0); + local uint* boxUint = (local uint*)(perNodeData + localNodeId); + if (get_sub_group_size() == 8 || lane < 8) + { + boxUint[lane] = reduce_bounds_lane; + uint globalParentIdx; + if (processRoot) { + // for root, treeletRootGlobalIndex is index of rootsParent in global space + globalParentIdx = globalParent; + } + else { + // for non root, raw_parent_idx is in local space + globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent; + } + if (lane == 0) { + *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3); + } + } +} + +GRL_INLINE void perform_phase1(global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem, + local union UPerNodeData* perNodeData, + local uint* local_records_head, + local uint* local_globalOffsetForNodes, + BuildRecordLocalMortonFlattener rootRecord, + uint treeletRootGlobalIndex, + uint parentOfRootIndex, + const uint leafPrimType, + bool isInstancePrimLeafType) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + varying ushort lane = get_sub_group_local_id(); + + // array that will keep 2x8 shorts indices + varying uint sg_fatleaf_array = 0x0; + uniform uint8_t sg_fatleaf_cnt = 0; + /* terminate when all subtrees are leaves */ + + uint subgroupId = get_sub_group_id(); + uint ID = subgroupId; + + uint sg_bu_startpoints = 0; + uniform uint sg_bu_startpoints_cnt = 0; + const uint shift_mask = globals->shift_mask; + + const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh); + + uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart; + uint leafStart = *pLeafStart; + uint leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode)); + uint nodesToLeafsGap = leafStart - nodeDataStart; + + if (ID == 0) + { + BuildRecordLocalMortonFlattener current = rootRecord; + + if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6) + { + *local_records_head = 1; +#if MORTON_DEBUG_CHECKS + if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n"); +#endif + BuildRecordLocalMortonFlattener fatleafRecord = current; + uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); + uint pseudolane = lane < numChildren ? lane : 0; + uniform const uint mcID = fatleafRecord.rangeStart; + varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask); + + phase1_process_fatleaf( + treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride, + nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, + true, 0, fatleafRecord, primID); + } + else + { +#if MORTON_VERBOSE_LOG + if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); } +#endif + //printf("local_records_head = %d\n", *local_records_head); + SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData); + *local_globalOffsetForNodes = treeletRootGlobalIndex; + } + + ID += get_num_sub_groups(); + } + + uniform uint priv_records_tail = 1; + + /* wait for all work items to have updated local_records array */ + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + uniform uint priv_records_head = *local_records_head & 0xFFFF; + treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1 + uniform uint priv_records_tail_prev = priv_records_tail; + uniform uint other_records_head = priv_records_head; + + uint ticToc = 1; + + if (priv_records_head == priv_records_tail) + { + return; + } + else + { + do + { + for (; ID < priv_records_head; ID += get_num_sub_groups()) + { + BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord); + + if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6) + { + set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane); +#if MORTON_VERBOSE_LOG + if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID); +#endif +#if MORTON_DEBUG_CHECKS + if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n"); +#endif + } + else + { + SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData); + } + } + + priv_records_tail = priv_records_head; + /* wait for all work items to have updated local_records array */ + work_group_barrier(CLK_LOCAL_MEM_FENCE); + { + uint records_as_in_mem = *local_records_head; + priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF; + uint other_records_head_temp = priv_records_head; + priv_records_head += other_records_head; + other_records_head = other_records_head_temp; + ticToc = ticToc ^ 1; +#if MORTON_VERBOSE_LOG + if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem); +#endif + } + } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head + } + + bool atomicNodeAllocation = treeletRootGlobalIndex > 0; + bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation; + uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0; + + uniform uint globalBaseForInternalNodes = 0; + + // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex + // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so + // there's no need to synchronize multiple treelets nodes allocations with atomics. + if (atomicNodeAllocationProduce) + { + *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1); + } + + // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1 + // mapping local to global: + // local space global space + // [0] - treelet root [treeletRootGlobalIndex] + // ... possibly very long distance ... + // [1] - first non root [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above + // [2] - first [globalBaseForInternalNodes + 2] + // ... + // [numToAllocate] - last node [globalBaseForInternalNodes + 3] + if (atomicNodeAllocation) + { + work_group_barrier(CLK_LOCAL_MEM_FENCE); + globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1); + } + +#if MORTON_VERBOSE_LOG + if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); } +#endif + + if (sg_fatleaf_cnt) + { + short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane); + //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue; + //if(local_startpoints_cnt > 1) return; + BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord; + + varying uint primID; + { + uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); + uint pseudolane = lane < numChildren ? lane : 0; + uniform const uint mcID = fatleafRecord.rangeStart; + primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask); + } + + // process fatleafs, and store their boxes to SLM + // also put startpoints for bottom up + //uint fatleaf_cnt = *local_startpoints_cnt; + while (sg_fatleaf_cnt-- > 1) + { + short nextLocalNodeId = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane); + BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord; + varying uint nextPrimId; + + { + uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF); + uint pseudolane = lane < numChildren ? lane : 0; + uniform const uint mcID = nextfatleafRecord.rangeStart; + nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask); + } + + phase1_process_fatleaf( + globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride, + nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, + false, localNodeId, fatleafRecord, primID); + + fatleafRecord = nextfatleafRecord; + localNodeId = nextLocalNodeId; + primID = nextPrimId; + } + + phase1_process_fatleaf( + globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride, + nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, + false, localNodeId, fatleafRecord, primID); + } + +#if 0 + // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups. + { + ushort myStartpointWriteSite = 0; + + if (lane == 0) + { + myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt); + } + myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0); + + unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite); + } +#endif + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + // distribute bottom-up startpoints +#if 0 + { + short sp_count_to_divide = (*local_startpoints_cnt); + + //calculate the chunk for each sg. + sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups(); + uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups(); + + uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt; + if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) { + //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx + // and all sgs before it also have one extra + myReadSite += get_sub_group_id(); + sg_bu_startpoints_cnt++; + } + else + { + // all reminder elements are consummed by previous sgs + myReadSite += sg_bu_startpoints_cnt_reminder; + } + + pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane); + } +#endif + + SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt); + + if (singleTreeletBumpBVHnodeCnt) + { + bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt; + } +} + +GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType) +{ + if (get_sub_group_id() == 0 ) + { + global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh); + BackPointers* backPointers = BVHBase_GetBackPointers(bvh); + + //set required fields to mark that blas is empty + uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0; + qnode->type = leafPrimType; + qnode->instMask = 0; + qnode->qbounds.lower_x[k] = 0x80; + qnode->qbounds.upper_x[k] = 0; + + *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6); + } +} + +/* + + POSTSORT PHASE1: + Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD. + 1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip + 2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards) + +*/ + +__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_build_phase1_Indirect_SG( global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + const uint leafPrimType = globals->leafPrimType; + + //special case for empty blas + if(globals->numPrimitives == 0) + { + bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1; + update_empty_blas(bvh, leafPrimType); + return; + } + + local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1]; + local uint local_records_head; + // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers + local uint local_globalOffsetForNodes, local_globalOffsetForNodes2; + + uint rootIndex = 0; + uint parentOfRoot = 0; + BuildRecordLocalMortonFlattener rootBuildRecord; + + /* add start build record to local stack */ + if (get_sub_group_id() == 0 ) + { + global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart); + uint recordID = get_group_id(0); + struct BuildRecordMorton mortonGlobalRecord = records[recordID]; + + rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID)); + + parentOfRoot = mortonGlobalRecord.parent_index; + rootIndex = mortonGlobalRecord.current_index; + +#if MORTON_VERBOSE_LOG + printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n", + local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index); +#endif + } + + if (leafPrimType == NODE_TYPE_INSTANCE) + { + perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, + &local_records_head, &local_globalOffsetForNodes, + rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true); + } + else + { + perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, + &local_records_head, &local_globalOffsetForNodes, + rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false); + } + +} + +__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_build_phase1_Indirect_global_root( global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + const uint leafPrimType = globals->leafPrimType; + const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; + + bvh->nodeDataCur = nodeDataStart + 1; + + //special case for empty blas + if(globals->numPrimitives == 0) + { + update_empty_blas(bvh, leafPrimType); + return; + } + + local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1]; + local uint local_records_head; + local uint local_globalOffsetForNodes; + + BuildRecordLocalMortonFlattener rootBuildRecord; + + if (get_sub_group_id() == 0 ) + { + struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root); + + rootBuildRecord = TranslateToLocalRecord(binaryNode); + + local_globalOffsetForNodes = 0; + } + + if (leafPrimType == NODE_TYPE_INSTANCE) + { + perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, + &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true); + } + else + { + perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, + &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false); + + } +} + +#if 0 +GRL_INLINE void +DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem, + uint startID, uint endID, + local uint* local_numRecords, + local uint* local_numRecordsOld, + local struct BuildRecordMorton* local_records +) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64); + + /* iterate over all subtrees this workgroup should build */ + for ( uint recordID = startID; recordID < endID; recordID++ ) + { + /* add start build record to local stack */ + if ( get_local_id( 0 ) == 0 ) + { + local_records[0] = records[recordID]; + *local_numRecords = 1; + *local_numRecordsOld = 0; + } + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + + /* terminate when all subtrees are leaves */ + while ( *local_numRecords != *local_numRecordsOld ) + { + /* remember the old number of build records to detect later + * whether we are done */ + if ( get_local_id( 0 ) == 0 ) + { + *local_numRecordsOld = *local_numRecords; + } + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + + /* all work items in the sub group pick a subtree to build */ + for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) ) + { + /* ignore small subtrees */ + if ( local_records[ID].items <= BVH_NODE_N6 ) + continue; + + /* create QBVH node */ + create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] ); + } + + /* wait for all work items to have updated local_records array */ + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + } + + const uint shift_mask = globals->shift_mask; + const uint leafPrimType = globals->leafPrimType; + const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; + BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + + /* create all fat leaf nodes and initiate refit */ + for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) ) + { + struct BuildRecordMorton current = local_records[ID]; + const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID ); + + global struct QBVHNodeN* qnode = nodeData + current.current_index; + + /* get bounds of all children of the fat leaf node */ + struct AABB bounds[BVH_NODE_N6]; + for ( uint i = 0; i < current.items; i++ ) + { + /* get primID and bounds of primitive */ + const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask); + bounds[i] = primref[primID]; + + /* For all primitives in a fat leaf we store a back + * pointer. This way we can modify the fat leaf node at leaf construction time. */ + const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem; + + /* Store back pointer and primID inside morton code array to + * be later used by leaf creation. */ + mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; + } + + /* update fat leaf node */ + QBVHNodeN_setType( qnode, leafPrimType ); + global void* offset; + if ( leafPrimType != BVH_INSTANCE_NODE ) + { + offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad ); + QBVHNodeN_setChildIncr1( qnode ); + } + else + { + offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf ); + QBVHNodeN_setChildIncr2( qnode ); + } + QBVH6Node_set_offset( qnode, offset ); + QBVHNodeN_setBounds( qnode, bounds, current.items ); + + /* set back pointers for fat leaf nodes */ + *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3); + + /* bottom up refit */ + refit_bottom_up( qnode, bvh, bounds, current.items ); + } + } +} + +/* + + This phase takes the build records calculated in phase0 as input and + finished the BVH construction for all these subtrees. + +*/ +__attribute__((reqd_work_group_size(8, 1, 1))) +old_parallel_build_phase1(global struct Globals *globals, + global struct MortonCodePrimitive *mc, + global struct AABB *primref, + global struct BinaryMortonCodeHierarchy *bnodes, + global char *bvh_mem) +{ + global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; + global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); + + /* a queue of build records */ + local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; + local uint local_numRecords; + local uint local_numRecordsOld; + + /* construct range of build records that each sub group will process */ + const uint numRecords = globals->numBuildRecords; + const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0); + const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0); + + DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); + +} + +__attribute__( (reqd_work_group_size( 8, 1, 1 )) ) +old_parallel_build_phase1_Indirect( global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem ) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart); + + /* a queue of build records */ + local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; + local uint local_numRecords; + local uint local_numRecordsOld; + + /* construct range of build records that each sub group will process */ + const uint numRecords = globals->numBuildRecords; + uint startID = get_group_id( 0 ); + uint endID = startID + 1; + + DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); + +} +#endif diff --git a/src/intel/vulkan/grl/gpu/morton/phase2.cl b/src/intel/vulkan/grl/gpu/morton/phase2.cl new file mode 100644 index 00000000000..e82d22aaacf --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton/phase2.cl @@ -0,0 +1,314 @@ +// +// Copyright (C) 2009-2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "bvh_build_refit.h" +#include "libs/lsc_intrinsics.h" +#include "morton/morton_common.h" + +/* + + POSTSORT PHASE2: + Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value. + 1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate. + This kernel should be used only for very big bvh, it is faster than non-SLM fallback + in parallel_build_phase2_refit_local. + 2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of + nodes allocated in phase0, but there is also non-SLM fallback there, as the + decision on which kernel to run is based on the nodes estimates on the host + side. + +*/ + + +GRL_INLINE void refit_bottom_up_global_sync( + global char* bvh_mem, + global uint* global_refit_startpoints, + uniform uint nodeId, + uniform ushort lane) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + + BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + + // Get the node idx that was put here in phase1 + const uint innerNodeIdx = global_refit_startpoints[nodeId]; + + // Get the qnode and backpointer + uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx; + uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); + + varying struct AABB childrenAABB; // one child AABB per lane + AABB_init(&childrenAABB); + + uniform uint numChildren = (backPointer >> 3) & 0x7; + if(numChildren == 0) return; + + global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); + varying ushort child_idx = (lane < numChildren) ? lane : 0; + childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); + +#if MORTON_VERBOSE_LOG + if(lane == 0) + printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx); +#endif + + struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB ); + reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 ); + + subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane); + + uint children_mask = qnode_child[child_idx].instMask; + qnode->instMask = sub_group_reduce_or_N6(children_mask); + + SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 ); +} + +__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel +parallel_build_phase2_refit( global char* bvh_mem, + global uint* global_refit_startpoints ) +{ + refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0)); +} + + +GRL_INLINE void SUBGROUP_refit_bottom_up_global( + uniform global struct QBVHNodeN* globalNodeData, + uniform struct BackPointers* backPointers, + varying ushort lane, + varying uint curNodeIndex) +{ + uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); + + const uint head_lane = 0; + uniform struct AABB child_aabb; // this carries reduced aabb between loop turns + + while (curNodeIndex != 0) + { + global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex; + global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); + uint numChildren = BackPointer_GetNumChildren(backpointer); + + varying ushort child_idx = (lane < numChildren) ? lane : 0; + child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx ); + + struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); + reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane); + + /* get bounds of all children from child nodes directly */ + subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane); + + uchar childrenMask = qnode_child[child_idx].instMask; + qnode->instMask = sub_group_reduce_or_N6(childrenMask); + + uint parentIndex = BackPointer_GetParentIndex(backpointer); + + mem_fence_gpu_invalidate(); + + if (lane == 0) + { + backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex)); + + uint globalBackpointer = (parentIndex << 6) | (numChildren << 3); + + /* set global back pointer */ + *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer; + +#if MORTON_VERBOSE_LOG + printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n", + curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x); +#endif + } + + backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane); + curNodeIndex = parentIndex; + + /* if all children got refitted, then continue */ + uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7; + uniform uint numChildrenTotal = (backpointer >> 3) & 0x7; + + if (numChildrenRefitted != numChildrenTotal) + return; + } + + // process root of the treelet + { + +#if MORTON_DEBUG_CHECKS + if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n"); +#endif + + global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData ); + uint numChildren = BackPointer_GetNumChildren(backpointer); + + varying ushort child_idx = (lane < numChildren) ? lane : 0; + child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx ); + + struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); + reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane); + + /* get bounds of all children from child nodes directly */ + subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane); + + uchar childrenMask = qnode_child[child_idx].instMask; + globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask); + + /* reset refit counter for next refit */ + if (lane == 0) + { + /* set global back pointer */ + *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u); + +#if MORTON_VERBOSE_LOG + printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n", + curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt); +#endif + } + } +} + + +// TODO: Check why 512 wg size has worse performance than 256 +__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +parallel_build_phase2_refit_local( global struct Globals* globals, + global char* bvh_mem, + global struct MortonFlattenedBoxlessNode *boxless_nodes) +{ + // Number of nodes created in P0, to be refitted in this stage + uint p0_created_num = globals->p0_created_num; + + // Return immediately if host executed this kernel but there is nothing to do + if(p0_created_num == 0) + return; + + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + varying ushort lane = get_sub_group_local_id(); + + // Hardcode SLM to max here as we do not know upfront how much mem will be needed + local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */ + + // Number of allocated nodes in phase0 (p0_created_num + children) + uint p0_allocated_num = globals->p0_allocated_num; + + // array that will keep 2x8 shorts indices + varying uint sg_fatleaf_array = 0x0; + uniform uint8_t sg_bu_startpoints_cnt = 0; + + // Determine if we can fit into SLM with all the nodes allocated in phase0, + // There are two paths here: + // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local, + // which does refit nad creates qnodes in bvh + // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization. + // It is not performant to do so, keep it as a guardrail here. On the host side we do fallback + // to the old refit separated path, with wg_size 8 with better EU reuse. + if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM) + { + for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() ) + { + MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID]; + uint current_id = boxless_node.binary_hierarchy_index >> 6; + + // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused + uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F); + + if(lane == 0) + perNodeData[current_id].boxlessNode = boxless_node; + + // When no children are subtree roots, we are done and skip to the next iteration + if(children_root_mask == 0x0) + { + continue; + } + // When all children are subtree roots, put them to sg_fatleaf_array + else if(children_root_mask == 0x3F) + { + set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane); + } + + uniform global struct QBVHNodeN* qnode = nodeData + current_id; + + uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7; + uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node); + varying ushort child_idx = (lane < numChildren) ? lane : 0; + + varying struct AABB childrenAABB; // one child AABB per lane + AABB_init(&childrenAABB); + + uint lead_child_global_id = current_id + lead_child_offset; + + uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id; + childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); + + // Get only AABBs of children that are p1 subtree roots + bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx); + if(lane_active) + { + uint child_global_id = lead_child_global_id + child_idx; + perNodeData[child_global_id].box = childrenAABB; + perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask); + } + +#if MORTON_VERBOSE_LOG + if(lane == 0) + printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset); +#endif + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt); + } + else + { + for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() ) + { + MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID]; + uint current_id = boxless_node.binary_hierarchy_index >> 6; + + // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused + uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F); + uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7; + + uniform global struct QBVHNodeN* qnode = nodeData + current_id; + uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node); + uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node); + + SUBGROUP_QBVHNodeN_setChildIncr1( qnode ); + if(lane == 0) + { + QBVH6Node_set_type( qnode, nodeType ); + qnode->offset = lead_child_offset; + } + + // When no children are subtree roots, we are done and skip to the next iteration + if(children_root_mask == 0x0) + { + continue; + } + // When all children are subtree roots, put them to sg_fatleaf_array + else if(children_root_mask == 0x3F) + { + set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane); + } + +#if MORTON_VERBOSE_LOG + if(lane == 0) + printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset); +#endif + } + + while (sg_bu_startpoints_cnt > 0) + { + uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane); + + SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex); + } + } +} diff --git a/src/intel/vulkan/grl/gpu/morton/post_sort.cl b/src/intel/vulkan/grl/gpu/morton/post_sort.cl new file mode 100644 index 00000000000..c13762438a3 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton/post_sort.cl @@ -0,0 +1,521 @@ +// +// Copyright (C) 2009-2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "libs/lsc_intrinsics.h" +#include "morton/morton_common.h" + +//////////////////////////////////////////////////////////////////////////////////////////////////////// +/* + + This kernel constructs a binary hierarchy in bottom up fashion from + the morton codes. + +*/ +//////////////////////////////////////////////////////////////////////////////////////////////////////// + +int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 ) +{ + const uint64_t key1 = mc[i1].index_code; + return clz(key0 ^ key1); +} + +int sign( int d ) +{ + return (d > 0) ? 1 : -1; +} + +__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) ) +void kernel build_bottom_up_indirect( global struct Globals* globals, + global struct BinaryMortonCodeHierarchy* bnodes, + global struct MortonCodePrimitive* mc ) +{ + /* construct range of primitives that each work group will process */ + const uint numPrimitives = globals->numPrimitives; + + uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 ); + + if (i == 0) + { + globals->binary_hierarchy_root = 0; + if (numPrimitives == 1) + { + // special kludge for 1-prim tree. Make sure the one leaf node is initialized + bnodes[i].range.start = 0; + bnodes[i].range.end = 0; + bnodes[i].leftChild = -1; + bnodes[i].rightChild = -1; + } + + // store pointer to the binary hierarchy in the globals struct. + // This will be used + globals->binary_hierarchy_buffer = (gpuva_t) bnodes; + } + + uint num_inner_nodes = numPrimitives-1; + if ( i < num_inner_nodes ) + { + // + // direction is 1 if this morton code is the node's first key, -1 if it's the last + // By construction every internal node is either the start or the end of a given key range + // direction should be towards the neighbor with the most bits in common + + uint64_t ki = mc[i].index_code; + + int direction, delta_min; + uint lmax; + if( i == 0 ) + { + direction = 1; + delta_min = -1; + lmax = numPrimitives; + } + else + { + direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc, ki, i - 1 ) ); + delta_min = Delta( mc, ki, i - direction ); + + // find upper bound for length of this node's key range + lmax = 8; + while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min) + lmax = lmax * 2; + } + + // clamp max length so that the binary searches are fully in-bounds + uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1); + lmax = min(lmax, maxLen); + + // find end of range using binary search + uint length = 0; + uint end = lmax-1; + while (length != end) + { + uint mid = length + ((end-length)/2) + ((end-length)%2); + bool bigger = Delta( mc, ki, i+mid*direction) > delta_min; + length = bigger ? mid : length; + end = bigger ? end : mid-1; + } + uint j = i + length*direction ; + + // find split position using binary search + uint split = 0; + end = length-1; + int delta_node = Delta(mc, ki, j); + while (split != end) + { + uint mid = split + ((end-split)/2) + ((end-split)%2); + bool bigger = Delta( mc, ki, i+mid*direction) > delta_node; + split = bigger ? mid : split; + end = bigger ? end : mid-1; + } + split = i + split*direction + min(direction,0); + + uint left = split; + uint right = split+1; + + // mark leaves + if( min(i,j) == split ) + left = left | (1<<31); + if( max(i,j) == split+1 ) + right = right | (1<<31); + + bnodes[i].range.start = min(i,j); + bnodes[i].range.end = max(i,j); + bnodes[i].leftChild = left; + bnodes[i].rightChild = right; + } +} + + + + + +#if 0 +__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) ) +void kernel build_bottom_up_indirect( global struct Globals* globals, + global struct BinaryMortonCodeHierarchy* bnodes, + global struct MortonCodePrimitive* mc ) +{ + /* construct range of primitives that each work group will process */ + const uint numPrimitives = globals->numPrimitives; + + // RangeFactor determines the distance between adjacent nodeIds in work group. + // The aim of the nodes distribution within work group, for rangeFactor > 1 + // is to be sure that half of the work groups will entirelly be dropped off + // at the bottom layer of the graph. This way the EUs can be reused faster. + // The factor needs to be smaller than MAX_HW_SIMD_WIDTH + const uint rangeFactor = 2; + + const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH); + const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 ); + const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups; + const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor; + + /* iterate over all primitives the work group should process */ + const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange; + + if ( i < numPrimitives ) + { + uint node = i | ((uint)1 << 31); + uint start = i; + uint end = i; + + /* bottom up */ + while ( true ) + { + /* goto parent node and link parent node to current node */ + node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 ); + + /* do not continue if we reached this node the first time */ + if ( node == -1 ) + break; + + mem_fence_gpu_invalidate(); + + /* update range */ + start = bnodes[node].range.start; + end = bnodes[node].range.end; + + /* stop when we reached the root node */ + if ( start == 0 && end == numPrimitives - 1 ) + { + globals->binary_hierarchy_root = node; + break; + } + } + } +} + +#endif + +/* + + This function builds one QBVH6 node by opening the provided binary + BVH nodes until the QBVH node is full. + + */ + +GRL_INLINE void create_node(global struct Globals *globals, + global struct BinaryMortonCodeHierarchy *bnodes, + global char *bvh_mem, + uint rID, + local uint *local_numRecords, + local uint *local_QNodeOffset, + struct BuildRecordMorton *records, + struct BuildRecordMorton *current) +{ + global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; + const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; + global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); + BackPointers *backPointers = BVHBase_GetBackPointers(bvh); + + /* initialize child array */ + uint numChildren = 2; + struct BuildRecordMorton children[BVH_NODE_N6]; + children[0].nodeID = bnodes[current->nodeID].leftChild; + children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID); + children[1].nodeID = bnodes[current->nodeID].rightChild; + children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID); + + /* fill QBVH6 node with up to 6 children */ + while (numChildren < BVH_NODE_N6) + { + /*! find best child to split */ + uint bestItems = 0; + int bestChild = -1; + for (int i = 0; i < numChildren; i++) + { + const uint items = children[i].items; + + /* ignore leaves as they cannot get split */ + if (items <= cfg_minLeafSize) + continue; + + /* find child with largest number of items */ + if (items > bestItems) + { + bestItems = items; + bestChild = i; + } + } + if (bestChild == -1) + break; + + /* perform best found split */ + const uint bestNodeID = children[bestChild].nodeID; + struct BuildRecordMorton *lrecord = &children[bestChild]; + struct BuildRecordMorton *rrecord = &children[numChildren]; + lrecord->nodeID = bnodes[bestNodeID].leftChild; + lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID); + rrecord->nodeID = bnodes[bestNodeID].rightChild; + rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID); + numChildren++; + } + + /* allocate memory for all children */ + const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); + global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset); + + /* create node, but to not set bounds yet as these get calculated during refit */ + const uint current_index = current->current_index; + struct QBVHNodeN *qnode = nodeData + current_index; + QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE); + QBVHNodeN_setChildIncr1(qnode); + QBVH6Node_set_offset(qnode, childNodes); + + /* set back pointers */ + *InnerNode_GetBackPointer(backPointers, current_index) = (current->parent_index << 6) | (numChildren << 3); + + /* update parent pointer of build records of all children */ + for (uint ID = 0; ID < numChildren; ID++) + { + children[ID].current_index = childNodes - nodeData + ID; + children[ID].parent_index = current_index; + } + + /* write out child build records */ + const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1); + records[rID] = children[0]; + + for (uint i = 1; i < numChildren; i++) + records[global_offset + i - 1] = children[i]; + + mem_fence_workgroup_default(); + +} + +#if 0 +/* This function calculates the similarity between two morton + * codes. It essentially counts how many bits of the morton codes are + * equal starting at the top. The more bits are equal, the similar the + * codes, and the closer the primitives are located spatially. */ + +GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc, + const uint id) +{ + const uint64_t key0 = mc[id + 0].index_code; + const uint64_t key1 = mc[id + 1].index_code; + return clz(key0 ^ key1); +} + + + +/* This function checks for a range [left,right] of morton codes, if + * it is spatially closer to the left or to the right nodes. */ + +GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc, + const uint left, + const uint right, + const uint last) +{ + /* merge to right if we are at the left end of the array */ + if (left == 0) + return true; + + /* merge to left if we are at the right end of the array */ + if (right == last) + return false; + + /* otherwise merge to the side where the morton code sequence has + * the largest number of equal bits from the top */ + return delta(mc, right) > delta(mc, left - 1); +} + +GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes, + global struct MortonCodePrimitive *mc, + const uint nodeID, + const uint left, + const uint right, + const uint last) +{ + uint parent; + + /* check if we should merge this node to the left or right */ + if (merge_to_right(mc, left, right, last)) + { + parent = right; + bnodes[parent].leftChild = nodeID; + bnodes[parent].range.start = left; + } + else + { + parent = left - 1; + bnodes[parent].rightChild = nodeID; + bnodes[parent].range.end = right; + } + + mem_fence_gpu_default(); + + /* stop ascending the tree if we reached this node the first time */ + const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0; + return first ? -1 : parent; +} + +GRL_INLINE void +DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem, + uint startID, uint endID, + local uint* local_numRecords, + local uint* local_numRecordsOld, + local struct BuildRecordMorton* local_records +) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64); + + /* iterate over all subtrees this workgroup should build */ + for ( uint recordID = startID; recordID < endID; recordID++ ) + { + /* add start build record to local stack */ + if ( get_local_id( 0 ) == 0 ) + { + local_records[0] = records[recordID]; + *local_numRecords = 1; + *local_numRecordsOld = 0; + } + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + + /* terminate when all subtrees are leaves */ + while ( *local_numRecords != *local_numRecordsOld ) + { + /* remember the old number of build records to detect later + * whether we are done */ + if ( get_local_id( 0 ) == 0 ) + { + *local_numRecordsOld = *local_numRecords; + } + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + + /* all work items in the sub group pick a subtree to build */ + for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) ) + { + /* ignore small subtrees */ + if ( local_records[ID].items <= BVH_NODE_N6 ) + continue; + + /* create QBVH node */ + create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] ); + } + + /* wait for all work items to have updated local_records array */ + work_group_barrier( CLK_LOCAL_MEM_FENCE ); + } + + const uint shift_mask = globals->shift_mask; + const uint leafPrimType = globals->leafPrimType; + const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; + BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); + global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); + + /* create all fat leaf nodes and initiate refit */ + for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) ) + { + struct BuildRecordMorton current = local_records[ID]; + const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID ); + + global struct QBVHNodeN* qnode = nodeData + current.current_index; + + /* get bounds of all children of the fat leaf node */ + struct AABB bounds[BVH_NODE_N6]; + for ( uint i = 0; i < current.items; i++ ) + { + /* get primID and bounds of primitive */ + const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask); + bounds[i] = primref[primID]; + + /* For all primitives in a fat leaf we store a back + * pointer. This way we can modify the fat leaf node at leaf construction time. */ + const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem; + + /* Store back pointer and primID inside morton code array to + * be later used by leaf creation. */ + mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; + } + + /* update fat leaf node */ + QBVHNodeN_setType( qnode, leafPrimType ); + global void* offset; + if ( leafPrimType != BVH_INSTANCE_NODE ) + { + offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad ); + QBVHNodeN_setChildIncr1( qnode ); + } + else + { + offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf ); + QBVHNodeN_setChildIncr2( qnode ); + } + QBVH6Node_set_offset( qnode, offset ); + QBVHNodeN_setBounds( qnode, bounds, current.items ); + + /* set back pointers for fat leaf nodes */ + *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3); + + /* bottom up refit */ + refit_bottom_up( qnode, bvh, bounds, current.items ); + } + } +} + +/* + + This phase takes the build records calculated in phase0 as input and + finished the BVH construction for all these subtrees. + +*/ +__attribute__((reqd_work_group_size(8, 1, 1))) +old_parallel_build_phase1(global struct Globals *globals, + global struct MortonCodePrimitive *mc, + global struct AABB *primref, + global struct BinaryMortonCodeHierarchy *bnodes, + global char *bvh_mem) +{ + global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; + global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); + + /* a queue of build records */ + local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; + local uint local_numRecords; + local uint local_numRecordsOld; + + /* construct range of build records that each sub group will process */ + const uint numRecords = globals->numBuildRecords; + const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0); + const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0); + + DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); + +} + +__attribute__( (reqd_work_group_size( 8, 1, 1 )) ) +old_parallel_build_phase1_Indirect( global struct Globals* globals, + global struct MortonCodePrimitive* mc, + global struct AABB* primref, + global struct BinaryMortonCodeHierarchy* bnodes, + global char* bvh_mem ) +{ + global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; + global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart); + + /* a queue of build records */ + local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; + local uint local_numRecords; + local uint local_numRecordsOld; + + /* construct range of build records that each sub group will process */ + const uint numRecords = globals->numBuildRecords; + uint startID = get_group_id( 0 ); + uint endID = startID + 1; + + DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); + +} +#endif diff --git a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl new file mode 100644 index 00000000000..099f926e194 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl @@ -0,0 +1,117 @@ +// +// Copyright (C) 2009-2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "morton/morton_common.h" + +GRL_INLINE uint get_morton_shift( uint numPrimitives ) +{ + return 32 - clz( numPrimitives ); +} + +GRL_INLINE uint get_morton_shift_mask( uint numPrimitives ) +{ + uint shift = get_morton_shift( numPrimitives ); + uint mask =(uint)(((ulong)1 << shift)); + return mask - 1; // separated due to problems in DX +} + +__attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals ) +{ + /* variable shift for putting morton code + index to 64 bit */ + const uint shift = 32 - clz(globals->numPrimitives); + globals->shift = shift; + globals->shift_mask = (uint)(((ulong)1 << shift)); + globals->shift_mask -= 1; // separated due to problems in DX + globals->binary_hierarchy_root = 0; + globals->morton_sort_in_flight = 0; + globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift); +} + +/* + + This kernel create a morton code array containing a morton code and + index into the primref array. + + The code uses the maximal number of bits for the morton code, such + that the morton code and index can still both get stored in 64 bits. + + The algorithm first maps the centroids of the primitives and their + bounding box diagonal into a 4D grid, and then interleaves all 4 + grid coordinates to construct the to morton code. + + */ + +__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) +__attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel +create_morton_codes_indirect( global struct Globals* globals, + global struct BVHBase* bvh, + global struct AABB* primref, + global struct MortonCodePrimitive* morton_codes, + global struct MortonCodePrimitive* morton_codes_tmp, + uint use_new_morton_sort) +{ + /* construct range of morton codes each work group should create */ + const uint numPrimitives = globals->numPrimitives; + const uint startID = get_group_id( 0 ) * get_local_size( 0 ); + const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives); + + /* get lower and upper bounds of geometry and length of scene diagonal */ + const float3 lower = globals->centroidBounds.lower.xyz; + const float3 upper = globals->centroidBounds.upper.xyz; + const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz ); + + /* calculates the 4D grid */ + const uint shift = get_morton_shift( numPrimitives ); + const uint grid_size = 1 << (64 - shift) / 4; + const float4 grid_base = (float4)(lower, 0.0f); + const float4 grid_extend = (float4)(upper - lower, diag); + const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!! + + const uint req_iterations = get_morton_sort_lsb_req_iterations(shift); + + /* each work group iterates over its range of morton codes to create */ + uint primID = startID + get_local_id( 0 ); + if( primID < endID ) + { + /* calculate position inside 4D grid */ + float4 centroid2 = AABB_centroid2( &primref[primID] ); + centroid2.w = length( AABB_size( &primref[primID] ).xyz ); + const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale ); + + /* calculate and store morton code */ + const ulong code = ulong_bitInterleave4D( gridpos ); + const ulong index_code = ((ulong)code << shift) | (ulong)primID; + + // It is required for morton code to be in morton_codes buffer after LSB sort finishes. + // If there would be odd iteration number needed for sorting, it is needed + // to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer + if(req_iterations & 1 && !use_new_morton_sort) + morton_codes_tmp[primID].index_code = index_code; + else + morton_codes[primID].index_code = index_code; + } +} + +/* + + Initialization of the binary morton code hierarchy. + + */ + +__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals, + global struct BinaryMortonCodeHierarchy* bnodes ) +{ + /* construct range each work group will process */ + const uint numPrimitives = globals->numPrimitives; + const uint startID = get_group_id( 0 ) * get_local_size(0); + const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives); + + /* each workgroup iterates over its range to initialize the binary BVH */ + uint i = startID + get_local_id( 0 ); + if( i < endID ) + BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 ); +} diff --git a/src/intel/vulkan/grl/gpu/morton_builder.grl b/src/intel/vulkan/grl/gpu/morton_builder.grl new file mode 100644 index 00000000000..f221fd39fed --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton_builder.grl @@ -0,0 +1,335 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module morton_builder; + +kernel_module morton_kernels ("morton/pre_sort.cl") +{ + kernel opencl_build_kernel_init < kernelFunction="init" >; + kernel opencl_build_morton_kernel_create_morton_codes_indirect < kernelFunction="create_morton_codes_indirect" >; + kernel opencl_build_morton_kernel_init_bottom_up_indirect < kernelFunction="init_bottom_up_indirect" >; +} + +kernel_module morton_kernels ("morton/post_sort.cl") +{ + links lsc_intrinsics; + + kernel opencl_build_morton_kernel_build_bottom_up_indirect < kernelFunction="build_bottom_up_indirect" >; +} + +kernel_module morton_kernels ("morton/phase0.cl") +{ + links lsc_intrinsics; + + kernel opencl_build_morton_kernel_parallel_build_phase0 < kernelFunction="parallel_build_phase0" >; + kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync < kernelFunction="parallel_build_phase0_local_sync" >; +} + +kernel_module morton_kernels ("morton/phase1.cl") +{ + links lsc_intrinsics; + + kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect < kernelFunction="parallel_build_phase1_Indirect_SG" >; + kernel opencl_build_morton_kernel_parallel_build_phase1_root < kernelFunction="parallel_build_phase1_Indirect_global_root" >; +} + +kernel_module morton_kernels ("morton/phase2.cl") +{ + links lsc_intrinsics; + + kernel opencl_build_morton_kernel_parallel_build_phase2_refit < kernelFunction="parallel_build_phase2_refit" >; + kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >; +} + +import struct MKBuilderState "structs.grl"; + +/* +metakernel begin( + MKBuilderState state, + qword morton_code_buffer, + dword primLeafType, + dword numHwThreads) +{ + dispatch opencl_build_kernel_init(1, 1, 1) args( + state.build_globals + ); + + control(wait_idle); + + + dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args( + state.build_globals, + state.bvh_buffer, + state.build_primref_buffer, + morton_code_buffer); + + control(wait_idle); + +} + +metakernel build_bottom_up( + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_code_buffer, + dword numHwThreads) +{ + dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args( + state.build_globals, + buildrecords_bottom_up); + + control(wait_idle); + + dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args( + state.build_globals, + buildrecords_bottom_up, + morton_code_buffer); + + control(wait_idle); + +} + + +metakernel parallel_build( + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_code_buffer, + dword numHwThreads) +{ + dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args( + state.build_globals, + buildrecords_bottom_up, + state.bvh_buffer); + + control(wait_idle); + + dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args( + state.build_globals, + morton_code_buffer, + state.build_primref_buffer, + buildrecords_bottom_up, + state.bvh_buffer); + + control(wait_idle); + +} + +*/ + +metakernel NewMorton_pre_sort( + qword num_primrefs_counter, + MKBuilderState state, + qword morton_code_buffer, + qword morton_code_buffer_tmp, + qword buildrecords_bottom_up, + dword use_new_morton_sort) +{ + + + { + REG1 = 15; + REG2 = 4; + REG0 = load_dword( num_primrefs_counter ); + + REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals + REG1 = ~REG1; + REG0 = REG0 & REG1; + REG0 = REG0 >> REG2; + } + + dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals ); + + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + /* + // new bottom-up kernel does not need this + dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args( + state.build_globals, + buildrecords_bottom_up); + */ + dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args( + state.build_globals, + state.bvh_buffer, + state.build_primref_buffer, + morton_code_buffer, + morton_code_buffer_tmp, + use_new_morton_sort); + + +} + + + +metakernel NewMorton_post_sort( + qword num_primrefs_counter, + qword num_buildrecords_counter, + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_code_buffer ) +{ + + { + REG1 = 15; + REG2 = 4; + REG0 = load_dword( num_primrefs_counter ); + + REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals + REG1 = ~REG1; + REG0 = REG0 & REG1; + REG0 = REG0 >> REG2; + } + + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args( + state.build_globals, + buildrecords_bottom_up, + morton_code_buffer); + + + /* + dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args( + state.build_globals, + buildrecords_bottom_up, + morton_code_buffer); + */ + + control(wait_idle); + + dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args( + state.build_globals, + buildrecords_bottom_up, + state.bvh_buffer); + + control(wait_idle); + + DISPATCHDIM_X = load_dword( num_buildrecords_counter ); + + dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args( + state.build_globals, + morton_code_buffer, + state.build_primref_buffer, + buildrecords_bottom_up, + state.bvh_buffer); + + control(wait_idle); + +} + +metakernel NewMorton_bottom_up( + qword num_primrefs_counter, + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_code_buffer ) +{ + + { + REG1 = 15; + REG2 = 4; + REG0 = load_dword( num_primrefs_counter ); + + REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals + REG1 = ~REG1; + REG0 = REG0 & REG1; + REG0 = REG0 >> REG2; + } + + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args( + state.build_globals, + buildrecords_bottom_up, + morton_code_buffer); +} + + +metakernel NewMorton_phase0( + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_p0_refit_startpoints) +{ + + dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args( + state.build_globals, + buildrecords_bottom_up, + state.bvh_buffer, + morton_p0_refit_startpoints); +} + +metakernel NewMorton_phase0_local_sync( + MKBuilderState state, + qword buildrecords_bottom_up, + qword p0_boxless_nodes) +{ + + dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args( + state.build_globals, + buildrecords_bottom_up, + state.bvh_buffer, + p0_boxless_nodes); +} + + +metakernel NewMorton_phase1( + qword num_buildrecords_counter, + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_code_buffer) +{ + + DISPATCHDIM_X = load_dword( num_buildrecords_counter ); + + dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args( + state.build_globals, + morton_code_buffer, + state.build_primref_buffer, + buildrecords_bottom_up, + state.bvh_buffer); +} + +metakernel NewMorton_phase1_root( + qword num_buildrecords_counter, + MKBuilderState state, + qword buildrecords_bottom_up, + qword morton_code_buffer) +{ + dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args( + state.build_globals, + morton_code_buffer, + state.build_primref_buffer, + buildrecords_bottom_up, + state.bvh_buffer); +} + +metakernel NewMorton_phase2( + qword num_leaves_counter, + MKBuilderState state, + qword bottom_node_ids ) +{ + + DISPATCHDIM_X = load_dword( num_leaves_counter ); + + dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args( + state.bvh_buffer, + bottom_node_ids); +} + +metakernel NewMorton_phase2_local( + MKBuilderState state, + qword p0_boxless_nodes) +{ + + dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args( + state.build_globals, + state.bvh_buffer, + p0_boxless_nodes); +} diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl new file mode 100644 index 00000000000..075d44a51ba --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl @@ -0,0 +1,9 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// just inlines the kernels that are there in the header +#include "morton_msb_radix_bitonic_sort.h" \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h new file mode 100644 index 00000000000..4fb6c21b014 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h @@ -0,0 +1,924 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "common.h" +#include "morton_msb_radix_bitonic_sort_shared.h" + +#include "libs/lsc_intrinsics.h" + +/////////////////////////////////////////////////////////////////////////////// +// +// Configuration switches +// +/////////////////////////////////////////////////////////////////////////////// + +#define DEBUG 0 +#define MERGE_BLS_WITHIN_SG 0 + +/////////////////////////////////////////////////////////////////////////////// + + +#if DEBUG +#define DEBUG_CODE(A) A +#else +#define DEBUG_CODE(A) +#endif + +#define BOTTOM_LEVEL_SORT_WG_SIZE 512 + +// this kernel is only used to put into metakernel for debug to print that the code reached that place +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +void kernel debug_print_kernel(uint variable) +{ + if(get_local_id(0) == 0) + printf("I'm here! %d\n", variable); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(1, 1, 1))) +void kernel check_bls_sort(global struct Globals* globals, global ulong* input) +{ + uint prims_num = globals->numPrimitives; + + printf("in check_bls_sort kernel. Values count:: %d\n", prims_num); + + ulong left = input[0]; + ulong right; + for (int i = 0; i < prims_num - 1; i++) + { + right = input[i + 1]; + printf("sorted val: %llu\n", left); + if (left > right) + { + printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right); + } + left = right; + } +} + +inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE) +{ + const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE; + const uint sg_local_id = get_local_id(0) % SG_SIZE; + const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE; + + uint acc = sub_group_scan_inclusive_add(val); + if (NUM_HW_THREADS_IN_WG == 1) + { + return acc; + } + tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1); + barrier(CLK_LOCAL_MEM_FENCE); + + uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0; + uint wgs_acc = sub_group_scan_exclusive_add(loaded_val); + uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id); + // for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration + // same for > 64 workitems and more in SIMD8 + uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE; + for (int i = 1; i < num_iterations; i++) + { + // need to add tmp[] because of "exclusive" scan, so last element misses it + uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1]; + loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0; + wgs_acc = sub_group_scan_exclusive_add(loaded_val); + wgs_acc += prev_max_sum; + uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE); + if (hw_thread_in_wg_id >= i * SG_SIZE) + acc_for_this_hw_thread = new_acc_for_this_hw_thread; + } + return acc + acc_for_this_hw_thread; +} + +struct MSBDispatchArgs +{ + global struct MSBRadixContext* context; + uint num_of_wgs; // this is the number of workgroups that was dispatched for this context + ulong* wg_key_start; // this is where keys to process start for current workgroup + ulong* wg_key_end; + uint shift_bit; +}; + + + + +struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler) +{ + global struct MSBDispatchQueue* queue = &scheduler->msb_queue; + + uint group = get_group_id(0); + struct MSBDispatchRecord record; + + // TODO_OPT: Load this entire prefix array into SLM instead of searching.. + // Or use sub-group ops + uint i = 0; + while (i < queue->num_records) + { + uint n = queue->records[i].wgs_to_dispatch; + + if (group < n) + { + record = queue->records[i]; + break; + } + + group -= n; + i++; + } + + uint context_id = i; + global struct MSBRadixContext* context = &scheduler->contexts[context_id]; + + // moving to ulongs to avoid uint overflow + ulong group_id_in_dispatch = group; + ulong start_offset = context->start_offset; + ulong num_keys = context->num_keys; + ulong wgs_to_dispatch = record.wgs_to_dispatch; + + struct MSBDispatchArgs args; + args.context = context; + args.num_of_wgs = record.wgs_to_dispatch; + args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch); + args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch); + args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION; + return args; +} + + + + +void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record) +{ + uint new_idx = atomic_inc_global(&queue->num_records); + queue->records[new_idx] = *record; + DEBUG_CODE(printf("adding bls of size: %d\n", record->count)); +} + + + + +void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output) +{ + uint tid = get_local_id(0); + + global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset; + + ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX; + + SLM_shared[tid] = a; + + uint counter = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + ulong curr = SLM_shared[get_sub_group_local_id()]; + + for (uint i = 16; i < dispatchRecord.count; i += 16) + { + ulong next = SLM_shared[i + get_sub_group_local_id()]; + + for (uint j = 0; j < 16; j++) + { + // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint + uint2 curr_as_uint2 = as_uint2(curr); + uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j)); + ulong c = as_ulong(sg_curr_as_uint2); + if (c < a) + counter++; + } + + curr = next; + } + + + // last iter + for (uint j = 0; j < 16; j++) + { + // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint + uint2 curr_as_uint2 = as_uint2(curr); + uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j)); + ulong c = as_ulong(sg_curr_as_uint2); + if (c < a) + counter++; + } + + // save elements to its sorted positions + if (tid < dispatchRecord.count) + output[dispatchRecord.start_offset + counter] = a; +} + +void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output) +{ + uint lid = get_local_id(0); + uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD; + while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE) + { + elements_to_sort >>= 1; + } + + for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++) + { + uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE; + + if (tid >= dispatchRecord.count) + SLM_shared[tid] = ULONG_MAX; + else + SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint k_iterations = elements_to_sort; + while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0) + { + k_iterations >>= 1; + } + + for (unsigned int k = 2; k <= k_iterations; k *= 2) + { + for (unsigned int j = k / 2; j > 0; j /= 2) + { + // this loop is needed when we can't create big enough workgroup so we need to process multiple times + for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++) + { + uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE; + unsigned int ixj = tid ^ j; + if (ixj > tid) + { + if ((tid & k) == 0) + { + if (SLM_shared[tid] > SLM_shared[ixj]) + { + ulong tmp = SLM_shared[tid]; + SLM_shared[tid] = SLM_shared[ixj]; + SLM_shared[ixj] = tmp; + } + } + else + { + if (SLM_shared[tid] < SLM_shared[ixj]) + { + ulong tmp = SLM_shared[tid]; + SLM_shared[tid] = SLM_shared[ixj]; + SLM_shared[ixj] = tmp; + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++) + { + uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE; + + if (tid < dispatchRecord.count) + output[dispatchRecord.start_offset + tid] = SLM_shared[tid]; + } +} + + + + +void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input) +{ + uint lid = get_local_id(0); + + uint start = context->start[lid]; + uint count = context->count[lid]; + uint start_offset = context->start_offset + start; + + struct BLSDispatchRecord record; + record.start_offset = start_offset; + record.count = count; + record.keys_in = context->keys_out; + + if (count == 0) // we don't have elements so don't do anything + { + } + else if (count == 1) // single element so just write it out + { + input[start_offset] = ((global ulong*)record.keys_in)[start_offset]; + } + else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD) + { + BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); + } +} + + + + +// We try to merge small BLS into larger one within the sub_group +void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input) +{ + uint lid = get_local_id(0); + uint sid = get_sub_group_local_id(); + + uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0; + + uint start = context->start[lid]; + uint count = context->count[lid]; + uint ctx_start_offset = context->start_offset; + + if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS + { + struct BLSDispatchRecord record; + if (create_msb_work) + { + record.start_offset = ctx_start_offset + start + count; + record.count = 0; + } + else // SIMD lane 0 case + { + record.start_offset = ctx_start_offset + start; + record.count = count; + } + + record.keys_in = context->keys_out; + + uint loop_idx = 1; + while (sid + loop_idx < 16) // loop over subgroup + { + uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx); + uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx); + uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx); + + if (_create_msb_work) // found out next MSB work, so range of merges ends + break; + + // need to push record since nothing more will fit + if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD) + { + if (record.count == 1) + { + input[record.start_offset] = record.keys_in[record.start_offset]; + } + else if (record.count > 1) + { + BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); + } + record.start_offset = ctx_start_offset + _start; + record.count = _count; + } + else + { + record.count += _count; + } + loop_idx++; + } + // if we have any elements left, then schedule them + if (record.count == 1) // only one element, so just write it out + { + input[record.start_offset] = record.keys_in[record.start_offset]; + } + else if (record.count > 1) + { + BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); + } + } +} + + + + +// We try to merge small BLS into larger one within the sub_group +void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input) +{ + uint lid = get_local_id(0); + uint sid = get_sub_group_local_id(); + + uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0; + + uint start = context->start[lid]; + uint count = context->count[lid]; + uint ctx_start_offset = context->start_offset; + + if (sid == 0) + { + struct BLSDispatchRecord record; + record.start_offset = ctx_start_offset + start; + record.count = 0; + record.keys_in = context->keys_out; + + for (int i = 0; i < 16; i++) + { + uint _create_msb_work = sub_group_broadcast(create_msb_work, i); + uint _count = sub_group_broadcast(count, i); + uint _start = sub_group_broadcast(start, i); + if (_create_msb_work) + { + if (record.count == 1) // only one element, so just write it out + { + input[record.start_offset] = record.keys_in[record.start_offset]; + } + else if (record.count > 1) + { + BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); + } + record.start_offset = ctx_start_offset + _start + _count; + record.count = 0; + continue; + } + // need to push record since nothing more will fit + if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD) + { + BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); + record.start_offset = ctx_start_offset + _start; + record.count = _count; + } + else + { + record.count += _count; + } + } + // if we have any elements left, then schedule them + if (record.count == 1) // only one element, so just write it out + { + input[record.start_offset] = record.keys_in[record.start_offset]; + } + else if (record.count > 1) + { + BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); + } + } +} + + + + +void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size) +{ + uint lid = get_local_id(0); + + uint iteration = context->iteration + 1; + uint start = context->start[lid]; + uint count = context->count[lid]; + uint start_offset = context->start_offset + start; + + uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0; + +#if MERGE_BLS_WITHIN_SG + DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input); +#else + DO_Create_Separate_BLS_Work(scheduler, context, input); +#endif + + uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work); + uint stack_begin_entry; + // last workitem in wg contains number of all new entries + if (lid == (MSB_RADIX_NUM_BINS - 1)) + { + stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id); + } + stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1)); + new_entry_id += stack_begin_entry -1; + + + if (create_msb_work) + { + scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset; + scheduler->msb_stack.entries[new_entry_id].count = count; + scheduler->msb_stack.entries[new_entry_id].iteration = iteration; + } + + if (lid == 0) { + DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records)); + } +} + + +struct BatchedBLSDispatchEntry +{ + ///////////////////////////////////////////////////////////// + // State data used for communication with command streamer + // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' + ///////////////////////////////////////////////////////////// + qword p_data_buffer; + qword num_elements; // number of elements in p_data_buffer +}; + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches) +{ + uint dispatch_id = get_group_id(0); + uint lid = get_local_id(0); + + local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD]; + + struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id]; + struct BLSDispatchRecord dispatchRecord; + dispatchRecord.start_offset = 0; + dispatchRecord.count = dispatchArgs.num_elements; + dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer; + + DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count)); + + if(dispatchRecord.count > 1) + DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in); +} + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output) +{ + uint lid = get_local_id(0); + + DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives)); + + local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD]; + + struct BLSDispatchRecord dispatchRecord; + dispatchRecord.start_offset = 0; + dispatchRecord.count = globals->numPrimitives; + dispatchRecord.keys_in = (ulong*)input; + + //TODO: count or bitonic here? + //DO_Bitonic(dispatchRecord, SLM_shared, output); + DO_CountSort(dispatchRecord, SLM_shared, output); +} + + + + +// This kernel initializes first context to start up the whole execution +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel sort_morton_codes_msb_begin( + global struct Globals* globals, + global struct VContextScheduler* scheduler, + global ulong* buf0, + global ulong* buf1) +{ + uint lid = get_local_id(0); + uint gid = get_group_id(0); + + DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n")); + + scheduler->contexts[gid].count[lid] = 0; + + if (gid == 0 && lid == 0) + { + global struct MSBRadixContext* context = &scheduler->contexts[lid]; + const uint num_prims = globals->numPrimitives; + + scheduler->bls_queue0.num_records = 0; + scheduler->bls_queue1.num_records = 0; + + scheduler->curr_bls_queue = &scheduler->bls_queue1; + scheduler->next_bls_queue = &scheduler->bls_queue0; + + context->start_offset = 0; + context->num_wgs_in_flight = 0; + context->num_keys = num_prims; + context->iteration = 0; + context->keys_in = buf0; + context->keys_out = buf1; + + uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD; + scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch; + + scheduler->num_wgs_msb = msb_wgs_to_dispatch; + scheduler->num_wgs_bls = 0; + scheduler->msb_stack.num_entries = 0; + scheduler->msb_queue.num_records = 1; + } +} + + + + +__attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1))) +kernel void +scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1) +{ + uint lid = get_local_id(0); + + DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n")); + + uint context_idx = lid; + + const uint num_of_stack_entries = scheduler->msb_stack.num_entries; + + uint msb_wgs_to_dispatch = 0; + if (lid < num_of_stack_entries) + { + struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid]; + global struct MSBRadixContext* context = &scheduler->contexts[lid]; + context->start_offset = entry.start_offset; + context->num_wgs_in_flight = 0; + context->num_keys = entry.count; + context->iteration = entry.iteration; + context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1; + context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0; + + msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD; + scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch; + } + + msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it + + if (lid == 0) + { + // swap queue for next iteration + struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue; + scheduler->curr_bls_queue = scheduler->next_bls_queue; + scheduler->next_bls_queue = tmp; + + scheduler->next_bls_queue->num_records = 0; + + scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records; + scheduler->num_wgs_msb = msb_wgs_to_dispatch; + + if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS) + { + scheduler->msb_queue.num_records = num_of_stack_entries; + scheduler->msb_stack.num_entries = 0; + } + else + { + scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS; + scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS; + } + } + + DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n", + scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries)); +} + + + + +// this is the lowest sub-task, which should end return sorted codes +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output) +{ + uint lid = get_local_id(0); + + DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n")); + + local struct BLSDispatchRecord l_dispatchRecord; + if (lid == 0) + { + uint record_idx = get_group_id(0); + l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx]; + //l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue); + atomic_dec_global(&scheduler->num_wgs_bls); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + struct BLSDispatchRecord dispatchRecord = l_dispatchRecord; + + local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD]; + + // right now use only bitonic sort + // TODO: maybe implement something else + if (1) + { + //DO_Bitonic(dispatchRecord, SLM_shared, output); + DO_CountSort(dispatchRecord, SLM_shared, output); + } +} + + + + +#define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS +#define MSB_COUNT_SG_SIZE 16 + +// count how many elements per buckets we have +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE))) +void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler) +{ + uint lid = get_local_id(0); + uint lsz = MSB_RADIX_NUM_BINS; + + DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n")); + + local uint bucket_count[MSB_RADIX_NUM_BINS]; + local uint finish_count; + bucket_count[lid] = 0; + if (lid == 0) + { + finish_count = 0; + } + + struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler); + + global struct MSBRadixContext* context = dispatchArgs.context; + + global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid; + global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end; + uint shift_bit = dispatchArgs.shift_bit; + uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift + barrier(CLK_LOCAL_MEM_FENCE); + + global uchar* ks = (global uchar*)key_start; + ks += shift_byte; + global uchar* ke = (global uchar*)key_end; + ke += shift_byte; + + // double buffering on value loading + if (ks < ke) + { + uchar bucket_id = *ks; + ks += lsz * sizeof(ulong); + + for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong)) + { + uchar next_bucket_id = *k; + atomic_inc_local(&bucket_count[bucket_id]); + bucket_id = next_bucket_id; + } + + atomic_inc_local(&bucket_count[bucket_id]); + + } + + barrier(CLK_LOCAL_MEM_FENCE); + + //update global counters for context + uint count = bucket_count[lid]; + if (count > 0) + atomic_add_global(&context->count[lid], bucket_count[lid]); + + mem_fence_gpu_invalidate(); + work_group_barrier(0); + + bool final_wg = true; + // count WGs which have reached the end + if (dispatchArgs.num_of_wgs > 1) + { + if (lid == 0) + finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1; + + barrier(CLK_LOCAL_MEM_FENCE); + + final_wg = finish_count == dispatchArgs.num_of_wgs; + } + + local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE]; + // if this is last wg for current dispatch, update context + if (final_wg) + { + // code below does work_group_scan_exclusive_add(context->count[lid]); + { + uint lane_val = context->count[lid]; + uint sg_result = sub_group_scan_inclusive_add(lane_val); + + partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1); + barrier(CLK_LOCAL_MEM_FENCE); + + uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]); + slm_result = sub_group_broadcast(slm_result, get_sub_group_id()); + uint result = slm_result + sg_result - lane_val; + context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]); + } + + context->count[lid] = 0; + if(lid == 0) + context->num_wgs_in_flight = 0; + } +} + + + + +// sort elements into appropriate buckets +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) +void kernel sort_morton_codes_msb_bin_items( + global struct VContextScheduler* scheduler, global ulong* input) +{ + uint lid = get_local_id(0); + uint lsz = get_local_size(0); + + DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n")); + + local uint finish_count; + if (lid == 0) + { + finish_count = 0; + } + + struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler); + global struct MSBRadixContext* context = dispatchArgs.context; + + global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid; + global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end; + uint shift_bit = dispatchArgs.shift_bit; + + barrier(CLK_LOCAL_MEM_FENCE); + + global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset; + +#if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem + // here we'll do local counting, then move to global + + local uint slm_counters[MSB_RADIX_NUM_BINS]; + slm_counters[lid] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint place_in_slm_bucket; + uint bucket_id; + ulong val; + + bool active_lane = key_start < key_end; + + if (active_lane) + { + val = *key_start; + + bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1); + place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway + if (slm_counters[lid]) + slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]); + + barrier(CLK_LOCAL_MEM_FENCE); + + uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]); + + if (active_lane) + sorted_keys[context->start[bucket_id] + id_in_bucket] = val; +#else + // double buffering on value loading + if (key_start < key_end) + { + ulong val = *key_start; + key_start += lsz; + + for (global ulong* k = key_start; k < key_end; k += lsz) + { + ulong next_val = *k; + uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1); + uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]); + + //printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id); + sorted_keys[context->start[bucket_id] + id_in_bucket] = val; + + val = next_val; + } + + uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1); + uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]); + + sorted_keys[context->start[bucket_id] + id_in_bucket] = val; + } +#endif + + // make sure all groups's "counters" and "starts" are visible to final workgroup + mem_fence_gpu_invalidate(); + work_group_barrier(0); + + bool final_wg = true; + // count WGs which have reached the end + if (dispatchArgs.num_of_wgs > 1) + { + if (lid == 0) + finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1; + + barrier(CLK_LOCAL_MEM_FENCE); + + final_wg = finish_count == dispatchArgs.num_of_wgs; + } + + local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE]; + // if this is last wg for current dispatch, then prepare sub-tasks + if (final_wg) + { + DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS); + + // clear context's counters for future execution + context->count[lid] = 0; + } + +} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h new file mode 100644 index 00000000000..c2ab0d4a2c9 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h @@ -0,0 +1,135 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// +// This file contains structure definitions shared by GRL OCL kernels and host code +// + +#pragma once + +#include "GRLGen12.h" + +// NOTE: +// MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work +// BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work +// + +#define MSB_RADIX_NUM_BINS 256 +#define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration +#define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here + +#define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used + +#define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here, +// since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these + +#define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context + +#define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS, +// so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS + +#define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup. + // If a single MSB entry needs more, then it will spawn more WGs + // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num + +#define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance +// Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with. +// Since we use ulong(8bytes) we can store 4096 elements +// This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler +// Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD + +#define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS + +GRL_NAMESPACE_BEGIN(GRL) + + + + +GRL_NAMESPACE_BEGIN(RTAS) +GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT) + +struct MSBStackEntry +{ + uint start_offset; + uint count; + uint iteration; +}; + +struct MSBStack +{ + dword num_entries; + struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM]; +}; + +struct MSBRadixContext +{ + uint start[MSB_RADIX_NUM_BINS]; + uint count[MSB_RADIX_NUM_BINS]; + uint num_wgs_in_flight; // this is used to identify which msb wg is last + uint num_keys; // number of keys to process + uint iteration; + ulong* keys_in; + ulong* keys_out; + + uint start_offset; //offset from the beginning of the buffer +}; + +struct MSBDispatchRecord +{ + uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record +}; + +struct MSBDispatchQueue +{ + dword num_records; + struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record +}; + +// BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks +struct BLSDispatchRecord +{ + uint start_offset; // offset from the beginning of the buffer + uint count; + ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer +}; + +struct BLSDispatchQueue +{ + dword num_records; + struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS]; +}; + +struct VContextScheduler +{ + ///////////////////////////////////////////////////////////// + // State data used for communication with command streamer + // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' + ///////////////////////////////////////////////////////////// + + dword num_wgs_msb; // number of MSB workgroups being processed by current iteration + dword num_wgs_bls; // number of BLS workgroups being processed by current iteration + + dword scheduler_postsync; + dword _pad1; + + ///////////////////////////////////////////////////////////// + + struct MSBDispatchQueue msb_queue; + struct BLSDispatchQueue bls_queue0; + struct BLSDispatchQueue bls_queue1; + + struct BLSDispatchQueue* curr_bls_queue; + struct BLSDispatchQueue* next_bls_queue; + + struct MSBStack msb_stack; + + struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS]; +}; + +GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT) +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl new file mode 100644 index 00000000000..e123b2f46d3 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl @@ -0,0 +1,9 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// just inlines the kernels that are there in the header +#include "morton_radix_sort.h" diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.h b/src/intel/vulkan/grl/gpu/morton_radix_sort.h new file mode 100644 index 00000000000..d58ec829883 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.h @@ -0,0 +1,855 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "common.h" +#include "libs/lsc_intrinsics.h" + +/* ============================================================================= */ +/* ============================== LSB RADIX SORT =============================== */ +/* ============================================================================= */ + +#define RADIX_BINS 256 +#define SCATTER_WG_SIZE 512 +#define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort + +uint2 get_thread_range( uint numItems, uint numGroups, uint taskID ) +{ + uint items_per_group = (numItems / numGroups); + uint remainder = numItems - (items_per_group * numGroups); + uint startID = taskID * items_per_group + min(taskID, remainder); + uint endID = startID + items_per_group + ((taskID < remainder) ? 1 : 0); + + return (uint2)(startID,endID); +} + +GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals, + global uint* global_histogram, + global uchar* input, + local uint* histogram, + uint iteration, + uint numGroups, + uint numItems, + bool shift_primID, + uint taskID, + uint startID, + uint endID) +{ + const uint shift = globals->shift; + + for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0)) + histogram[i] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (shift_primID) + { + for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) + { + // Read input as ulong to make bitshift, so the bits representing primID are not being + // taken into account during sorting, which would result in smaller sort loops for + // cases where morton shift are bigger than 8 bits + ulong* ptr_ul = (ulong*)&input[8 * i]; + ulong code = *ptr_ul; + uchar* ptr = (uchar*)&code; + code >>= shift; + + uchar bin = ptr[iteration]; + atomic_inc_local(&histogram[bin]); + } + } + else + { + for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) + { + uchar bin = input[8 * i + iteration]; + atomic_inc_local(&histogram[bin]); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0)) + global_histogram[RADIX_BINS * taskID + i] = histogram[i]; +} + +GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals, + global uint* global_histogram, + global uint* wg_flags, + global uchar* input, + local uint* histogram, + uint iteration, + uint numGroups, + uint numItems, + bool shift_primID, + bool update_wg_flags) +{ + if (shift_primID) + { + // This check is present in other LSB sort functions as well, its purpose is + // to skip first n iterations where n is the difference between max iterations + // and actually needed iterations to sort without primIDs + const uint req_iterations = globals->sort_iterations; + if (iteration < req_iterations) + return; + + // iteration needs to be adjusted to reflect the skipped cycles + iteration -= req_iterations; + } + + const uint taskID = get_group_id(0); + + if (taskID == 0 && update_wg_flags) + { + for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0)) + wg_flags[i] = 0; + } + + uint2 ids = get_thread_range(numItems, numGroups, taskID); + uint startID = ids.x; + uint endID = ids.y; + + sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID, + taskID, startID, endID); +} + +__attribute__((reqd_work_group_size(512, 1, 1))) +void kernel +sort_morton_codes_bin_items( + global struct Globals* globals, + global uint* global_histogram, + global uint* wg_flags, + global uchar* input, + uint iteration, + uint numGroups, + uint update_wg_flags +) +{ + local uint histogram[RADIX_BINS]; + const uint numItems = globals->numPrimitives; + if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) + sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags); + else + sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags); +} + + +GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals, + global uint* global_histogram, + local uint* partials, + uint numTasks, + uint iteration, + bool shift_primID) +{ + const uint localID = get_local_id(0); + + if (shift_primID) + { + const uint req_iterations = globals->sort_iterations; + if (iteration < req_iterations) + return; + } + + uint t = 0; + for (uint j = 0; j < numTasks; j++) + { + const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0); + store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t); + t += count; + } + + // each lane now contains the number of elements in the corresponding bin + // prefix sum this for use in the subsequent scattering pass. + uint global_count = t; + + partials[get_sub_group_id()] = sub_group_reduce_add(global_count); + + barrier(CLK_LOCAL_MEM_FENCE); + + uint lane = get_sub_group_local_id(); + uint p = partials[lane]; + p = (lane < get_sub_group_id()) ? p : 0; + + global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count); + + store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(256, 1, 1))) +void kernel +sort_morton_codes_reduce_bins(global struct Globals* globals, + uint numTasks, + global uint* global_histogram, + uint iteration) +{ + local uint partials[RADIX_BINS]; + const uint numItems = globals->numPrimitives; + if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) + sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false); + else + sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true); +} + + +#if 1 +GRL_INLINE void sort_morton_codes_scatter_items_func( + global struct Globals* globals, + global uint* global_histogram, + global ulong* input, + global ulong* output, + local uint* local_offset, + local uint* flags, + uint iteration, + uint numGroups, + uint numItems, + bool shift_primID, + bool update_morton_sort_in_flight) +{ + const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0); + + const uint global_shift = globals->shift; + const uint localID = get_local_id(0); + const uint taskID = get_group_id(0); + + if (gID == 0 && update_morton_sort_in_flight) + globals->morton_sort_in_flight = 0; + + uint2 ids = get_thread_range(numItems, numGroups, taskID); + uint startID = ids.x; + uint endID = ids.y; + + if (shift_primID) + { + const uint req_iterations = globals->sort_iterations; + if (iteration < req_iterations) + return; + + iteration -= req_iterations; + } + + const uint shift = 8 * iteration; + + // load the global bin counts, and add each bin's global prefix + // to the local prefix + { + uint global_prefix = 0, local_prefix = 0; + if (localID < RADIX_BINS) + { + local_prefix = global_histogram[RADIX_BINS * taskID + localID]; + global_prefix = global_histogram[RADIX_BINS * numGroups + localID]; + local_offset[localID] = global_prefix + local_prefix; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + + // move elements in WG-sized chunks. The elements need to be moved sequentially (can't use atomics) + // because relative order has to be preserved for LSB radix sort to work + + // For each bin, a bit vector indicating which elements are in the bin + for (uint block_base = startID; block_base < endID; block_base += get_local_size(0)) + { + // initialize bit vectors + for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0)) + { + flags[i + 0] = 0; + flags[i + 1] = 0; + flags[i + 2] = 0; + flags[i + 3] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // read sort key, determine which bin it goes into, scatter into the bit vector + // and pre-load the local offset + uint ID = localID + block_base; + ulong key = 0; + uint bin_offset = 0; + uint bin = 0; + uint bin_word = localID / 32; + uint bin_bit = 1 << (localID % 32); + + if (ID < endID) + { + key = input[ID]; + + if (shift_primID) + bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1); + else + bin = (key >> shift) & (RADIX_BINS - 1); + + atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit); + bin_offset = local_offset[bin]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (ID < endID) + { + // each key reads the bit-vectors for its bin, + // - Computes local prefix sum to determine its output location + // - Computes number of items added to its bin (last thread adjusts bin position) + uint prefix = 0; + uint count = 0; + for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++) + { + uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i]; + uint bc = popcount(bits); + uint pc = popcount(bits & (bin_bit - 1)); + prefix += (i < bin_word) ? bc : 0; + prefix += (i == bin_word) ? pc : 0; + + count += bc; + } + + // store the key in its proper place.. + output[prefix + bin_offset] = key; + + // last item for each bin adjusts local offset for next outer loop iteration + if (prefix == count - 1) + local_offset[bin] += count; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + } + + /* uint local_offset[RADIX_BINS]; */ + /* uint offset_global = 0; */ + /* for (int i=0;i> shift) & (RADIX_BINS-1); */ + /* const uint offset = local_offset[bin]; */ + /* output[offset] = input[ID]; */ + /* local_offset[bin]++; */ + /* } */ +} + +#else + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +sort_morton_codes_scatter_items( + global struct Globals* globals, + uint shift, + global uint* global_histogram, + global char* input0, + global char* input1, + unsigned int input0_offset, + unsigned int input1_offset, + uint iteration) +{ + const uint numItems = globals->numPrimitives; + const uint local_size = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + const uint localID = get_local_id(0); + const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + const uint startID = (taskID + 0) * numItems / numTasks; + const uint endID = (taskID + 1) * numItems / numTasks; + + global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset); + global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset); + + local uint local_offset[RADIX_BINS]; + uint off = 0; + for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) + { + const uint count = global_histogram[RADIX_BINS * numTasks + i]; + const uint offset_task = global_histogram[RADIX_BINS * taskID + i]; + const uint sum = sub_group_reduce_add(count); + const uint prefix_sum = sub_group_scan_exclusive_add(count); + local_offset[i] = off + offset_task + prefix_sum; + off += sum; + } + + for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) + { + const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1); + const uint offset = atomic_add_local(&local_offset[bin], 1); + output[offset] = input[ID]; + } + + /* uint local_offset[RADIX_BINS]; */ + /* uint offset_global = 0; */ + /* for (int i=0;i> shift) & (RADIX_BINS-1); */ + /* const uint offset = local_offset[bin]; */ + /* output[offset] = input[ID]; */ + /* local_offset[bin]++; */ + /* } */ +} +#endif + +#if 1 +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1))) +void kernel +sort_morton_codes_scatter_items( + global struct Globals *globals, + global uint *global_histogram, + global ulong *input, + global ulong *output, + uint iteration, + uint numGroups, + uint update_morton_sort_in_flight) +{ + local uint local_offset[RADIX_BINS]; + local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32]; + const uint numItems = globals->numPrimitives; + if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) + sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset, + flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight); + else + sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset, + flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight); +} + +#else + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +sort_morton_codes_scatter_items( + global struct Globals *globals, + uint shift, + global uint *global_histogram, + global char *input0, + global char *input1, + unsigned int input0_offset, + unsigned int input1_offset, + uint iteration) +{ + const uint numItems = globals->numPrimitives; + const uint local_size = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + const uint localID = get_local_id(0); + const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0); + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + const uint startID = (taskID + 0) * numItems / numTasks; + const uint endID = (taskID + 1) * numItems / numTasks; + + global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset); + global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset); + + local uint local_offset[RADIX_BINS]; + uint off = 0; + for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) + { + const uint count = global_histogram[RADIX_BINS * numTasks + i]; + const uint offset_task = global_histogram[RADIX_BINS * taskID + i]; + const uint sum = sub_group_reduce_add(count); + const uint prefix_sum = sub_group_scan_exclusive_add(count); + local_offset[i] = off + offset_task + prefix_sum; + off += sum; + } + + for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) + { + const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1); + const uint offset = atomic_add_local(&local_offset[bin], 1); + output[offset] = input[ID]; + } + + /* uint local_offset[RADIX_BINS]; */ + /* uint offset_global = 0; */ + /* for (int i=0;i> shift) & (RADIX_BINS-1); */ + /* const uint offset = local_offset[bin]; */ + /* output[offset] = input[ID]; */ + /* local_offset[bin]++; */ + /* } */ +} +#endif + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(512, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) +void kernel +sort_morton_codes_merged( + global struct Globals* globals, + global uint* global_histogram, + global uchar* input, + uint iteration, + uint numGroups +) +{ + const uint numItems = globals->numPrimitives; + const uint taskID = get_group_id(0); + const uint loc_id = get_local_id(0); + const uint lane = get_sub_group_local_id(); + + uint2 ids = get_thread_range(numItems, numGroups, taskID); + uint startID = ids.x; + uint endID = ids.y; + + local uint histogram[RADIX_BINS]; + local uint hist_tmp[RADIX_BINS]; + + if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) + { + sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false, + taskID, startID, endID); + } + else + { + const uint req_iterations = globals->sort_iterations; + if (iteration < req_iterations) + return; + + iteration -= req_iterations; + + sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true, + taskID, startID, endID); + } + + uint last_group = 0; + if (loc_id == 0) + last_group = atomic_inc_global(&globals->morton_sort_in_flight); + + write_mem_fence(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + + last_group = work_group_broadcast(last_group, 0); + + bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1); + + uint global_count = 0; + + if (isLastGroup) + { + for (uint j = 0; j < numGroups; j++) + { + const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0); + store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count); + global_count += count; + } + + hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (isLastGroup) + { + uint p = hist_tmp[lane]; + p = (lane < get_sub_group_id()) ? p : 0; + + global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count); + + store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count); + } +} + +#if 0 +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void kernel +sort_morton_codes_bin_items( + global struct Globals* globals, + uint shift, + global uint* global_histogram, + global char* input0, + global char* input1, + unsigned int input0_offset, + unsigned int input1_offset, + uint iteration) +{ + const uint numItems = globals->numPrimitives; + const uint local_size = get_local_size(0); + const uint taskID = get_group_id(0); + const uint numTasks = get_num_groups(0); + const uint localID = get_local_id(0); + const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); + const uint subgroupLocalID = get_sub_group_local_id(); + const uint subgroup_size = get_sub_group_size(); + + const uint startID = (taskID + 0) * numItems / numTasks; + const uint endID = (taskID + 1) * numItems / numTasks; + + global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset); + +#if 1 + local uint histogram[RADIX_BINS]; + for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) + histogram[i] = 0; + + for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) + { + const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1); + atomic_add(&histogram[bin], 1); + } + + for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) + global_histogram[RADIX_BINS * taskID + i] = histogram[i]; + +#else + uint histogram[RADIX_BINS]; + for (int i = 0; i < RADIX_BINS; i++) + histogram[i] = 0; + + for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) + { + const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1); + histogram[bin]++; + } + + for (uint i = 0; i < RADIX_BINS; i++) + { + const uint reduced_counter = sub_group_reduce_add(histogram[i]); + global_histogram[RADIX_BINS * taskID + i] = reduced_counter; + } +#endif +} + +#endif + +#define WG_SIZE_WIDE 256 +#define SG_SIZE_SCAN 16 + +// Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16 +GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val) +{ + const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN; + const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN; + const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN; + + uint acc = sub_group_scan_exclusive_add(val); + uint acc2 = acc + val; + + tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1); + barrier(CLK_LOCAL_MEM_FENCE); + uint loaded_val = tmp[sg_local_id]; + uint wgs_acc = sub_group_scan_exclusive_add(loaded_val); + uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id); + return acc + acc_for_this_hw_thread; +} + +// Wide reduce algorithm is divided into 2 kernels: +// 1. First, partial exclusive add scans are made within each work group using SLM. +// Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer. +// Last work group is determined using global atomics on wg_flags buffer. +// 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are. +// Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones. +GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func( + global struct Globals* globals, + global uint* global_histogram, + global uint* global_histogram_partials, + global uint* wg_flags, + local uint* exclusive_scan_tmp, + uint numTasks, + uint numGroups, + uint iteration, + bool shift_primID) +{ + if (shift_primID) + { + const uint req_iterations = globals->sort_iterations; + if (iteration < req_iterations) + return; + + iteration -= req_iterations; + } + + const uint groupID = get_group_id(0) % RADIX_BINS; + const uint scanGroupID = get_group_id(0) / RADIX_BINS; + uint localID = get_local_id(0); + uint globalID = localID + (scanGroupID * WG_SIZE_WIDE); + const uint lastGroup = (numGroups / WG_SIZE_WIDE); + const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1; + + uint temp = 0; + uint last_count = 0; + if (globalID < numTasks) + { + temp = global_histogram[RADIX_BINS * globalID + groupID]; + + // Store the last value of the work group, it is either last element of histogram or last item in work group + if (globalID == endID) + last_count = temp; + } + + uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp); + + if (globalID <= numTasks) + { + global_histogram[RADIX_BINS * globalID + groupID] = val; + + // Store the block sum value to separate buffer + if (globalID == endID) + global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count; + } + + // Make sure that global_histogram_partials is updated in all work groups + write_mem_fence(CLK_GLOBAL_MEM_FENCE); + barrier(0); + + // Now, wait for the last group for each histogram bin, so we know that + // all work groups already updated the global_histogram_partials buffer + uint last_group = 0; + if (localID == 0) + last_group = atomic_inc_global(&wg_flags[groupID]); + + last_group = work_group_broadcast(last_group, 0); + bool isLastGroup = (last_group == lastGroup - 1); + + // Each of the last groups computes the scan exclusive add for each partial sum we have + if (isLastGroup) + { + uint temp1 = 0; + if (localID < lastGroup) + temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID]; + + uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1); + + if (localID < lastGroup) + global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2; + } +} + +GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func( + global struct Globals* globals, + global uint* global_histogram, + global uint* global_histogram_partials, + local uint* partials, + uint numTasks, + uint numGroups, + uint iteration, + bool shift_primID) +{ + if (shift_primID) + { + const uint req_iterations = globals->sort_iterations; + if (iteration < req_iterations) + return; + + iteration -= req_iterations; + } + + const uint groupID = get_group_id(0) % RADIX_BINS; + const uint scanGroupID = get_group_id(0) / RADIX_BINS; + const uint lastGroup = (numGroups / WG_SIZE_WIDE); + uint localID = get_local_id(0); + uint globalID = localID + (scanGroupID * WG_SIZE_WIDE); + const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1; + + // Add the global sums to the partials, skip the firsy scanGroupID as the first add + // value is 0 in case of exclusive add scans + if (scanGroupID > 0 && globalID <= numTasks) + { + uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID]; + atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val); + } + + // Wait for the last group + uint last_group = 0; + if (localID == 0) + last_group = atomic_inc_global(&globals->morton_sort_in_flight); + + last_group = work_group_broadcast(last_group, 0); + bool isLastGroup = (last_group == numGroups - 1); + + // Do the exclusive scan within all bins with global data now + if (isLastGroup) + { + mem_fence_gpu_invalidate(); + + uint global_count = global_histogram[numTasks * RADIX_BINS + localID]; + + partials[get_sub_group_id()] = sub_group_reduce_add(global_count); + + barrier(CLK_LOCAL_MEM_FENCE); + + uint lane = get_sub_group_local_id(); + uint p = partials[lane]; + p = (lane < get_sub_group_id()) ? p : 0; + + global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count); + + store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count); + } +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN))) +void kernel +sort_morton_codes_reduce_bins_wide_partial_sum( + global struct Globals* globals, + uint numTasks, + uint numGroups, + global uint* global_histogram, + global uint* global_histogram_partials, + global uint* wg_flags, + uint iteration) +{ + local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN]; + + const uint numItems = globals->numPrimitives; + if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) + sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false); + else + sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true); +} + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1))) +__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN))) +void kernel +sort_morton_codes_reduce_bins_wide_add_reduce( + global struct Globals* globals, + uint numTasks, + uint numGroups, + global uint* global_histogram, + global uint* global_histogram_partials, + uint iteration) +{ + local uint partials[RADIX_BINS]; + + const uint numItems = globals->numPrimitives; + if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) + sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false); + else + sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true); +} diff --git a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl new file mode 100644 index 00000000000..dee315adcda --- /dev/null +++ b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl @@ -0,0 +1,297 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module msb_radix_bitonic_sort; + +kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl") +{ + links lsc_intrinsics; + + kernel opencl_debug_print < kernelFunction="debug_print_kernel">; + kernel opencl_check_bls < kernelFunction="check_bls_sort">; + + kernel opencl_bottom_level_sort_single_wg < kernelFunction="sort_morton_codes_bottom_level_single_wg">; + + kernel opencl_build_morton_kernel_sort_msb_init < kernelFunction="sort_morton_codes_msb_begin">; + + kernel opencl_build_morton_kernel_sort_msb_scheduler < kernelFunction="scheduler">; + + kernel opencl_build_morton_kernel_sort_bottom_level < kernelFunction="sort_morton_codes_bottom_level">; + + kernel opencl_build_morton_kernel_sort_msb_count_items < kernelFunction="sort_morton_codes_msb_count_items">; + kernel opencl_build_morton_kernel_sort_msb_bin_items < kernelFunction="sort_morton_codes_msb_bin_items">; + + kernel opencl_build_morton_kernel_sort_batched_bls_dispatch < kernelFunction="sort_morton_codes_batched_BLS_dispatch">; +} + + +const MSB_RADIX_NUM_VCONTEXTS = 8; +const BOTTOM_LEVEL_SORT_THRESHOLD = 512; + +struct MSBRadixScheduler +{ + dword num_wgs_msb; + dword num_wgs_bls; + + dword scheduler_postsync; + dword _pad1; +}; + +struct MSBRadixArgs +{ + qword p_scheduler; + qword p_num_primitives; +}; + + + + +struct BatchedBLSDispatchEntry +{ + qword p_data_buffer; + qword num_elements; // number of elements in p_data_buffer +}; + + + + +metakernel add_bls_dispatch_init(qword p_storage) +{ + define REG_numWgs REG14; + define REG_p_storage REG15; + + REG_numWgs = 0; + REG_p_storage = p_storage; +} + + + + +// basically this code does: +// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives }; +// dispatchId++; +// +metakernel add_bls_dispatch( + qword p_data, + qword p_num_primitives +) +{ + define C_1 REG0; + define C_8 REG1; + + define C_MIN_PRIMREFS REG2; + + define REG_p_data REG3; + define REG_num_prims REG4; + define REG_no_dispatch REG5; + + define REG_numWgs REG14; + define REG_p_storage REG15; + + C_MIN_PRIMREFS = 2; + + REG_num_prims = 0; + REG_num_prims.lo = load_dword(p_num_primitives); + + REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; + + goto l_finish if(REG_no_dispatch.lo); + + C_1 = 1; + C_8 = 8; + + // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data + REG_p_data = p_data; + store_qword( REG_p_storage, REG_p_data ); // store the data pointer + + REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct + + // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives + store_qword( REG_p_storage, REG_num_prims ); + + REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance + + REG_numWgs = REG_numWgs + C_1; + +l_finish: + +} + + + + +metakernel batched_bls_dispatch( + qword private_mem +) +{ + define REG_numWgs REG14; + + DISPATCHDIM_X = REG_numWgs; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem); +} + + + + +metakernel sort_bottom_level( + qword build_globals, + qword input, + qword p_num_primitives) +{ + define REG_num_prims REG0; + define C_MIN_PRIMREFS REG1; + define REG_no_dispatch REG2; + + REG_num_prims = load_dword( p_num_primitives ); + + C_MIN_PRIMREFS = 2; + + REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; + + goto l_finish if(REG_no_dispatch.lo); + + dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input); + +l_finish: + +} + + + + +metakernel sort( + qword build_globals, + qword input, + qword tmp, + MSBRadixArgs sort_args) +{ + define REG_num_prims REG0; + { + define C_MIN_PRIMREFS REG1; + define C_MAX_PRIMREFS REG2; + define REG_no_dispatch REG3; + define REG_dispatch_single_wg REG4; + + REG_num_prims = load_dword( sort_args.p_num_primitives ); + C_MIN_PRIMREFS = 2; + C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD; + + REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; + REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS; + + goto l_sort_finish if(REG_no_dispatch.lo); + goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo); + goto l_full_sort; + } + +l_dispatch_single_wg: + + { + dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input); + goto l_sort_finish; + } + +l_full_sort: + + define p_scheduler sort_args.p_scheduler; + define p_scheduler_postsync (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) ); + define p_num_wgs_bls (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) ); + + define REG_scheduler_postsync REG3; + REG_scheduler_postsync = p_scheduler_postsync; + + define C_0 REG4; + define C_8 REG5; + define C_255 REG6; + C_0 = 0; + C_8 = 8; + C_255 = 255; + + store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore + + REG_num_prims = REG_num_prims + C_255; + REG_num_prims = REG_num_prims >> C_8; + + DISPATCHDIM_X = REG_num_prims.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + control( cs_store_fence ); // commit the semaphore write + + // initialize the whole execution + dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp) + postsync store_dword( p_scheduler_postsync, 1 ); + + // wait on count_items kernel + semaphore_wait while( *p_scheduler_postsync != 1 ); + + dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler) + postsync store_dword( p_scheduler_postsync, 2 ); + + // wait on count_items kernel + semaphore_wait while( *p_scheduler_postsync != 2 ); + + dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input) + postsync store_dword( p_scheduler_postsync, 0 ); + + define C_MASK_HI REG4; + C_MASK_HI = 0x00000000ffffffff; + + l_build_loop: + { + semaphore_wait while( *p_scheduler_postsync != 0 ); + { + dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp ) + postsync store_dword( p_scheduler_postsync, 1 ); + + // wait on scheduler kernel + semaphore_wait while( *p_scheduler_postsync != 1 ); + } + + // load and process the scheduler results + define REG_wg_counts REG0; + define REG_num_msb_wgs REG0.lo; + define REG_num_bls_wgs REG0.hi; + define REG_p_scheduler REG1; + define REG_no_msb_wgs REG2; + { + REG_p_scheduler = p_scheduler; + REG_wg_counts = load_qword( REG_p_scheduler ); + + REG_no_msb_wgs = REG_wg_counts & C_MASK_HI; + REG_no_msb_wgs = REG_no_msb_wgs == 0; + } + + // dispatch new bls WGs + DISPATCHDIM_X = REG_num_bls_wgs; + dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input ); + + // jump out if there are no msb WGs + goto l_sort_finish if (REG_no_msb_wgs); + + DISPATCHDIM_X = REG_num_msb_wgs; + dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler) + postsync store_dword( p_scheduler_postsync, 2 ); + + // wait on count_items kernel + semaphore_wait while( *p_scheduler_postsync != 2 ); + + dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input) + postsync store_dword( p_scheduler_postsync, 0 ); + + // wait till all BLS finished launching + semaphore_wait while( *p_num_wgs_bls != 0 ); + + goto l_build_loop; + } + +l_sort_finish: + +} diff --git a/src/intel/vulkan/grl/gpu/new_sah_builder.grl b/src/intel/vulkan/grl/gpu/new_sah_builder.grl new file mode 100644 index 00000000000..d0a9694acc2 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/new_sah_builder.grl @@ -0,0 +1,665 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module new_sah_builder; + +kernel_module bfs_kernels ("bvh_build_BFS.cl") +{ + links lsc_intrinsics; + + kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial < kernelFunction="BFS_pass1_initial" > ; + kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed < kernelFunction="BFS_pass1_indexed" > ; + kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial < kernelFunction="BFS_pass2_initial" > ; + kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed < kernelFunction="BFS_pass2_indexed" > ; + + kernel opencl_build_kernel_BinnedSAH_DFS < kernelFunction="DFS" >; + // kernel opencl_build_kernel_BinnedSAH_BuildQNodes < kernelFunction="build_qnodes" >; + kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff < kernelFunction="build_qnodes_pc_kickoff" >; + kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify < kernelFunction="build_qnodes_pc_amplify" >; + kernel opencl_build_kernel_BinnedSAH_begin < kernelFunction = "begin" >; + kernel opencl_build_kernel_BinnedSAH_scheduler < kernelFunction = "scheduler" >; + + kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch < kernelFunction="BFS_pass1_initial_batchable" >; + kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch < kernelFunction="BFS_pass1_indexed_batchable" >; + kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch < kernelFunction="BFS_pass2_initial_batchable" >; + kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch < kernelFunction="BFS_pass2_indexed_batchable" >; + + kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >; + kernel opencl_build_kernel_BinnedSAH_begin_batched < kernelFunction="begin_batchable" >; + + kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched < kernelFunction="build_qnodes_init_scheduler_batched" >; + kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched < kernelFunction="build_qnodes_begin_batchable" >; + kernel opencl_build_kernel_BinnedSAH_qnode_scheduler < kernelFunction="build_qnodes_scheduler" >; + kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch < kernelFunction="build_qnodes_pc_amplify_batched" >; + + kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >; + +} + +kernel opencl_build_kernel_DFS_single_wg < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" > +kernel opencl_build_kernel_DFS_trivial < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial" > +kernel opencl_build_kernel_DFS_single_wg_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" > +kernel opencl_build_kernel_DFS_trivial_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable" > + +kernel single_pass_binsah < source="bvh_build_DFS.cl", kernelFunction="DFS" > + + +const DFS_MIN_PRIMREFS = 6; +const DFS_MAX_PRIMREFS = 256; +const BFS_WG_SIZE_SHIFT = 9; + + + +struct Scheduler +{ + dword num_bfs_wgs; + dword num_dfs_wgs; + + dword scheduler_postsync; + dword _pad1; + + dword num_trivial_builds; + dword num_single_builds; + + dword batched_build_wg_count; + dword batched_build_loop_mask; + +}; + + +struct SAHBuildArgs +{ + qword p_num_primitives; + qword p_qnode_child_buffer; + qword p_scheduler; + qword p_sah_globals; + qword p_globals; + qword p_primref_buffer; + qword p_primref_index_buffers; + qword p_bvh_base; + qword p_bvh2; + qword p_root_buffer_counters; + dword sah_build_flags; + dword leaf_size; + dword leaf_type; + dword max_internal_nodes; +}; + + +metakernel single_pass_binsah( + qword build_globals, + qword bvh_buffer, + qword build_primref_buffer, + qword build_primref_index_buffers, + dword alloc_backpointers ) +{ + + dispatch single_pass_binsah(1, 1, 1) args( + build_globals, + bvh_buffer, + build_primref_buffer, + build_primref_index_buffers, + alloc_backpointers + ); + +} + + + +metakernel new_sah_build( SAHBuildArgs build_args ) +{ + define REG_num_prims REG0; + + { + define C_MIN_PRIMREFS REG1; + define C_MAX_PRIMREFS REG2; + define REG_dispatch_trivial REG3; + define REG_dispatch_single_wg REG4; + + REG_num_prims = load_dword( build_args.p_num_primitives ); + C_MIN_PRIMREFS = DFS_MIN_PRIMREFS; + C_MAX_PRIMREFS = DFS_MAX_PRIMREFS; + + REG_dispatch_trivial = REG_num_prims <= C_MIN_PRIMREFS; + REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS; + + goto l_dispatch_trivial if(REG_dispatch_trivial.lo); + goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo); + goto l_full_build; + } + +l_dispatch_trivial: + { + dispatch opencl_build_kernel_DFS_trivial (1,1,1) + args( build_args.p_globals, + build_args.p_bvh_base, + build_args.p_primref_buffer, + build_args.p_primref_index_buffers, + build_args.sah_build_flags + ); + + control( wait_idle ); + goto l_done; + } + +l_dispatch_single_wg: + { + dispatch opencl_build_kernel_DFS_single_wg (1,1,1) + args( build_args.p_globals, + build_args.p_bvh_base, + build_args.p_primref_buffer, + build_args.p_primref_index_buffers, + build_args.sah_build_flags + ); + + control( wait_idle ); + goto l_done; + } + + +l_full_build: + + + { + define p_scheduler build_args.p_scheduler; + define p_num_dfs_wgs build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs); + define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) ); + define C_0 REG1; + define C_8 REG2; + C_8 = 8; + C_0 = 0; + + + // + // Init pass + // + store_dword( p_scheduler_postsync, C_0.lo ); + + // compute number of BFS WGs from prim-count + // NOTE: This code uses a hardcoded WG size of 512 for BFS + // If the BFS wg size ever changes, it needs to be touched + // This is necessary because DG2 shifter only supports POW2 shifts + { + define REG_scheduler_postsync REG3; + define C_511 REG4; + define C_1 REG5; + + REG_scheduler_postsync = p_scheduler_postsync; + C_511 = 511; + C_1 = 1; + + store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore + + REG_num_prims = REG_num_prims + C_511; + REG_num_prims = REG_num_prims >> C_8; + REG_num_prims = REG_num_prims >> C_1; + + DISPATCHDIM_X = REG_num_prims.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + control( cs_store_fence ); // commit the semaphore write + + // launch scheduler init kernel + dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1) + args( + build_args.p_scheduler, + build_args.leaf_size, + build_args.leaf_type, + build_args.p_primref_index_buffers, + build_args.p_primref_buffer, + build_args.p_bvh2, + build_args.p_bvh_base, + build_args.p_globals, + build_args.p_sah_globals, + build_args.p_qnode_child_buffer, + build_args.sah_build_flags + ) + postsync store_dword( p_scheduler_postsync, 1 ); + + // wait on init kernel + semaphore_wait while( *p_scheduler_postsync != 1 ); + + // launch BFS1 pass1 + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial + args( build_args.p_scheduler, + build_args.p_sah_globals) + postsync store_dword( p_scheduler_postsync, 0 ); + + // wait on BFS pass1 + semaphore_wait while( *p_scheduler_postsync != 0 ); + + // launch BFS pass2 + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial + args( build_args.p_scheduler, + build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 1 ); + } + + // after BFS pass 2 we drop into a scheduling loop + + l_build_loop: + { + semaphore_wait while( *p_scheduler_postsync != 1 ); + + { + dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1) + args( build_args.p_scheduler, build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 0 ); + + // wait on the scheduler + semaphore_wait while( *p_scheduler_postsync != 0 ); + } + + // load and process the scheduler results + define REG_wg_counts REG0; + define REG_num_bfs_wgs REG0.lo; + define REG_num_dfs_wgs REG0.hi; + define REG_loop_break REG1; + define REG_p_scheduler REG2; + { + REG_p_scheduler = p_scheduler; + REG_wg_counts = load_qword( REG_p_scheduler ); + + define C_MASK_LO REG3 ; + C_MASK_LO = 0xffffffff; + + REG_loop_break = REG_wg_counts & C_MASK_LO; + REG_loop_break = REG_loop_break == 0; + } + + // dispatch new DFS WGs + DISPATCHDIM_X = REG_num_dfs_wgs; + dispatch_indirect opencl_build_kernel_BinnedSAH_DFS + args( p_scheduler, + build_args.p_sah_globals ); + + // jump out if there are no bfs WGs + goto l_build_qnodes if (REG_loop_break); + + // dispatch new BFS1 WGs + DISPATCHDIM_X = REG_num_bfs_wgs; + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed + args( p_scheduler, + build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 2 ); + + semaphore_wait while( *p_scheduler_postsync != 2 ); + + // dispatch new BFS2 WGs + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed + args( p_scheduler, + build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 1 ); + + //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore + + // wait until all upcoming DFS WGs have finished launching + // so that the scheduler can refill the launch array + // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely) + semaphore_wait while( *p_num_dfs_wgs != 0 ); + + + goto l_build_loop; + } + } + +l_build_qnodes: + + control( wait_idle ); + + // P/C qnode build + + dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1) + args( build_args.p_sah_globals, + build_args.p_qnode_child_buffer, + build_args.sah_build_flags ); + + { + define p_pc_counters ( build_args.p_root_buffer_counters ); + + define REG_addr REG0; + define REG_produced REG1; + define REG_consumed REG2; + define REG_have_work REG3; + define REG_wg_count REG4; + define C_8 REG5; + define C_16 REG6; + define C_1 REG7; + C_1 = 1; + C_8 = 8; + C_16 = 16; + REG_addr = build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address + + REG_consumed = 0; + + l_qnode_loop: + + control( wait_idle ); // wait for previous pass + + // load counters and compute number of wgs to respawn + REG_produced = load_qword( REG_addr ); REG_addr = REG_addr + C_8; + REG_wg_count = REG_produced - REG_consumed; + REG_have_work = REG_wg_count > 0; + + goto l_done if not(REG_have_work.lo); + + // save REG_consumed as a starting position in p_qnode_child_buffer + store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8; + + // save REG_produced as ending position in p_qnode_child_buffer + store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16; + + REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration + + // calculate amount of workgroups to schedule + REG_wg_count = REG_wg_count + C_1; + REG_wg_count = REG_wg_count >> C_1; + + DISPATCHDIM_X = REG_wg_count.lo; + + control( cs_store_fence ); // commit the stores + + dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify + args( build_args.p_sah_globals, + build_args.p_qnode_child_buffer, + build_args.sah_build_flags); + + goto l_qnode_loop; + } + +l_done: +} + + + + + + + + + +struct SAHBuildArgsBatchable +{ + qword p_globals_ptrs; + qword p_scheduler; + qword p_buffers_info; + qword p_sah_globals; + + dword num_max_qnode_global_root_buffer_entries; + dword num_builds; + +}; + + +metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args ) +{ + define p_scheduler build_args.p_scheduler; + define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) ); + define p_num_dfs_wgs (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs)); + + // initialize scheduler semaphore + REG0.lo = 0; + store_dword( p_scheduler_postsync, REG0.lo ); + + + // dispatch categorization pass + dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1) + args( + build_args.p_scheduler, + build_args.p_globals_ptrs, + build_args.p_buffers_info, + build_args.p_sah_globals, + build_args.num_builds + ) + postsync store_dword( p_scheduler_postsync, 1 ); + + // wait on the categorization pass + semaphore_wait while( *p_scheduler_postsync != 1 ); + + + // dispatch the trivial and single-WG passes + { + REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) ); + DISPATCHDIM_X = REG0.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + // dispatch trivial builds + + dispatch_indirect opencl_build_kernel_DFS_trivial_batch + args( build_args.p_sah_globals ); + + control( wait_idle ); + + // dispatch single-wg builds + + DISPATCHDIM_X = REG0.hi; + dispatch_indirect opencl_build_kernel_DFS_single_wg_batch + args( build_args.p_sah_globals, build_args.p_scheduler ); + } + + // compute the number of builds not covered by the trivial passes + // skip the builder loop if all builds are satisfied by trivial passes + { + REG1 = REG0.lo; + REG2 = REG0.hi; + REG3 = build_args.num_builds; + REG5 = REG2 + REG1; + REG5 = REG3 - REG5; + REG4 = REG5 == 0 ; + + goto l_done if (REG4.lo); + } + + // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop + define REG_num_nontrivial REG5; + +l_build_outer_loop: + { + + // configure the scheduler to initiate a new block of builds + + dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1) + args( build_args.p_scheduler, build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 0 ); + + // wait on init kernel + semaphore_wait while( *p_scheduler_postsync != 0 ); + + + // read results produced by scheduler init kernel + // lo == BFS wg count. hi == all ones if we need to loop again + // + REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count); + REG4 = load_qword( REG0 ); + + // launch BFS1 pass1 + DISPATCHDIM_X = REG4.lo; + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch + args( build_args.p_scheduler, + build_args.p_sah_globals) + postsync store_dword( p_scheduler_postsync, 1 ); + + // wait on BFS pass1 + semaphore_wait while( *p_scheduler_postsync != 1 ); + + // launch BFS pass2 + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch + args( build_args.p_scheduler, + build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 0 ); + + l_build_loop: + { + semaphore_wait while( *p_scheduler_postsync != 0 ); + + { + dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1) + args( build_args.p_scheduler, build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 1 ); + + // wait on the scheduler + semaphore_wait while( *p_scheduler_postsync != 1 ); + } + + // load and process the scheduler results + define REG_wg_counts REG0; + define REG_num_bfs_wgs REG0.lo; + define REG_num_dfs_wgs REG0.hi; + define REG_loop_break REG1; + define REG_p_scheduler REG2; + { + REG_p_scheduler = p_scheduler; + REG_wg_counts = load_qword( REG_p_scheduler ); + + define C_MASK_LO REG3 ; + C_MASK_LO = 0xffffffff; + + REG_loop_break = REG_wg_counts & C_MASK_LO; + REG_loop_break = REG_loop_break == 0; + } + + // dispatch new DFS WGs + DISPATCHDIM_X = REG_num_dfs_wgs; + dispatch_indirect opencl_build_kernel_BinnedSAH_DFS + args( p_scheduler, + build_args.p_sah_globals ); + + // jump out if there are no bfs WGs + goto l_continue_outer_loop if (REG_loop_break); + + // dispatch new BFS1 WGs + DISPATCHDIM_X = REG_num_bfs_wgs; + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch + args( p_scheduler, + build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 2 ); + + semaphore_wait while( *p_scheduler_postsync != 2 ); + + // dispatch new BFS2 WGs + dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch + args( p_scheduler, + build_args.p_sah_globals ) + postsync store_dword( p_scheduler_postsync, 0 ); + + //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore + + // wait until all upcoming DFS WGs have finished launching + // so that the scheduler can refill the launch array + // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely) + semaphore_wait while( *p_num_dfs_wgs != 0 ); + + goto l_build_loop; + } + + + l_continue_outer_loop: + + + goto l_build_outer_loop if(REG4.hi); + + } + +//////// +// +// Qnode build phase +// +//////// + + // Wait for all outstanding DFS dispatches to complete, then build the QNodes + control( wait_idle ); + + define REG_wg_counts REG1; + define REG_p_scheduler REG2; + define REG_have_work REG3; + define REG_GRB_NUM_MAX_ENTRIES REG4; + + // init scheduler for qnode phase + dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1) + args( build_args.p_scheduler, + build_args.num_builds, + build_args.num_max_qnode_global_root_buffer_entries); + + REG_p_scheduler = p_scheduler; + + control( wait_idle ); + + REG_wg_counts = load_qword( REG_p_scheduler ); + + DISPATCHDIM_X = REG_wg_counts.lo; + + // configure the scheduler to initiate a new block of builds + dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched + args( build_args.p_scheduler, + build_args.p_sah_globals); + + // read results produced by init scheduler kernel + // lo == num of builds processed. hi == num of maximum global root buffer entries + // + REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count); + REG5 = load_qword( REG0 ); + + REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi; + REG_GRB_NUM_MAX_ENTRIES.hi = 0; + +l_qnode_loop: + { + control( wait_idle ); // wait for previous pass + + dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler ); + + control( wait_idle ); + + REG_wg_counts = load_qword( REG_p_scheduler ); + REG_have_work = REG_wg_counts > 0; + + goto l_done if not(REG_have_work.lo); + + DISPATCHDIM_X = REG_wg_counts.lo; + + dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch + args( build_args.p_sah_globals, + build_args.p_scheduler ); + + control( wait_idle ); + + REG_wg_counts = load_qword( REG_p_scheduler ); // reload values + REG_wg_counts.lo = REG_wg_counts.hi; + REG_wg_counts.hi = 0; + + REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES; + + goto l_qnode_loop if not(REG_have_work.lo); + + DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled + + dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched + args( build_args.p_sah_globals, + build_args.p_scheduler ); + + goto l_qnode_loop; + } + +//////// +// +// Old implementation - TODO: maybe add switch between two implementations? +// +//////// + // Wait for all outstanding DFS dispatches to complete, then build the QNodes + //DISPATCHDIM_X = REG5.lo; + + //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes + // args( build_args.p_sah_globals, build_args.p_scheduler ); + + +l_done: + + control( wait_idle ); + +} diff --git a/src/intel/vulkan/grl/gpu/postbuild_info.grl b/src/intel/vulkan/grl/gpu/postbuild_info.grl new file mode 100644 index 00000000000..3039e533a9b --- /dev/null +++ b/src/intel/vulkan/grl/gpu/postbuild_info.grl @@ -0,0 +1,49 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module postbuild_info; // In postbuild we assume output data structure to be DXR compatible + +kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" > +kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" > +kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" > +kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" > + +metakernel compacted_size( + qword bvh, + qword postbuildInfo) +{ + dispatch compacted_size(1,1,1) args( + bvh, + postbuildInfo); +} + +metakernel current_size( + qword bvh, + qword postbuildInfo) +{ + dispatch current_size(1,1,1) args( + bvh, + postbuildInfo); +} + +metakernel serialized_size( + qword bvh, + qword postbuildInfo) +{ + dispatch serialized_size(1,1,1) args( + bvh, + postbuildInfo); +} + +metakernel decoded_size( + qword bvh, + qword postbuildInfo) +{ + dispatch decoded_size(1,1,1) args( + bvh, + postbuildInfo); +} diff --git a/src/intel/vulkan/grl/gpu/presplit.grl b/src/intel/vulkan/grl/gpu/presplit.grl new file mode 100644 index 00000000000..d0f6e53fbb1 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/presplit.grl @@ -0,0 +1,62 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module presplit; + +kernel_module presplit_kernels ("bvh_build_presplit.cl") +{ + links lsc_intrinsics; + + kernel opencl_kernel_compute_num_presplits < kernelFunction="compute_num_presplits" >; + kernel opencl_kernel_priority_sum < kernelFunction="priority_sum" >; + kernel opencl_kernel_perform_presplits < kernelFunction="perform_presplits" >; +} + +import struct MKBuilderState "structs.grl"; +import struct MKSizeEstimate "structs.grl"; + + +metakernel compute_num_presplits( + MKBuilderState state, + qword presplit_buffer, + dword numHwThreads ) +{ + dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args( + state.build_globals, + state.bvh_buffer, + state.build_primref_buffer, + presplit_buffer, + state.geomDesc_buffer ); +} + + +metakernel priority_sum( + MKBuilderState state, + MKSizeEstimate estimate, + qword presplit_buffer ) +{ + dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args( + state.build_globals, + presplit_buffer, + estimate.numPrimitivesToSplit / 2 ); +} + +metakernel perform_presplits( + MKBuilderState state, + MKSizeEstimate estimate, + qword presplit_buffer, + dword numHwThreads ) +{ + dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args( + state.build_globals, + state.bvh_buffer, + state.build_primref_buffer, + presplit_buffer, + state.bvh_buffer, + state.geomDesc_buffer, + estimate.numPrimitivesToSplit / 2 ); +} diff --git a/src/intel/vulkan/grl/gpu/qbvh6.h b/src/intel/vulkan/grl/gpu/qbvh6.h new file mode 100644 index 00000000000..22260d07f41 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/qbvh6.h @@ -0,0 +1,933 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "GRLGen12.h" + +#include "shared.h" +#include "quad.h" + +/* ====== GENERAL BVH config ====== */ + +#define BVH_NODE_N6 6 +#define BVH_NODE_N 8 +#define BVH_NODE_N_LOG 3 + +#define SAH_LOG_BLOCK_SHIFT 2 +#define BVH_LEAF_N_MIN BVH_NODE_N6 +#define BVH_LEAF_N_MAX BVH_NODE_N6 + +#define BVH_NODE_DEFAULT_MASK 0xff +#define BVH_NODE_DEGENERATED_MASK 0x00 + +/* ====== QUANTIZATION config ====== */ + +#define QUANT_BITS 8 +#define QUANT_MIN 0 +#define QUANT_MAX 255 +#define QUANT_MAX_MANT (255.0f / 256.0f) + +#define NO_NODE_OFFSET 0 + +/* ======================================================================= */ +/* ============================== BVH BASE =============================== */ +/* ======================================================================= */ + +GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb) +{ + base->Meta.bounds.lower[0] = aabb->lower.x; + base->Meta.bounds.lower[1] = aabb->lower.y; + base->Meta.bounds.lower[2] = aabb->lower.z; + + base->Meta.bounds.upper[0] = aabb->upper.x; + base->Meta.bounds.upper[1] = aabb->upper.y; + base->Meta.bounds.upper[2] = aabb->upper.z; +} + +GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh) +{ + return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET); +} + +GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh) +{ + return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET); +} + +GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh) +{ + return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart); +} + +GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh) +{ + return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64; +} + +GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh) +{ + return bvh->quadLeafCur - bvh->quadLeafStart; +} + +GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh) +{ + return bvh->proceduralDataCur - bvh->proceduralDataStart; +} + +GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh) +{ + return bvh->instanceLeafEnd - bvh->instanceLeafStart; +} + +/* =================================================================== */ +/* ============================== QBVH =============================== */ +/* =================================================================== */ + +__constant const float ulp = FLT_EPSILON; + +GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb) +{ + struct AABB box; + const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper)); + const float v = ulp * max(v4.x, max(v4.y, v4.z)); + box.lower = aabb->lower - (float4)v; + box.upper = aabb->upper + (float4)v; + return box; +} + +GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d) +{ + struct AABB aabb4d = AABBfromAABB3f(*aabb3d); + struct AABB box = conservativeAABB(&aabb4d); + return AABB3fFromAABB(box); +} + +struct QBVH_AABB +{ + uchar lower_x[BVH_NODE_N6]; + uchar upper_x[BVH_NODE_N6]; + uchar lower_y[BVH_NODE_N6]; + uchar upper_y[BVH_NODE_N6]; + uchar lower_z[BVH_NODE_N6]; + uchar upper_z[BVH_NODE_N6]; +}; + +struct QBVHNodeN +{ + float lower[3]; + int offset; + // 16 bytes + uchar type; + uchar pad; + // 18 bytes + char exp[3]; + uchar instMask; + // 22 bytes + uchar childData[6]; + // 28 bytes + struct QBVH_AABB qbounds; // + 36 bytes + // 64 bytes +}; + +GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID) +{ + return This->childData[childID] & 0x3; +} + +GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID) +{ + return (This->childData[childID] >> 2) & 0xF; +} + +GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode) +{ + uint *ptr = (uint *)qnode; + for (uint i = 0; i < 16; i++) + ptr[i] = 0; +} + +GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i) +{ + struct AABB aabb; + const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f); + const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0); + const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0); + const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f); + aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8); + aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8); + return aabb; +} + +GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode) +{ + struct AABB aabb; +#if 0 + AABB_init(&aabb); + for (uint i = 0; i < BVH_NODE_N6; i++) + { + struct AABB v = extractAABB_QBVHNodeN(qnode, i); + AABB_extend(&aabb, &v); + } +#else + uint lower_x = qnode->qbounds.lower_x[0]; + uint lower_y = qnode->qbounds.lower_y[0]; + uint lower_z = qnode->qbounds.lower_z[0]; + + uint upper_x = qnode->qbounds.upper_x[0]; + uint upper_y = qnode->qbounds.upper_y[0]; + uint upper_z = qnode->qbounds.upper_z[0]; + + for (uint i = 1; i < BVH_NODE_N6; i++) + { + uint lx = qnode->qbounds.lower_x[i]; + uint ly = qnode->qbounds.lower_y[i]; + uint lz = qnode->qbounds.lower_z[i]; + + uint ux = qnode->qbounds.upper_x[i]; + uint uy = qnode->qbounds.upper_y[i]; + uint uz = qnode->qbounds.upper_z[i]; + + bool valid = lx <= ux; + if (valid) + { + lower_x = min(lower_x, lx); + lower_y = min(lower_y, ly); + lower_z = min(lower_z, lz); + + upper_x = max(upper_x, ux); + upper_y = max(upper_y, uy); + upper_z = max(upper_z, uz); + } + } + + const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f); + const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0); + const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0); + const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f); + aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8); + aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8); +#endif + return aabb; +} + +GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node) +{ + return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node)); +} + +GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode) +{ + uint children = 0; + for (uint i = 0; i < BVH_NODE_N6; i++) + { + uint lx = qnode->qbounds.lower_x[i]; + uint ux = qnode->qbounds.upper_x[i]; + bool valid = lx <= ux; + if (valid) + children++; + } + return children; +} + +GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode) +{ + return ((long)qnode->offset) << 6; +} + +GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode) +{ + const int offset = qnode->offset; + return (void *)(qnode + offset); +} + +GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + const uint k = subgroupLocalID; + const float up = 1.0f + ulp; + const float down = 1.0f - ulp; + + struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width + aabb = AABB_sub_group_broadcast(&aabb, 0); + + if (subgroupLocalID < BVH_NODE_N6) + { + struct AABB conservative_aabb = conservativeAABB(&aabb); + const float3 len = AABB_size(&conservative_aabb).xyz * up; + int3 exp; + const float3 mant = frexp_vec3(len, &exp); + const float3 org = conservative_aabb.lower.xyz; + + exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); + + qbvh_node->offset = offset; + qbvh_node->type = type; + + qbvh_node->lower[0] = org.x; + qbvh_node->lower[1] = org.y; + qbvh_node->lower[2] = org.z; + + qbvh_node->exp[0] = exp.x; + qbvh_node->exp[1] = exp.y; + qbvh_node->exp[2] = exp.z; + + qbvh_node->instMask = mask; + + uchar3 lower_uchar = (uchar3)(0x80); + uchar3 upper_uchar = (uchar3)(0); + + if (subgroupLocalID < numChildren) + { + struct AABB child_aabb = conservativeAABB(input_aabb); + + float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); + lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); + float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); + upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); + + lower_uchar = convert_uchar3_rtn(lower); + upper_uchar = convert_uchar3_rtp(upper); + + if (degenerated) + { + lower_uchar = upper_uchar = 0; + } + } + + qbvh_node->qbounds.lower_x[k] = lower_uchar.x; + qbvh_node->qbounds.lower_y[k] = lower_uchar.y; + qbvh_node->qbounds.lower_z[k] = lower_uchar.z; + qbvh_node->qbounds.upper_x[k] = upper_uchar.x; + qbvh_node->qbounds.upper_y[k] = upper_uchar.y; + qbvh_node->qbounds.upper_z[k] = upper_uchar.z; + + qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1; + +#if ENABLE_CONVERSION_CHECKS == 1 + + if (!(exp.x >= -128 && exp.x <= 127)) + printf("exp_x error \n"); + if (!(exp.y >= -128 && exp.y <= 127)) + printf("exp_y error \n"); + if (!(exp.z >= -128 && exp.z <= 127)) + printf("exp_z error \n"); + + struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k); + if (!AABB_subset(&child_aabb, &child_qaabb)) + { + uint3 lower_i = convert_uint3(lower_uchar); + uint3 upper_i = convert_uint3(upper_uchar); + + printf("\n ERROR %d\n", k); + printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i); + printf("%i uncompressed \n", k); + AABB_print(&child_aabb); + printf("%i compressed \n", k); + AABB_print(&child_qaabb); + + printf("%i uncompressed (as int) \n", k); + AABB_printasInt(&child_aabb); + printf("%i compressed (as int) \n", k); + AABB_printasInt(&child_qaabb); + + int4 e0 = child_aabb.lower < child_qaabb.lower; + int4 e1 = child_aabb.upper > child_qaabb.upper; + printf("e0 %d e1 %d \n", e0, e1); + } +#endif + } +} + +GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated) +{ + struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); + subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb); +} + +GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane) +{ + const uint lane = get_sub_group_local_id() % 8; + const uint node_in_sg = get_sub_group_local_id() / 8; + const uint k = lane; + const float up = 1.0f + ulp; + const float down = 1.0f - ulp; + + struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width + aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8); + + if (lane < BVH_NODE_N6 && active_lane) + { + struct AABB conservative_aabb = conservativeAABB(&aabb); + const float3 len = AABB_size(&conservative_aabb).xyz * up; + int3 exp; + const float3 mant = frexp_vec3(len, &exp); + const float3 org = conservative_aabb.lower.xyz; + + exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); + + qbvh_node->offset = offset; + qbvh_node->type = type; + + qbvh_node->lower[0] = org.x; + qbvh_node->lower[1] = org.y; + qbvh_node->lower[2] = org.z; + + qbvh_node->exp[0] = exp.x; + qbvh_node->exp[1] = exp.y; + qbvh_node->exp[2] = exp.z; + + qbvh_node->instMask = mask; + + uchar3 lower_uchar = (uchar3)(0x80); + uchar3 upper_uchar = (uchar3)(0); + + if (lane < numChildren) + { + struct AABB child_aabb = conservativeAABB(input_aabb); + + float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); + lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); + float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); + upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); + + lower_uchar = convert_uchar3_rtn(lower); + upper_uchar = convert_uchar3_rtp(upper); + + if (degenerated) + { + lower_uchar = upper_uchar = 0; + } + } + + qbvh_node->qbounds.lower_x[k] = lower_uchar.x; + qbvh_node->qbounds.lower_y[k] = lower_uchar.y; + qbvh_node->qbounds.lower_z[k] = lower_uchar.z; + qbvh_node->qbounds.upper_x[k] = upper_uchar.x; + qbvh_node->qbounds.upper_y[k] = upper_uchar.y; + qbvh_node->qbounds.upper_z[k] = upper_uchar.z; + + qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1; + +#if ENABLE_CONVERSION_CHECKS == 1 + + if (!(exp.x >= -128 && exp.x <= 127)) + printf("exp_x error \n"); + if (!(exp.y >= -128 && exp.y <= 127)) + printf("exp_y error \n"); + if (!(exp.z >= -128 && exp.z <= 127)) + printf("exp_z error \n"); + + struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k); + if (!AABB_subset(&child_aabb, &child_qaabb)) + { + uint3 lower_i = convert_uint3(lower_uchar); + uint3 upper_i = convert_uint3(upper_uchar); + + printf("\n ERROR %d\n", k); + printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i); + printf("%i uncompressed \n", k); + AABB_print(&child_aabb); + printf("%i compressed \n", k); + AABB_print(&child_qaabb); + + printf("%i uncompressed (as int) \n", k); + AABB_printasInt(&child_aabb); + printf("%i compressed (as int) \n", k); + AABB_printasInt(&child_qaabb); + + int4 e0 = child_aabb.lower < child_qaabb.lower; + int4 e1 = child_aabb.upper > child_qaabb.upper; + printf("e0 %d e1 %d \n", e0, e1); + } +#endif + } +} + +GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + + // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin. + // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here. + bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK); + + struct AABB aabb; + AABB_init(&aabb); + + // if every child is degenerated (or inactive) instance, we need to init aabb with origin point + uchar commonMask = sub_group_reduce_or_N6(instMask); + if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK)) + aabb = *input_aabb; + + subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated); +} + + +// return true if is degenerated +GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane) +{ + const uint lane = get_sub_group_local_id() % 8; + + // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin. + // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here. + bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK); + + // if every child is degenerated (or inactive) instance, we need to init aabb with origin point + uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask); + if (active_lane) + *mask = commonMask; + + if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK)) + AABB_init(input_aabb); + + return active_lane ? degenerated : false; +} + +GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane) +{ + const uint lane = get_sub_group_local_id() % 8; + + // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin. + // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here. + bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK); + + struct AABB aabb; + AABB_init(&aabb); + + // if every child is degenerated (or inactive) instance, we need to init aabb with origin point + uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask); + if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK)) + aabb = *input_aabb; + + subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane); +} + + +GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask) +{ + const uint subgroupLocalID = get_sub_group_local_id(); + + struct AABB aabb; + AABB_init(&aabb); + + if (subgroupLocalID < numChildren) + aabb = *input_aabb; + + subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false); +} + + +GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane) +{ + const uint lane = get_sub_group_local_id() % 8; + + struct AABB aabb; + AABB_init(&aabb); + + if (lane < numChildren) + aabb = *input_aabb; + + subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane); +} + + +GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node, + uniform struct AABB reduced_bounds, + varying struct AABB input_aabb, + uniform uint numChildren, + varying ushort lane ) +{ + const float up = 1.0f + ulp; + const float down = 1.0f - ulp; + + int3 exp; + + struct AABB conservative_aabb = conservativeAABB( &reduced_bounds); + const float3 len = AABB_size( &conservative_aabb ).xyz * up; + const float3 mant = frexp_vec3( len, &exp ); + const float3 org = conservative_aabb.lower.xyz; + + exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0); + + qbvh_node->lower[0] = org.x; + qbvh_node->lower[1] = org.y; + qbvh_node->lower[2] = org.z; + + qbvh_node->exp[0] = exp.x; + qbvh_node->exp[1] = exp.y; + qbvh_node->exp[2] = exp.z; + + qbvh_node->instMask = 0xff; + + uchar3 lower_uchar = 0x80; + uchar3 upper_uchar = 0; + + if ( lane < BVH_NODE_N6 ) + { + ushort k = lane; + if( lane < numChildren ) + { + struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ??? + + float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) ); + lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) ); + float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) ); + upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) ); + + lower_uchar = convert_uchar3_rtn( lower ); + upper_uchar = convert_uchar3_rtp( upper ); + } + + qbvh_node->qbounds.lower_x[k] = lower_uchar.x; + qbvh_node->qbounds.lower_y[k] = lower_uchar.y; + qbvh_node->qbounds.lower_z[k] = lower_uchar.z; + qbvh_node->qbounds.upper_x[k] = upper_uchar.x; + qbvh_node->qbounds.upper_y[k] = upper_uchar.y; + qbvh_node->qbounds.upper_z[k] = upper_uchar.z; + } + +} + +GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren) +{ + const float up = 1.0f + ulp; + const float down = 1.0f - ulp; + + int3 exp; + struct AABB aabb; + AABB_init(&aabb); + for (uint i = 0; i < numChildren; i++) + AABB_extend(&aabb, &input_aabb[i]); + + struct AABB conservative_aabb = conservativeAABB(&aabb); + const float3 len = AABB_size(&conservative_aabb).xyz * up; + const float3 mant = frexp_vec3(len, &exp); + const float3 org = conservative_aabb.lower.xyz; + + exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); + + qbvh_node->lower[0] = org.x; + qbvh_node->lower[1] = org.y; + qbvh_node->lower[2] = org.z; + + qbvh_node->exp[0] = exp.x; + qbvh_node->exp[1] = exp.y; + qbvh_node->exp[2] = exp.z; + + qbvh_node->instMask = 0xff; + + for (uint k = 0; k < numChildren; k++) + { + struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ??? + + float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); + lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); + float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); + upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); + + uchar3 lower_uchar = convert_uchar3_rtn(lower); + uchar3 upper_uchar = convert_uchar3_rtp(upper); + + qbvh_node->qbounds.lower_x[k] = lower_uchar.x; + qbvh_node->qbounds.lower_y[k] = lower_uchar.y; + qbvh_node->qbounds.lower_z[k] = lower_uchar.z; + qbvh_node->qbounds.upper_x[k] = upper_uchar.x; + qbvh_node->qbounds.upper_y[k] = upper_uchar.y; + qbvh_node->qbounds.upper_z[k] = upper_uchar.z; + +#if ENABLE_CONVERSION_CHECKS == 1 + if (!(exp.x >= -128 && exp.x <= 127)) + printf("exp_x error \n"); + if (!(exp.y >= -128 && exp.y <= 127)) + printf("exp_y error \n"); + if (!(exp.z >= -128 && exp.z <= 127)) + printf("exp_z error \n"); + + struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k); + if (!AABB_subset(&child_aabb, &child_qaabb)) + { + uint3 lower_i = convert_uint3(lower_uchar); + uint3 upper_i = convert_uint3(upper_uchar); + + printf("\n ERROR %d\n", k); + printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i); + printf("%i uncompressed \n", k); + AABB_print(&child_aabb); + printf("%i compressed \n", k); + AABB_print(&child_qaabb); + + printf("%i uncompressed (as int) \n", k); + AABB_printasInt(&child_aabb); + printf("%i compressed (as int) \n", k); + AABB_printasInt(&child_qaabb); + + int4 e0 = child_aabb.lower < child_qaabb.lower; + int4 e1 = child_aabb.upper > child_qaabb.upper; + printf("e0 %d e1 %d \n", e0, e1); + } +#endif + } + for (uint k = numChildren; k < BVH_NODE_N6; k++) + { + qbvh_node->qbounds.lower_x[k] = 0x80; + qbvh_node->qbounds.lower_y[k] = 0x80; + qbvh_node->qbounds.lower_z[k] = 0x80; + qbvh_node->qbounds.upper_x[k] = 0; + qbvh_node->qbounds.upper_y[k] = 0; + qbvh_node->qbounds.upper_z[k] = 0; + } +} + +GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren) +{ + qbvh_node->offset = offset; + for (uint k = 0; k < BVH_NODE_N6; k++) + qbvh_node->childData[k] = 1; +} + +GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node) +{ + for (uint k = 0; k < BVH_NODE_N6; k++) + qbvh_node->childData[k] = 1; +} + +GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node) +{ + if( get_sub_group_local_id() < BVH_NODE_N6 ) + qbvh_node->childData[get_sub_group_local_id()] = 1; +} + + +GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node) +{ + for (uint k = 0; k < BVH_NODE_N6; k++) + qbvh_node->childData[k] = 2; +} + +GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type) +{ + qbvh_node->type = type; +} + +GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node) +{ + QBVHNodeN_setType(qbvh_node, type); + QBVHNodeN_setChildren(qbvh_node, offset, numChildren); + QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren); +} + +GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode) +{ + printf(" offset %d type %d \n", qnode->offset, (int)qnode->type); + printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]); + printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]); + printf(" instMask %d \n", qnode->instMask); + + struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0); + struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1); + struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2); + struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3); + struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4); + struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5); + + printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x); + printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x); + + printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y); + printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y); + + printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z); + printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z); +} + +GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset) +{ + long global_parent_offset = (long)parent - (long)bvh_mem; + global_parent_offset = global_parent_offset & (~(64 - 1)); // FIXME: (sw) this should not be necessary? + int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB + //if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset); + return relative_offset; +} + +GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children) +{ + int ofs = (struct QBVHNodeN *)children - qnode; + qnode->offset = ofs; +} + +GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type) +{ + qnode->type = type; +} + +GRL_INLINE uint sortBVHChildrenIDs(uint input) +{ +#if BVH_NODE_N == 8 + return sort8_descending(input); +#else + return sort4_descending(input); +#endif +} + +enum XFM_BOX_OPTION { + XFM_BOX_NO_CLIP = 0, + XFM_BOX_NOT_REFINED_CLIPPED = 1, //<upper); + AABB3f_trim_upper(&child_bounds1, clipBox->upper); + AABB3f_trim_upper(&child_bounds2, clipBox->upper); + AABB3f_trim_upper(&child_bounds3, clipBox->upper); + AABB3f_trim_upper(&child_bounds4, clipBox->upper); + AABB3f_trim_upper(&child_bounds5, clipBox->upper); + } + + child_bounds0 = transform_aabb(child_bounds0, xfm); + child_bounds1 = transform_aabb(child_bounds1, xfm); + child_bounds2 = transform_aabb(child_bounds2, xfm); + child_bounds3 = transform_aabb(child_bounds3, xfm); + child_bounds4 = transform_aabb(child_bounds4, xfm); + child_bounds5 = transform_aabb(child_bounds5, xfm); + + AABB3f_extend(&child_bounds0, &child_bounds1); + AABB3f_extend(&child_bounds2, &child_bounds3); + AABB3f_extend(&child_bounds4, &child_bounds5); + AABB3f_extend(&child_bounds0, &child_bounds2); + AABB3f_extend(&child_bounds0, &child_bounds4); + + return child_bounds0; + } +#endif + +#if DEB_PRINTFS + printf("0"); +#endif + + struct AABB3f child_bounds; + + if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX) + { + // XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP + child_bounds = InternalNode_getAABB3f(pnode); + if (clipOpt != XFM_BOX_NO_CLIP) + { + AABB3f_intersect(&child_bounds, *clipBox); + } + } + else + { + //XFM_BOX_NOT_REFINED_TAKE_CLIPBOX + child_bounds = *clipBox; + } + + child_bounds = transform_aabb(child_bounds, xfm); + //child_bounds = conservativeAABB3f(&child_bounds); + return child_bounds; +} + +GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead) +{ + float transform[12]; + load_row_major_from_AffineSpace3f(xfm, transform); + return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead); +} + +GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base) +{ + uint dataSize = 0; + + if (BVHBase_HasBackPointers(base)) + { + const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63; + const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63; + + // New atomic update + if(base->quadIndicesDataStart > base->backPointerDataStart) + { + uint numQuads = BVHBase_GetNumQuads(base); + + const uint quadTableMainBufferSize = (numQuads + 255) & ~255; + const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255; + const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63); + + const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63; + + dataSize += quadTableEntriesSize + quadIndicesDataSize; + } + + dataSize += + ((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63) + + fatleafEntrySize + innerEntrySize; + } + + return (uint64_t)dataSize; +} + +GRL_INLINE uint64_t compute_compacted_size(BVHBase* base) +{ + uint64_t size = sizeof(BVHBase); + size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf); + size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf); + size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf); + size += compute_refit_structs_compacted_size(base); + size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode); + size += sizeof(InstanceDesc) * base->Meta.instanceCount; + size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64 + size = (size + 63) & ~63; + + return size; +} diff --git a/src/intel/vulkan/grl/gpu/quad.h b/src/intel/vulkan/grl/gpu/quad.h new file mode 100644 index 00000000000..cc1b7d470f8 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/quad.h @@ -0,0 +1,127 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "shared.h" +#include "intrinsics.h" +#include "AABB.h" +#include "AABB3f.h" + +// JDB TODO: Use corresponding GRL structures!!! + +struct Quad +{ + unsigned int shaderIndex; // note: also mask + unsigned int geomIndex; // note: also geom flags in upper 2 bits + unsigned int primIndex0; + unsigned int primIndex1Delta; + float v[4][3]; +}; + +GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad) +{ + return quad->geomIndex; +} + +GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad) +{ + return quad->primIndex0; +} + +GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad) +{ + return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF); +} + +GRL_INLINE float3 load_float3(float *p) +{ + return (float3)(p[0], p[1], p[2]); +} + +GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm) +{ + return (float3)(p[perm.x], p[perm.y], p[perm.z]); +} + +GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm) +{ + return (float2)(p[perm.x], p[perm.y]); +} + +GRL_INLINE float load_perm_float(float *p, const uint perm) +{ + return p[perm]; +} + +GRL_INLINE struct AABB getAABB_Quad(struct Quad *q) +{ + struct AABB aabb; + const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3]))); + const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3]))); + aabb.lower = (float4)(lower, 0.0f); + aabb.upper = (float4)(upper, 0.0f); + return aabb; +} + +GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box) +{ + struct AABB aabb; + const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3]))); + const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3]))); + aabb.lower = (float4)(lower, 0.0f); + aabb.upper = (float4)(upper, 0.0f); + AABB_extend(box, &aabb); +} + +GRL_INLINE float4 getCentroid2_Quad(struct Quad *q) +{ + struct AABB aabb = getAABB_Quad(q); + return aabb.lower + aabb.upper; +} + +GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3, + const uchar j0, const uchar j1, const uchar j2, + const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags ) +{ + quad->v[0][0] = v0.x; + quad->v[0][1] = v0.y; + quad->v[0][2] = v0.z; + quad->v[1][0] = v1.x; + quad->v[1][1] = v1.y; + quad->v[1][2] = v1.z; + quad->v[2][0] = v2.x; + quad->v[2][1] = v2.y; + quad->v[2][2] = v2.z; + quad->v[3][0] = v3.x; + quad->v[3][1] = v3.y; + quad->v[3][2] = v3.z; + + quad->shaderIndex = (geomMask << 24) | geomID; + quad->geomIndex = geomID | (geomFlags << 30); + quad->primIndex0 = primID0; + const uint delta = primID1 - primID0; + const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4)); + quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf + +} + +GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3) +{ + quad->v[0][0] = v0.x; + quad->v[0][1] = v0.y; + quad->v[0][2] = v0.z; + quad->v[1][0] = v1.x; + quad->v[1][1] = v1.y; + quad->v[1][2] = v1.z; + quad->v[2][0] = v2.x; + quad->v[2][1] = v2.y; + quad->v[2][2] = v2.z; + quad->v[3][0] = v3.x; + quad->v[3][1] = v3.y; + quad->v[3][2] = v3.z; +} diff --git a/src/intel/vulkan/grl/gpu/radix_sort.grl b/src/intel/vulkan/grl/gpu/radix_sort.grl new file mode 100644 index 00000000000..df932057a10 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/radix_sort.grl @@ -0,0 +1,163 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module radix_sort; + +kernel_module radix_kernels ("morton_radix_sort.cl") +{ + links lsc_intrinsics; + kernel opencl_build_morton_kernel_sort_bin_items < kernelFunction="sort_morton_codes_bin_items">; + kernel opencl_build_morton_kernel_sort_reduce_bins < kernelFunction="sort_morton_codes_reduce_bins">; + kernel opencl_build_morton_kernel_sort_scatter_items < kernelFunction="sort_morton_codes_scatter_items">; + + kernel opencl_build_morton_codes_sort_merged < kernelFunction="sort_morton_codes_merged">; + + kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">; + kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">; +} + +metakernel sort( + qword build_globals, + dword shift, + qword global_histogram, + qword input0, + qword input1, + dword input0_offset, + dword input1_offset, + dword iteration, + dword threads) +{ + dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args( + build_globals, + shift, + global_histogram, + input0, + input1, + input0_offset, + input1_offset, + iteration); + + control(wait_idle); + + dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args( + threads, + global_histogram); + + control(wait_idle); + + dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args( + build_globals, + shift, + global_histogram, + input0, + input1, + input0_offset, + input1_offset, + iteration); + + control(wait_idle); + +} + +metakernel sort_bin_items( + qword build_globals, + qword global_histogram, + qword wg_flags, + qword input0, + dword iteration, + dword threads, + dword update_wg_flags + ) +{ + dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args( + build_globals, + global_histogram, + wg_flags, + input0, + iteration, + threads, + update_wg_flags + ); +} + +metakernel sort_reduce_bins( + qword build_globals, + qword global_histogram, + dword threads, + dword iteration) +{ + dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args( + build_globals, + threads, + global_histogram, + iteration); +} + +metakernel sort_scatter_items( + qword build_globals, + qword global_histogram, + qword input0, + qword input1, + dword iteration, + dword threads, + dword update_morton_sort_in_flight ) +{ + dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args( + build_globals, + global_histogram, + input0, + input1, + iteration, + threads, + update_morton_sort_in_flight + ); +} + +metakernel sort_bin_items_merged( + qword build_globals, + qword global_histogram, + qword input0, + dword iteration, + dword threads) +{ + dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args( + build_globals, + global_histogram, + input0, + iteration, + threads + ); +} + +metakernel sort_reduce_bins_wide( + qword build_globals, + qword global_histogram, + qword global_histogram_tmp, + qword wg_flags, + dword threads, + dword threads_groups, + dword iteration) +{ + dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args( + build_globals, + threads, + threads_groups, + global_histogram, + global_histogram_tmp, + wg_flags, + iteration); + + control(wait_idle); + + dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args( + build_globals, + threads, + threads_groups, + global_histogram, + global_histogram_tmp, + iteration); +} diff --git a/src/intel/vulkan/grl/gpu/rebraid.grl b/src/intel/vulkan/grl/gpu/rebraid.grl new file mode 100644 index 00000000000..5aa809637a3 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/rebraid.grl @@ -0,0 +1,167 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module rebraid; + +kernel init_scratch < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch" > +kernel chase_instance_ptrs < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers" > +kernel calc_aabb < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances" > +kernel calc_aabb_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect" > +kernel calc_aabb_ptr < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers" > +kernel calc_aabb_ptr_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect" > +kernel count_splits < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits" > +kernel count_splits_SG < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG" > +kernel count_splits_SG_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect" > +kernel build_primrefs < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs" > +kernel build_primrefs_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect" > + +//kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" > +//kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" > + + +const PRIMREF_GROUP_SIZE = 256; + +const COUNT_SPLITS_GROUP_SIZE = 16; + +struct MKRebraidArgs +{ + qword bvh_buffer; + qword primref_buffer; + qword global_buffer; + qword instances_buffer; + qword rebraid_scratch; + qword flat_instances_buffer; + dword num_instances; + dword num_extra_primrefs; +}; + +metakernel rebraid( + MKRebraidArgs Args + ) +{ + dispatch init_scratch(1,1,1) args( Args.rebraid_scratch ); + dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer ); + control( wait_idle ); + + //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE); + //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances ); + + dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch ); + control( wait_idle ); + + define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE); + + dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances ); + control( wait_idle ); + + //dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances ); +} + +metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo) +{ + + dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch); + + define num_groups REG0; + num_groups = load_dword(indirectBuildRangeInfo); + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo); + control(wait_idle); + + dispatch_indirect count_splits_SG_indirect + args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo); + + define groupsize_1 REG1; // groupsize - 1 + define C_8 REG2; + + groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1 + C_8 = 8; // log_2(PRIMREF_GROUP_SIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE; + DISPATCHDIM_X = num_groups.lo; + + control(wait_idle); + + dispatch_indirect build_primrefs_indirect args( + Args.global_buffer, + Args.bvh_buffer, + Args.instances_buffer, + Args.rebraid_scratch, + Args.primref_buffer, + indirectBuildRangeInfo, + Args.num_extra_primrefs); + control(wait_idle); +} + +metakernel rebraid_ptrs( + MKRebraidArgs Args + ) +{ + dispatch init_scratch(1,1,1) args( Args.rebraid_scratch ); + dispatch chase_instance_ptrs( Args.num_instances, 1, 1) args( Args.instances_buffer, Args.flat_instances_buffer ); + dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer ); + control( wait_idle ); + + //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE); + //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch ); + + dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch ); + control( wait_idle ); + + define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE); + + + dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances ); + control( wait_idle ); + +} + +metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo) +{ + dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch); + + define num_groups REG0; + num_groups = load_dword(indirectBuildRangeInfo); + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect chase_instance_ptrs + args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo); + dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo); + control(wait_idle); + + dispatch_indirect count_splits_SG_indirect + args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo); + + define groupsize_1 REG1; // groupsize - 1 + define C_8 REG2; + + groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1 + C_8 = 8; // log_2(PRIMREF_GROUP_SIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE; + DISPATCHDIM_X = num_groups.lo; + + control(wait_idle); + + dispatch_indirect build_primrefs_indirect args( + Args.global_buffer, + Args.bvh_buffer, + Args.flat_instances_buffer, + Args.rebraid_scratch, + Args.primref_buffer, + Args.num_extra_primrefs, + indirectBuildRangeInfo, + Args.num_instances); + control(wait_idle); +} diff --git a/src/intel/vulkan/grl/gpu/shared.h b/src/intel/vulkan/grl/gpu/shared.h new file mode 100644 index 00000000000..0d42d98a1d4 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/shared.h @@ -0,0 +1,182 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "GRLGen12.h" +#pragma once + +#define sizeof_Quad 64 +#define sizeof_Procedural 64 +#define sizeof_PrimRef 32 +#define sizeof_PresplitItem 8 +#define sizeof_HwInstanceLeaf 128 +#define MORTON_BUILDER_SUBTREE_THRESHOLD 256 +#define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32 +// Temporarily disable localized phase2 due to issues in ELG presi +// This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit +#define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0 + +#define BVH_QUAD_NODE 4 +#define BVH_INSTANCE_NODE 1 +#define BVH_INTERNAL_NODE 0 +#define BVH_PROCEDURAL_NODE 3 +#define BUILDRECORD_STACK_SIZE 48 +#define BINS 16 + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) +GRL_NAMESPACE_BEGIN(GPUBVHBuilder) + +struct AABB +{ + float4 lower; + float4 upper; +}; + +typedef struct BlockAllocator +{ + unsigned int start; + unsigned int cur; +} BlockAllocator; + +struct Globals +{ + struct AABB centroidBounds; + + unsigned int build_record_start; + unsigned int numPrimitives; + unsigned int leafPrimType; + unsigned int leafSize; + + unsigned int numSplittedPrimitives; + unsigned int numBuildRecords; + + // spatial split sate + unsigned int numOriginalPrimitives; + float presplitPrioritySum; + float probThreshold; + + // binned-sah bfs state + unsigned int counter; + unsigned int numBuildRecords_extended; + + // sync variable used for global-sync on work groups + unsigned int sync; + + + /* morton code builder state */ + unsigned int shift; // used by adaptive mc-builder + unsigned int shift_mask; // used by adaptive mc-builder + unsigned int binary_hierarchy_root; + unsigned int p0_allocated_num; + unsigned int p0_created_num; + unsigned int morton_sort_in_flight; + unsigned int sort_iterations; + + gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid +}; + +struct Range +{ + unsigned int start, end; +}; + +struct Triangle +{ + unsigned int vtx[3]; + //unsigned int primID; + //unsigned int geomID; +}; + +struct MortonCodePrimitive +{ + uint64_t index_code; // 64bit code + index combo +}; + +struct BuildRecord +{ + struct AABB centroidBounds; + unsigned int start, end; + __global void *current; +}; + +struct BinaryMortonCodeHierarchy +{ + struct Range range; + unsigned int leftChild; + unsigned int rightChild; + // unsigned int flag; +}; + +typedef struct MortonFlattenedBoxlessNode { + uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE + uint childOffset_type; // childOffset : 26, type : 6 + uint backPointer; // same usage as in bvh +} MortonFlattenedBoxlessNode; + +struct StatStackEntry +{ + struct AABB aabb; + unsigned int node; + unsigned int type; + unsigned int depth; + float area; +}; + +struct BuildRecordMorton +{ + unsigned int nodeID; + unsigned int items; + unsigned int current_index; + unsigned int parent_index; +}; + +struct Split +{ + float sah; + int dim; + int pos; +}; + +struct BinMapping +{ + float4 ofs, scale; +}; + +struct BinInfo +{ + struct AABB3f boundsX[BINS]; + struct AABB3f boundsY[BINS]; + struct AABB3f boundsZ[BINS]; + uint3 counts[BINS]; +}; + +struct BinInfo2 +{ + struct AABB3f boundsX[BINS * 2]; + struct AABB3f boundsY[BINS * 2]; + struct AABB3f boundsZ[BINS * 2]; + uint3 counts[BINS * 2]; +}; + +struct GlobalBuildRecord +{ + struct BinInfo2 binInfo; + struct BinMapping binMapping; + struct Split split; + struct Range range; + struct AABB leftCentroid; + struct AABB rightCentroid; + struct AABB leftGeometry; + struct AABB rightGeometry; + unsigned int atomicCountLeft; + unsigned int atomicCountRight; + unsigned int buildRecordID; +}; + +GRL_NAMESPACE_END(GPUBVHBuilder) +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/structs.grl b/src/intel/vulkan/grl/gpu/structs.grl new file mode 100644 index 00000000000..f15b1d2346b --- /dev/null +++ b/src/intel/vulkan/grl/gpu/structs.grl @@ -0,0 +1,38 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module structs; + +struct MKBuilderState { + qword geomDesc_buffer; + qword build_primref_buffer; + qword build_globals; + qword bvh_buffer; + dword leaf_type; + dword leaf_size; +}; + +struct MKSizeEstimate { + dword numTriangles; + dword numProcedurals; + dword numPrimitives; + dword numMeshes; + dword numBuildPrimitives; + dword numPrimitivesToSplit; + dword instance_descs_start; + dword geo_meta_data_start; + dword node_data_start; + dword leaf_data_start; + dword procedural_data_start; + dword back_pointer_start; + dword sizeTotal; + dword updateScratchSizeTotal; + dword fatleaf_table_start; + dword innernode_table_start; + dword max_fatleaves; + dword quad_indices_data_start; +}; diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.cl b/src/intel/vulkan/grl/gpu/traversal_shader.cl new file mode 100644 index 00000000000..ee5d2afcc75 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/traversal_shader.cl @@ -0,0 +1,277 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#include "instance.h" +#include "api_interface.h" + +#include "bvh_build_primref.h" +#include "bvh_build_refit.h" + +/* + Create primrefs from array of instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +TS_primrefs_from_instances( + global struct Globals* globals, + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, + uint numInstances, + global struct AABB* primrefs, + global uchar* pAABBs, + global uchar* pIsProcedural, + dword aabb_stride, + uint allowUpdate + ) +{ + const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (instanceIndex < numInstances) + { + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; + + global struct GRL_RAYTRACING_AABB* procedural_bb = 0; + if ( pIsProcedural[instanceIndex] ) + { + procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); + } + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + procedural_bb, + allowUpdate); + } +} + +/* + Create primrefs from array of instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel +TS_primrefs_from_instances_indirect( + global struct Globals* globals, + global struct BVHBase* bvh, + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, + uint numInstances, + global struct AABB* primrefs, + global uchar* pAABBs, + global uchar* pIsProcedural, + dword aabb_stride, + uint allowUpdate, + global struct IndirectBuildRangeInfo* indirect_data + ) +{ + const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (instanceIndex < indirect_data->primitiveCount) + { + instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) + (((global char*)instances) + indirect_data->primitiveOffset); + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; + + global struct GRL_RAYTRACING_AABB* procedural_bb = 0; + if ( pIsProcedural[instanceIndex] ) + { + procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); + } + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + procedural_bb, + allowUpdate); + } +} + +/* + Create primrefs from array of pointers to instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel +TS_primrefs_from_instances_pointers(global struct Globals* globals, + global struct BVHBase* bvh, + global void* instances_in, + uint numInstances, + global struct AABB* primrefs, + global uchar* pAABBs, + global uchar* pIsProcedural, + dword aabb_stride, + uint allowUpdate + ) +{ + global const struct GRL_RAYTRACING_INSTANCE_DESC** instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in; + + const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (instanceIndex < numInstances) + { + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; + + global struct GRL_RAYTRACING_AABB* procedural_bb = 0; + if (pIsProcedural[instanceIndex]) + { + procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); + } + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + procedural_bb, + allowUpdate); + } +} + +/* + Create primrefs from array of pointers to instance descriptors. + */ + GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) +void kernel +TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals, + global struct BVHBase* bvh, + global void* instances_in, + global struct AABB* primrefs, + global uchar* pAABBs, + global uchar* pIsProcedural, + dword aabb_stride, + uint allowUpdate, + global struct IndirectBuildRangeInfo* indirect_data + ) +{ + const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; + if (instanceIndex < indirect_data->primitiveCount) + { + instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset; + global const struct GRL_RAYTRACING_INSTANCE_DESC** instances = + (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in; + global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; + + global struct GRL_RAYTRACING_AABB* procedural_bb = 0; + if (pIsProcedural[instanceIndex]) + { + procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); + } + + primrefs_from_instances( + globals, + bvh, + instance, + instanceIndex, + primrefs, + procedural_bb, + allowUpdate); + } +} + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +TS_update_instance_leaves(global struct BVHBase* bvh, + uint64_t dxrInstancesArray, + uint64_t dxrInstancesPtr, + global struct AABB3f* instance_aabb_scratch, + global uchar* aabbs, + global uchar* is_procedural, + dword aabb_stride +) +{ + uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh); + uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); + if (id >= num_leaves) + return; + + struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh); + uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]); + + global GRL_RAYTRACING_AABB* procedural_box = 0; + if (is_procedural[idx]) + { + procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx)); + } + + DO_update_instance_leaves( + bvh, + dxrInstancesArray, + dxrInstancesPtr, + instance_aabb_scratch, + id, + procedural_box); +} + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(16, 1, 1))) +void kernel +TS_fixup_leaves( global struct BVHBase* bvh, + global uchar* primref_index, + global PrimRef* primrefs, + uint stride ) + +{ + uint num_inners = BVHBase_GetNumInternalNodes(bvh); + uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); + + // assign 8 lanes to each inner node, 6 of which will do useful work + uint node_id = id / 8; + uint child_id = id % 8; + + bool node_valid = (node_id < num_inners); + + if (node_valid ) + { + global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh); + global InternalNode* my_node = nodes + node_id; + + if (my_node->nodeType == BVH_INSTANCE_NODE) + { + bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id); + if (child_valid) + { + global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node); + uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id; + + const uint primrefID = *(uint*)(primref_index + leafIndex * stride); + + uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ? + BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE; + + InternalNode_SetChildType(my_node, child_id, type); + } + + if (child_id == 0) + my_node->nodeType = BVH_INTERNAL_NODE; + } + } +} + + + + + +GRL_ANNOTATE_IGC_DO_NOT_SPILL +__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel +TS_Refit_per_one_startpoint_sg( + global struct BVHBase* bvh, + global struct AABB3f* instance_leaf_aabbs, + global uchar* procedural_instance_enable_buffer ) +{ + DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer ); + +} diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.grl b/src/intel/vulkan/grl/gpu/traversal_shader.grl new file mode 100644 index 00000000000..3820996c348 --- /dev/null +++ b/src/intel/vulkan/grl/gpu/traversal_shader.grl @@ -0,0 +1,244 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +module traversal_shader; + +kernel_module morton_kernels ("traversal_shader.cl") +{ + links lsc_intrinsics; + + kernel TS_primrefs_from_instances < kernelFunction = "TS_primrefs_from_instances" >; + kernel TS_primrefs_from_instances_indirect < kernelFunction = "TS_primrefs_from_instances_indirect" >; + kernel TS_primrefs_from_instances_ptrs < kernelFunction = "TS_primrefs_from_instances_pointers" >; + kernel TS_primrefs_from_instances_ptrs_indirect < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >; + kernel TS_update_instance_leaves < kernelFunction = "TS_update_instance_leaves" >; + kernel TS_Refit_per_one_startpoint_sg < kernelFunction = "TS_Refit_per_one_startpoint_sg" >; + kernel TS_fixup_leaves < kernelFunction = "TS_fixup_leaves" >; +} + +struct MKTSBuildArgs +{ + qword build_globals; + qword bvh_buffer; + qword instance_descs; + qword build_primref_buffer; + qword aabb_buffer; + qword is_procedural_buffer; + qword leaf_creation_index_buffer; + dword aabb_stride; + dword num_instances; + dword leaf_creation_index_stride; +}; + +const BUILD_PRIMREFS_GROUPSIZE = 16; + + +metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate ) +{ + define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE); + dispatch TS_primrefs_from_instances(num_groups, 1, 1) args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.instance_descs, + build_state.num_instances, + build_state.build_primref_buffer, + build_state.aabb_buffer, + build_state.is_procedural_buffer, + build_state.aabb_stride, + allowUpdate + ); + +} + +metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1 + C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect TS_primrefs_from_instances_indirect args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.instance_descs, + build_state.build_primref_buffer, + build_state.aabb_buffer, + build_state.is_procedural_buffer, + build_state.aabb_stride, + allowUpdate, + indirectBuildRangeInfo + ); + +} + +metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate ) +{ + define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE); + dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.instance_descs, + build_state.num_instances, + build_state.build_primref_buffer, + build_state.aabb_buffer, + build_state.is_procedural_buffer, + build_state.aabb_stride, + allowUpdate + ); +} + +metakernel +TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1 + C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args( + build_state.build_globals, + build_state.bvh_buffer, + build_state.instance_descs, + build_state.build_primref_buffer, + build_state.aabb_buffer, + build_state.is_procedural_buffer, + build_state.aabb_stride, + allowUpdate, + indirectBuildRangeInfo + ); +} + + + + +const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16; + +struct MKTSUpdateArgs +{ + qword bvh_buffer; + qword instance_descs; + qword instance_descs_ptrs; + qword aabb_buffer; + qword is_procedural_buffer; + qword refit_scratch; + dword aabb_stride; + dword num_instances; +}; + +metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state ) +{ + define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE); + dispatch TS_update_instance_leaves(num_groups, 1, 1) args( + update_state.bvh_buffer, + update_state.instance_descs, + update_state.instance_descs_ptrs, + update_state.refit_scratch, + update_state.aabb_buffer, + update_state.is_procedural_buffer, + update_state.aabb_stride + ); +} + +metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo ) +{ + define num_groups REG0; + define groupsize_1 REG1; // groupsize - 1 + define C_4 REG2; + + // init with primitiveCount + num_groups = load_dword(indirectBuildRangeInfo); + groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1 + C_4 = 4; // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE) + + num_groups = num_groups + groupsize_1; + num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE; + + DISPATCHDIM_X = num_groups.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + // need to add indirect offset? + dispatch_indirect TS_update_instance_leaves args( + update_state.bvh_buffer, + update_state.instance_descs, + update_state.instance_descs_ptrs, + update_state.refit_scratch, + update_state.aabb_buffer, + update_state.is_procedural_buffer, + update_state.aabb_stride + ); +} + +metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end ) +{ + REG0 = bvh_inner_nodes_start_value; + REG1.lo = load_dword(bvh_inner_nodes_end); + REG1.hi = 0; + REG2 = REG1 - REG0; + + DISPATCHDIM_X = REG2.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect TS_Refit_per_one_startpoint_sg + args( + update_state.bvh_buffer, + update_state.refit_scratch, + update_state.is_procedural_buffer + ); +} + + +const FIXUP_LEAVES_NODES_PER_GROUP = 2; + +metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end ) +{ + define ONE REG3; + + ONE = 1; + REG0 = bvh_inner_nodes_start_value; + REG1.lo = load_dword(bvh_inner_nodes_end); + REG1.hi = 0; + REG2 = REG1 - REG0; + REG2 = REG2 + ONE; + REG2 = REG2 >> ONE; + + DISPATCHDIM_X = REG2.lo; + DISPATCHDIM_Y = 1; + DISPATCHDIM_Z = 1; + + dispatch_indirect TS_fixup_leaves + args( + build_state.bvh_buffer, + build_state.leaf_creation_index_buffer, + build_state.build_primref_buffer, + build_state.leaf_creation_index_stride + ); + +} diff --git a/src/intel/vulkan/grl/grl_cl_kernel_gen.py b/src/intel/vulkan/grl/grl_cl_kernel_gen.py new file mode 100644 index 00000000000..18b3a41a420 --- /dev/null +++ b/src/intel/vulkan/grl/grl_cl_kernel_gen.py @@ -0,0 +1,212 @@ +COPYRIGHT = """\ +/* + * Copyright 2021 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +""" + +import argparse +import os + +from grl_parser import parse_grl_file +from mako.template import Template + +TEMPLATE_H = Template(COPYRIGHT + """ +/* This file generated from ${filename}, don't edit directly. */ + +#ifndef GRL_CL_KERNEL_H +#define GRL_CL_KERNEL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "genxml/gen_macros.h" +#include "compiler/brw_kernel.h" + +enum grl_cl_kernel { +% for k in kernels: + GRL_CL_KERNEL_${k.upper()}, +% endfor +}; + +const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id); + +void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* INTEL_GRL_H */ +""", output_encoding='utf-8') + +TEMPLATE_C = Template(COPYRIGHT + """ +/* This file generated from ${filename}, don't edit directly. */ + +#include "grl_cl_kernel.h" + +% for k in kernels: +#include "${prefix}_${k}.h" +% endfor + +const char * +genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id) +{ + switch (id) { +% for k in kernels: + case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1; +% endfor + default: + unreachable("Invalid GRL kernel enum"); + } +}; + +void +${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id) +{ + switch (id) { +% for k in kernels: + case GRL_CL_KERNEL_${k.upper()}: + *kernel = ${prefix}_${k}; + break; +% endfor + default: + unreachable("Invalid GRL kernel enum"); + } +} +""", output_encoding='utf-8') + +def get_libraries_files(kernel_module): + lib_files = [] + for item in kernel_module[3]: + if item[0] != 'library': + continue + default_file = None + fallback_file = None + path_directory = None + for props in item[2]: + if props[0] == 'fallback': + fallback_file = props[1] + elif props[0] == 'default': + default_file = props[1] + elif props[0] == 'path': + path_directory = props[1] + assert path_directory + assert default_file or fallback_file + if fallback_file: + lib_files.append(os.path.join(path_directory, fallback_file)) + else: + lib_files.append(os.path.join(path_directory, default_file)) + return lib_files + +def add_kernels(kernels, cl_file, entrypoint, libs): + assert cl_file.endswith('.cl') + for lib_file in libs: + assert lib_file.endswith('.cl') + kernels.append((cl_file, entrypoint, ','.join(libs))) + +def get_kernels(grl_nodes): + kernels = [] + for item in grl_nodes: + assert isinstance(item, tuple) + if item[0] == 'kernel': + ann = item[2] + add_kernels(kernels, ann['source'], ann['kernelFunction'], []) + elif item[0] == 'kernel-module': + cl_file = item[2] + libfiles = get_libraries_files(item) + for kernel_def in item[3]: + if kernel_def[0] == 'kernel': + ann = kernel_def[2] + add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles) + return kernels + +def parse_libraries(filenames): + libraries = {} + for fname in filenames: + lib_package = parse_grl_file(fname, []) + for lib in lib_package: + assert lib[0] == 'library' + # Add the directory of the library so that CL files can be found. + lib[2].append(('path', os.path.dirname(fname))) + libraries[lib[1]] = lib + return libraries + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--out-c', help='Output C file') + parser.add_argument('--out-h', help='Output H file') + parser.add_argument('--ls-kernels', action='store_const', const=True, + help='List all openCL kernels') + parser.add_argument('--prefix', help='Prefix') + parser.add_argument('--library', dest='libraries', action='append', + default=[], help='Libraries to include') + parser.add_argument('files', type=str, nargs='*', help='GRL files') + args = parser.parse_args() + + libraries = parse_libraries(args.libraries) + + kernels = [] + for fname in args.files: + kernels += get_kernels(parse_grl_file(fname, libraries)) + + # Make the list of kernels unique and sorted + kernels = sorted(list(set(kernels))) + + if args.ls_kernels: + for cl_file, entrypoint, libs in kernels: + if not os.path.isabs(cl_file): + cl_file = os.path.join(os.path.dirname(fname), cl_file) + print('{}:{}:{}'.format(cl_file, entrypoint, libs)) + + kernel_c_names = [] + for cl_file, entrypoint, libs in kernels: + cl_file = os.path.splitext(cl_file)[0] + cl_file_name = cl_file.replace('/', '_') + kernel_c_names.append('_'.join([cl_file_name, entrypoint])) + + try: + if args.out_h: + with open(args.out_h, 'wb') as f: + f.write(TEMPLATE_H.render(kernels=kernel_c_names, + filename=os.path.basename(__file__))) + + if args.out_c: + with open(args.out_c, 'wb') as f: + f.write(TEMPLATE_C.render(kernels=kernel_c_names, + prefix=args.prefix, + filename=os.path.basename(__file__))) + except Exception: + # In the event there's an error, this imports some helpers from mako + # to print a useful stack trace and prints it, then exits with + # status 1, if python is run with debug; otherwise it just raises + # the exception + if __debug__: + import sys + from mako import exceptions + sys.stderr.write(exceptions.text_error_template().render() + '\n') + sys.exit(1) + raise + +if __name__ == '__main__': + main() diff --git a/src/intel/vulkan/grl/include/AABB3f.h b/src/intel/vulkan/grl/include/AABB3f.h new file mode 100644 index 00000000000..a3412332c77 --- /dev/null +++ b/src/intel/vulkan/grl/include/AABB3f.h @@ -0,0 +1,459 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "GRLRTASCommon.h" + +#include "affinespace.h" + +#ifndef __OPENCL_VERSION__ +# include "stdio.h" //for printf +#endif + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) + +GRL_INLINE void AABB3f_init(struct AABB3f *aabb) +{ + aabb->lower[0] = (float)(INFINITY); + aabb->lower[1] = (float)(INFINITY); + aabb->lower[2] = (float)(INFINITY); + + aabb->upper[0] = -(float)(INFINITY); + aabb->upper[1] = -(float)(INFINITY); + aabb->upper[2] = -(float)(INFINITY); +} + +GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb ) +{ + float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] }; + return v; +} +GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb ) +{ + float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] }; + return v; +} + +GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v) +{ + aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]); + aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]); + aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]); + aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]); + aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]); + aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]); +} + +GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters) +{ + aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]); + aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]); + aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]); + aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]); + aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]); + aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]); +} + +GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper) +{ + aabb->upper[0] = fmin(upper[0], aabb->upper[0]); + aabb->upper[1] = fmin(upper[1], aabb->upper[1]); + aabb->upper[2] = fmin(upper[2], aabb->upper[2]); +} + +GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper ) +{ + aabb->lower[0] = lower.x ; + aabb->lower[1] = lower.y ; + aabb->lower[2] = lower.z ; + aabb->upper[0] = upper.x ; + aabb->upper[1] = upper.y ; + aabb->upper[2] = upper.z ; +} + +inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p) +{ + aabb->lower[0] = fmin(aabb->lower[0], p.x); + aabb->lower[1] = fmin(aabb->lower[1], p.y); + aabb->lower[2] = fmin(aabb->lower[2], p.z); + aabb->upper[0] = fmax(aabb->upper[0], p.x); + aabb->upper[1] = fmax(aabb->upper[1], p.y); + aabb->upper[2] = fmax(aabb->upper[2], p.z); +} + +GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper) +{ + aabb->lower[0] = fmin(aabb->lower[0], lower.x); + aabb->lower[1] = fmin(aabb->lower[1], lower.y); + aabb->lower[2] = fmin(aabb->lower[2], lower.z); + aabb->upper[0] = fmax(aabb->upper[0], upper.x); + aabb->upper[1] = fmax(aabb->upper[1], upper.y); + aabb->upper[2] = fmax(aabb->upper[2], upper.z); +} + +GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb) +{ + return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb); +} + +GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb) +{ + const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb ); + return d.x* (d.y + d.z) + (d.y * d.z); +} + +GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me +{ + const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] }; + return fma(d.x, (d.y + d.z), d.y * d.z); +} + +GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower) +{ + aabb->lower[0] = lower.x; + aabb->lower[1] = lower.y; + aabb->lower[2] = lower.z; +} + +GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper) +{ + aabb->upper[0] = upper.x; + aabb->upper[1] = upper.y; + aabb->upper[2] = upper.z; +} + +GRL_INLINE float3 conservativeExtent(float3 extent) +{ + const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z)); + float3 v3 = { v,v,v }; + extent = extent + v3; + return extent; +} + +inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform) +{ +#if 1 + // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area + // New AABB is center +- Extent. + // + // For derivation see: + // https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/ + // + + float3 Center = (upper + lower) * 0.5f; + float3 Extent = (conservativeExtent(upper) - lower) * 0.5f; + + float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3]; + float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7]; + float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11]; + float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]); + float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]); + float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]); + + Center.x = cx; Center.y = cy; Center.z = cz; + Extent.x = ex; Extent.y = ey; Extent.z = ez; + + struct AABB3f box; + AABB3f_set_lower(&box, Center - Extent); + AABB3f_set_upper(&box, Center + Extent); + return box; +#else + struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform); + + float3 plll = { lower.x, lower.y, lower.z }; + float3 pllu = { lower.x, lower.y, upper.z }; + float3 plul = { lower.x, upper.y, lower.z }; + float3 pluu = { lower.x, upper.y, upper.z }; + float3 pull = { upper.x, lower.y, lower.z }; + float3 pulu = { upper.x, lower.y, upper.z }; + float3 puul = { upper.x, upper.y, lower.z }; + float3 puuu = { upper.x, upper.y, upper.z }; + plll = xfmPoint(xfm, plll) ; + pllu = xfmPoint(xfm, pllu) ; + plul = xfmPoint(xfm, plul) ; + pluu = xfmPoint(xfm, pluu) ; + pull = xfmPoint(xfm, pull) ; + pulu = xfmPoint(xfm, pulu) ; + puul = xfmPoint(xfm, puul) ; + puuu = xfmPoint(xfm, puuu) ; + + float3 p1_min = fmin(plll, pull); + float3 p2_min = fmin(pllu, pulu); + float3 p3_min = fmin(plul, puul); + float3 p4_min = fmin(pluu, puuu); + float3 p1_max = fmax(plll, pull); + float3 p2_max = fmax(pllu, pulu); + float3 p3_max = fmax(plul, puul); + float3 p4_max = fmax(pluu, puuu); + p1_min = fmin(p1_min, p3_min); + p2_min = fmin(p2_min, p4_min); + p1_max = fmax(p1_max, p3_max); + p2_max = fmax(p2_max, p4_max); + p1_min = fmin(p1_min, p2_min); + p1_max = fmax(p1_max, p2_max); + + AABB3f out = { + {p1_min.x,p1_min.y,p1_min.z}, + {p1_max.x,p1_max.y,p1_max.z} + }; + return out; +#endif +} + +GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform) +{ + float3 lower = { box.lower[0], box.lower[1], box.lower[2] }; + float3 upper = { box.upper[0], box.upper[1], box.upper[2] }; + return transform_aabb(lower, upper, Transform); +} + +GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in) +{ + struct AABB3f out; + float rmTransform[12]; + load_row_major_from_AffineSpace3f(xfm, rmTransform); + out = transform_aabb(in, rmTransform); + + return out; +} + +GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained) +{ + bool iscontained = + contained.x >= bigger.lower[0] && + contained.y >= bigger.lower[1] && + contained.z >= bigger.lower[2] && + contained.x <= bigger.upper[0] && + contained.y <= bigger.upper[1] && + contained.z <= bigger.upper[2]; + + return iscontained; +} + +GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained) +{ + bool iscontained = + contained.lower[0] >= bigger.lower[0] && + contained.lower[1] >= bigger.lower[1] && + contained.lower[2] >= bigger.lower[2] && + contained.upper[0] <= bigger.upper[0] && + contained.upper[1] <= bigger.upper[1] && + contained.upper[2] <= bigger.upper[2]; + + return iscontained; +} + +GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box ) +{ + return box->lower[0] > box->upper[0] || + box->lower[1] > box->upper[1] || + box->lower[2] > box->upper[2]; +} + +GRL_INLINE void AABB3f_print(struct AABB3f *aabb) +{ + printf("AABB {\n"); + printf(" lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]); + printf(" upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]); + printf("}\n"); +} + + + +#ifdef __OPENCL_VERSION__ +GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID) +{ + struct AABB3f bounds; + bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID); + bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID); + bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID); + bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID); + bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID); + bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID); + return bounds; +} + +GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb) +{ + struct AABB3f bounds; + bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]); + bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]); + bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]); + bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]); + bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]); + bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]); + return bounds; +} + +GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb) +{ + struct AABB3f bounds; + bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]); + bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]); + bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]); + bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]); + bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]); + bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]); + return bounds; +} + +GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb) +{ + struct AABB3f bounds; + bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]); + bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]); + bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]); + bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]); + bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]); + bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]); + return bounds; +} + +GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper) +{ + atomic_min((local float *)&aabb->lower + 0, lower.x); + atomic_min((local float *)&aabb->lower + 1, lower.y); + atomic_min((local float *)&aabb->lower + 2, lower.z); + atomic_max((local float *)&aabb->upper + 0, upper.x); + atomic_max((local float *)&aabb->upper + 1, upper.y); + atomic_max((local float *)&aabb->upper + 2, upper.z); +} + + +GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper ) +{ + atomic_min( (global float*) & aabb->lower + 0, lower.x ); + atomic_min( (global float*) & aabb->lower + 1, lower.y ); + atomic_min( (global float*) & aabb->lower + 2, lower.z ); + atomic_max( (global float*) & aabb->upper + 0, upper.x ); + atomic_max( (global float*) & aabb->upper + 1, upper.y ); + atomic_max( (global float*) & aabb->upper + 2, upper.z ); +} + +GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper ) +{ + atomic_min( (local float*) & aabb->lower + 0, lower.x ); + atomic_min( (local float*) & aabb->lower + 1, lower.y ); + atomic_min( (local float*) & aabb->lower + 2, lower.z ); + atomic_max( (local float*) & aabb->upper + 0, upper.x ); + atomic_max( (local float*) & aabb->upper + 1, upper.y ); + atomic_max( (local float*) & aabb->upper + 2, upper.z ); +} + +GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper) +{ + float lx = sub_group_reduce_min(lower.x); + float ly = sub_group_reduce_min(lower.y); + float lz = sub_group_reduce_min(lower.z); + + float ux = sub_group_reduce_max(upper.x); + float uy = sub_group_reduce_max(upper.y); + float uz = sub_group_reduce_max(upper.z); + + if (get_sub_group_local_id() == 0) + { + atomic_min((local float*) & aabb->lower + 0, lx); + atomic_min((local float*) & aabb->lower + 1, ly); + atomic_min((local float*) & aabb->lower + 2, lz); + atomic_max((local float*) & aabb->upper + 0, ux); + atomic_max((local float*) & aabb->upper + 1, uy); + atomic_max((local float*) & aabb->upper + 2, uz); + } +} + +GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper) +{ + uint lane = get_sub_group_local_id(); + float l[3]; + l[0] = sub_group_reduce_min(lower.x); + l[1] = sub_group_reduce_min(lower.y); + l[2] = sub_group_reduce_min(lower.z); + float u[3]; + u[0] = sub_group_reduce_max(upper.x); + u[1] = sub_group_reduce_max(upper.y); + u[2] = sub_group_reduce_max(upper.z); + + if (lane < 3) + { + atomic_min((global float*)&aabb->lower + lane, l[lane]); + atomic_max((global float*)&aabb->upper + lane, u[lane]); + } +} + +GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other ) +{ + float3 lower = AABB3f_load_lower( other ); + float3 upper = AABB3f_load_upper( other ); + atomic_min( (global float*) & aabb->lower + 0, lower.x ); + atomic_min( (global float*) & aabb->lower + 1, lower.y ); + atomic_min( (global float*) & aabb->lower + 2, lower.z ); + atomic_max( (global float*) & aabb->upper + 0, upper.x ); + atomic_max( (global float*) & aabb->upper + 1, upper.y ); + atomic_max( (global float*) & aabb->upper + 2, upper.z ); +} + +GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb ) +{ + atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] ); + atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] ); + atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] ); + atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] ); + atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] ); + atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] ); +} + +GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper) +{ + if (lower.x < aabb->lower[0]) + atomic_min((local float *)&aabb->lower + 0, lower.x); + if (lower.y < aabb->lower[1]) + atomic_min((local float *)&aabb->lower + 1, lower.y); + if (lower.z < aabb->lower[2]) + atomic_min((local float *)&aabb->lower + 2, lower.z); + if (upper.x > aabb->upper[0]) + atomic_max((local float *)&aabb->upper + 0, upper.x); + if (upper.y > aabb->upper[1]) + atomic_max((local float *)&aabb->upper + 1, upper.y); + if (upper.z > aabb->upper[2]) + atomic_max((local float *)&aabb->upper + 2, upper.z); +} + +GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source) +{ + float3 l = AABB3f_load_lower(source); + float3 u = AABB3f_load_upper(source); + atomic_min((global float *)&dest->lower + 0, l.x ); + atomic_min((global float *)&dest->lower + 1, l.y ); + atomic_min((global float *)&dest->lower + 2, l.z ); + atomic_max((global float *)&dest->upper + 0, u.x ); + atomic_max((global float *)&dest->upper + 1, u.y ); + atomic_max((global float *)&dest->upper + 2, u.z ); +} + + +struct AABB3f AABB3f_construct( float3 min, float3 max ) +{ + struct AABB3f bb; + bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z; + bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z; + return bb; +} + +struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond ) +{ + float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond ); + float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond ); + return AABB3f_construct( l, u ); +} + +#endif + +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) + diff --git a/src/intel/vulkan/grl/include/GRLGen12.h b/src/intel/vulkan/grl/include/GRLGen12.h new file mode 100644 index 00000000000..20849599e91 --- /dev/null +++ b/src/intel/vulkan/grl/include/GRLGen12.h @@ -0,0 +1,691 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// +// This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures +// +// + +//******************************************************************************************** +// WARNING!!!!! +// This file is shared by OpenCL and C++ source code and must be compatible. +// There should only be C structure definitions and trivial GRL_INLINE functions here +// +//******************************************************************************************** + +#pragma once + +#include "GRLRTASCommon.h" +#include "GRLUtilities.h" + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) +GRL_NAMESPACE_BEGIN(GEN12) + + enum_uint8(NodeType) + { + NODE_TYPE_MIXED = 0x0, // identifies a mixed internal node where each child can have a different type + NODE_TYPE_INTERNAL = 0x0, // internal BVH node with 6 children + NODE_TYPE_INSTANCE = 0x1, // instance leaf + NODE_TYPE_PROCEDURAL = 0x3, // procedural leaf + NODE_TYPE_QUAD = 0x4, // quad leaf + NODE_TYPE_INVALID = 0x7 // indicates invalid node + }; + + + typedef enum PrimLeafType + { + TYPE_NONE = 0, + + TYPE_QUAD = 0, + + /* For a node type of NODE_TYPE_PROCEDURAL we support enabling + * and disabling the opaque/non_opaque culling. */ + + TYPE_OPACITY_CULLING_ENABLED = 0, + TYPE_OPACITY_CULLING_DISABLED = 1 + } PrimLeafType; + + #define BVH_MAGIC_MACRO "GEN12_RTAS_005" // If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end + static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO; + + typedef struct BVHBase + { + // TODO: Implement the "copy-first-node" trick... duplicate root node here + + uint64_t rootNodeOffset; + + uint32_t reserved; + + uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64 + uint32_t quadLeafStart; + uint32_t quadLeafCur; + uint32_t proceduralDataStart; + uint32_t proceduralDataCur; + uint32_t instanceLeafStart; + uint32_t instanceLeafEnd; + uint32_t backPointerDataStart; // + uint32_t refitTreeletsDataStart; // refit structs + uint32_t refitStartPointDataStart; // + uint32_t BVHDataEnd; + + // number of bottom treelets + // if 1, then the bottom treelet is also tip treelet + uint32_t refitTreeletCnt; + uint32_t refitTreeletCnt2; // always 0, used for atomic updates + // data layout: + // @backPointerDataStart + // 'backpointer' - a dword per inner node. + // The bits are used as follows: + // 2:0 --> Used as a refit counter during BVH refitting. MBZ + // 5:3 --> Number of children + // 31:6 --> Index of the parent node in the internal node array + // The root node has a parent index of all ones + // @refitTreeletsDataStart + // RefitTreelet[], the last treelet is for top treelet all previous are for bottom + // @refitStartPointDataStart + // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space + // @backPointerDataEnd + + uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves" + uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children) + uint32_t fatLeafTableStart; + uint32_t innerTableStart; + + uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update + uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256 + uint32_t quadIndicesDataStart; + + uint32_t _pad[9]; + + struct RTASMetaData Meta; + + } BVHBase; + + GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base) + { + return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart); + } + +#ifdef __OPENCL_VERSION__ +#define BVH_ROOT_NODE_OFFSET sizeof(BVHBase) +#else +#define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase) +#endif + +GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!"); +GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!"); + + typedef struct BackPointers { + } BackPointers; + + // threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number + // means that no bottom treelet has more paths than this number + #define TREELET_NUM_STARTPOINTS 1536 + + // threshold under which only one treelet will be created + #define SINGLE_TREELET_THRESHOLD 3072 + + typedef struct LeafTableEntry { + + uint backpointer; + uint inner_node_index; + uint leaf_index; + } LeafTableEntry; + + typedef struct InnerNodeTableEntry { + + uint node_index_and_numchildren; // numchildren in 3 lsbs + uint first_child; + + } InnerNodeTableEntry; + + typedef struct QuadDataIndices + { + uint header_data[4]; + uint vert_idx[4]; + } QuadDataIndices; + + typedef struct RefitTreelet { + uint32_t startpoint_offset; + uint32_t numStartpoints; + uint32_t numNonTrivialStartpoints; + uint8_t maxDepth; + uint8_t depthLess64; // depth from bottom at which there are less 64 paths + uint8_t depthLess128;// depth from bottom at which there are less 128 paths + uint8_t depthLess256;// depth from bottom at which there are less 256 paths + } RefitTreelet; + + // if RefitTreelet has number of startpoints == 1 + // it should be reinterpreted as: + typedef struct RefitTreeletTrivial { + uint32_t theOnlyNodeIndex; + uint32_t numStartpoints; // have to be 1 or 0 + int32_t childrenOffsetOfTheNode; // 0th node based + uint8_t maxDepth; + uint8_t numChildrenOfTheNode; + } RefitTreeletTrivial; + + // 5:0 - depth after you die + // 31:6 - Index of the inner node + typedef uint32_t StartPoint; + + struct HwInstanceLeaf; + struct QuadLeaf; + struct ProceduralLeaf; + struct InternalNode; + + typedef struct HwInstanceLeaf HwInstanceLeaf; + typedef struct InternalNode InternalNode; + typedef struct QuadLeaf QuadLeaf; + typedef struct ProceduralLeaf ProceduralLeaf; + + GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp ) + { + return bp >> 6; + } + GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp ) + { + return (bp >> 3) & (7); + } + GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp ) + { + return bp & 7; + } + GRL_INLINE bool BackPointer_IsRoot( uint32_t bp ) + { + return (bp >> 6) == 0x03FFFFFF; + } + + GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p ) + { + return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET); + } + + GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p) + { + return p->Meta.bounds; + } + + GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p) + { + return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET); + } + GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p) + { + return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur)); + } + GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p) + { + return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64; + } + + + GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p) + { + return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart)); + } + GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p) + { + return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur)); + } + + GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p) + { + return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur)); + } + + GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p) + { + return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart)); + } + + GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p ) + { + char* pRTASBits = (char*)p; + return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart)); + } + + GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p ) + { + char* pRTASBits = (char*) p; + return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd)); + } + + GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p ) + { + return (p->instanceLeafEnd - p->instanceLeafStart) / 2; + } + + GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p) + { + return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart)); + } + + GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p) + { + return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart); + } + + GRL_INLINE uint StartPoint_GetDepth(StartPoint s) + { + return s & ((1 << 6) - 1); + } + + GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s) + { + return s >> 6; + } + + GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p) + { + return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart)); + } + + // this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms. + // to get real number of all treelets including tip, the formula is + // actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1; + GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p) + { + return &p->refitTreeletCnt; + } + + GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p) + { + return p->refitTreeletCnt; + } + + GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p) + { + return p->refitTreeletCnt == 1; + } + + GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p) + { + return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart)); + } + + + GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p) + { + return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart)); + } + GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p) + { + return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart)); + } + GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p) + { + return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart)); + } + + GRL_INLINE unsigned* InnerNode_GetBackPointer( + BackPointers* backpointersStruct, + uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/) + { + uint* backpointersArray = (uint*)backpointersStruct; + // BACKPOINTER_LAYOUT + uint new_index = inodeOffset; //<-layout canonical + //uint new_index = inodeOffset*16; //<-layout scattered + // uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8)); //<-layout hashed + + return backpointersArray + new_index; + } + + GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p) + { + return 64u * (p->BVHDataEnd - p->backPointerDataStart); + } + + GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p) + { + return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart); + } + + GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p ) + { + return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd)); + } + + GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p ) + { + return p->refitTreeletsDataStart > p->backPointerDataStart; + } + + GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p) + { + return p->quadLeafCur - p->quadLeafStart; + } + + GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p) + { + return p->proceduralDataCur - p->proceduralDataStart; + } + + GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p) + { + return (p->instanceLeafEnd - p->instanceLeafStart) / 2; + } + + GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p) + { + return p->BVHDataEnd * 64u; + } + + + + struct HwInstanceLeaf + { + /* first 64 bytes accessed during traversal */ + struct Part0 + { + //uint32_t shaderIndex : 24; + //uint32_t geomMask : 8; + uint32_t DW0; + + // uint32_t instanceContributionToHitGroupIndex : 24; + // uint32_t pad0 : 8 + // + // NOTE: Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path + // For a procedural instance, bit 29 should be set to 1, to disable "opaque culling" + // and bits 30 and 31 must be zero. See also the definition of the 'PrimLeafDesc' structure + uint32_t DW1; + + // uint64_t rootNodePtr : 48; + // uint64_t instFlags : 8; + // uint64_t pad1 : 8; + uint64_t DW2_DW3; + + // Vec3f world2obj_vx; // 1st row of Worl2Obj transform + float world2obj_vx_x; + float world2obj_vx_y; + float world2obj_vx_z; + + // Vec3f world2obj_vy; // 2nd row of Worl2Obj transform + float world2obj_vy_x; + float world2obj_vy_y; + float world2obj_vy_z; + + // Vec3f world2obj_vz; // 3rd row of Worl2Obj transform + float world2obj_vz_x; + float world2obj_vz_y; + float world2obj_vz_z; + + // Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes) + float obj2world_p_x; + float obj2world_p_y; + float obj2world_p_z; + } part0; + + /* second 64 bytes accessed during shading */ + // NOTE: Everything in this block is under SW control + struct Part1 + { + // uint64_t bvhPtr : 48; + // uint64_t pad : 16; + uint64_t DW0_DW1; + + uint32_t instanceID; + uint32_t instanceIndex; + + // Vec3f world2obj_vx; // 1st row of Worl2Obj transform + float obj2world_vx_x; + float obj2world_vx_y; + float obj2world_vx_z; + + // Vec3f world2obj_vy; // 2nd row of Worl2Obj transform + float obj2world_vy_x; + float obj2world_vy_y; + float obj2world_vy_z; + + // Vec3f world2obj_vz; // 3rd row of Worl2Obj transform + float obj2world_vz_x; + float obj2world_vz_y; + float obj2world_vz_z; + + // Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes) + float world2obj_p_x; + float world2obj_p_y; + float world2obj_p_z; + } part1; + }; + + __constant const uint64_t c_one = 1ul; + + GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p ) + { + return p->part0.DW0 >> 24; + } + + GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p ) + { + return p->part0.DW1 & 0x00ffffff; + } + + GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p ) + { + return (p->part0.DW2_DW3 >> 48) & 0xff; + } + GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p ) + { + return p->part1.instanceID; + } + + GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p ) { return p->part1.DW0_DW1 & ((c_one << 48) - 1); } + GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p ) { return p->part0.DW2_DW3 & ((c_one << 48) - 1); } + GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; } + + GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform) + { + transform[0] = p->part1.obj2world_vx_x; + transform[1] = p->part1.obj2world_vy_x; + transform[2] = p->part1.obj2world_vz_x; + transform[3] = p->part0.obj2world_p_x; + transform[4] = p->part1.obj2world_vx_y; + transform[5] = p->part1.obj2world_vy_y; + transform[6] = p->part1.obj2world_vz_y; + transform[7] = p->part0.obj2world_p_y; + transform[8] = p->part1.obj2world_vx_z; + transform[9] = p->part1.obj2world_vy_z; + transform[10] = p->part1.obj2world_vz_z; + transform[11] = p->part0.obj2world_p_z; + } + + GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) { + uint64_t mask = ((c_one << 48) - 1); + uint64_t v = p->part1.DW0_DW1; + v = (b & mask) | (v & ~mask); + p->part1.DW0_DW1 = v; + } + GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) { + uint64_t mask = ((c_one << 48) - 1); + uint64_t v = p->part0.DW2_DW3; + v = (b & mask) | (v & ~mask); + p->part0.DW2_DW3 = v; + } + GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p, + gpuva_t root, + uint8_t flags ) { + uint64_t mask = ((1ull << 48) - 1); + uint64_t v = (root & mask) | ((uint64_t)(flags)<<48); + p->part1.DW0_DW1 = v; + } + + struct InternalNode + { + float lower[3]; // world space origin of quantization grid + int32_t childOffset; // offset to all children in 64B multiples + + uint8_t nodeType; // the type of the node + uint8_t pad; // unused byte + + int8_t exp_x; // 2^exp_x is the size of the grid in x dimension + int8_t exp_y; // 2^exp_y is the size of the grid in y dimension + int8_t exp_z; // 2^exp_z is the size of the grid in z dimension + uint8_t nodeMask; // mask used for ray filtering + + struct ChildData + { + //uint8_t blockIncr : 2; // size of child in 64 byte blocks. Must be ==2 for instance leaves, <=2 for quad leaves. + //uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode + //uint8_t pad : 2; // unused bits + uint8_t bits; + } childData[6]; + + uint8_t lower_x[6]; // the quantized lower bounds in x-dimension + uint8_t upper_x[6]; // the quantized upper bounds in x-dimension + uint8_t lower_y[6]; // the quantized lower bounds in y-dimension + uint8_t upper_y[6]; // the quantized upper bounds in y-dimension + uint8_t lower_z[6]; // the quantized lower bounds in z-dimension + uint8_t upper_z[6]; // the quantized upper bounds in z-dimension + }; + + GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx ) + { + return p->childData[idx].bits & 3; + } + GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx ) + { + return (p->childData[idx].bits>>2) & 0xf; + } + + GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx ) + { + return (p->childData[idx].bits >> 2) & 0xF; + } + + GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type ) + { + uint bits = p->childData[idx].bits; + const uint mask = (0xF << 2); + bits = ((type << 2) & mask) | (bits & ~mask); + p->childData[idx].bits = (uint8_t)bits; + } + + GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child ) + { + bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0 + bool upper = p->upper_x[child] & 0x80; + return !lower || upper; + } + + GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i) + { + float4 lower, upper; + const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f }; + const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 }; + const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 }; + const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 }; + lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8); + upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8); + AABB3f aabb3f = { + { lower.x, lower.y, lower.z }, + { upper.x, upper.y, upper.z } }; + return aabb3f; + } + + GRL_INLINE void* InternalNode_GetChildren( InternalNode* node) + { + return (void*)(((char*)node) + node->childOffset * 64); + } + + typedef struct PrimLeafDesc + { + //uint32_t shaderIndex : 24; // shader index used for shader record calculations + //uint32_t geomMask : 8; // geometry mask used for ray masking + uint32_t shaderIndex_geomMask; + + //uint32_t geomIndex : 29; // the geometry index specifies the n'th geometry of the scene + //PrimLeafType type : 1; // see above + //GeometryFlags geomFlags : 2; // geometry flags of this geometry + uint32_t geomIndex_flags; + } PrimLeafDesc; + + GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p ) + { + return p->shaderIndex_geomMask & ((1 << 24) - 1); + } + GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p ) + { + return p->geomIndex_flags & ((1<<29)-1); + } + GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p ) + { + return (p->geomIndex_flags >> 30); + } + GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p) + { + return (p->geomIndex_flags >> 29) & 1; + } + + struct QuadLeaf + { + PrimLeafDesc leafDesc; + + uint32_t primIndex0; + + //uint32_t primIndex1Delta : 16; + //uint32_t j0 : 2; + //uint32_t j1 : 2; + //uint32_t j2 : 2; + //uint32_t last : 1; // last quad in list + //uint32_t pad : 9; + uint32_t DW1; + + float v[4][3]; + }; + + GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p ) + { + return p->DW1 & 0x0000ffff; + } + GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p ) + { + return p->primIndex0; + } + GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p ) + { + return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p); + } + GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p ) + { + return QuadLeaf_GetPrimIndexDelta(p) == 0; + } + GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p ) + { + return (p->DW1>>16) & 0x3f; + } + + GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 ) + { + quad->v[0][0] = v0.x; + quad->v[0][1] = v0.y; + quad->v[0][2] = v0.z; + quad->v[1][0] = v1.x; + quad->v[1][1] = v1.y; + quad->v[1][2] = v1.z; + quad->v[2][0] = v2.x; + quad->v[2][1] = v2.y; + quad->v[2][2] = v2.z; + quad->v[3][0] = v3.x; + quad->v[3][1] = v3.y; + quad->v[3][2] = v3.z; + } + + + struct ProceduralLeaf { + PrimLeafDesc leafDesc; + + // Number of primitives + "last" bits. + // The meaning of this section is SW-defined and flexible + uint32_t DW1 ; + uint32_t _primIndex[13]; + } ; + +GRL_NAMESPACE_END(Gen12) +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/include/GRLIntTypes.h b/src/intel/vulkan/grl/include/GRLIntTypes.h new file mode 100644 index 00000000000..573dbbc7481 --- /dev/null +++ b/src/intel/vulkan/grl/include/GRLIntTypes.h @@ -0,0 +1,152 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +//******************************************************************************************** +// WARNING!!!!! +// +// This file is shared by OpenCL and C++ source code and must be a pure C header +// There should only be C structure definitions and trivial inline functions here +// +//******************************************************************************************** + +#pragma once + +#include "GRLOCLCompatibility.h" + +GRL_NAMESPACE_BEGIN(GRL) + + typedef uint32_t dword; + typedef uint64_t qword; + typedef qword gpuva_t; + + + enum_uint8( InstanceFlags ) + { + INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1, + INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2, + INSTANCE_FLAG_FORCE_OPAQUE = 0x4, + INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8, + }; + + enum_uint8( GeometryFlags ) + { + GEOMETRY_FLAG_NONE = 0x0, + GEOMETRY_FLAG_OPAQUE = 0x1, + GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2, + }; + + enum_uint8( GeometryType ) + { + GEOMETRY_TYPE_TRIANGLES = 0, + GEOMETRY_TYPE_PROCEDURAL = 1, + NUM_GEOMETRY_TYPES = 2 + }; + + // NOTE: Does NOT match DXR + enum_uint8( IndexFormat ) + { + INDEX_FORMAT_NONE = 0, // INDEX_FORMAT_NONE Indicates non-indexed geometry + INDEX_FORMAT_R16_UINT = 2, + INDEX_FORMAT_R32_UINT = 4, + INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1 + }; + + // NOTE: Does NOT match DXR + enum_uint8( VertexFormat ) + { + VERTEX_FORMAT_R32G32_FLOAT = 0, + VERTEX_FORMAT_R32G32B32_FLOAT = 1, + VERTEX_FORMAT_R16G16_FLOAT = 2, + VERTEX_FORMAT_R16G16B16A16_FLOAT = 3, + VERTEX_FORMAT_R16G16_SNORM = 4, + VERTEX_FORMAT_R16G16B16A16_SNORM = 5, + VERTEX_FORMAT_R16G16B16A16_UNORM = 6, + VERTEX_FORMAT_R16G16_UNORM = 7, + VERTEX_FORMAT_R10G10B10A2_UNORM = 8, + VERTEX_FORMAT_R8G8B8A8_UNORM = 9, + VERTEX_FORMAT_R8G8_UNORM = 10, + VERTEX_FORMAT_R8G8B8A8_SNORM = 11, + VERTEX_FORMAT_R8G8_SNORM = 12, + VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1 + }; + + + + enum_uint32(RTASFlags) + { + // These flags match DXR + BUILD_FLAG_ALLOW_UPDATE = 1<<0, + BUILD_FLAG_ALLOW_COMPACTION = 1<<1, + BUILD_FLAG_PREFER_FAST_TRACE = 1<<2, + BUILD_FLAG_PREFER_FAST_BUILD = 1<<3, + BUILD_FLAG_MINIMIZE_MEMORY = 1<<4, + BUILD_FLAG_PERFORM_UPDATE = 1<<5, + + // internal flags start here + BUILD_FLAG_DISALLOW_REBRAID = 1<<16, + + BUILD_FLAG_ALL = 0x0001003f + }; + + enum_uint8(BVHType) + { + BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices + BVH_TYPE_GEN12, + }; + + enum_uint8(PostBuildInfoType) + { + PBI_CURRENT_SIZE, + PBI_COMPACTED_SIZE, + PBI_DXR_TOOLS_VISUALIZATION_DESC, + PBI_DXR_SERIALIZATION_DESC, + }; + + enum_uint32(HazardTypes) + { + HAZARD_RTAS_READ = 1 << 0, + HAZARD_RTAS_WRITE = 1 << 1, + HAZARD_READ = 1 << 2, + HAZARD_WRITE = 1 << 3, + HAZARD_ALL = 0xf + }; + + enum_uint32(RaytracingAccelerationStructureType) + { + TOP_LEVEL = 0x0, + BOTTOM_LEVEL = 0x1, + }; + + typedef struct PostbuildInfoCurrentSize + { + uint64_t CurrentSizeInBytes; + } PostbuildInfoCurrentSize; + + typedef struct PostbuildInfoCompactedSize + { + uint64_t CompactedSizeInBytes; + } PostbuildInfoCompactedSize; + + typedef struct PostbuildInfoToolsVisualizationDesc + { + uint64_t DecodedSizeInBytes; + } PostbuildInfoToolsVisualizationDesc; + + typedef struct PostbuildInfoSerializationDesc + { + uint64_t SerializedSizeInBytes; + uint64_t NumBottomLevelAccelerationStructurePointers; + } PostbuildInfoSerializationDesc; + + typedef struct DecodeHeader + { + RaytracingAccelerationStructureType Type; + uint32_t NumDesc; + } DecodeHeader; + + +GRL_NAMESPACE_END(GRL) \ No newline at end of file diff --git a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h new file mode 100644 index 00000000000..dd9ff2c271a --- /dev/null +++ b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h @@ -0,0 +1,205 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#ifdef __OPENCL_VERSION__ + +typedef uchar uint8_t; +typedef ushort uint16_t; +typedef uint uint32_t; +typedef ulong uint64_t; +typedef char int8_t; +typedef short int16_t; +typedef int int32_t; +typedef long int64_t; + +#else + +#include + +typedef uint8_t uchar; +typedef uint16_t ushort; +typedef uint32_t uint; +typedef uint64_t ulong; + +#define __constant +#define __global + +typedef struct uint2 +{ +#ifdef __cplusplus + uint2() {}; + uint2( uint ix, uint iy ) : x( ix ), y( iy ) {}; +#endif + uint x; + uint y; +} uint2; + +typedef struct uint3 +{ +#ifdef __cplusplus + uint3() {}; + uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {}; +#endif + uint x; + uint y; + uint z; +} uint3; + +typedef struct int3 +{ + int32_t x; + int32_t y; + int32_t z; + +#ifdef __cplusplus + int3() {}; + int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {}; + + int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); } + int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); } +#endif +} int3; + +typedef struct int4 +{ + int32_t x; + int32_t y; + int32_t z; + int32_t w; + +#ifdef __cplusplus + int4() {}; + int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {}; + + int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); } + int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); } + int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); } +#endif +} int4; + +typedef struct float3 +{ + float x; + float y; + float z; + +#ifdef __cplusplus + float3(){}; + float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){}; + + float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); } + float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); } + float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); } + float3 operator-() { return float3(-this->x, -this->y, -this->z); } + float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); } +#endif +} float3; + +typedef struct float4 +{ + float x; + float y; + float z; + float w; + +#ifdef __cplusplus + float4() {}; + float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {}; + + float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); } + float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); } +#endif +} float4; + +#endif /* ! __OPENCL_VERSION__ */ + + +#ifndef __cplusplus + +#define GRL_NAMESPACE_BEGIN(x) +#define GRL_NAMESPACE_END(x) +#define GRL_OVERLOADABLE __attribute((overloadable)) +#define GRL_INLINE __attribute__((always_inline)) inline static + +# define enum_uint8(name) \ + typedef uint8_t name; \ + enum name##_uint32 +# define enum_uint16(name) \ + typedef uint16_t name; \ + enum name##_uint32 +# define enum_uint32(name) \ + typedef uint32_t name; \ + enum name##_uint32 + +#define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n))) +#define GRL_STATIC_ASSERT(condition,desc) + +#else /* C++ */ +#ifdef __OPENCL_VERSION__ +#error "OpenCL C++ not supported by this header" +#endif + +#define GRL_NAMESPACE_BEGIN(x) namespace x { +#define GRL_NAMESPACE_END(x) } +#define GRL_OVERLOADABLE +#define GRL_INLINE inline + +#define enum_uint8(N) enum N : uint8_t +#define enum_uint16(N) enum N : uint16_t +#define enum_uint32(N) enum N : uint32_t + +#define OCL_BYTE_ALIGN(n) +#define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc ) + +#include + +inline float3 fmin(float3 a, float3 b) +{ + float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) }; + return o; +} + +inline float3 fmax(float3 a, float3 b) +{ + float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) }; + return o; +} + +inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); } + +inline float dot(const float3& a, const float3& b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +inline float as_float(uint32_t i) +{ + return *reinterpret_cast(&i); +} + +inline float3 as_float3(int3 i3) +{ + return *reinterpret_cast(&i3); +} + +inline float4 as_float4(int4 i4) +{ + return *reinterpret_cast(&i4); +} + +inline float4 convert_float4_rtn(int4 i4) +{ + return float4(static_cast(i4.x), static_cast(i4.y), static_cast(i4.z), static_cast(i4.w)); +} + +inline float4 convert_float4_rtp(int4 i4) +{ + return convert_float4_rtn(i4); +} + +#endif diff --git a/src/intel/vulkan/grl/include/GRLRTASCommon.h b/src/intel/vulkan/grl/include/GRLRTASCommon.h new file mode 100644 index 00000000000..1f2cda2ea0b --- /dev/null +++ b/src/intel/vulkan/grl/include/GRLRTASCommon.h @@ -0,0 +1,142 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +// +// This file is to contain structure definitions for RTAS-related meta-deta. +// The structures here should be generic enough to apply to any acceleration structure. +// If we ever move to KD-Trees or Octrees, this file should not need to change. +// + +//******************************************************************************************** +// WARNING!!!!! +// +// This file is shared by OpenCL and C++ source code and must be a pure C header +// There should only be C structure definitions and trivial inline functions here +// +//******************************************************************************************** + + +#pragma once +#include "GRLIntTypes.h" + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(RTAS) + + typedef struct SerializationIdentifier + { + uint8_t Bytes[16]; + } SerializationIdentifier; + + GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!"); + + + // Header structure for RTAS serialization. + // This structure is binary-compatible with the DXR and Vulkan API definitions + typedef struct SerializationHeader + { + SerializationIdentifier DriverID; // DXR 'DriverOpaqueGUID'. Vulkan: 'driverUUID' + SerializationIdentifier GRLID; // DXR 'DriverOpaqueVersioningData'. Vulkan: 'accelerationStructureUUID' + + uint64_t SerializedSizeInBytesIncludingHeader; + uint64_t DeserializedSizeInBytes; + uint64_t InstanceHandleCount; + } SerializationHeader; + + GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!"); + + // This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures + typedef struct InstanceDesc { + float Transform[3][4]; + uint32_t InstanceIDAndMask; // mask in 8 msbs + uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs + gpuva_t AccelerationStructureGPUVA; // NOTE: In GRL this is always a VA. Vulkan CPU builds use handles here, and these may need to be translated + } InstanceDesc; + GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!"); + + typedef struct GeoMetaData{ + uint32_t PrimitiveCount; + uint16_t Type; + uint16_t Flags; + } GeoMetaData; + GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!"); + + typedef struct AABB3f { + float lower[3]; + float upper[3]; + } AABB3f; + GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!"); + + enum_uint32(error_t_) { + error_t_no_error = 0x0, + error_t_internal_node_child_OOB = 0x1, + error_t_leaf_node_child_OOB = 0x2, + error_t_unrecognised_node_t = 0x4, + error_t_mixed_node_unsupported = 0x8, + error_t_instance_pointers_inconsistent = 0x10, + error_t_instance_pointed_root_not_internal = 0x20, + error_t_leaf_node_instance_child_missed_by_64B = 0x40, + error_t_internal_node_child_cycle = 0x80, + error_t_input_geo_insane = 0x100, + error_t_quad_leaf_broken = 0x200, + error_t_backpointer_not_reset = 0x400, + error_t_backpointer_wrong_children_num = 0x500, + error_t_backpointer_inconsitent_parent_child = 0x600, + error_t_backpointer_root_not_root_error = 0x700, + error_t_backpointer_OOB = 0x800, + error_t_backpointers_buffer_too_small = 0x900, + error_t_atomic_update_struct_fatleaf_count_oob = 0x1000, // for this and following: + error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000, // offset_in_BVH is just index in fatleaf or inner node arrays + error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000, + error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000, + error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000, + error_t_atomic_update_struct_inner_count_oob = 0x6000, + error_t_atomic_update_struct_inner_node_idx_oob = 0x7000, + error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000, + error_t_atomic_update_struct_inner_num_children_error = 0x9000, + error_t_atomic_update_struct_inner_children_non_internal = 0xA000, + error_t_unknown = 1u << 31, + }; + + enum_uint32(error_phase_t) { + error_phase_t_unknown = 0, + error_phase_t_post_build_Morton = 1, + error_phase_t_post_build_Trivial = 2, + error_phase_t_post_build_NewSAH = 3, + error_phase_t_post_update = 4, + error_phase_t_pre_update = 5, + error_phase_t_post_copy_op = 6, + }; + + typedef struct ERROR_INFO { + error_t_ type; + uint offset_in_BVH; //in 64B units + error_phase_t when; + uint reserved; + } ERROR_INFO; + + // Meta-data common to all acceleration structures, which is needed to implement required functionality + // All RTAS structures must contain a struct of this type named 'Meta' + typedef struct RTASMetaData { + struct AABB3f bounds; + + uint32_t instanceDescsStart; // byte offset to array of original instance_descs used for build. Required for DXR visualization and serialization + uint32_t instanceCount; + + uint32_t geoDescsStart; // byte offset to array of 'GeoMetaData' matching input geos. Required for DXR visualization + uint32_t geoCount; + + uint64_t allocationSize; // Size of the memory allocation containing this RTAS + // This is the size given to the app in the prebuild info when the RTAS was first created + // If RTAS was compacted, this will be the compacted size + + ERROR_INFO errors; // only used in debug mode + } RTASMetaData; + + GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!"); + +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/include/GRLStructs.h b/src/intel/vulkan/grl/include/GRLStructs.h new file mode 100644 index 00000000000..c8af8313ffc --- /dev/null +++ b/src/intel/vulkan/grl/include/GRLStructs.h @@ -0,0 +1,60 @@ +// +// Copyright (C) 2009-2021 Intel Corporation +// +// SPDX-License-Identifier: MIT +// +// + +#pragma once + +#include "GRLIntTypes.h" + +GRL_NAMESPACE_BEGIN(GRL) +GRL_NAMESPACE_BEGIN(_INTERNAL) + + struct GeometryTriangles + { + gpuva_t pTransformBuffer; + gpuva_t pIndexBuffer; + gpuva_t pVertexBuffer; + qword VertexBufferByteStride; + dword IndexCount; + dword VertexCount; + IndexFormat IndexFormat; + VertexFormat VertexFormat; + }; + + struct GeometryProcedural + { + gpuva_t pAABBs_GPUVA; ///, <0,1,0>, <0,0,1> + float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8]; + float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9]; + float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10]; + + float obb_sq_half_surf = obx * oby + oby * obz + obz * obx; + + return obb_sq_half_surf / aabb_sq_half_surf; + + // ex = 2.0 + // ey = 2.0 + // ez = 2.0 + // ex = 4.0 + // ey = 4.0 + // ez = 4.0 + // aabb_half_surf = 16+16 *2.0 + 2.0*2.0+ 2.0*2.0; = 12; + // aabb_sq_half_surf = 144; + // + // obx = 4.0; + // oby = 4.0; + // obz = 4.0; + // obb_sq_half_surf = 16 + 16+ 16; + // obb_sq_half_surf = 16.0 *3 = 48 +} + +GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out) +{ + out[0] = in.l.vx.x; + out[4] = in.l.vx.y; + out[8] = in.l.vx.z; + out[1] = in.l.vy.x; + out[5] = in.l.vy.y; + out[9] = in.l.vy.z; + out[2] = in.l.vz.x; + out[6] = in.l.vz.y; + out[10] = in.l.vz.z; + + out[3] = in.p.x; + out[7] = in.p.y; + out[11] = in.p.z; +} + +GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p) +{ + return xfmPoint(xfm.l, p) + xfm.p; +} + +/* compute inverse matrix */ +GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in) +{ + const struct LinearSpace3f il = LinearSpace3f_invert(in.l); + float3 ip = -xfmPoint(il, in.p); + return AffineSpace3f_Constructor(il, ip); +} + +GRL_NAMESPACE_END(RTAS) +GRL_NAMESPACE_END(GRL)