From f3ddfd81b4deaa8033d598527e0cbc255e60addc Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 21 Jan 2021 02:18:32 -0600 Subject: [PATCH] anv: Build BVHs on the GPU with GRL Reviewed-by: Lionel Landwerlin Acked-by: Caio Oliveira Part-of: --- src/intel/vulkan/anv_device.c | 15 +- src/intel/vulkan/anv_pipeline.c | 2 + src/intel/vulkan/anv_private.h | 10 + .../vulkan/genX_acceleration_structure.c | 1280 +++++++++++++++++ src/intel/vulkan/genX_cmd_buffer.c | 2 +- src/intel/vulkan/genX_query.c | 85 +- src/intel/vulkan/genX_state.c | 8 + src/intel/vulkan/grl/genX_grl.h | 2 + src/intel/vulkan/grl/genX_grl_uuid.cpp | 39 + src/intel/vulkan/grl/grl_structs.h | 479 ++++++ src/intel/vulkan/grl/meson.build | 24 +- src/intel/vulkan/meson.build | 7 +- 12 files changed, 1934 insertions(+), 19 deletions(-) create mode 100644 src/intel/vulkan/genX_acceleration_structure.c create mode 100644 src/intel/vulkan/grl/genX_grl_uuid.cpp create mode 100644 src/intel/vulkan/grl/grl_structs.h diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 36070e97441..44d7d4d4c7d 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -191,6 +191,7 @@ get_device_extensions(const struct anv_physical_device *device, *ext = (struct vk_device_extension_table) { .KHR_8bit_storage = true, .KHR_16bit_storage = true, + .KHR_acceleration_structure = device->info.has_ray_tracing, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, .KHR_copy_commands2 = true, @@ -1343,11 +1344,12 @@ void anv_GetPhysicalDeviceFeatures2( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: { VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext; - features->accelerationStructure = false; - features->accelerationStructureCaptureReplay = false; - features->accelerationStructureIndirectBuild = false; + features->accelerationStructure = pdevice->info.has_ray_tracing; + features->accelerationStructureCaptureReplay = false; /* TODO */ + features->accelerationStructureIndirectBuild = false; /* TODO */ features->accelerationStructureHostCommands = false; - features->descriptorBindingAccelerationStructureUpdateAfterBind = true; + features->descriptorBindingAccelerationStructureUpdateAfterBind = + pdevice->info.has_ray_tracing; break; } @@ -3393,6 +3395,11 @@ VkResult anv_CreateDevice( "Anv") + 8, 8), }; + device->rt_uuid_addr = anv_address_add(device->workaround_address, 8); + memcpy(device->rt_uuid_addr.bo->map + device->rt_uuid_addr.offset, + physical_device->rt_uuid, + sizeof(physical_device->rt_uuid)); + device->debug_frame_desc = intel_debug_get_identifier_block(device->workaround_bo->map, device->workaround_bo->size, diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index e179a89490c..df23f1a3eaa 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -2866,6 +2866,8 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline, VkResult anv_device_init_rt_shaders(struct anv_device *device) { + device->bvh_build_method = ANV_BVH_BUILD_METHOD_NEW_SAH; + if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline) return VK_SUCCESS; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 938532c534d..6c7407742d5 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -975,6 +975,7 @@ struct anv_physical_device { uint8_t pipeline_cache_uuid[VK_UUID_SIZE]; uint8_t driver_uuid[VK_UUID_SIZE]; uint8_t device_uuid[VK_UUID_SIZE]; + uint8_t rt_uuid[VK_UUID_SIZE]; struct vk_sync_type sync_syncobj_type; struct vk_sync_timeline_type sync_timeline_type; @@ -1076,6 +1077,11 @@ anv_device_upload_nir(struct anv_device *device, const struct nir_shader *nir, unsigned char sha1_key[20]); +enum anv_rt_bvh_build_method { + ANV_BVH_BUILD_METHOD_TRIVIAL, + ANV_BVH_BUILD_METHOD_NEW_SAH, +}; + struct anv_device { struct vk_device vk; @@ -1146,6 +1152,7 @@ struct anv_device { struct anv_scratch_pool scratch_pool; struct anv_bo *rt_scratch_bos[16]; struct anv_bo *btd_fifo_bo; + struct anv_address rt_uuid_addr; /** Shadow ray query BO * @@ -1165,6 +1172,8 @@ struct anv_device { struct anv_shader_bin *rt_trampoline; struct anv_shader_bin *rt_trivial_return; + enum anv_rt_bvh_build_method bvh_build_method; + pthread_mutex_t mutex; pthread_cond_t queue_submit; @@ -2087,6 +2096,7 @@ anv_pipe_flush_bits_for_access_flags(struct anv_device *device, switch ((VkAccessFlags2)BITFIELD64_BIT(b)) { case VK_ACCESS_2_SHADER_WRITE_BIT: case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT: + case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: /* We're transitioning a buffer that was previously used as write * destination through the data port. To make its content available * to future operations, flush the hdc pipeline. diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c new file mode 100644 index 00000000000..d0382ccd88a --- /dev/null +++ b/src/intel/vulkan/genX_acceleration_structure.c @@ -0,0 +1,1280 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +#include + +#include "util/debug.h" +#include "util/half_float.h" +#include "util/u_atomic.h" + +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" + +#if GFX_VERx10 >= 125 + +#include "grl/grl_structs.h" + +/* Wait for the previous dispatches to finish and flush their data port + * writes. + */ +#define ANV_GRL_FLUSH_FLAGS (ANV_PIPE_END_OF_PIPE_SYNC_BIT | \ + ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ + ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) + +static const VkAccelerationStructureGeometryKHR * +get_geometry(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo, + uint32_t index) +{ + return pInfo->pGeometries ? &pInfo->pGeometries[index] : + pInfo->ppGeometries[index]; +} + +static size_t align_transient_size(size_t bytes) +{ + return ALIGN(bytes, 64); +} + +static size_t align_private_size(size_t bytes) +{ + return ALIGN(bytes, 64); +} + +static size_t get_scheduler_size(size_t num_builds) +{ + size_t scheduler_size = sizeof(union SchedulerUnion); + /* add more memory for qnode creation stage if needed */ + if (num_builds > QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) { + scheduler_size += (num_builds - QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) * 2 * + sizeof(struct QNodeGlobalRootBufferEntry); + } + + return align_private_size(scheduler_size); +} + +static size_t +get_batched_binnedsah_transient_mem_size(size_t num_builds) +{ + if (num_builds == 0) + return 0; + return num_builds * (sizeof(struct SAHBuildBuffersInfo) + sizeof(gpuva_t)); +} + +static size_t +get_batched_binnedsah_private_mem_size(size_t num_builds) +{ + if (num_builds == 0) + return 0; + + size_t globals_size = align_private_size(num_builds * sizeof(struct SAHBuildGlobals)); + return globals_size + get_scheduler_size(num_builds); +} + +static uint32_t +estimate_qbvh6_nodes(const uint32_t N) +{ + const uint32_t W = 6; + const uint32_t N0 = N / 2 + N % 2; // lowest level with 2 leaves per QBVH6 node + const uint32_t N1 = N0 / W + (N0 % W ? 1 : 0); // filled level + const uint32_t N2 = N0 / W + (N1 % W ? 1 : 0); // filled level + const uint32_t N3 = N0 / W + (N2 % W ? 1 : 0); // filled level + const uint32_t N4 = N3; // overestimate remaining nodes + return N0 + N1 + N2 + N3 + N4; +} + +/* Estimates the worst case number of QBVH6 nodes for a top-down BVH + * build that guarantees to produce subtree with N >= K primitives + * from which a single QBVH6 node is created. + */ +static uint32_t +estimate_qbvh6_nodes_minK(const uint32_t N, uint32_t K) +{ + const uint32_t N0 = N / K + (N % K ? 1 : 0); // lowest level of nodes with K leaves minimally + return N0 + estimate_qbvh6_nodes(N0); +} + +static size_t +estimate_qbvh6_fatleafs(const size_t P) +{ + return P; +} + +static size_t +estimate_qbvh6_nodes_worstcase(const size_t P) +{ + const size_t F = estimate_qbvh6_fatleafs(P); + + // worst-case each inner node having 5 fat-leaf children. + // number of inner nodes is F/5 and number of fat-leaves is F + return F + ceil(F/5.0); +} + +#define sizeof_PrimRef 32 +#define sizeof_HwInstanceLeaf (GENX(RT_BVH_INSTANCE_LEAF_length) * 4) +#define sizeof_InternalNode (GENX(RT_BVH_INTERNAL_NODE_length) * 4) +#define sizeof_Procedural (GENX(RT_BVH_PROCEDURAL_LEAF_length) * 4) +#define sizeof_Quad (GENX(RT_BVH_QUAD_LEAF_length) * 4) + +static struct MKSizeEstimate +get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo, + const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos, + const uint32_t *pMaxPrimitiveCounts) +{ + uint32_t num_triangles = 0, num_aabbs = 0, num_instances = 0; + for (unsigned g = 0; g < pInfo->geometryCount; g++) { + const VkAccelerationStructureGeometryKHR *pGeometry = + get_geometry(pInfo, g); + uint32_t prim_count = pBuildRangeInfos != NULL ? + pBuildRangeInfos[g].primitiveCount : pMaxPrimitiveCounts[g]; + + switch (pGeometry->geometryType) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + num_triangles += prim_count; + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + num_aabbs += prim_count; + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + num_instances += prim_count; + break; + default: + unreachable("Unsupported geometry type"); + } + } + const uint32_t num_primitives = num_triangles + num_aabbs + num_instances; + + struct MKSizeEstimate est = {}; + + uint64_t size = sizeof(BVHBase); + size = align_u64(size, 64); + + /* Must immediately follow BVHBase because we use fixed offset to nodes. */ + est.node_data_start = size; + + switch (pInfo->type) { + case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: { + assert(num_triangles == 0 && num_aabbs == 0); + + est.numPrimitives = num_instances; + est.numPrimitivesToSplit = 0; + est.numBuildPrimitives = est.numPrimitives + est.numPrimitivesToSplit; + + est.min_primitives = est.numPrimitives; + est.max_primitives = est.numPrimitives + est.numPrimitivesToSplit; + + unsigned int sizeInnerNodes = + (unsigned int) estimate_qbvh6_nodes_worstcase(est.numBuildPrimitives) * + sizeof_InternalNode; + if (sizeInnerNodes == 0) + sizeInnerNodes = sizeof_InternalNode; + + est.max_inner_nodes = sizeInnerNodes / sizeof_InternalNode; + + size += sizeInnerNodes; + STATIC_ASSERT(sizeof_InternalNode % 64 == 0); + + est.leaf_data_start = size; + size += est.numBuildPrimitives * sizeof_HwInstanceLeaf; + STATIC_ASSERT(sizeof_HwInstanceLeaf % 64 == 0); + + est.leaf_data_size = est.numBuildPrimitives * sizeof_HwInstanceLeaf; + + break; + } + + case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: { + assert(num_instances == 0); + + /* RT: TODO */ + const float split_factor = 0.0f; + uint32_t num_prims_to_split = 0; + if (false) + num_prims_to_split = num_triangles + (double)split_factor; + + const uint32_t num_build_triangles = num_triangles + num_prims_to_split; + const uint32_t num_build_primitives = num_build_triangles + num_aabbs; + + est.numPrimitives = num_primitives; + est.numTriangles = num_triangles; + est.numProcedurals = num_aabbs; + est.numMeshes = pInfo->geometryCount; + est.numBuildPrimitives = num_build_primitives; + est.numPrimitivesToSplit = num_prims_to_split; + est.max_instance_leafs = 0; + + est.min_primitives = (size_t)(num_build_triangles * 0.5f + num_aabbs); + est.max_primitives = num_build_triangles + num_aabbs; + + size_t nodeBytes = 0; + nodeBytes += estimate_qbvh6_nodes_worstcase(num_build_triangles) * sizeof_InternalNode; + nodeBytes += estimate_qbvh6_nodes_worstcase(num_aabbs) * sizeof_InternalNode; + if (nodeBytes == 0) // for case with 0 primitives + nodeBytes = sizeof_InternalNode; + nodeBytes = MAX2(nodeBytes, 8 * (size_t)num_build_primitives); // for primref_index0/1 buffers + + est.max_inner_nodes = nodeBytes / sizeof_InternalNode; + + size += nodeBytes; + STATIC_ASSERT(sizeof_InternalNode % 64 == 0); + + est.leaf_data_start = size; + size += num_build_triangles * sizeof_Quad; + STATIC_ASSERT(sizeof_Quad % 64 == 0); + + est.procedural_data_start = size; + size += num_aabbs * sizeof_Procedural; + STATIC_ASSERT(sizeof_Procedural % 64 == 0); + + est.leaf_data_size = num_build_triangles * sizeof_Quad + + num_aabbs * sizeof_Procedural; + + if (num_build_primitives == 0) + size += MAX2(sizeof_Quad, sizeof_Procedural); + break; + } + + default: + unreachable("Unsupported acceleration structure type"); + } + + size = align_u64(size, 64); + est.instance_descs_start = size; + size += sizeof(struct InstanceDesc) * num_instances; + + est.geo_meta_data_start = size; + size += sizeof(struct GeoMetaData) * pInfo->geometryCount; + size = align_u64(size, 64); + + assert(size == align_u64(size, 64)); + est.back_pointer_start = size; + + const bool alloc_backpointers = false; /* RT TODO */ + if (alloc_backpointers) { + size += est.max_inner_nodes * sizeof(uint32_t); + size = align_u64(size, 64); + } + + assert(size < UINT32_MAX); + est.sizeTotal = align_u64(size, 64); + + return est; +} + +struct scratch_layout { + gpuva_t base; + uint32_t total_size; + + gpuva_t primrefs; + gpuva_t globals; + gpuva_t leaf_index_buffers; + uint32_t leaf_index_buffer_stride; + + /* new_sah */ + gpuva_t qnode_buffer; + gpuva_t bvh2_buffer; +}; + +static size_t +get_bvh2_size(uint32_t num_primitivies) +{ + if (num_primitivies == 0) + return 0; + return sizeof(struct BVH2) + + (2 * num_primitivies - 1) * sizeof(struct BVH2Node); +} + +static struct scratch_layout +get_gpu_scratch_layout(struct anv_address base, + struct MKSizeEstimate est, + enum anv_rt_bvh_build_method build_method) +{ + struct scratch_layout scratch = { + .base = anv_address_physical(base), + }; + gpuva_t current = anv_address_physical(base); + + scratch.globals = intel_canonical_address(current); + current += sizeof(struct Globals); + + scratch.primrefs = intel_canonical_address(current); + current += est.numBuildPrimitives * sizeof_PrimRef; + + scratch.leaf_index_buffers = intel_canonical_address(current); + current += est.numBuildPrimitives * sizeof(uint32_t) * 2; + scratch.leaf_index_buffer_stride = sizeof(uint32_t); + + switch (build_method) { + case ANV_BVH_BUILD_METHOD_TRIVIAL: + break; + + case ANV_BVH_BUILD_METHOD_NEW_SAH: { + size_t bvh2_size = get_bvh2_size(est.numBuildPrimitives); + if (est.leaf_data_size < bvh2_size) { + scratch.bvh2_buffer = intel_canonical_address(current); + current += bvh2_size; + } + + scratch.qnode_buffer = intel_canonical_address(current); + current += 2 * sizeof(dword) * est.max_inner_nodes; + break; + } + + default: + unreachable("invalid build"); + } + + assert((current - scratch.base) < UINT32_MAX); + scratch.total_size = current - scratch.base; + + return scratch; +} + +static void +anv_get_gpu_acceleration_structure_size( + UNUSED struct anv_device *device, + VkAccelerationStructureBuildTypeKHR buildType, + const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo, + const uint32_t* pMaxPrimitiveCounts, + VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) +{ + + struct MKSizeEstimate est = get_gpu_size_estimate(pBuildInfo, NULL, + pMaxPrimitiveCounts); + struct scratch_layout scratch = get_gpu_scratch_layout(ANV_NULL_ADDRESS, est, + device->bvh_build_method); + + pSizeInfo->accelerationStructureSize = est.sizeTotal; + pSizeInfo->buildScratchSize = scratch.total_size; + pSizeInfo->updateScratchSize = scratch.total_size; /* TODO */ +} + +void +genX(GetAccelerationStructureBuildSizesKHR)( + VkDevice _device, + VkAccelerationStructureBuildTypeKHR buildType, + const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo, + const uint32_t* pMaxPrimitiveCounts, + VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + assert(pSizeInfo->sType == + VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR); + + VkAccelerationStructureBuildSizesInfoKHR gpu_size_info; + anv_get_gpu_acceleration_structure_size(device, buildType, pBuildInfo, + pMaxPrimitiveCounts, + &gpu_size_info); + + pSizeInfo->accelerationStructureSize = + gpu_size_info.accelerationStructureSize; + pSizeInfo->buildScratchSize = gpu_size_info.buildScratchSize; + pSizeInfo->updateScratchSize = gpu_size_info.updateScratchSize; +} + +VkResult +genX(CreateAccelerationStructureKHR)( + VkDevice _device, + const VkAccelerationStructureCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkAccelerationStructureKHR* pAccelerationStructure) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer); + struct anv_acceleration_structure *accel; + + accel = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (accel == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + vk_object_base_init(&device->vk, &accel->base, + VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR); + + accel->size = pCreateInfo->size; + accel->address = anv_address_add(buffer->address, pCreateInfo->offset); + + *pAccelerationStructure = anv_acceleration_structure_to_handle(accel); + + return VK_SUCCESS; +} + +void +genX(DestroyAccelerationStructureKHR)( + VkDevice _device, + VkAccelerationStructureKHR accelerationStructure, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_acceleration_structure, accel, accelerationStructure); + + if (!accel) + return; + + vk_object_base_finish(&accel->base); + vk_free2(&device->vk.alloc, pAllocator, accel); +} + +VkDeviceAddress +genX(GetAccelerationStructureDeviceAddressKHR)( + VkDevice device, + const VkAccelerationStructureDeviceAddressInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_acceleration_structure, accel, + pInfo->accelerationStructure); + + assert(!anv_address_is_null(accel->address)); + + return anv_address_physical(accel->address); +} + +void +genX(GetDeviceAccelerationStructureCompatibilityKHR)( + VkDevice _device, + const VkAccelerationStructureVersionInfoKHR* pVersionInfo, + VkAccelerationStructureCompatibilityKHR* pCompatibility) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (memcmp(pVersionInfo->pVersionData, + device->physical->rt_uuid, + sizeof(device->physical->rt_uuid)) == 0) { + *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR; + } else { + *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR; + } +} + +static inline uint8_t +vk_to_grl_GeometryFlags(VkGeometryFlagsKHR flags) +{ + uint8_t grl_flags = GEOMETRY_FLAG_NONE; + unsigned mask = flags; + while (mask) { + int i = u_bit_scan(&mask); + switch ((VkGeometryFlagBitsKHR)(1u << i)) { + case VK_GEOMETRY_OPAQUE_BIT_KHR: + grl_flags |= GEOMETRY_FLAG_OPAQUE; + break; + case VK_GEOMETRY_NO_DUPLICATE_ANY_HIT_INVOCATION_BIT_KHR: + grl_flags |= GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION; + break; + default: + unreachable("Unsupported acceleration structure build flag"); + } + } + return grl_flags; +} + +static inline IndexFormat +vk_to_grl_IndexFormat(VkIndexType type) +{ + switch (type) { + case VK_INDEX_TYPE_NONE_KHR: return INDEX_FORMAT_NONE; + case VK_INDEX_TYPE_UINT8_EXT: unreachable("No UINT8 support yet"); + case VK_INDEX_TYPE_UINT16: return INDEX_FORMAT_R16_UINT; + case VK_INDEX_TYPE_UINT32: return INDEX_FORMAT_R32_UINT; + default: + unreachable("Unsupported index type"); + } +} + +static inline VertexFormat +vk_to_grl_VertexFormat(VkFormat format) +{ + switch (format) { + case VK_FORMAT_R32G32_SFLOAT: return VERTEX_FORMAT_R32G32_FLOAT; + case VK_FORMAT_R32G32B32_SFLOAT: return VERTEX_FORMAT_R32G32B32_FLOAT; + case VK_FORMAT_R16G16_SFLOAT: return VERTEX_FORMAT_R16G16_FLOAT; + case VK_FORMAT_R16G16B16A16_SFLOAT: return VERTEX_FORMAT_R16G16B16A16_FLOAT; + case VK_FORMAT_R16G16_SNORM: return VERTEX_FORMAT_R16G16_SNORM; + case VK_FORMAT_R16G16B16A16_SNORM: return VERTEX_FORMAT_R16G16B16A16_SNORM; + case VK_FORMAT_R16G16B16A16_UNORM: return VERTEX_FORMAT_R16G16B16A16_UNORM; + case VK_FORMAT_R16G16_UNORM: return VERTEX_FORMAT_R16G16_UNORM; + /* case VK_FORMAT_R10G10B10A2_UNORM: return VERTEX_FORMAT_R10G10B10A2_UNORM; */ + case VK_FORMAT_R8G8B8A8_UNORM: return VERTEX_FORMAT_R8G8B8A8_UNORM; + case VK_FORMAT_R8G8_UNORM: return VERTEX_FORMAT_R8G8_UNORM; + case VK_FORMAT_R8G8B8A8_SNORM: return VERTEX_FORMAT_R8G8B8A8_SNORM; + case VK_FORMAT_R8G8_SNORM: return VERTEX_FORMAT_R8G8_SNORM; + default: + unreachable("Unsupported vertex format"); + } +} + +static struct Geo +vk_to_grl_Geo(const VkAccelerationStructureGeometryKHR *pGeometry, + uint32_t prim_count) +{ + struct Geo geo = { + .Flags = vk_to_grl_GeometryFlags(pGeometry->flags), + }; + + switch (pGeometry->geometryType) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { + const VkAccelerationStructureGeometryTrianglesDataKHR *vk_tri = + &pGeometry->geometry.triangles; + + geo.Type = GEOMETRY_TYPE_TRIANGLES; + + geo.Desc.Triangles.pTransformBuffer = + vk_tri->transformData.deviceAddress; + geo.Desc.Triangles.pIndexBuffer = + vk_tri->indexData.deviceAddress; + geo.Desc.Triangles.pVertexBuffer = + vk_tri->vertexData.deviceAddress; + geo.Desc.Triangles.VertexBufferByteStride = vk_tri->vertexStride; + + if (vk_tri->indexType == VK_INDEX_TYPE_NONE_KHR) { + geo.Desc.Triangles.IndexCount = 0; + geo.Desc.Triangles.VertexCount = prim_count * 3; + geo.Desc.Triangles.IndexFormat = INDEX_FORMAT_NONE; + } else { + geo.Desc.Triangles.IndexCount = prim_count * 3; + geo.Desc.Triangles.VertexCount = vk_tri->maxVertex; + geo.Desc.Triangles.IndexFormat = + vk_to_grl_IndexFormat(vk_tri->indexType); + } + geo.Desc.Triangles.VertexFormat = + vk_to_grl_VertexFormat(vk_tri->vertexFormat); + break; + } + + case VK_GEOMETRY_TYPE_AABBS_KHR: { + const VkAccelerationStructureGeometryAabbsDataKHR *vk_aabbs = + &pGeometry->geometry.aabbs; + geo.Type = GEOMETRY_TYPE_PROCEDURAL; + geo.Desc.Procedural.pAABBs_GPUVA = vk_aabbs->data.deviceAddress; + geo.Desc.Procedural.AABBByteStride = vk_aabbs->stride; + geo.Desc.Procedural.AABBCount = prim_count; + break; + } + + default: + unreachable("Invalid geometry type"); + } + + return geo; +} + +#include "grl/grl_metakernel_copy.h" +#include "grl/grl_metakernel_misc.h" +#include "grl/grl_metakernel_build_primref.h" +#include "grl/grl_metakernel_new_sah_builder.h" +#include "grl/grl_metakernel_build_leaf.h" + +struct build_state { + enum anv_rt_bvh_build_method build_method; + + struct MKSizeEstimate estimate; + struct scratch_layout scratch; + struct MKBuilderState state; + + struct anv_address bvh_addr; + + size_t geom_size_prefix_sum_buffer; + size_t transient_size; + + uint32_t leaf_type; + uint32_t leaf_size; + + uint32_t num_geometries; + uint32_t num_instances; + + uint64_t instances_addr; + bool array_of_instances_ptr; + + const VkAccelerationStructureGeometryKHR *vk_geoms; +}; + +static void +get_binnedsah_scratch_buffers(struct build_state *bs, + uint64_t *p_qnode_buffer, + uint64_t *p_primref_indices, + uint64_t *p_bvh2) +{ + if (bs->estimate.numBuildPrimitives == 0) + { + *p_bvh2 = 0; + *p_qnode_buffer = 0; + *p_primref_indices = 0; + return; + } + + size_t bvh2_size = get_bvh2_size(bs->estimate.numBuildPrimitives); + if (bs->estimate.leaf_data_size < bvh2_size) { + assert(bs->scratch.bvh2_buffer != 0); + *p_bvh2 = bs->scratch.bvh2_buffer; + } else { + *p_bvh2 = intel_canonical_address(bs->state.bvh_buffer + + bs->estimate.leaf_data_start); + } + + assert(bs->scratch.qnode_buffer != 0); + *p_qnode_buffer = bs->scratch.qnode_buffer; + + assert(bs->scratch.leaf_index_buffers != 0); + *p_primref_indices = bs->scratch.leaf_index_buffers; +} + +static void +write_memory(struct anv_cmd_alloc alloc, size_t offset, const void *data, size_t data_len) +{ + assert((offset + data_len) < alloc.size); + memcpy(alloc.map + offset, data, data_len); +} + +static void +cmd_build_acceleration_structures( + struct anv_cmd_buffer *cmd_buffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, + const VkDeviceAddress *pIndirectDeviceAddresses, + const uint32_t *pIndirectStrides, + const uint32_t *const *ppMaxPrimitiveCounts) +{ + struct anv_device *device = cmd_buffer->device; + VK_MULTIALLOC(ma); + + struct build_state *builds; + vk_multialloc_add(&ma, &builds, struct build_state, infoCount); + + if (!vk_multialloc_zalloc(&ma, + &cmd_buffer->device->vk.alloc, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); + return; + } + + /* TODO: Indirect */ + assert(ppBuildRangeInfos != NULL); + + size_t transient_mem_init_globals_size = 0; + size_t transient_mem_init_globals_offset = 0; + + size_t transient_total = 0; + + size_t private_mem_total = 0; + + size_t num_trivial_builds = 0; + size_t num_new_sah_builds = 0; + + /* Prepare a bunch of data for the kernels we have to run. */ + for (uint32_t i = 0; i < infoCount; i++) { + struct build_state *bs = &builds[i]; + + const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i]; + struct anv_address scratch_addr = + anv_address_from_u64(pInfo->scratchData.deviceAddress); + + const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos = + ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL; + const uint32_t *pMaxPrimitiveCounts = + ppMaxPrimitiveCounts ? ppMaxPrimitiveCounts[i] : NULL; + + ANV_FROM_HANDLE(anv_acceleration_structure, dst_accel, + pInfo->dstAccelerationStructure); + + bs->build_method = device->bvh_build_method; + + bs->bvh_addr = dst_accel->address; + + bs->estimate = get_gpu_size_estimate(pInfo, pBuildRangeInfos, + pMaxPrimitiveCounts); + bs->scratch = get_gpu_scratch_layout(scratch_addr, bs->estimate, + bs->build_method); + + uint32_t leaf_size, leaf_type; + + switch (pInfo->type) { + case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: { + assert(pInfo->geometryCount == 1); + + const VkAccelerationStructureGeometryKHR *pGeometry = + get_geometry(pInfo, 0); + assert(pGeometry->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR); + + const VkAccelerationStructureGeometryInstancesDataKHR *instances = + &pGeometry->geometry.instances; + + bs->num_instances = pBuildRangeInfos[0].primitiveCount; + bs->instances_addr = instances->data.deviceAddress; + bs->array_of_instances_ptr = instances->arrayOfPointers; + leaf_type = NODE_TYPE_INSTANCE; + leaf_size = GENX(RT_BVH_INSTANCE_LEAF_length) * 4; + break; + } + + case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: { + bs->num_geometries = pInfo->geometryCount; + leaf_type = NODE_TYPE_QUAD; + leaf_size = GENX(RT_BVH_QUAD_LEAF_length) * 4; + break; + } + + default: + unreachable("Unsupported acceleration structure type"); + } + + size_t geom_struct_size = bs->num_geometries * sizeof(struct Geo); + size_t geom_prefix_sum_size = ALIGN(sizeof(uint32_t) * (bs->num_geometries + 1), 64); + + bs->transient_size = geom_prefix_sum_size + geom_struct_size; + + bs->geom_size_prefix_sum_buffer = transient_total + 0; + + bs->state = (struct MKBuilderState) { + .geomDesc_buffer = bs->geom_size_prefix_sum_buffer + + geom_prefix_sum_size, + .build_primref_buffer = bs->scratch.primrefs, + .build_globals = bs->scratch.globals, + .bvh_buffer = anv_address_physical(bs->bvh_addr), + .leaf_type = leaf_type, + .leaf_size = leaf_size, + }; + + transient_total += bs->transient_size; + + switch (device->bvh_build_method) { + case ANV_BVH_BUILD_METHOD_TRIVIAL: + num_trivial_builds++; + break; + case ANV_BVH_BUILD_METHOD_NEW_SAH: + num_new_sah_builds++; + break; + default: + unreachable("invalid BVH build method"); + } + + transient_mem_init_globals_size += sizeof(struct BatchedInitGlobalsData); + } + + transient_total = align_transient_size(transient_total); + transient_mem_init_globals_offset = transient_total; + transient_total += align_transient_size(transient_mem_init_globals_size); + + size_t transient_mem_binnedsah_size = 0; + size_t transient_mem_binnedsah_offset = 0; + size_t private_mem_binnedsah_size = 0; + size_t private_mem_binnedsah_offset = 0; + + transient_mem_binnedsah_size = get_batched_binnedsah_transient_mem_size(num_new_sah_builds); + transient_mem_binnedsah_offset = transient_total; + transient_total += align_transient_size(transient_mem_binnedsah_size); + + private_mem_binnedsah_size = get_batched_binnedsah_private_mem_size(num_new_sah_builds); + private_mem_binnedsah_offset = private_mem_total; + private_mem_total += align_private_size(private_mem_binnedsah_size); + + /* Allocate required memory */ + struct anv_cmd_alloc private_mem_alloc = + anv_cmd_buffer_alloc_space(cmd_buffer, private_mem_total, 64); + if (private_mem_total > 0 && anv_cmd_alloc_is_empty(private_mem_alloc)) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto error; + } + struct anv_cmd_alloc transient_mem_alloc = + anv_cmd_buffer_alloc_space(cmd_buffer, transient_total, 64); + if (transient_total > 0 && anv_cmd_alloc_is_empty(transient_mem_alloc)) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto error; + } + + uint64_t private_base = anv_address_physical(private_mem_alloc.address); + uint64_t transient_base = anv_address_physical(transient_mem_alloc.address); + + /* Prepare transient memory */ + for (uint32_t i = 0; i < infoCount; i++) { + struct build_state *bs = &builds[i]; + + const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i]; + + const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos = + ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL; + + struct Geo *geos = transient_mem_alloc.map + bs->state.geomDesc_buffer; + uint32_t *prefixes = transient_mem_alloc.map + bs->geom_size_prefix_sum_buffer; + uint32_t prefix_sum = 0; + for (unsigned g = 0; g < bs->num_geometries; g++) { + const VkAccelerationStructureGeometryKHR *pGeometry = get_geometry(pInfo, g); + uint32_t prim_count = pBuildRangeInfos[g].primitiveCount; + geos[g] = vk_to_grl_Geo(pGeometry, prim_count); + + prefixes[g] = prefix_sum; + prefix_sum += prim_count; + } + + prefixes[bs->num_geometries] = prefix_sum; + + bs->geom_size_prefix_sum_buffer = + intel_canonical_address(bs->geom_size_prefix_sum_buffer + + transient_base); + bs->state.geomDesc_buffer = + intel_canonical_address(bs->state.geomDesc_buffer + + transient_base); + + struct BatchedInitGlobalsData data = { + .p_build_globals = bs->scratch.globals, + .p_bvh_buffer = anv_address_physical(bs->bvh_addr), + + .numPrimitives = 0, + .numGeometries = bs->num_geometries, + .numInstances = bs->num_instances, + + .instance_descs_start = bs->estimate.instance_descs_start, + .geo_meta_data_start = bs->estimate.geo_meta_data_start, + .node_data_start = bs->estimate.node_data_start, + .leaf_data_start = bs->estimate.leaf_data_start, + .procedural_data_start = bs->estimate.procedural_data_start, + .back_pointer_start = bs->estimate.back_pointer_start, + .sizeTotal = bs->estimate.sizeTotal, + + .leafType = bs->state.leaf_type, + .leafSize = bs->state.leaf_size, + }; + + write_memory(transient_mem_alloc, + transient_mem_init_globals_offset + i * sizeof(data), + &data, sizeof(data)); + } + + /* Round 1 : init_globals kernel */ + genX(grl_misc_batched_init_globals)( + cmd_buffer, + intel_canonical_address(transient_base + + transient_mem_init_globals_offset), + infoCount); + + cmd_buffer->state.pending_pipe_bits |= ANV_GRL_FLUSH_FLAGS; + + /* Round 2 : Copy instance/geometry data from the application provided + * buffers into the acceleration structures. + */ + for (uint32_t i = 0; i < infoCount; i++) { + struct build_state *bs = &builds[i]; + + /* Metadata */ + if (bs->num_instances) { + assert(bs->num_geometries == 0); + + const uint64_t copy_size = bs->num_instances * sizeof(InstanceDesc); + /* This must be calculated in same way as + * groupCountForGeoMetaDataCopySize + */ + const uint32_t num_threads = (copy_size >> 8) + 3; + + if (bs->array_of_instances_ptr) { + genX(grl_misc_copy_instance_ptrs)( + cmd_buffer, + anv_address_physical(anv_address_add(bs->bvh_addr, + bs->estimate.instance_descs_start)), + bs->instances_addr, + copy_size, num_threads); + } else { + genX(grl_misc_copy_instances)( + cmd_buffer, + anv_address_physical(anv_address_add(bs->bvh_addr, + bs->estimate.instance_descs_start)), + bs->instances_addr, + copy_size, num_threads); + } + } + + if (bs->num_geometries) { + assert(bs->num_instances == 0); + const uint64_t copy_size = bs->num_geometries * sizeof(struct GeoMetaData); + + /* This must be calculated in same way as + * groupCountForGeoMetaDataCopySize + */ + const uint32_t num_threads = (copy_size >> 6) + 1; + + genX(grl_misc_copy_geo_meta_data)( + cmd_buffer, + anv_address_physical(anv_address_add(bs->bvh_addr, + bs->estimate.geo_meta_data_start)), + bs->state.geomDesc_buffer, + copy_size, + num_threads); + } + + /* Primrefs */ + if (bs->num_instances) { + if (bs->array_of_instances_ptr) { + genX(grl_build_primref_buildPrimirefsFromInstancesArrOfPtrs)( + cmd_buffer, + bs->instances_addr, + PREFIX_MK_SIZE(grl_build_primref, bs->estimate), + PREFIX_MK_STATE(grl_build_primref, bs->state), + false /* allowUpdate */); + } else { + genX(grl_build_primref_buildPrimirefsFromInstances)( + cmd_buffer, + bs->instances_addr, + PREFIX_MK_SIZE(grl_build_primref, bs->estimate), + PREFIX_MK_STATE(grl_build_primref, bs->state), + false /* allowUpdate */); + } + } + + if (bs->num_geometries) { + const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i]; + const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos = + ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL; + + assert(pInfo->geometryCount == bs->num_geometries); + for (unsigned g = 0; g < pInfo->geometryCount; g++) { + const VkAccelerationStructureGeometryKHR *pGeometry = + get_geometry(pInfo, g); + + switch (pGeometry->geometryType) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + genX(grl_build_primref_primrefs_from_tris)( + cmd_buffer, + PREFIX_MK_STATE(grl_build_primref, bs->state), + PREFIX_MK_SIZE(grl_build_primref, bs->estimate), + bs->state.geomDesc_buffer + g * sizeof(struct Geo), + g, + vk_to_grl_GeometryFlags(pGeometry->flags), + /* TODO: Indirect */ + pBuildRangeInfos[g].primitiveCount); + break; + + case VK_GEOMETRY_TYPE_AABBS_KHR: + genX(grl_build_primref_primrefs_from_proc)( + cmd_buffer, + PREFIX_MK_STATE(grl_build_primref, bs->state), + PREFIX_MK_SIZE(grl_build_primref, bs->estimate), + bs->state.geomDesc_buffer + g * sizeof(struct Geo), + g, + vk_to_grl_GeometryFlags(pGeometry->flags), + /* TODO: Indirect */ + pBuildRangeInfos[g].primitiveCount); + break; + + default: + unreachable("Invalid geometry type"); + } + } + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_GRL_FLUSH_FLAGS; + + /* Dispatch trivial builds */ + if (num_trivial_builds) { + for (uint32_t i = 0; i < infoCount; i++) { + struct build_state *bs = &builds[i]; + + if (bs->build_method != ANV_BVH_BUILD_METHOD_TRIVIAL) + continue; + + genX(grl_new_sah_builder_single_pass_binsah)( + cmd_buffer, + bs->scratch.globals, + bs->state.bvh_buffer, + bs->state.build_primref_buffer, + bs->scratch.leaf_index_buffers, + false /* alloc_backpointers */); + } + } + + /* Dispatch new SAH builds */ + if (num_new_sah_builds) { + size_t global_ptrs_offset = transient_mem_binnedsah_offset; + size_t buffers_info_offset = transient_mem_binnedsah_offset + sizeof(gpuva_t) * num_new_sah_builds; + + size_t scheduler_offset = private_mem_binnedsah_offset; + size_t sah_globals_offset = private_mem_binnedsah_offset + get_scheduler_size(num_new_sah_builds); + + struct SAHBuildArgsBatchable args = { + .num_builds = infoCount, + .p_globals_ptrs = intel_canonical_address(transient_base + global_ptrs_offset), + .p_buffers_info = intel_canonical_address(transient_base + buffers_info_offset), + .p_scheduler = intel_canonical_address(private_base + scheduler_offset), + .p_sah_globals = intel_canonical_address(private_base + sah_globals_offset), + .num_max_qnode_global_root_buffer_entries = MAX2(num_new_sah_builds, QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM), + }; + + for (uint32_t i = 0; i < infoCount; i++) { + struct build_state *bs = &builds[i]; + + if (bs->build_method != ANV_BVH_BUILD_METHOD_NEW_SAH) + continue; + + uint64_t p_build_primref_index_buffers; + uint64_t p_bvh2; + uint64_t p_qnode_child_buffer; + + get_binnedsah_scratch_buffers(bs, + &p_qnode_child_buffer, + &p_build_primref_index_buffers, + &p_bvh2); + + struct SAHBuildBuffersInfo buffers = { + .p_primref_index_buffers = bs->scratch.leaf_index_buffers, + .p_bvh_base = bs->state.bvh_buffer, + .p_primrefs_buffer = bs->state.build_primref_buffer, + .p_bvh2 = p_bvh2, + .p_qnode_root_buffer = p_qnode_child_buffer, + .sah_globals_flags = 0, + }; + + write_memory(transient_mem_alloc, buffers_info_offset, &buffers, sizeof(buffers)); + buffers_info_offset += sizeof(buffers); + + write_memory(transient_mem_alloc, global_ptrs_offset, &bs->state.build_globals, + sizeof(bs->state.build_globals)); + global_ptrs_offset += sizeof(bs->state.build_globals); + } + + genX(grl_new_sah_builder_new_sah_build_batchable)( + cmd_buffer, PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(grl_new_sah_builder, args)); + } + + if (num_new_sah_builds == 0) + cmd_buffer->state.pending_pipe_bits |= ANV_GRL_FLUSH_FLAGS; + + /* Finally write the leaves. */ + for (uint32_t i = 0; i < infoCount; i++) { + struct build_state *bs = &builds[i]; + + if (bs->num_instances) { + assert(bs->num_geometries == 0); + if (bs->array_of_instances_ptr) { + genX(grl_leaf_builder_buildLeafDXR_instances_pointers)(cmd_buffer, + PREFIX_MK_STATE(grl_leaf_builder, bs->state), + bs->scratch.leaf_index_buffers, + bs->instances_addr, + bs->scratch.leaf_index_buffer_stride, + 0 /* offset */, + bs->estimate.numBuildPrimitives); + } else { + genX(grl_leaf_builder_buildLeafDXR_instances)(cmd_buffer, + PREFIX_MK_STATE(grl_leaf_builder, bs->state), + bs->scratch.leaf_index_buffers, + bs->instances_addr, + bs->scratch.leaf_index_buffer_stride, + 0 /* offset */, + bs->estimate.numBuildPrimitives); + } + } + + if (bs->num_geometries) { + assert(bs->num_instances == 0); + const uint64_t p_numPrimitives = + bs->state.build_globals + offsetof(struct Globals, numPrimitives); + + assert(bs->estimate.numProcedurals == 0 || + bs->estimate.numTriangles == 0); + if (bs->estimate.numProcedurals) { + genX(grl_leaf_builder_buildLeafDXR_procedurals)( + cmd_buffer, + PREFIX_MK_STATE(grl_leaf_builder, bs->state), + bs->scratch.leaf_index_buffers, + bs->scratch.leaf_index_buffer_stride, + 0 /* offset */, + p_numPrimitives); + } else { + genX(grl_leaf_builder_buildLeafDXR_quads)( + cmd_buffer, + PREFIX_MK_STATE(grl_leaf_builder, bs->state), + bs->scratch.leaf_index_buffers, + bs->scratch.leaf_index_buffer_stride, + 0 /* offset */, + p_numPrimitives, + false /* allow_updates */); + } + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_GRL_FLUSH_FLAGS; + + error: + vk_free(&cmd_buffer->device->vk.alloc, builds); +} + +void +genX(CmdBuildAccelerationStructuresKHR)( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + cmd_build_acceleration_structures(cmd_buffer, infoCount, pInfos, + ppBuildRangeInfos, NULL, NULL, NULL); +} + +void +genX(CmdBuildAccelerationStructuresIndirectKHR)( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts) +{ + unreachable("Unimplemented"); +} + +void +genX(CmdCopyAccelerationStructureKHR)( + VkCommandBuffer commandBuffer, + const VkCopyAccelerationStructureInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_acceleration_structure, src_accel, pInfo->src); + ANV_FROM_HANDLE(anv_acceleration_structure, dst_accel, pInfo->dst); + + assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR || + pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR); + + if (pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR) { + struct anv_address src_size_addr = anv_address_add( + src_accel->address, + offsetof(struct BVHBase, Meta.allocationSize)); + genX(grl_copy_clone_indirect)(cmd_buffer, + anv_address_physical(dst_accel->address), + anv_address_physical(src_accel->address), + anv_address_physical(src_size_addr)); + } else { + genX(grl_copy_compact)(cmd_buffer, + anv_address_physical(dst_accel->address), + anv_address_physical(src_accel->address)); + } + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; +} + +void +genX(CmdCopyAccelerationStructureToMemoryKHR)( + VkCommandBuffer commandBuffer, + const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_acceleration_structure, src_accel, pInfo->src); + struct anv_device *device = cmd_buffer->device; + struct anv_address src_size_addr = anv_address_add( + src_accel->address, + offsetof(struct BVHBase, Meta.allocationSize)); + + assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR); + + genX(grl_copy_serialize_indirect)(cmd_buffer, + pInfo->dst.deviceAddress, + anv_address_physical(src_accel->address), + anv_address_physical(device->rt_uuid_addr), + anv_address_physical(src_size_addr)); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; +} + +void +genX(CmdCopyMemoryToAccelerationStructureKHR)( + VkCommandBuffer commandBuffer, + const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_acceleration_structure, dst_accel, pInfo->dst); + + assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR); + + uint64_t src_size_addr = pInfo->src.deviceAddress + + offsetof(struct SerializationHeader, DeserializedSizeInBytes); + genX(grl_copy_deserialize_indirect)(cmd_buffer, + anv_address_physical(dst_accel->address), + pInfo->src.deviceAddress, + src_size_addr); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; +} + +/* TODO: Host commands */ + +VkResult +genX(BuildAccelerationStructuresKHR)( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +genX(CopyAccelerationStructureKHR)( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + const VkCopyAccelerationStructureInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +genX(CopyAccelerationStructureToMemoryKHR)( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +genX(CopyMemoryToAccelerationStructureKHR)( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +genX(WriteAccelerationStructuresPropertiesKHR)( + VkDevice _device, + uint32_t accelerationStructureCount, + const VkAccelerationStructureKHR* pAccelerationStructures, + VkQueryType queryType, + size_t dataSize, + void* pData, + size_t stride) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +#endif /* GFX_VERx10 >= 125 */ diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 51f1f1b34d5..565a80e0d3e 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -5345,7 +5345,7 @@ genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer, mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); struct anv_address sysvals_addr = { - .bo = cmd_buffer->device->general_state_pool.block_pool.bo, + .bo = NULL, /* General state buffer is always 0. */ .offset = indirect_data.offset, }; diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 0541b52cc5c..0f1940a5043 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -159,6 +159,12 @@ VkResult genX(CreateQueryPool)( /* Query has two values: begin and end. */ uint64s_per_slot = 1 + 2; break; +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + uint64s_per_slot = 1 + 1 /* availability + size */; + break; +#endif default: assert(!"Invalid query type"); } @@ -435,13 +441,18 @@ VkResult genX(GetQueryPoolResults)( ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - assert(pool->type == VK_QUERY_TYPE_OCCLUSION || - pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || - pool->type == VK_QUERY_TYPE_TIMESTAMP || - pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || - pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || - pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL || - pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); + assert( +#if GFX_VERx10 >= 125 + pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || + pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || +#endif + pool->type == VK_QUERY_TYPE_OCCLUSION || + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || + pool->type == VK_QUERY_TYPE_TIMESTAMP || + pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL || + pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); if (vk_device_is_lost(&device->vk)) return VK_ERROR_DEVICE_LOST; @@ -533,6 +544,10 @@ VkResult genX(GetQueryPoolResults)( break; } +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: +#endif case VK_QUERY_TYPE_TIMESTAMP: { uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) @@ -716,6 +731,10 @@ void genX(CmdResetQueryPool)( switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: +#endif for (uint32_t i = 0; i < queryCount; i++) { emit_query_pc_availability(cmd_buffer, anv_query_address(pool, firstQuery + i), @@ -1466,6 +1485,10 @@ void genX(CmdCopyQueryPoolResults)( break; case VK_QUERY_TYPE_TIMESTAMP: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: +#endif result = mi_mem64(anv_address_add(query_addr, 8)); gpu_write_query_result(&b, dest_addr, flags, idx++, result); break; @@ -1486,3 +1509,51 @@ void genX(CmdCopyQueryPoolResults)( dest_addr = anv_address_add(dest_addr, destStride); } } + +#if GFX_VERx10 >= 125 + +#include "grl/include/GRLRTASCommon.h" +#include "grl/grl_metakernel_postbuild_info.h" + +void +genX(CmdWriteAccelerationStructuresPropertiesKHR)( + VkCommandBuffer commandBuffer, + uint32_t accelerationStructureCount, + const VkAccelerationStructureKHR* pAccelerationStructures, + VkQueryType queryType, + VkQueryPool queryPool, + uint32_t firstQuery) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + + assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < accelerationStructureCount; i++) { + ANV_FROM_HANDLE(anv_acceleration_structure, accel, pAccelerationStructures[i]); + struct anv_address query_addr = + anv_address_add(anv_query_address(pool, firstQuery + i), 8); + + if (queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR) { + genX(grl_postbuild_info_compacted_size)(cmd_buffer, + anv_address_physical(accel->address), + anv_address_physical(query_addr)); + } else { + assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR); + genX(grl_postbuild_info_serialized_size)(cmd_buffer, + anv_address_physical(accel->address), + anv_address_physical(query_addr)); + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t i = 0; i < accelerationStructureCount; i++) + emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true); +} +#endif diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c index e45ed6a4f51..378dcd35dee 100644 --- a/src/intel/vulkan/genX_state.c +++ b/src/intel/vulkan/genX_state.c @@ -36,6 +36,11 @@ #include "genxml/genX_pack.h" #include "vk_standard_sample_locations.h" + +#if GFX_VERx10 >= 125 +#include "grl/genX_grl.h" +#endif + #include "vk_util.h" static void @@ -466,6 +471,9 @@ void genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice) { assert(pdevice->info.verx10 == GFX_VERx10); +#if GFX_VERx10 >= 125 + genX(grl_load_rt_uuid)(pdevice->rt_uuid); +#endif } VkResult diff --git a/src/intel/vulkan/grl/genX_grl.h b/src/intel/vulkan/grl/genX_grl.h index 82eaec5fa75..6617e210bae 100644 --- a/src/intel/vulkan/grl/genX_grl.h +++ b/src/intel/vulkan/grl/genX_grl.h @@ -39,6 +39,8 @@ genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer, uint32_t arg_count, const struct anv_kernel_arg *args); +void +genX(grl_load_rt_uuid)(uint8_t *out_uuid); #ifdef __cplusplus } /* extern "C" */ diff --git a/src/intel/vulkan/grl/genX_grl_uuid.cpp b/src/intel/vulkan/grl/genX_grl_uuid.cpp new file mode 100644 index 00000000000..9f43358923f --- /dev/null +++ b/src/intel/vulkan/grl/genX_grl_uuid.cpp @@ -0,0 +1,39 @@ +/* + * Copyright © 2021 Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "include/GRLGen12.h" + +#include "vulkan/vulkan_core.h" + +extern "C" void +gfx125_grl_load_rt_uuid(uint8_t *out_uuid); + +extern "C" void +gfx125_grl_load_rt_uuid(uint8_t *out_uuid) +{ + assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE); + memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE); +} diff --git a/src/intel/vulkan/grl/grl_structs.h b/src/intel/vulkan/grl/grl_structs.h new file mode 100644 index 00000000000..ed721afa6a2 --- /dev/null +++ b/src/intel/vulkan/grl/grl_structs.h @@ -0,0 +1,479 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * This file contains a redefinition of structures defined in the GRL library. + * We need to have those structures defined to allocate & prepare data for + * the OpenCL kernels building acceleration structures. Unfortunately because + * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL + * header files directly so we have to redefine stuff here. + */ + +#ifndef GRL_STRUCTS_H +#define GRL_STRUCTS_H + +#include "GRLStructs.h" +#include "GRLRTASCommon.h" + +struct MKBuilderState { + qword geomDesc_buffer; + qword build_primref_buffer; + qword build_globals; + qword bvh_buffer; + dword leaf_type; + dword leaf_size; +}; + +#define PREFIX_MK_STATE(prefix, obj) \ + (struct prefix##_MKBuilderState) { \ + .geomDesc_buffer = (obj).geomDesc_buffer, \ + .build_primref_buffer = (obj).build_primref_buffer, \ + .build_globals = (obj).build_globals, \ + .bvh_buffer = (obj).bvh_buffer, \ + .leaf_type = (obj).leaf_type, \ + .leaf_size = (obj).leaf_size, \ + } + +struct MKSizeEstimate { + dword numTriangles; + dword numProcedurals; + dword numPrimitives; + dword numMeshes; + dword numBuildPrimitives; + dword numPrimitivesToSplit; + dword instance_descs_start; + dword geo_meta_data_start; + dword node_data_start; + dword leaf_data_start; + dword procedural_data_start; + dword back_pointer_start; + dword sizeTotal; + dword updateScratchSizeTotal; + dword fatleaf_table_start; + dword innernode_table_start; + dword max_fatleaves; + + size_t max_instance_leafs; + size_t max_inner_nodes; + size_t leaf_data_size; + size_t min_primitives; + size_t max_primitives; +}; + +#define PREFIX_MK_SIZE(prefix, obj) \ + (struct prefix##_MKSizeEstimate) { \ + .numTriangles = (obj).numTriangles, \ + .numProcedurals = (obj).numProcedurals, \ + .numPrimitives = (obj).numPrimitives, \ + .numMeshes = (obj).numMeshes, \ + .numBuildPrimitives = (obj).numBuildPrimitives, \ + .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \ + .instance_descs_start = (obj).instance_descs_start, \ + .geo_meta_data_start = (obj).geo_meta_data_start, \ + .node_data_start = (obj).node_data_start, \ + .leaf_data_start = (obj).leaf_data_start, \ + .procedural_data_start = (obj).procedural_data_start, \ + .back_pointer_start = (obj).back_pointer_start, \ + .sizeTotal = (obj).sizeTotal, \ + .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \ + .fatleaf_table_start = (obj).fatleaf_table_start, \ + .innernode_table_start = (obj).innernode_table_start, \ + .max_fatleaves = (obj).max_fatleaves, \ + } + +typedef struct AABB { + float lower[4]; + float upper[4]; +} AABB; + +struct Globals +{ + struct AABB centroidBounds; + + unsigned int build_record_start; + unsigned int numPrimitives; + unsigned int leafPrimType; + unsigned int leafSize; + + unsigned int numSplittedPrimitives; + unsigned int numBuildRecords; + + // spatial split sate + unsigned int numOriginalPrimitives; + float presplitPrioritySum; + float probThreshold; + + // binned-sah bfs state + unsigned int counter; + unsigned int numBuildRecords_extended; + + // sync variable used for global-sync on work groups + unsigned int sync; + + + /* morton code builder state */ + unsigned int shift; // used by adaptive mc-builder + unsigned int shift_mask; // used by adaptive mc-builder + unsigned int binary_hierarchy_root; + unsigned int p0_allocated_num; + unsigned int p0_created_num; + unsigned int morton_sort_in_flight; + unsigned int sort_iterations; + + gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid +}; + +typedef struct BVHBase +{ + // TODO: Implement the "copy-first-node" trick... duplicate root node here + + uint64_t rootNodeOffset; + + uint32_t reserved; + + uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64 + uint32_t quadLeafStart; + uint32_t quadLeafCur; + uint32_t proceduralDataStart; + uint32_t proceduralDataCur; + uint32_t instanceLeafStart; + uint32_t instanceLeafEnd; + uint32_t backPointerDataStart; // + uint32_t refitTreeletsDataStart; // refit structs + uint32_t refitStartPointDataStart; // + uint32_t BVHDataEnd; + + // number of bottom treelets + // if 1, then the bottom treelet is also tip treelet + uint32_t refitTreeletCnt; + uint32_t refitTreeletCnt2; // always 0, used for atomic updates + // data layout: + // @backPointerDataStart + // 'backpointer' - a dword per inner node. + // The bits are used as follows: + // 2:0 --> Used as a refit counter during BVH refitting. MBZ + // 5:3 --> Number of children + // 31:6 --> Index of the parent node in the internal node array + // The root node has a parent index of all ones + // @refitTreeletsDataStart + // RefitTreelet[], the last treelet is for top treelet all previous are for bottom + // @refitStartPointDataStart + // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space + // @backPointerDataEnd + + uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves" + uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children) + uint32_t fatLeafTableStart; + uint32_t innerTableStart; + + uint32_t _pad[12]; + + struct RTASMetaData Meta; +} BVHBase; + + +struct BatchedInitGlobalsData +{ + qword p_build_globals; + qword p_bvh_buffer; + dword numPrimitives; + dword numGeometries; + dword numInstances; + dword instance_descs_start; + dword geo_meta_data_start; + dword node_data_start; + dword leaf_data_start; + dword procedural_data_start; + dword back_pointer_start; + dword sizeTotal; + dword leafType; + dword leafSize; + dword fatleaf_table_start; + dword innernode_table_start; +}; + + +#define BFS_NUM_BINS 16 +#define BFS_NUM_VCONTEXTS 256 +#define BFS_MAX_DEPTH 32 + +#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384 + +struct BFS_Split +{ + float sah; + int dim; + int pos; +}; + +struct BFS_BinInfo +{ + float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6] + // The 6 are lower(xyz) and -upper(xyz) + // bins use negated-max so that we can use vectorized mins instead of min/max pairs + uint counts[3 * BFS_NUM_BINS]; +}; + +struct SAHBuildGlobals +{ + qword p_primref_index_buffers; + qword p_primrefs_buffer; + qword p_bvh2; + qword p_globals; // TODO: deprecate this + qword p_bvh_base; + gpuva_t p_qnode_root_buffer; + + dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks' + dword num_primrefs; + dword leaf_size; + dword leaf_type; + + dword root_buffer_num_produced; + dword root_buffer_num_produced_hi; + dword root_buffer_num_consumed; + dword root_buffer_num_consumed_hi; + dword root_buffer_num_to_consume; + dword root_buffer_num_to_consume_hi; +}; + +typedef union LRBounds +{ + struct + { + struct AABB3f left_centroid_bounds; + struct AABB3f left_geom_bounds; + struct AABB3f right_centroid_bounds; + struct AABB3f right_geom_bounds; + } boxes; + struct + { + float Array[24]; + } scalars; +} LRBounds; + + +struct VContext +{ + uint dispatch_primref_begin; // range of primrefs for this task + uint dispatch_primref_end; + uint bvh2_root; // BVH2 root node for this task + uint tree_depth; // depth of this node in the tree + uint num_left; // primref counts + uint num_right; + uint lr_mask; // lower 8b : left mask. upper 8b : right mask + uint batch_index; + + // pass1 global working state and output + struct BFS_Split split; + struct BFS_BinInfo global_bin_info; + + // pass2 global working state and output + LRBounds lr_bounds; +}; + + + +struct BFSDispatchRecord +{ + ushort batch_index; + ushort context_id; +}; + + +struct BFSDispatchQueue +{ + uint num_dispatches; + uint wg_count[BFS_NUM_VCONTEXTS]; + struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS]; +}; + +struct BFS1SpillStackEntry +{ + uint primref_begin; + uint primref_end; + uint bvh2_root; + ushort tree_depth; + ushort batch_index; +}; + +struct BFS1SpillStack +{ + uint size; + struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH]; +}; + +struct QNodeGlobalRootBufferEntry +{ + uint bvh2_node; + uint qnode; + uint build_idx; + uint _pad; +}; + +struct QNodeGlobalRootBuffer +{ + uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM + struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2]; +}; + +struct DFSDispatchRecord +{ + uint primref_base; + uint bvh2_base; + uint batch_index; + ushort num_primrefs; + ushort tree_depth; +}; + + +struct DFSDispatchQueue +{ + struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2]; +}; + +#define VCONTEXT_STATE_EXECUTING 0 +#define VCONTEXT_STATE_UNALLOCATED 1 + +union SchedulerUnion +{ + struct VContextScheduler + { + ///////////////////////////////////////////////////////////// + // State data used for communication with command streamer + // NOTE: This part must match definition in 'new_sah_builder.grl' + ///////////////////////////////////////////////////////////// + + dword num_bfs_wgs; + dword num_dfs_wgs; + + dword scheduler_postsync; + dword _pad1; + + dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). + dword num_single_builds; // number of single-wg builds (#primrefs < threshold) + + dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass + dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition + + ///////////////////////////////////////////////////////////// + + dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer + dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer + + dword vcontext_state[BFS_NUM_VCONTEXTS]; + + struct BFSDispatchQueue bfs_queue; + struct DFSDispatchQueue dfs_queue; + + struct VContext contexts[BFS_NUM_VCONTEXTS]; + + struct BFS1SpillStack bfs2_spill_stack; + } vContextScheduler; + + struct QnodeScheduler + { + dword num_qnode_grb_curr_entries; + dword num_qnode_grb_new_entries; + + dword scheduler_postsync; + dword _pad1; + + dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). + dword num_single_builds; // number of single-wg builds (#primrefs < threshold) + + dword batched_builds_to_process; + dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer + + ///////////////////////////////////////////////////////////// + + dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer + dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer + + struct QNodeGlobalRootBuffer qnode_global_root_buffer; + } qnodeScheduler; +}; + + +struct BVH2Node +{ + struct AABB3f box; + uint meta_u; // leaf: primref start. inner: offset from node to its first child + uint meta_ss; + //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes + //uchar is_inner; // 1 if inner, 0 if leaf + //uchar mask; +}; + +struct BVH2 +{ + uint num_nodes; + uint _pad[7]; // align to 32B +}; + +struct BatchedBLSDispatchEntry +{ + ///////////////////////////////////////////////////////////// + // State data used for communication with command streamer + // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' + ///////////////////////////////////////////////////////////// + qword p_data_buffer; + qword num_elements; // number of elements in p_data_buffer +}; + +struct SAHBuildArgsBatchable +{ + qword p_globals_ptrs; + qword p_scheduler; + qword p_buffers_info; + qword p_sah_globals; + + dword num_max_qnode_global_root_buffer_entries; + dword num_builds; +}; + +#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \ + (struct prefix##_SAHBuildArgsBatchable) { \ + .p_globals_ptrs = (obj).p_globals_ptrs, \ + .p_scheduler = (obj).p_scheduler, \ + .p_buffers_info = (obj).p_buffers_info, \ + .p_sah_globals = (obj).p_sah_globals, \ + .num_max_qnode_global_root_buffer_entries = \ + (obj).num_max_qnode_global_root_buffer_entries, \ + .num_builds = (obj).num_builds, \ + } + + +struct SAHBuildBuffersInfo +{ + gpuva_t p_globals; + gpuva_t p_primref_index_buffers; + gpuva_t p_primrefs_buffer; + gpuva_t p_bvh2; + gpuva_t p_bvh_base; + gpuva_t p_qnode_root_buffer; + dword sah_globals_flags; + dword _pad; + gpuva_t _pad2; +}; + +#endif /* GRL_STRUCTS_H */ diff --git a/src/intel/vulkan/grl/meson.build b/src/intel/vulkan/grl/meson.build index 99ca3febbe0..1230ba82935 100644 --- a/src/intel/vulkan/grl/meson.build +++ b/src/intel/vulkan/grl/meson.build @@ -49,6 +49,7 @@ endforeach grl_genX_files = [ 'genX_grl_dispatch.c', + 'genX_grl_uuid.cpp', ] grl_lib_args = [] @@ -151,7 +152,7 @@ foreach t : [['125', 'gfx125', 'dg2']] grl_genX_libs += static_library( 'grl_@0@'.format(genX_prefix), [grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c, - grl_genX_files, grl_metakernel_c], + grl_genX_files, grl_metakernel_c, grl_metakernel_h], include_directories : [ inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel, inc_anv, @@ -160,6 +161,10 @@ foreach t : [['125', 'gfx125', 'dg2']] no_override_init_args, c_sse2_args, '-DGFX_VERx10=@0@'.format(verX10), ], + cpp_args : [ + no_override_init_args, c_sse2_args, + '-DGFX_VERx10=@0@'.format(verX10), + ], dependencies : [ dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers, idep_vulkan_runtime_headers, @@ -168,6 +173,13 @@ foreach t : [['125', 'gfx125', 'dg2']] ) endforeach +libgrl_deps = [ + dep_valgrind, + idep_nir_headers, + idep_vulkan_util_headers, + idep_vulkan_wsi_headers, +] + libgrl = static_library( 'grl', [grl_cl_kernel_h], @@ -176,8 +188,12 @@ libgrl = static_library( inc_compiler, ], link_whole : [grl_genX_libs], - dependencies : [ - dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers - ], + dependencies : libgrl_deps, install : true, ) +idep_grl = declare_dependency( + link_with : libgrl, + dependencies : libgrl_deps, + sources : grl_metakernel_h, + include_directories : include_directories('include', 'gpu'), +) diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 90a2dda0b5c..f2ac6634eff 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -73,6 +73,7 @@ endif libanv_per_hw_ver_libs = [] anv_per_hw_ver_files = files( + 'genX_acceleration_structure.c', 'genX_blorp_exec.c', 'genX_cmd_buffer.c', 'genX_gpu_memcpy.c', @@ -100,12 +101,12 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']], dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml, idep_vulkan_util_headers, idep_vulkan_wsi_headers, idep_vulkan_runtime_headers, idep_intel_driver_ds_headers, + idep_grl, ], ) endforeach libanv_files = files( - 'anv_acceleration_structure.c', 'anv_allocator.c', 'anv_android.h', 'anv_batch_chain.c', @@ -194,7 +195,7 @@ libvulkan_intel = shared_library( include_directories : [ inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, ], - link_whole : [libanv_common, libanv_per_hw_ver_libs], + link_whole : [libanv_common, libanv_per_hw_ver_libs, libgrl], link_with : [ libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf, ], @@ -232,7 +233,7 @@ if with_tests ], link_whole : libanv_common, link_with : [ - libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev, + libanv_per_hw_ver_libs, libgrl, libintel_compiler, libintel_common, libintel_dev, libisl, libblorp, libintel_perf, ], dependencies : [