mesa/src/intel/vulkan/bvh/copy.comp
Sagar Ghuge 6aa3b70382 anv: Mark RootNodeOffset at 256B always
This commit change the BVH layout a little so that we can load the BVH
offset as constant rather than reading from memory.

We have to force the instance leaves pointer at the end which gets used
in copy.comp shader.

Totals:
Instrs: 54798 -> 54728 (-0.13%)
Send messages: 3854 -> 3847 (-0.18%)
Cycle count: 1915106 -> 1913954 (-0.06%); split: -0.07%, +0.01%
Non SSA regs after NIR: 18594 -> 18575 (-0.10%)

Totals from 7 (7.37% of 95) affected shaders:
Instrs: 5532 -> 5462 (-1.27%)
Send messages: 367 -> 360 (-1.91%)
Cycle count: 132592 -> 131440 (-0.87%); split: -1.01%, +0.14%
Non SSA regs after NIR: 1989 -> 1970 (-0.96%)

PERCENTAGE DELTAS Shaders   Instrs  Send messages Cycle count Non SSA regs after NIR
q2rtx-rt-pipeline 95        -0.13%      -0.18%       -0.06%           -0.10%
--------------------------------------------------------------------------------------
All affected      7         -1.27%      -1.91%       -0.87%           -0.96%
--------------------------------------------------------------------------------------
Total             95        -0.13%      -0.18%       -0.06%           -0.10%

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39106>
2026-01-22 23:20:04 +00:00

140 lines
5.8 KiB
Text

/* Copyright © 2022 Bas Nieuwenhuizen
* Copyright © 2024 Intel Coorporation
* SPDX-License-Identifier: MIT
*/
#version 460
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
#include "anv_build_interface.h"
layout(push_constant) uniform CONSTS {
copy_args args;
};
// Layout of serialized data
/**************************************|
| vk_accel_struct_serialization_header |
|--------------------------------------|
| For a TLAS, all handles to the BLAS |
| within this TLAS. |
| For a BLAS, nothing. |
|--------------------------------------|
| Driver-specific part. |
| For Intel, this starts with |
| anv_accel_struct_header as drawn |
| in anv_bvh.h |
|**************************************/
/*
* Explanation of BLAS handles:
* According to the spec of vkCmdCopyAccelerationStructureToMemoryKHR,
* for a TLAS, the handles of all BLAS/instances within this TLAS are
* tightly stored after vk_accel_struct_serialization_header, making this
* serialized-memory a semi-opaque object. The application might be able
* to swap/replace these handles with other handles. In fact this is what
* dEQP-VK.ray_tracing_pipeline.acceleration_structures.header_bottom_address.*
* is doing.
*
* Therefore, if the application updates the handles, we need to replace
* the old handles in anv_instance_leaf with the new one. To access
* anv_instance_leaf without traversing the TLAS, pointers to these
* anv_instance_leaf are stored right after anv_accel_struct_header,
* allowing us to know where they are in the TLAS instantly.
*
* Although, the fact that the application can swap/replace new handles
* of BLAS without rebuilding the TLAS sounds a bit odd.
*/
void
main(void)
{
uint32_t global_id = gl_GlobalInvocationID.x;
uint32_t lanes = gl_NumWorkGroups.x * 128;
uint32_t increment = lanes * 8;
uint64_t copy_src_addr = args.src_addr;
uint64_t copy_dst_addr = args.dst_addr;
if (args.mode == ANV_COPY_MODE_DESERIALIZE) {
copy_src_addr += SIZEOF(vk_accel_struct_serialization_header) +
DEREF(REF(vk_accel_struct_serialization_header)(args.src_addr)).instance_count * SIZEOF(uint64_t);
}
REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(copy_src_addr);
uint64_t instance_base = args.src_addr + SIZEOF(vk_accel_struct_serialization_header);
uint64_t instance_offset = DEREF(header).instance_leaves_offset;
/* We store the address of instance_leaf after bvh header */
uint64_t instance_end = DEREF(header).instance_count * SIZEOF(uint64_t);
if (instance_end > 0)
instance_end += instance_offset;
if (args.mode == ANV_COPY_MODE_SERIALIZE) {
copy_dst_addr += SIZEOF(vk_accel_struct_serialization_header) +
DEREF(REF(anv_accel_struct_header)(args.src_addr)).instance_count * SIZEOF(uint64_t);
if (global_id == 0) {
REF(vk_accel_struct_serialization_header) ser_header =
REF(vk_accel_struct_serialization_header)(args.dst_addr);
DEREF(ser_header).serialization_size = DEREF(header).serialization_size;
DEREF(ser_header).deserialization_size = DEREF(header).compacted_size;
DEREF(ser_header).instance_count = DEREF(header).instance_count;
for (uint32_t offset = 0; offset < VK_UUID_SIZE; offset++) {
DEREF(ser_header).driver_uuid[offset] = args.driver_uuid[offset];
}
for (uint32_t offset = 0; offset < VK_UUID_SIZE; offset++) {
DEREF(ser_header).accel_struct_compat[offset] = args.accel_struct_compat[offset];
}
}
instance_base = args.dst_addr + SIZEOF(vk_accel_struct_serialization_header);
} else if (args.mode == ANV_COPY_MODE_COPY) {
instance_end = 0;
}
uint64_t size = DEREF(header).compacted_size;
for (uint64_t offset = global_id * 8; offset < size; offset += increment) {
/* copy 8 bytes per iteration */
DEREF(REF(uint64_t)(copy_dst_addr + offset)) =
DEREF(REF(uint64_t)(copy_src_addr + offset));
/* Do the adjustment inline in the same invocation that copies the data so that we don't have
* to synchronize.
*/
if (offset < instance_end && offset >= instance_offset &&
(offset - instance_offset) % SIZEOF(uint64_t) == 0) {
uint64_t idx = (offset - instance_offset) / SIZEOF(uint64_t);
if (args.mode == ANV_COPY_MODE_SERIALIZE) {
/* Indirectly access the anv_instance_leaf, and store the blas_ptrs after ser_header */
uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_src_addr + offset));
REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr);
uint64_t blas_ptr = DEREF(instance_leaf).part1.bvh_ptr;
DEREF(INDEX(uint64_t, instance_base, idx)) = blas_ptr;
} else { /* ANV_COPY_MODE_DESERIALIZE */
/* Indirectly access the anv_instance_leaf, and replace the bvh_ptr with the ones after ser_header */
uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_dst_addr + offset));
REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr);
uint64_t blas_ptr = DEREF(INDEX(uint64_t, instance_base, idx));
DEREF(instance_leaf).part1.bvh_ptr = blas_ptr;
/* set the startNodePtr to blas_ptr + ANV_HEADER_SIZE */
uint64_t new_startNodePtr = blas_ptr + ANV_RT_BVH_HEADER_SIZE;
#if GFX_VERx10 >= 300
DEREF(instance_leaf).part0.QW_startNodePtr = new_startNodePtr;
#else
uint64_t mask = 0x0000fffffffffffful;
/* clear bits and set */
DEREF(instance_leaf).part0.QW_startNodePtr =
(DEREF(instance_leaf).part0.QW_startNodePtr & ~mask) | (new_startNodePtr & mask);
#endif
}
}
}
}