anv/rt: Copy 16bytes at once instead of copying 8bytes
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

For simple copy, we can copy data with uvec4(16bytes) at once.
When we have serialize/deserialize copy mode, we want to copy out the
instance leave address which are 8byte wide, so we need to jump with
8byte stride instead of 16bytes.

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40966>
This commit is contained in:
Sagar Ghuge 2026-04-14 20:57:40 -07:00 committed by Marge Bot
parent 81feb2e7f1
commit 19d64d6f7d
2 changed files with 21 additions and 15 deletions

View file

@ -37,10 +37,10 @@ layout(push_constant) uniform CONSTS {
* dEQP-VK.ray_tracing_pipeline.acceleration_structures.header_bottom_address.*
* is doing.
*
* Therefore, if the application updates the handles, we need to replace
* the old handles in anv_instance_leaf with the new one. To access
* Therefore, if the application updates the handles, we need to replace the
* old handles in anv_instance_leaf with the new one. To access
* anv_instance_leaf without traversing the TLAS, pointers to these
* anv_instance_leaf are stored right after anv_accel_struct_header,
* anv_instance_leaf are stored at instance_leaves_offset in BVH layout,
* allowing us to know where they are in the TLAS instantly.
*
* Although, the fact that the application can swap/replace new handles
@ -52,7 +52,7 @@ main(void)
{
uint32_t global_id = gl_GlobalInvocationID.x;
uint32_t lanes = gl_NumWorkGroups.x * 128;
uint32_t increment = lanes * 8;
uint32_t increment = lanes * 16;
uint64_t copy_src_addr = args.src_addr;
uint64_t copy_dst_addr = args.dst_addr;
@ -99,27 +99,33 @@ main(void)
}
uint64_t size = DEREF(header).compacted_size;
for (uint64_t offset = global_id * 8; offset < size; offset += increment) {
/* copy 8 bytes per iteration */
DEREF(REF(uint64_t)(copy_dst_addr + offset)) =
DEREF(REF(uint64_t)(copy_src_addr + offset));
for (uint64_t offset = global_id * 16; offset < size; offset += increment) {
DEREF(REF(uvec4)(copy_dst_addr + offset)) =
DEREF(REF(uvec4)(copy_src_addr + offset));
/* Do the adjustment inline in the same invocation that copies the data so that we don't have
* to synchronize.
*/
if (offset < instance_end && offset >= instance_offset &&
(offset - instance_offset) % SIZEOF(uint64_t) == 0) {
uint64_t idx = (offset - instance_offset) / SIZEOF(uint64_t);
for (int i = 0; i < 2; i++) {
uint64_t sub_offset = offset + i * 8;
/* Check if access to instance leave addresses are bounds. */
if (sub_offset < instance_offset || sub_offset >= instance_end)
continue;
if ((sub_offset - instance_offset) % SIZEOF(uint64_t) != 0)
continue;
uint64_t idx = (sub_offset - instance_offset) / SIZEOF(uint64_t);
if (args.mode == ANV_COPY_MODE_SERIALIZE) {
/* Indirectly access the anv_instance_leaf, and store the blas_ptrs after ser_header */
uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_src_addr + offset));
uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_src_addr + sub_offset));
REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr);
uint64_t blas_ptr = DEREF(instance_leaf).part1.bvh_ptr;
DEREF(INDEX(uint64_t, instance_base, idx)) = blas_ptr;
} else { /* ANV_COPY_MODE_DESERIALIZE */
/* Indirectly access the anv_instance_leaf, and replace the bvh_ptr with the ones after ser_header */
uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_dst_addr + offset));
uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_dst_addr + sub_offset));
REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr);
uint64_t blas_ptr = DEREF(INDEX(uint64_t, instance_base, idx));
DEREF(instance_leaf).part1.bvh_ptr = blas_ptr;

View file

@ -34,11 +34,11 @@ main(void)
DEREF(args.dst).instance_count = args.instance_count;
DEREF(args.dst).instance_leaves_offset = args.instance_leaves_offset;
/* 128 is local_size_x in copy.comp shader, 8 is the amount of data
/* 128 is local_size_x in copy.comp shader, 16bytes is the amount of data
* copied by each iteration of that shader's loop
*/
DEREF(args.dst).copy_dispatch_size[0] =
uint32_t(DIV_ROUND_UP(compacted_size, 8 * 128));
uint32_t(DIV_ROUND_UP(compacted_size, 16 * 128));
DEREF(args.dst).copy_dispatch_size[1] = 1;
DEREF(args.dst).copy_dispatch_size[2] = 1;
#if GFX_VERx10 >= 300