/*
 * Copyright © 2021 Bas Nieuwenhuizen
 * Copyright © 2023 Valve Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "vk_acceleration_structure.h"

#include "vk_alloc.h"
#include "vk_common_entrypoints.h"
#include "vk_device.h"
#include "vk_command_buffer.h"
#include "vk_log.h"
#include "vk_meta.h"

#include "bvh/vk_build_interface.h"
#include "bvh/vk_bvh.h"

#include "radix_sort/common/vk/barrier.h"
#include "radix_sort/shaders/push.h"

#include "util/u_string.h"

static const uint32_t leaf_spv[] = {
#include "bvh/leaf.spv.h"
};

static const uint32_t morton_spv[] = {
#include "bvh/morton.spv.h"
};

static const uint32_t lbvh_main_spv[] = {
#include "bvh/lbvh_main.spv.h"
};

static const uint32_t lbvh_generate_ir_spv[] = {
#include "bvh/lbvh_generate_ir.spv.h"
};

static const uint32_t ploc_spv[] = {
#include "bvh/ploc_internal.spv.h"
};

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_CreateAccelerationStructureKHR(VkDevice _device,
                                         const VkAccelerationStructureCreateInfoKHR *pCreateInfo,
                                         const VkAllocationCallbacks *pAllocator,
                                         VkAccelerationStructureKHR *pAccelerationStructure)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   VK_FROM_HANDLE(vk_buffer, buffer, pCreateInfo->buffer);

   struct vk_acceleration_structure *accel_struct = vk_object_alloc(
      device, pAllocator, sizeof(struct vk_acceleration_structure),
      VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);

   if (!accel_struct)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   accel_struct->buffer = buffer;
   accel_struct->offset = pCreateInfo->offset;
   accel_struct->size = pCreateInfo->size;

   if (pCreateInfo->deviceAddress &&
       vk_acceleration_structure_get_va(accel_struct) != pCreateInfo->deviceAddress)
      return vk_error(device, VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS);

   *pAccelerationStructure = vk_acceleration_structure_to_handle(accel_struct);
   return VK_SUCCESS;
}

VKAPI_ATTR void VKAPI_CALL
vk_common_DestroyAccelerationStructureKHR(VkDevice _device,
                                     VkAccelerationStructureKHR accelerationStructure,
                                     const VkAllocationCallbacks *pAllocator)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, accelerationStructure);

   if (!accel_struct)
      return;

   vk_object_free(device, pAllocator, accel_struct);
}

VKAPI_ATTR VkDeviceAddress VKAPI_CALL
vk_common_GetAccelerationStructureDeviceAddressKHR(
   VkDevice _device, const VkAccelerationStructureDeviceAddressInfoKHR *pInfo)
{
   VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfo->accelerationStructure);
   return vk_acceleration_structure_get_va(accel_struct);
}

#define KEY_ID_PAIR_SIZE 8
#define MORTON_BIT_SIZE  24

static void
vk_acceleration_structure_build_state_init(struct vk_acceleration_structure_build_state *state,
                                           struct vk_device *device, uint32_t leaf_count,
                                           const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
                                           const struct vk_acceleration_structure_build_args *args)
{
   state->build_info = build_info;
   state->leaf_node_count = leaf_count;

   if (leaf_count <= 4)
      state->config.internal_type = VK_INTERNAL_BUILD_TYPE_LBVH;
   else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR)
      state->config.internal_type = VK_INTERNAL_BUILD_TYPE_PLOC;
   else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) &&
            !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
      state->config.internal_type = VK_INTERNAL_BUILD_TYPE_PLOC;
   else
      state->config.internal_type = VK_INTERNAL_BUILD_TYPE_LBVH;

   if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR &&
       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
       device->as_build_ops->update_as[0])
      state->config.internal_type = VK_INTERNAL_BUILD_TYPE_UPDATE;

   if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) &&
       build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
       device->as_build_ops->update_as[0])
      state->config.updateable = true;

   if (device->as_build_ops->get_build_config)
      device->as_build_ops->get_build_config(vk_device_to_handle(device), state);

   uint32_t internal_count = MAX2(leaf_count, 2) - 1;

   radix_sort_vk_memory_requirements_t requirements = {
      0,
   };
   radix_sort_vk_get_memory_requirements(args->radix_sort, leaf_count,
                                         &requirements);

   uint32_t ir_leaf_size;
   switch (vk_get_as_geometry_type(build_info)) {
   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
      ir_leaf_size = sizeof(struct vk_ir_triangle_node);
      break;
   case VK_GEOMETRY_TYPE_AABBS_KHR:
      ir_leaf_size = sizeof(struct vk_ir_aabb_node);
      break;
   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
      ir_leaf_size = sizeof(struct vk_ir_instance_node);
      break;
   default:
      unreachable("Unknown VkGeometryTypeKHR");
   }

   uint32_t offset = 0;

   uint32_t ploc_scratch_space = 0;
   uint32_t lbvh_node_space = 0;

   if (state->config.internal_type == VK_INTERNAL_BUILD_TYPE_PLOC)
      ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition);
   else
      lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count;

   uint32_t encode_scratch_size = 0;
   if (device->as_build_ops->get_encode_scratch_size)
      encode_scratch_size = device->as_build_ops->get_encode_scratch_size(vk_device_to_handle(device), state);

   state->scratch.header_offset = offset;
   offset += sizeof(struct vk_ir_header);

   /* The encode passes should not need node sorting state. Reuse the space reserved for node sorting. */
   uint32_t encode_scratch_end = offset + encode_scratch_size;

   state->scratch.sort_buffer_offset[0] = offset;
   offset += requirements.keyvals_size;

   state->scratch.sort_buffer_offset[1] = offset;
   offset += requirements.keyvals_size;

   state->scratch.sort_internal_offset = offset;
   /* Internal sorting data is not needed when PLOC/LBVH are invoked,
    * save space by aliasing them */
   state->scratch.ploc_prefix_sum_partition_offset = offset;
   state->scratch.lbvh_node_offset = offset;
   offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space);

   /* Make sure encode scratch space does not overlap the BVH. */
   offset = MAX2(offset, encode_scratch_end);

   state->scratch.ir_offset = offset;
   offset += ir_leaf_size * leaf_count;

   state->scratch.internal_node_offset = offset;
   offset += sizeof(struct vk_ir_box_node) * internal_count;

   state->scratch.size = offset;

   if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR &&
       device->as_build_ops->update_as[0]) {
      state->scratch.update_size = device->as_build_ops->get_update_scratch_size(vk_device_to_handle(device), state);
   } else {
      state->scratch.update_size = offset;
   }
}

struct bvh_state {
   struct vk_acceleration_structure_build_state vk;

   uint32_t scratch_offset;

   uint32_t internal_node_count;

   /* Radix sort state */
   uint32_t scatter_blocks;
   uint32_t count_ru_scatter;
   uint32_t histo_blocks;
   uint32_t count_ru_histo;
   struct rs_push_scatter push_scatter;

   uint32_t last_encode_pass;
};

struct bvh_batch_state {
   bool any_updateable;
   bool any_non_updateable;
   bool any_ploc;
   bool any_lbvh;
   bool any_update;
};

struct vk_bvh_build_pipeline_layout_key {
   enum vk_meta_object_key_type type;
   uint32_t size;
};

struct vk_bvh_build_pipeline_key {
   enum vk_meta_object_key_type type;
   uint32_t flags;
};

VkResult
vk_get_bvh_build_pipeline_layout(struct vk_device *device, struct vk_meta_device *meta,
                                 unsigned push_constant_size, VkPipelineLayout *layout)
{
   struct vk_bvh_build_pipeline_layout_key key = {
      .type = VK_META_OBJECT_KEY_BVH_PIPELINE_LAYOUT,
      .size = push_constant_size,
   };

   VkPushConstantRange push_constant_range = {
      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
      .size = push_constant_size,
   };

   return vk_meta_get_pipeline_layout(
      device, meta, NULL, &push_constant_range, &key, sizeof(key), layout);
}

VkResult
vk_get_bvh_build_pipeline_spv(struct vk_device *device, struct vk_meta_device *meta,
                              enum vk_meta_object_key_type type, const uint32_t *spv,
                              uint32_t spv_size, unsigned push_constant_size,
                              const struct vk_acceleration_structure_build_args *args,
                              uint32_t flags, VkPipeline *pipeline)
{
   VkPipelineLayout layout;
   VkResult result = vk_get_bvh_build_pipeline_layout(device, meta, push_constant_size, &layout);
   if (result != VK_SUCCESS)
      return result;

   struct vk_bvh_build_pipeline_key key = {
      .type = type,
      .flags = flags,
   };

   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(meta, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline = pipeline_from_cache;
      return VK_SUCCESS;
   }

   VkShaderModuleCreateInfo module_info = {
      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
      .pNext = NULL,
      .flags = 0,
      .codeSize = spv_size,
      .pCode = spv,
   };

   VkSpecializationMapEntry spec_map[4] = {
      {
         .constantID = SUBGROUP_SIZE_ID,
         .offset = 0,
         .size = sizeof(args->subgroup_size),
      },
      {
         .constantID = BVH_BOUNDS_OFFSET_ID,
         .offset = sizeof(args->subgroup_size),
         .size = sizeof(args->bvh_bounds_offset),
      },
      {
         .constantID = BUILD_FLAGS_ID,
         .offset = sizeof(args->subgroup_size) + sizeof(args->bvh_bounds_offset),
         .size = sizeof(flags),
      },
      {
         .constantID = ROOT_FLAGS_OFFSET_ID,
         .offset = sizeof(args->subgroup_size) +
                   sizeof(args->bvh_bounds_offset),
         .size = sizeof(args->root_flags_offset),
      }
   };

   uint32_t spec_constants[4] = {
      args->subgroup_size,
      args->bvh_bounds_offset,
      flags,
      args->root_flags_offset,
   };

   VkSpecializationInfo spec_info = {
      .mapEntryCount = ARRAY_SIZE(spec_map),
      .pMapEntries = spec_map,
      .dataSize = sizeof(spec_constants),
      .pData = spec_constants,
   };

   VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT rssci = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
      .pNext = &module_info,
      .requiredSubgroupSize = args->subgroup_size,
   };

   VkPipelineShaderStageCreateInfo shader_stage = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
      .pNext = &rssci,
      .flags = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT,
      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
      .pName = "main",
      .pSpecializationInfo = &spec_info,
   };

   VkComputePipelineCreateInfo pipeline_info = {
      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
      .stage = shader_stage,
      .flags = 0,
      .layout = layout,
   };

   return vk_meta_create_compute_pipeline(device, meta, &pipeline_info,
                                          &key, sizeof(key), pipeline);
}

static uint32_t
pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags)
{
   uint32_t geometry_id_and_flags = geometry_id;
   if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR)
      geometry_id_and_flags |= VK_GEOMETRY_OPAQUE;

   return geometry_id_and_flags;
}

struct vk_bvh_geometry_data
vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index,
                      const VkAccelerationStructureGeometryKHR *geometry,
                      const VkAccelerationStructureBuildRangeInfoKHR *build_range_info)
{
   struct vk_bvh_geometry_data data = {
      .first_id = first_id,
      .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags),
      .geometry_type = geometry->geometryType,
   };

   switch (geometry->geometryType) {
   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);

      data.data = geometry->geometry.triangles.vertexData.deviceAddress +
                  build_range_info->firstVertex * geometry->geometry.triangles.vertexStride;
      data.indices = geometry->geometry.triangles.indexData.deviceAddress;

      if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR)
         data.data += build_range_info->primitiveOffset;
      else
         data.indices += build_range_info->primitiveOffset;

      data.transform = geometry->geometry.triangles.transformData.deviceAddress;
      if (data.transform)
         data.transform += build_range_info->transformOffset;

      data.stride = geometry->geometry.triangles.vertexStride;
      data.vertex_format = geometry->geometry.triangles.vertexFormat;
      data.index_format = geometry->geometry.triangles.indexType;
      break;
   case VK_GEOMETRY_TYPE_AABBS_KHR:
      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR);

      data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset;
      data.stride = geometry->geometry.aabbs.stride;
      break;
   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
      assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR);

      data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset;

      if (geometry->geometry.instances.arrayOfPointers)
         data.stride = 8;
      else
         data.stride = sizeof(VkAccelerationStructureInstanceKHR);
      break;
   default:
      unreachable("Unknown geometryType");
   }

   return data;
}

void
vk_accel_struct_cmd_begin_debug_marker(VkCommandBuffer commandBuffer,
                                       enum vk_acceleration_structure_build_step step,
                                       const char *format, ...)
{
   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
   struct vk_device *device = cmd_buffer->base.device;

   va_list ap;
   va_start(ap, format);

   char *name;
   if (vasprintf(&name, format, ap) == -1) {
      va_end(ap);
      return;
   }

   va_end(ap);

   VkDebugMarkerMarkerInfoEXT marker = {
      .sType = VK_STRUCTURE_TYPE_DEBUG_MARKER_MARKER_INFO_EXT,
      .pMarkerName = name,
   };

   device->dispatch_table.CmdDebugMarkerBeginEXT(commandBuffer, &marker);
}

void
vk_accel_struct_cmd_end_debug_marker(VkCommandBuffer commandBuffer)
{
   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
   struct vk_device *device = cmd_buffer->base.device;

   device->dispatch_table.CmdDebugMarkerEndEXT(commandBuffer);
}

static VkResult
build_leaves(VkCommandBuffer commandBuffer,
             struct vk_device *device, struct vk_meta_device *meta,
             const struct vk_acceleration_structure_build_args *args,
             uint32_t infoCount,
             const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
             const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
             struct bvh_state *bvh_states,
             bool updateable)
{
   VkPipeline pipeline;
   VkPipelineLayout layout;

   /* Many apps are broken and will make inactive primitives active when
    * updating, even though this is disallowed by the spec.  To handle this,
    * we use a different variant for updateable acceleration structures when
    * the driver implements an update pass. This passes through inactive leaf
    * nodes as if they were active, with an empty bounding box. It's then the
    * driver or HW's responsibility to filter out inactive nodes.
    */
   const uint32_t *spirv = leaf_spv;
   size_t spirv_size = sizeof(leaf_spv);

   if (device->as_build_ops->leaf_spirv_override) {
      spirv = device->as_build_ops->leaf_spirv_override;
      spirv_size = device->as_build_ops->leaf_spirv_override_size;
   }

   uint32_t flags = 0;
   if (updateable)
      flags |= VK_BUILD_FLAG_ALWAYS_ACTIVE;
   if (args->propagate_cull_flags)
      flags |= VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS;
   VkResult result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_LEAF,
                                                   spirv, spirv_size, sizeof(struct leaf_args),
                                                   args, flags,
                                                   &pipeline);
   if (result != VK_SUCCESS)
      return result;

   result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct leaf_args), &layout);
   if (result != VK_SUCCESS)
      return result;

   if (args->emit_markers) {
      device->as_build_ops->begin_debug_marker(commandBuffer,
                                               VK_ACCELERATION_STRUCTURE_BUILD_STEP_BUILD_LEAVES,
                                               "build_leaves");
   }

   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
   disp->CmdBindPipeline(
      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE)
         continue;
      if (bvh_states[i].vk.config.updateable != updateable)
         continue;

      struct leaf_args leaf_consts = {
         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.ir_offset,
         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.header_offset,
         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_buffer_offset[0],
      };

      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
         const VkAccelerationStructureGeometryKHR *geom =
            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];

         const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j];

         if (build_range_info->primitiveCount == 0)
            continue;

         leaf_consts.geom_data = vk_fill_geometry_data(pInfos[i].type, bvh_states[i].vk.leaf_node_count, j, geom, build_range_info);

         disp->CmdPushConstants(commandBuffer, layout,
                                VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts);
         device->cmd_dispatch_unaligned(commandBuffer, build_range_info->primitiveCount, 1, 1);

         bvh_states[i].vk.leaf_node_count += build_range_info->primitiveCount;
      }
   }

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);

   return VK_SUCCESS;
}

static VkResult
morton_generate(VkCommandBuffer commandBuffer, struct vk_device *device,
                struct vk_meta_device *meta,
                const struct vk_acceleration_structure_build_args *args,
                uint32_t infoCount,
                const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
                struct bvh_state *bvh_states)
{
   VkPipeline pipeline;
   VkPipelineLayout layout;

   VkResult result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_MORTON,
                                                   morton_spv, sizeof(morton_spv),
                                                   sizeof(struct morton_args), args, 0,
                                                   &pipeline);
   if (result != VK_SUCCESS)
      return result;

   result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct morton_args), &layout);
   if (result != VK_SUCCESS)
      return result;

   if (args->emit_markers) {
      device->as_build_ops->begin_debug_marker(commandBuffer,
                                               VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_GENERATE,
                                               "morton_generate");
   }

   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
   disp->CmdBindPipeline(
      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE)
         continue;
      const struct morton_args consts = {
         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.ir_offset,
         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.header_offset,
         .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_buffer_offset[0],
      };

      disp->CmdPushConstants(commandBuffer, layout,
                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
      device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].vk.leaf_node_count, 1, 1);
   }

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);

   return VK_SUCCESS;
}

static void
morton_sort(VkCommandBuffer commandBuffer, struct vk_device *device,
            const struct vk_acceleration_structure_build_args *args,
            uint32_t infoCount,
            const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
{
   const struct vk_device_dispatch_table *disp = &device->dispatch_table;

   if (args->emit_markers) {
      device->as_build_ops->begin_debug_marker(commandBuffer,
                                               VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_SORT,
                                               "morton_sort");
   }

   /* Copyright 2019 The Fuchsia Authors. */
   const radix_sort_vk_t *rs = args->radix_sort;

   /*
    * OVERVIEW
    *
    *   1. Pad the keyvals in `scatter_even`.
    *   2. Zero the `histograms` and `partitions`.
    *      --- BARRIER ---
    *   3. HISTOGRAM is dispatched before PREFIX.
    *      --- BARRIER ---
    *   4. PREFIX is dispatched before the first SCATTER.
    *      --- BARRIER ---
    *   5. One or more SCATTER dispatches.
    *
    * Note that the `partitions` buffer can be zeroed anytime before the first
    * scatter.
    */

   /* How many passes? */
   uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t);
   uint32_t keyval_bits = keyval_bytes * 8;
   uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits);
   uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2;

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].vk.leaf_node_count)
         bvh_states[i].scratch_offset = bvh_states[i].vk.scratch.sort_buffer_offset[passes & 1];
      else
         bvh_states[i].scratch_offset = bvh_states[i].vk.scratch.sort_buffer_offset[0];
   }

   /*
    * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS
    *
    * Pad fractional blocks with max-valued keyvals.
    *
    * Zero the histograms and partitions buffer.
    *
    * This assumes the partitions follow the histograms.
    */

   /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */

   /* How many scatter blocks? */
   uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2;
   uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows;

   /*
    * How many histogram blocks?
    *
    * Note that it's OK to have more max-valued digits counted by the histogram
    * than sorted by the scatters because the sort is stable.
    */
   uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2;
   uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows;

   uint32_t pass_idx = (keyval_bytes - passes);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (!bvh_states[i].vk.leaf_node_count)
         continue;
      if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE)
         continue;

      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_buffer_offset[0];
      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_internal_offset;

      bvh_states[i].scatter_blocks = (bvh_states[i].vk.leaf_node_count + scatter_block_kvs - 1) / scatter_block_kvs;
      bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs;

      bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs;
      bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs;

      /* Fill with max values */
      if (bvh_states[i].count_ru_histo > bvh_states[i].vk.leaf_node_count) {
         device->cmd_fill_buffer_addr(commandBuffer, keyvals_even_addr +
                                      bvh_states[i].vk.leaf_node_count * keyval_bytes,
                                      (bvh_states[i].count_ru_histo - bvh_states[i].vk.leaf_node_count) * keyval_bytes,
                                      0xFFFFFFFF);
      }

      /*
       * Zero histograms and invalidate partitions.
       *
       * Note that the partition invalidation only needs to be performed once
       * because the even/odd scatter dispatches rely on the the previous pass to
       * leave the partitions in an invalid state.
       *
       * Note that the last workgroup doesn't read/write a partition so it doesn't
       * need to be initialized.
       */
      uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1;

      uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));

      device->cmd_fill_buffer_addr(commandBuffer,
                                   internal_addr + rs->internal.histograms.offset + fill_base,
                                   histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)) + keyval_bytes * sizeof(uint32_t), 0);
   }

   /*
    * Pipeline: HISTOGRAM
    *
    * TODO(allanmac): All subgroups should try to process approximately the same
    * number of blocks in order to minimize tail effects.  This was implemented
    * and reverted but should be reimplemented and benchmarked later.
    */
   vk_barrier_transfer_w_to_compute_r(commandBuffer);

   disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
                         rs->pipelines.named.histogram);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (!bvh_states[i].vk.leaf_node_count)
         continue;
      if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE)
         continue;

      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_buffer_offset[0];
      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_internal_offset;

      /* Dispatch histogram */
      struct rs_push_histogram push_histogram = {
         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
         .devaddr_keyvals = keyvals_even_addr,
         .passes = passes,
      };

      disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0,
                             sizeof(push_histogram), &push_histogram);

      disp->CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1);
   }

   /*
    * Pipeline: PREFIX
    *
    * Launch one workgroup per pass.
    */
   vk_barrier_compute_w_to_compute_r(commandBuffer);

   disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
                         rs->pipelines.named.prefix);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (!bvh_states[i].vk.leaf_node_count)
         continue;
      if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE)
         continue;

      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_internal_offset;

      struct rs_push_prefix push_prefix = {
         .devaddr_histograms = internal_addr + rs->internal.histograms.offset,
      };

      disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0,
                             sizeof(push_prefix), &push_prefix);

      disp->CmdDispatch(commandBuffer, passes, 1, 1);
   }

   /* Pipeline: SCATTER */
   vk_barrier_compute_w_to_compute_r(commandBuffer);

   uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t));

   for (uint32_t i = 0; i < infoCount; i++) {
      uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_buffer_offset[0];
      uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_buffer_offset[1];
      uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.sort_internal_offset;

      bvh_states[i].push_scatter = (struct rs_push_scatter){
         .devaddr_keyvals_even = keyvals_even_addr,
         .devaddr_keyvals_odd = keyvals_odd_addr,
         .devaddr_partitions = internal_addr + rs->internal.partitions.offset,
         .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset,
      };
   }

   bool is_even = true;

   while (true) {
      uint32_t pass_dword = pass_idx / 4;

      /* Bind new pipeline */
      VkPipeline p =
         is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd;
      disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p);

      /* Update push constants that changed */
      VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even
                                    : rs->pipeline_layouts.named.scatter[pass_dword].odd;

      for (uint32_t i = 0; i < infoCount; i++) {
         if (!bvh_states[i].vk.leaf_node_count)
            continue;
         if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE)
            continue;

         bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2;

         disp->CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter),
                                &bvh_states[i].push_scatter);

         disp->CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1);

         bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t));
      }

      /* Continue? */
      if (++pass_idx >= keyval_bytes)
         break;

      vk_barrier_compute_w_to_compute_r(commandBuffer);

      is_even ^= true;
   }

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);
}

static VkResult
lbvh_build_internal(VkCommandBuffer commandBuffer,
                    struct vk_device *device, struct vk_meta_device *meta,
                    const struct vk_acceleration_structure_build_args *args,
                    uint32_t infoCount,
                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
{
   VkPipeline pipeline;
   VkPipelineLayout layout;

   uint32_t flags = 0;
   if (args->propagate_cull_flags)
      flags |= VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS;

   VkResult result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_LBVH_MAIN,
                                                   lbvh_main_spv, sizeof(lbvh_main_spv),
                                                   sizeof(struct lbvh_main_args), args, flags,
                                                   &pipeline);
   if (result != VK_SUCCESS)
      return result;

   result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct lbvh_main_args), &layout);
   if (result != VK_SUCCESS)
      return result;

   if (args->emit_markers) {
      device->as_build_ops->begin_debug_marker(commandBuffer,
                                               VK_ACCELERATION_STRUCTURE_BUILD_STEP_LBVH_BUILD_INTERNAL,
                                               "lbvh_build_internal");
   }

   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
   disp->CmdBindPipeline(
      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].vk.config.internal_type != VK_INTERNAL_BUILD_TYPE_LBVH)
         continue;

      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
      uint32_t internal_node_count = MAX2(bvh_states[i].vk.leaf_node_count, 2) - 1;

      const struct lbvh_main_args consts = {
         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.ir_offset,
         .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.lbvh_node_offset,
         .id_count = bvh_states[i].vk.leaf_node_count,
         .internal_node_base = bvh_states[i].vk.scratch.internal_node_offset - bvh_states[i].vk.scratch.ir_offset,
      };

      disp->CmdPushConstants(commandBuffer, layout,
                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
      device->cmd_dispatch_unaligned(commandBuffer, internal_node_count, 1, 1);
      bvh_states[i].internal_node_count = internal_node_count;
   }

   vk_barrier_compute_w_to_compute_r(commandBuffer);

   result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_LBVH_GENERATE_IR,
                                          lbvh_generate_ir_spv, sizeof(lbvh_generate_ir_spv),
                                          sizeof(struct lbvh_generate_ir_args), args, flags,
                                          &pipeline);
   if (result != VK_SUCCESS)
      return result;

   result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct lbvh_generate_ir_args), &layout);
   if (result != VK_SUCCESS)
      return result;

   disp->CmdBindPipeline(
      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].vk.config.internal_type != VK_INTERNAL_BUILD_TYPE_LBVH)
         continue;

      const struct lbvh_generate_ir_args consts = {
         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.ir_offset,
         .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.lbvh_node_offset,
         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.header_offset,
         .internal_node_base = bvh_states[i].vk.scratch.internal_node_offset - bvh_states[i].vk.scratch.ir_offset,
      };

      disp->CmdPushConstants(commandBuffer, layout,
                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
      device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].internal_node_count, 1, 1);
   }

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);

   return VK_SUCCESS;
}

static VkResult
ploc_build_internal(VkCommandBuffer commandBuffer,
                    struct vk_device *device, struct vk_meta_device *meta,
                    const struct vk_acceleration_structure_build_args *args,
                    uint32_t infoCount,
                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
{
   VkPipeline pipeline;
   VkPipelineLayout layout;

   uint32_t flags = 0;
   if (args->propagate_cull_flags)
      flags |= VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS;

   VkResult result = vk_get_bvh_build_pipeline_spv(device, meta, VK_META_OBJECT_KEY_PLOC, ploc_spv,
                                                   sizeof(ploc_spv), sizeof(struct ploc_args),
                                                   args, flags, &pipeline);
   if (result != VK_SUCCESS)
      return result;

   result = vk_get_bvh_build_pipeline_layout(device, meta, sizeof(struct ploc_args), &layout);
   if (result != VK_SUCCESS)
      return result;

   if (args->emit_markers) {
      device->as_build_ops->begin_debug_marker(commandBuffer,
                                               VK_ACCELERATION_STRUCTURE_BUILD_STEP_PLOC_BUILD_INTERNAL,
                                               "ploc_build_internal");
   }

   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
   disp->CmdBindPipeline(
      commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].vk.config.internal_type != VK_INTERNAL_BUILD_TYPE_PLOC)
         continue;

      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
      uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].vk.scratch.sort_buffer_offset[0])
                                       ? bvh_states[i].vk.scratch.sort_buffer_offset[1]
                                       : bvh_states[i].vk.scratch.sort_buffer_offset[0];

      const struct ploc_args consts = {
         .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.ir_offset,
         .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.header_offset,
         .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
         .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset,
         .prefix_scan_partitions =
            pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.ploc_prefix_sum_partition_offset,
         .internal_node_offset = bvh_states[i].vk.scratch.internal_node_offset - bvh_states[i].vk.scratch.ir_offset,
      };

      disp->CmdPushConstants(commandBuffer, layout,
                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
      disp->CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].vk.leaf_node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1);
   }

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);

   return VK_SUCCESS;
}

void
vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer,
                                     struct vk_device *device,
                                     struct vk_meta_device *meta,
                                     uint32_t infoCount,
                                     const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
                                     const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
                                     const struct vk_acceleration_structure_build_args *args)
{
   VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer);
   const struct vk_acceleration_structure_build_ops *ops = device->as_build_ops;

   struct bvh_batch_state batch_state = {0};

   struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));

   if (args->emit_markers) {
      uint32_t num_of_blas = 0;
      uint32_t num_of_tlas = 0;
      for (uint32_t i = 0; i < infoCount; ++i) {
         switch (pInfos[i].type) {
         case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR:
            num_of_tlas++;
            break;
         case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR:
            num_of_blas++;
            break;
         default:
            break;
         }
      }
      ops->begin_debug_marker(commandBuffer,
                              VK_ACCELERATION_STRUCTURE_BUILD_STEP_TOP,
                              "vkCmdBuildAccelerationStructuresKHR() TLAS(%u) BLAS(%u)",
                              num_of_tlas, num_of_blas);
   }

   for (uint32_t i = 0; i < infoCount; ++i) {
      uint32_t leaf_node_count = 0;
      for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) {
         leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount;
      }

      vk_acceleration_structure_build_state_init(&bvh_states[i].vk, cmd_buffer->base.device, leaf_node_count,
                                                 pInfos + i, args);

      bvh_states[i].vk.build_range_infos = ppBuildRangeInfos[i];
      /* The leaf node dispatch code uses leaf_node_count as a base index. */
      bvh_states[i].vk.leaf_node_count = 0;

      if (bvh_states[i].vk.config.updateable)
         batch_state.any_updateable = true;
      else
         batch_state.any_non_updateable = true;

      if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_PLOC) {
         batch_state.any_ploc = true;
      } else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_LBVH) {
         batch_state.any_lbvh = true;
      } else if (bvh_states[i].vk.config.internal_type == VK_INTERNAL_BUILD_TYPE_UPDATE) {
         batch_state.any_update = true;
         /* For updates, the leaf node pass never runs, so set leaf_node_count here. */
         bvh_states[i].vk.leaf_node_count = leaf_node_count;
      } else {
         unreachable("Unknown internal_build_type");
      }

      if (bvh_states[i].vk.config.internal_type != VK_INTERNAL_BUILD_TYPE_UPDATE) {
         /* The internal node count is updated in lbvh_build_internal for LBVH
          * and from the PLOC shader for PLOC. */
         struct vk_ir_header header = {
            .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff},
            .max_bounds = {0x80000000, 0x80000000, 0x80000000},
            .dispatch_size_y = 1,
            .dispatch_size_z = 1,
            .sync_data =
               {
                  .current_phase_end_counter = TASK_INDEX_INVALID,
                  /* Will be updated by the first PLOC shader invocation */
                  .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID},
               },
         };

         device->write_buffer_cp(commandBuffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].vk.scratch.header_offset,
                                 &header, sizeof(header));
      } else {
         ops->init_update_scratch(commandBuffer, &bvh_states[i].vk);
      }
   }

   /* Wait for the write_buffer_cp to land before using in compute shaders */
   device->flush_buffer_write_cp(commandBuffer);
   device->dispatch_table.CmdPipelineBarrier(commandBuffer,
                                             VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                                             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                             0, /* dependencyFlags */
                                             1,
                                             &(VkMemoryBarrier) {
                                                .srcAccessMask = 0,
                                                .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
                                             }, 0, NULL, 0, NULL);

   if (batch_state.any_lbvh || batch_state.any_ploc) {
      VkResult result;

      if (batch_state.any_non_updateable) {
         result =
            build_leaves(commandBuffer, device, meta, args, infoCount, pInfos,
                         ppBuildRangeInfos, bvh_states, false);

         if (result != VK_SUCCESS) {
            free(bvh_states);
            vk_command_buffer_set_error(cmd_buffer, result);
            return;
         }
      }

      if (batch_state.any_updateable) {
         result =
            build_leaves(commandBuffer, device, meta, args, infoCount, pInfos,
                         ppBuildRangeInfos, bvh_states, true);

         if (result != VK_SUCCESS) {
            free(bvh_states);
            vk_command_buffer_set_error(cmd_buffer, result);
            return;
         }
      }

      vk_barrier_compute_w_to_compute_r(commandBuffer);

      result =
         morton_generate(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);

      if (result != VK_SUCCESS) {
         free(bvh_states);
         vk_command_buffer_set_error(cmd_buffer, result);
         return;
      }

      vk_barrier_compute_w_to_compute_r(commandBuffer);

      morton_sort(commandBuffer, device, args, infoCount, pInfos, bvh_states);

      vk_barrier_compute_w_to_compute_r(commandBuffer);

      if (batch_state.any_lbvh) {
         result =
            lbvh_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);

         if (result != VK_SUCCESS) {
            free(bvh_states);
            vk_command_buffer_set_error(cmd_buffer, result);
            return;
         }
      }

      if (batch_state.any_ploc) {
         result =
            ploc_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states);

         if (result != VK_SUCCESS) {
            vk_command_buffer_set_error(cmd_buffer, result);
            return;
         }
      }

      vk_barrier_compute_w_to_compute_r(commandBuffer);
      vk_barrier_compute_w_to_indirect_compute_r(commandBuffer);
   }

   /* Calculate number of leaves and internal nodes to encode */
   uint32_t num_leaves = 0;
   uint32_t num_internal_node = 0;
   for ( uint32_t i = 0; i < infoCount; i++) {
      num_leaves += bvh_states[i].vk.leaf_node_count;
      num_internal_node += bvh_states[i].internal_node_count;
   }

   if (args->emit_markers)
      device->as_build_ops->begin_debug_marker(commandBuffer,
                                               VK_ACCELERATION_STRUCTURE_BUILD_STEP_ENCODE,
                                               "encode_leaves=%u encode_ir_node=%u",
                                               num_leaves, num_internal_node);

   for (unsigned pass = 0; pass < ARRAY_SIZE(ops->encode_as); pass++) {
      if (!ops->encode_as[pass] && !ops->update_as[pass])
         break;

      bool progress;
      do {
         progress = false;

         bool update;
         uint32_t encode_key = 0;
         uint32_t update_key = 0;
         for (uint32_t i = 0; i < infoCount; ++i) {
            if (bvh_states[i].last_encode_pass == pass + 1)
               continue;

            if (!progress) {
               update = (bvh_states[i].vk.config.internal_type ==
                         VK_INTERNAL_BUILD_TYPE_UPDATE);
               if (update && !ops->update_as[pass])
                  continue;
               if (!update && !ops->encode_as[pass])
                  continue;
               encode_key = bvh_states[i].vk.config.encode_key[pass];
               update_key = bvh_states[i].vk.config.update_key[pass];
               progress = true;
               if (update)
                  ops->update_bind_pipeline[pass](commandBuffer, &bvh_states[i].vk);
               else
                  ops->encode_bind_pipeline[pass](commandBuffer, &bvh_states[i].vk);
            } else {
               if (update != (bvh_states[i].vk.config.internal_type ==
                              VK_INTERNAL_BUILD_TYPE_UPDATE) ||
                   encode_key != bvh_states[i].vk.config.encode_key[pass] ||
                   update_key != bvh_states[i].vk.config.update_key[pass])
                  continue;
            }

            if (update)
               ops->update_as[pass](commandBuffer, &bvh_states[i].vk);
            else
               ops->encode_as[pass](commandBuffer, &bvh_states[i].vk);

            bvh_states[i].last_encode_pass = pass + 1;
         }
      } while (progress);
   }

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);

   if (args->emit_markers)
      device->as_build_ops->end_debug_marker(commandBuffer);

   free(bvh_states);
}

void
vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
                      const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
                      const uint32_t *pMaxPrimitiveCounts,
                      VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo,
                      const struct vk_acceleration_structure_build_args *args)
{
   VK_FROM_HANDLE(vk_device, device, _device);

   uint32_t leaf_count = 0;
   for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
      leaf_count += pMaxPrimitiveCounts[i];

   struct vk_acceleration_structure_build_state state = { 0 };
   vk_acceleration_structure_build_state_init(&state, device, leaf_count, pBuildInfo, args);

   pSizeInfo->accelerationStructureSize = device->as_build_ops->get_as_size(_device, &state);
   pSizeInfo->updateScratchSize = state.scratch.update_size;
   pSizeInfo->buildScratchSize = state.scratch.size;
}

/* Return true if the common framework supports using this format for loading
 * vertices. Must match the formats handled by load_vertices() on the GPU.
 */
bool
vk_acceleration_struct_vtx_format_supported(VkFormat format)
{
   switch (format) {
   case VK_FORMAT_R32G32_SFLOAT:
   case VK_FORMAT_R32G32B32_SFLOAT:
   case VK_FORMAT_R32G32B32A32_SFLOAT:
   case VK_FORMAT_R16G16_SFLOAT:
   case VK_FORMAT_R16G16B16_SFLOAT:
   case VK_FORMAT_R16G16B16A16_SFLOAT:
   case VK_FORMAT_R16G16_SNORM:
   case VK_FORMAT_R16G16_UNORM:
   case VK_FORMAT_R16G16B16A16_SNORM:
   case VK_FORMAT_R16G16B16A16_UNORM:
   case VK_FORMAT_R8G8_SNORM:
   case VK_FORMAT_R8G8_UNORM:
   case VK_FORMAT_R8G8B8A8_SNORM:
   case VK_FORMAT_R8G8B8A8_UNORM:
   case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
      return true;
   default:
      return false;
   }
}

/* Stubs of optional functions for drivers that don't implment them. */

VKAPI_ATTR void VKAPI_CALL
vk_common_CmdBuildAccelerationStructuresIndirectKHR(VkCommandBuffer commandBuffer,
                                                    uint32_t infoCount,
                                                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
                                                    const VkDeviceAddress *pIndirectDeviceAddresses,
                                                    const uint32_t *pIndirectStrides,
                                                    const uint32_t *const *ppMaxPrimitiveCounts)
{
   unreachable("Unimplemented");
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_WriteAccelerationStructuresPropertiesKHR(VkDevice _device, uint32_t accelerationStructureCount,
                                                   const VkAccelerationStructureKHR *pAccelerationStructures,
                                                   VkQueryType queryType,
                                                   size_t dataSize,
                                                   void *pData,
                                                   size_t stride)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   unreachable("Unimplemented");
   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_BuildAccelerationStructuresKHR(VkDevice _device,
                                         VkDeferredOperationKHR deferredOperation,
                                         uint32_t infoCount,
                                         const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
                                         const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   unreachable("Unimplemented");
   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_CopyAccelerationStructureKHR(VkDevice _device,
                                       VkDeferredOperationKHR deferredOperation,
                                       const VkCopyAccelerationStructureInfoKHR *pInfo)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   unreachable("Unimplemented");
   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_CopyMemoryToAccelerationStructureKHR(VkDevice _device,
                                               VkDeferredOperationKHR deferredOperation,
                                               const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   unreachable("Unimplemented");
   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_CopyAccelerationStructureToMemoryKHR(VkDevice _device,
                                               VkDeferredOperationKHR deferredOperation,
                                               const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo)
{
   VK_FROM_HANDLE(vk_device, device, _device);
   unreachable("Unimplemented");
   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}