/*
 * Copyright © 2021 Google
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "radv_acceleration_structure.h"
#include "radv_private.h"
#include "radv_shader.h"

#include "nir/nir.h"
#include "nir/nir_builder.h"

static VkRayTracingPipelineCreateInfoKHR
radv_create_merged_rt_create_info(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo)
{
   VkRayTracingPipelineCreateInfoKHR local_create_info = *pCreateInfo;
   uint32_t total_stages = pCreateInfo->stageCount;
   uint32_t total_groups = pCreateInfo->groupCount;

   if (pCreateInfo->pLibraryInfo) {
      for (unsigned i = 0; i < pCreateInfo->pLibraryInfo->libraryCount; ++i) {
         RADV_FROM_HANDLE(radv_pipeline, library, pCreateInfo->pLibraryInfo->pLibraries[i]);
         total_stages += library->library.stage_count;
         total_groups += library->library.group_count;
      }
   }
   VkPipelineShaderStageCreateInfo *stages = NULL;
   VkRayTracingShaderGroupCreateInfoKHR *groups = NULL;
   local_create_info.stageCount = total_stages;
   local_create_info.groupCount = total_groups;
   local_create_info.pStages = stages =
      malloc(sizeof(VkPipelineShaderStageCreateInfo) * total_stages);
   local_create_info.pGroups = groups =
      malloc(sizeof(VkRayTracingShaderGroupCreateInfoKHR) * total_groups);
   if (!local_create_info.pStages || !local_create_info.pGroups)
      return local_create_info;

   total_stages = pCreateInfo->stageCount;
   total_groups = pCreateInfo->groupCount;
   for (unsigned j = 0; j < pCreateInfo->stageCount; ++j)
      stages[j] = pCreateInfo->pStages[j];
   for (unsigned j = 0; j < pCreateInfo->groupCount; ++j)
      groups[j] = pCreateInfo->pGroups[j];

   if (pCreateInfo->pLibraryInfo) {
      for (unsigned i = 0; i < pCreateInfo->pLibraryInfo->libraryCount; ++i) {
         RADV_FROM_HANDLE(radv_pipeline, library, pCreateInfo->pLibraryInfo->pLibraries[i]);
         for (unsigned j = 0; j < library->library.stage_count; ++j)
            stages[total_stages + j] = library->library.stages[j];
         for (unsigned j = 0; j < library->library.group_count; ++j) {
            VkRayTracingShaderGroupCreateInfoKHR *dst = &groups[total_groups + j];
            *dst = library->library.groups[j];
            if (dst->generalShader != VK_SHADER_UNUSED_KHR)
               dst->generalShader += total_stages;
            if (dst->closestHitShader != VK_SHADER_UNUSED_KHR)
               dst->closestHitShader += total_stages;
            if (dst->anyHitShader != VK_SHADER_UNUSED_KHR)
               dst->anyHitShader += total_stages;
            if (dst->intersectionShader != VK_SHADER_UNUSED_KHR)
               dst->intersectionShader += total_stages;
         }
         total_stages += library->library.stage_count;
         total_groups += library->library.group_count;
      }
   }
   return local_create_info;
}

static VkResult
radv_rt_pipeline_library_create(VkDevice _device, VkPipelineCache _cache,
                                const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
                                const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
{
   RADV_FROM_HANDLE(radv_device, device, _device);
   struct radv_pipeline *pipeline;

   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (pipeline == NULL)
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

   vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
   pipeline->type = RADV_PIPELINE_LIBRARY;

   VkRayTracingPipelineCreateInfoKHR local_create_info =
      radv_create_merged_rt_create_info(pCreateInfo);
   if (!local_create_info.pStages || !local_create_info.pGroups)
      goto fail;

   if (local_create_info.stageCount) {
      size_t size = sizeof(VkPipelineShaderStageCreateInfo) * local_create_info.stageCount;
      pipeline->library.stage_count = local_create_info.stageCount;
      pipeline->library.stages = malloc(size);
      if (!pipeline->library.stages)
         goto fail;
      memcpy(pipeline->library.stages, local_create_info.pStages, size);
   }

   if (local_create_info.groupCount) {
      size_t size = sizeof(VkRayTracingShaderGroupCreateInfoKHR) * local_create_info.groupCount;
      pipeline->library.group_count = local_create_info.groupCount;
      pipeline->library.groups = malloc(size);
      if (!pipeline->library.groups)
         goto fail;
      memcpy(pipeline->library.groups, local_create_info.pGroups, size);
   }

   *pPipeline = radv_pipeline_to_handle(pipeline);

   free((void *)local_create_info.pGroups);
   free((void *)local_create_info.pStages);
   return VK_SUCCESS;
fail:
   free(pipeline->library.groups);
   free(pipeline->library.stages);
   free((void *)local_create_info.pGroups);
   free((void *)local_create_info.pStages);
   return VK_ERROR_OUT_OF_HOST_MEMORY;
}

/*
 * Global variables for an RT pipeline
 */
struct rt_variables {
   /* idx of the next shader to run in the next iteration of the main loop */
   nir_variable *idx;

   /* scratch offset of the argument area relative to stack_ptr */
   nir_variable *arg;

   nir_variable *stack_ptr;

   /* global address of the SBT entry used for the shader */
   nir_variable *shader_record_ptr;

   /* trace_ray arguments */
   nir_variable *accel_struct;
   nir_variable *flags;
   nir_variable *cull_mask;
   nir_variable *sbt_offset;
   nir_variable *sbt_stride;
   nir_variable *miss_index;
   nir_variable *origin;
   nir_variable *tmin;
   nir_variable *direction;
   nir_variable *tmax;

   /* from the BTAS instance currently being visited */
   nir_variable *custom_instance_and_mask;

   /* Properties of the primitive currently being visited. */
   nir_variable *primitive_id;
   nir_variable *geometry_id_and_flags;
   nir_variable *instance_id;
   nir_variable *instance_addr;
   nir_variable *hit_kind;
   nir_variable *opaque;

   /* Safeguard to ensure we don't end up in an infinite loop of non-existing case. Should not be
    * needed but is extra anti-hang safety during bring-up. */
   nir_variable *main_loop_case_visited;

   /* Output variable for intersection & anyhit shaders. */
   nir_variable *ahit_status;

   /* Array of stack size struct for recording the max stack size for each group. */
   struct radv_pipeline_shader_stack_size *stack_sizes;
   unsigned group_idx;
};

static struct rt_variables
create_rt_variables(nir_shader *shader, struct radv_pipeline_shader_stack_size *stack_sizes)
{
   struct rt_variables vars = {
      NULL,
   };
   vars.idx = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "idx");
   vars.arg = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "arg");
   vars.stack_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "stack_ptr");
   vars.shader_record_ptr =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "shader_record_ptr");

   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
   vars.accel_struct =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "accel_struct");
   vars.flags = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "ray_flags");
   vars.cull_mask = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "cull_mask");
   vars.sbt_offset =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "sbt_offset");
   vars.sbt_stride =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "sbt_stride");
   vars.miss_index =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "miss_index");
   vars.origin = nir_variable_create(shader, nir_var_shader_temp, vec3_type, "ray_origin");
   vars.tmin = nir_variable_create(shader, nir_var_shader_temp, glsl_float_type(), "ray_tmin");
   vars.direction = nir_variable_create(shader, nir_var_shader_temp, vec3_type, "ray_direction");
   vars.tmax = nir_variable_create(shader, nir_var_shader_temp, glsl_float_type(), "ray_tmax");

   vars.custom_instance_and_mask = nir_variable_create(
      shader, nir_var_shader_temp, glsl_uint_type(), "custom_instance_and_mask");
   vars.primitive_id =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "primitive_id");
   vars.geometry_id_and_flags =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "geometry_id_and_flags");
   vars.instance_id =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "instance_id");
   vars.instance_addr =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "instance_addr");
   vars.hit_kind = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "hit_kind");
   vars.opaque = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "opaque");

   vars.main_loop_case_visited =
      nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "main_loop_case_visited");
   vars.ahit_status =
      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "ahit_status");

   vars.stack_sizes = stack_sizes;
   return vars;
}

/*
 * Remap all the variables between the two rt_variables struct for inlining.
 */
static void
map_rt_variables(struct hash_table *var_remap, struct rt_variables *src,
                 const struct rt_variables *dst)
{
   _mesa_hash_table_insert(var_remap, src->idx, dst->idx);
   _mesa_hash_table_insert(var_remap, src->arg, dst->arg);
   _mesa_hash_table_insert(var_remap, src->stack_ptr, dst->stack_ptr);
   _mesa_hash_table_insert(var_remap, src->shader_record_ptr, dst->shader_record_ptr);

   _mesa_hash_table_insert(var_remap, src->accel_struct, dst->accel_struct);
   _mesa_hash_table_insert(var_remap, src->flags, dst->flags);
   _mesa_hash_table_insert(var_remap, src->cull_mask, dst->cull_mask);
   _mesa_hash_table_insert(var_remap, src->sbt_offset, dst->sbt_offset);
   _mesa_hash_table_insert(var_remap, src->sbt_stride, dst->sbt_stride);
   _mesa_hash_table_insert(var_remap, src->miss_index, dst->miss_index);
   _mesa_hash_table_insert(var_remap, src->origin, dst->origin);
   _mesa_hash_table_insert(var_remap, src->tmin, dst->tmin);
   _mesa_hash_table_insert(var_remap, src->direction, dst->direction);
   _mesa_hash_table_insert(var_remap, src->tmax, dst->tmax);

   _mesa_hash_table_insert(var_remap, src->custom_instance_and_mask, dst->custom_instance_and_mask);
   _mesa_hash_table_insert(var_remap, src->primitive_id, dst->primitive_id);
   _mesa_hash_table_insert(var_remap, src->geometry_id_and_flags, dst->geometry_id_and_flags);
   _mesa_hash_table_insert(var_remap, src->instance_id, dst->instance_id);
   _mesa_hash_table_insert(var_remap, src->instance_addr, dst->instance_addr);
   _mesa_hash_table_insert(var_remap, src->hit_kind, dst->hit_kind);
   _mesa_hash_table_insert(var_remap, src->opaque, dst->opaque);
   _mesa_hash_table_insert(var_remap, src->ahit_status, dst->ahit_status);

   src->stack_sizes = dst->stack_sizes;
   src->group_idx = dst->group_idx;
}

/*
 * Create a copy of the global rt variables where the primitive/instance related variables are
 * independent.This is needed as we need to keep the old values of the global variables around
 * in case e.g. an anyhit shader reject the collision. So there are inner variables that get copied
 * to the outer variables once we commit to a better hit.
 */
static struct rt_variables
create_inner_vars(nir_builder *b, const struct rt_variables *vars)
{
   struct rt_variables inner_vars = *vars;
   inner_vars.idx =
      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_idx");
   inner_vars.shader_record_ptr = nir_variable_create(
      b->shader, nir_var_shader_temp, glsl_uint64_t_type(), "inner_shader_record_ptr");
   inner_vars.primitive_id =
      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_primitive_id");
   inner_vars.geometry_id_and_flags = nir_variable_create(
      b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_geometry_id_and_flags");
   inner_vars.tmax =
      nir_variable_create(b->shader, nir_var_shader_temp, glsl_float_type(), "inner_tmax");
   inner_vars.instance_id =
      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_instance_id");
   inner_vars.instance_addr = nir_variable_create(b->shader, nir_var_shader_temp,
                                                  glsl_uint64_t_type(), "inner_instance_addr");
   inner_vars.hit_kind =
      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_hit_kind");
   inner_vars.custom_instance_and_mask = nir_variable_create(
      b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_custom_instance_and_mask");

   return inner_vars;
}

/* The hit attributes are stored on the stack. This is the offset compared to the current stack
 * pointer of where the hit attrib is stored. */
const uint32_t RADV_HIT_ATTRIB_OFFSET = -(16 + RADV_MAX_HIT_ATTRIB_SIZE);

static void
insert_rt_return(nir_builder *b, const struct rt_variables *vars)
{
   nir_store_var(b, vars->stack_ptr,
                 nir_iadd(b, nir_load_var(b, vars->stack_ptr), nir_imm_int(b, -16)), 1);
   nir_store_var(b, vars->idx,
                 nir_load_scratch(b, 1, 32, nir_load_var(b, vars->stack_ptr), .align_mul = 16), 1);
}

enum sbt_type {
   SBT_RAYGEN,
   SBT_MISS,
   SBT_HIT,
   SBT_CALLABLE,
};

static nir_ssa_def *
get_sbt_ptr(nir_builder *b, nir_ssa_def *idx, enum sbt_type binding)
{
   nir_ssa_def *desc = nir_load_sbt_amd(b, 4, .binding = binding);
   nir_ssa_def *base_addr = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
   nir_ssa_def *stride = nir_channel(b, desc, 2);

   nir_ssa_def *ret = nir_imul(b, idx, stride);
   ret = nir_iadd(b, base_addr, nir_u2u64(b, ret));

   return ret;
}

static void
load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_ssa_def *idx,
               enum sbt_type binding, unsigned offset)
{
   nir_ssa_def *addr = get_sbt_ptr(b, idx, binding);

   nir_ssa_def *load_addr = addr;
   if (offset)
      load_addr = nir_iadd(b, load_addr, nir_imm_int64(b, offset));
   nir_ssa_def *v_idx =
      nir_build_load_global(b, 1, 32, load_addr, .align_mul = 4, .align_offset = 0);

   nir_store_var(b, vars->idx, v_idx, 1);

   nir_ssa_def *record_addr = nir_iadd(b, addr, nir_imm_int64(b, RADV_RT_HANDLE_SIZE));
   nir_store_var(b, vars->shader_record_ptr, record_addr, 1);
}

static nir_ssa_def *
nir_build_vec3_mat_mult(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *matrix[], bool translation)
{
   nir_ssa_def *result_components[3] = {
      nir_channel(b, matrix[0], 3),
      nir_channel(b, matrix[1], 3),
      nir_channel(b, matrix[2], 3),
   };
   for (unsigned i = 0; i < 3; ++i) {
      for (unsigned j = 0; j < 3; ++j) {
         nir_ssa_def *v =
            nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[i], 1 << j));
         result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
      }
   }
   return nir_vec(b, result_components, 3);
}

static nir_ssa_def *
nir_build_vec3_mat_mult_pre(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *matrix[])
{
   nir_ssa_def *result_components[3] = {
      nir_channel(b, matrix[0], 3),
      nir_channel(b, matrix[1], 3),
      nir_channel(b, matrix[2], 3),
   };
   return nir_build_vec3_mat_mult(b, nir_fsub(b, vec, nir_vec(b, result_components, 3)), matrix,
                                  false);
}

static void
nir_build_wto_matrix_load(nir_builder *b, nir_ssa_def *instance_addr, nir_ssa_def **out)
{
   unsigned offset = offsetof(struct radv_bvh_instance_node, wto_matrix);
   for (unsigned i = 0; i < 3; ++i) {
      out[i] = nir_build_load_global(b, 4, 32,
                                     nir_iadd(b, instance_addr, nir_imm_int64(b, offset + i * 16)),
                                     .align_mul = 64, .align_offset = offset + i * 16);
   }
}

/* This lowers all the RT instructions that we do not want to pass on to the combined shader and
 * that we can implement using the variables from the shader we are going to inline into. */
static void
lower_rt_instructions(nir_shader *shader, struct rt_variables *vars, unsigned call_idx_base)
{
   nir_builder b_shader;
   nir_builder_init(&b_shader, nir_shader_get_entrypoint(shader));

   nir_foreach_block (block, nir_shader_get_entrypoint(shader)) {
      nir_foreach_instr_safe (instr, block) {
         switch (instr->type) {
         case nir_instr_type_intrinsic: {
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            switch (intr->intrinsic) {
            case nir_intrinsic_rt_execute_callable: {
               uint32_t size = align(nir_intrinsic_stack_size(intr), 16) + RADV_MAX_HIT_ATTRIB_SIZE;
               uint32_t ret = call_idx_base + nir_intrinsic_call_idx(intr) + 1;
               b_shader.cursor = nir_instr_remove(instr);

               nir_store_var(&b_shader, vars->stack_ptr,
                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
                                      nir_imm_int(&b_shader, size)),
                             1);
               nir_store_scratch(&b_shader, nir_imm_int(&b_shader, ret),
                                 nir_load_var(&b_shader, vars->stack_ptr), .align_mul = 16,
                                 .write_mask = 1);

               nir_store_var(&b_shader, vars->stack_ptr,
                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
                                      nir_imm_int(&b_shader, 16)),
                             1);
               load_sbt_entry(&b_shader, vars, intr->src[0].ssa, SBT_CALLABLE, 0);

               nir_store_var(
                  &b_shader, vars->arg,
                  nir_isub(&b_shader, intr->src[1].ssa, nir_imm_int(&b_shader, size + 16)), 1);

               vars->stack_sizes[vars->group_idx].recursive_size =
                  MAX2(vars->stack_sizes[vars->group_idx].recursive_size, size + 16);
               break;
            }
            case nir_intrinsic_rt_trace_ray: {
               uint32_t size = align(nir_intrinsic_stack_size(intr), 16) + RADV_MAX_HIT_ATTRIB_SIZE;
               uint32_t ret = call_idx_base + nir_intrinsic_call_idx(intr) + 1;
               b_shader.cursor = nir_instr_remove(instr);

               nir_store_var(&b_shader, vars->stack_ptr,
                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
                                      nir_imm_int(&b_shader, size)),
                             1);
               nir_store_scratch(&b_shader, nir_imm_int(&b_shader, ret),
                                 nir_load_var(&b_shader, vars->stack_ptr), .align_mul = 16,
                                 .write_mask = 1);

               nir_store_var(&b_shader, vars->stack_ptr,
                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
                                      nir_imm_int(&b_shader, 16)),
                             1);

               nir_store_var(&b_shader, vars->idx, nir_imm_int(&b_shader, 1), 1);
               nir_store_var(
                  &b_shader, vars->arg,
                  nir_isub(&b_shader, intr->src[10].ssa, nir_imm_int(&b_shader, size + 16)), 1);

               vars->stack_sizes[vars->group_idx].recursive_size =
                  MAX2(vars->stack_sizes[vars->group_idx].recursive_size, size + 16);

               /* Per the SPIR-V extension spec we have to ignore some bits for some arguments. */
               nir_store_var(&b_shader, vars->accel_struct, intr->src[0].ssa, 0x1);
               nir_store_var(&b_shader, vars->flags, intr->src[1].ssa, 0x1);
               nir_store_var(&b_shader, vars->cull_mask,
                             nir_iand(&b_shader, intr->src[2].ssa, nir_imm_int(&b_shader, 0xff)),
                             0x1);
               nir_store_var(&b_shader, vars->sbt_offset,
                             nir_iand(&b_shader, intr->src[3].ssa, nir_imm_int(&b_shader, 0xf)),
                             0x1);
               nir_store_var(&b_shader, vars->sbt_stride,
                             nir_iand(&b_shader, intr->src[4].ssa, nir_imm_int(&b_shader, 0xf)),
                             0x1);
               nir_store_var(&b_shader, vars->miss_index,
                             nir_iand(&b_shader, intr->src[5].ssa, nir_imm_int(&b_shader, 0xffff)),
                             0x1);
               nir_store_var(&b_shader, vars->origin, intr->src[6].ssa, 0x7);
               nir_store_var(&b_shader, vars->tmin, intr->src[7].ssa, 0x1);
               nir_store_var(&b_shader, vars->direction, intr->src[8].ssa, 0x7);
               nir_store_var(&b_shader, vars->tmax, intr->src[9].ssa, 0x1);
               break;
            }
            case nir_intrinsic_rt_resume: {
               uint32_t size = align(nir_intrinsic_stack_size(intr), 16) + RADV_MAX_HIT_ATTRIB_SIZE;
               b_shader.cursor = nir_instr_remove(instr);

               nir_store_var(&b_shader, vars->stack_ptr,
                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
                                      nir_imm_int(&b_shader, -size)),
                             1);
               break;
            }
            case nir_intrinsic_rt_return_amd: {
               b_shader.cursor = nir_instr_remove(instr);

               if (shader->info.stage == MESA_SHADER_RAYGEN) {
                  nir_store_var(&b_shader, vars->idx, nir_imm_int(&b_shader, 0), 1);
                  break;
               }
               insert_rt_return(&b_shader, vars);
               break;
            }
            case nir_intrinsic_load_scratch: {
               b_shader.cursor = nir_before_instr(instr);
               nir_instr_rewrite_src_ssa(
                  instr, &intr->src[0],
                  nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr), intr->src[0].ssa));
               break;
            }
            case nir_intrinsic_store_scratch: {
               b_shader.cursor = nir_before_instr(instr);
               nir_instr_rewrite_src_ssa(
                  instr, &intr->src[1],
                  nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr), intr->src[1].ssa));
               break;
            }
            case nir_intrinsic_load_rt_arg_scratch_offset_amd: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->arg);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_shader_record_ptr: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->shader_record_ptr);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_launch_id: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_global_invocation_id(&b_shader, 32);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_t_min: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->tmin);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_t_max: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->tmax);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_world_origin: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->origin);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_world_direction: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->direction);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_instance_custom_index: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->custom_instance_and_mask);
               ret = nir_iand(&b_shader, ret, nir_imm_int(&b_shader, 0xFFFFFF));
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_primitive_id: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->primitive_id);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_geometry_index: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->geometry_id_and_flags);
               ret = nir_iand(&b_shader, ret, nir_imm_int(&b_shader, 0xFFFFFFF));
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_instance_id: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->instance_id);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_flags: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->flags);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_hit_kind: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->hit_kind);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_load_ray_world_to_object: {
               unsigned c = nir_intrinsic_column(intr);
               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
               nir_ssa_def *wto_matrix[3];
               nir_build_wto_matrix_load(&b_shader, instance_node_addr, wto_matrix);

               nir_ssa_def *vals[3];
               for (unsigned i = 0; i < 3; ++i)
                  vals[i] = nir_channel(&b_shader, wto_matrix[i], c);

               nir_ssa_def *val = nir_vec(&b_shader, vals, 3);
               if (c == 3)
                  val = nir_fneg(&b_shader,
                                 nir_build_vec3_mat_mult(&b_shader, val, wto_matrix, false));
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
               break;
            }
            case nir_intrinsic_load_ray_object_to_world: {
               unsigned c = nir_intrinsic_column(intr);
               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
               nir_ssa_def *val;
               if (c == 3) {
                  nir_ssa_def *wto_matrix[3];
                  nir_build_wto_matrix_load(&b_shader, instance_node_addr, wto_matrix);

                  nir_ssa_def *vals[3];
                  for (unsigned i = 0; i < 3; ++i)
                     vals[i] = nir_channel(&b_shader, wto_matrix[i], c);

                  val = nir_vec(&b_shader, vals, 3);
               } else {
                  val = nir_build_load_global(
                     &b_shader, 3, 32,
                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 92 + c * 12)),
                     .align_mul = 4, .align_offset = 0);
               }
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
               break;
            }
            case nir_intrinsic_load_ray_object_origin: {
               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
               nir_ssa_def *wto_matrix[] = {
                  nir_build_load_global(
                     &b_shader, 4, 32,
                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 16)),
                     .align_mul = 64, .align_offset = 16),
                  nir_build_load_global(
                     &b_shader, 4, 32,
                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 32)),
                     .align_mul = 64, .align_offset = 32),
                  nir_build_load_global(
                     &b_shader, 4, 32,
                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 48)),
                     .align_mul = 64, .align_offset = 48)};
               nir_ssa_def *val = nir_build_vec3_mat_mult_pre(
                  &b_shader, nir_load_var(&b_shader, vars->origin), wto_matrix);
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
               break;
            }
            case nir_intrinsic_load_ray_object_direction: {
               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
               nir_ssa_def *wto_matrix[3];
               nir_build_wto_matrix_load(&b_shader, instance_node_addr, wto_matrix);
               nir_ssa_def *val = nir_build_vec3_mat_mult(
                  &b_shader, nir_load_var(&b_shader, vars->direction), wto_matrix, false);
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
               break;
            }
            case nir_intrinsic_load_intersection_opaque_amd: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_ssa_def *ret = nir_load_var(&b_shader, vars->opaque);
               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
               break;
            }
            case nir_intrinsic_ignore_ray_intersection: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_store_var(&b_shader, vars->ahit_status, nir_imm_int(&b_shader, 1), 1);

               /* The if is a workaround to avoid having to fix up control flow manually */
               nir_push_if(&b_shader, nir_imm_true(&b_shader));
               nir_jump(&b_shader, nir_jump_return);
               nir_pop_if(&b_shader, NULL);
               break;
            }
            case nir_intrinsic_terminate_ray: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_store_var(&b_shader, vars->ahit_status, nir_imm_int(&b_shader, 2), 1);

               /* The if is a workaround to avoid having to fix up control flow manually */
               nir_push_if(&b_shader, nir_imm_true(&b_shader));
               nir_jump(&b_shader, nir_jump_return);
               nir_pop_if(&b_shader, NULL);
               break;
            }
            case nir_intrinsic_report_ray_intersection: {
               b_shader.cursor = nir_instr_remove(instr);
               nir_push_if(
                  &b_shader,
                  nir_iand(
                     &b_shader,
                     nir_flt(&b_shader, intr->src[0].ssa, nir_load_var(&b_shader, vars->tmax)),
                     nir_fge(&b_shader, intr->src[0].ssa, nir_load_var(&b_shader, vars->tmin))));
               {
                  nir_store_var(&b_shader, vars->ahit_status, nir_imm_int(&b_shader, 0), 1);
                  nir_store_var(&b_shader, vars->tmax, intr->src[0].ssa, 1);
                  nir_store_var(&b_shader, vars->hit_kind, intr->src[1].ssa, 1);
               }
               nir_pop_if(&b_shader, NULL);
               break;
            }
            default:
               break;
            }
            break;
         }
         case nir_instr_type_jump: {
            nir_jump_instr *jump = nir_instr_as_jump(instr);
            if (jump->type == nir_jump_halt) {
               b_shader.cursor = nir_instr_remove(instr);
               nir_jump(&b_shader, nir_jump_return);
            }
            break;
         }
         default:
            break;
         }
      }
   }

   nir_metadata_preserve(nir_shader_get_entrypoint(shader), nir_metadata_none);
}

static void
insert_rt_case(nir_builder *b, nir_shader *shader, const struct rt_variables *vars,
               nir_ssa_def *idx, uint32_t call_idx_base, uint32_t call_idx)
{
   struct hash_table *var_remap = _mesa_pointer_hash_table_create(NULL);

   nir_opt_dead_cf(shader);

   struct rt_variables src_vars = create_rt_variables(shader, vars->stack_sizes);
   map_rt_variables(var_remap, &src_vars, vars);

   NIR_PASS_V(shader, lower_rt_instructions, &src_vars, call_idx_base);

   NIR_PASS_V(shader, nir_opt_remove_phis);
   NIR_PASS_V(shader, nir_lower_returns);
   NIR_PASS_V(shader, nir_opt_dce);

   if (b->shader->info.stage == MESA_SHADER_ANY_HIT ||
       b->shader->info.stage == MESA_SHADER_INTERSECTION) {
      src_vars.stack_sizes[src_vars.group_idx].non_recursive_size =
         MAX2(src_vars.stack_sizes[src_vars.group_idx].non_recursive_size, shader->scratch_size);
   } else {
      src_vars.stack_sizes[src_vars.group_idx].recursive_size =
         MAX2(src_vars.stack_sizes[src_vars.group_idx].recursive_size, shader->scratch_size);
   }

   nir_push_if(b, nir_ieq(b, idx, nir_imm_int(b, call_idx)));
   nir_store_var(b, vars->main_loop_case_visited, nir_imm_bool(b, true), 1);
   nir_inline_function_impl(b, nir_shader_get_entrypoint(shader), NULL, var_remap);
   nir_pop_if(b, NULL);

   /* Adopt the instructions from the source shader, since they are merely moved, not cloned. */
   ralloc_adopt(ralloc_context(b->shader), ralloc_context(shader));

   ralloc_free(var_remap);
}

static nir_shader *
create_rt_shader(struct radv_device *device, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
                 struct radv_pipeline_shader_stack_size *stack_sizes)
{
   /* TODO */
   return NULL;
}

static VkResult
radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache,
                        const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
{
   RADV_FROM_HANDLE(radv_device, device, _device);
   VkResult result;
   struct radv_pipeline *pipeline = NULL;
   struct radv_pipeline_shader_stack_size *stack_sizes = NULL;

   if (pCreateInfo->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)
      return radv_rt_pipeline_library_create(_device, _cache, pCreateInfo, pAllocator, pPipeline);

   VkRayTracingPipelineCreateInfoKHR local_create_info =
      radv_create_merged_rt_create_info(pCreateInfo);
   if (!local_create_info.pStages || !local_create_info.pGroups) {
      result = VK_ERROR_OUT_OF_HOST_MEMORY;
      goto fail;
   }

   stack_sizes = calloc(sizeof(*stack_sizes), local_create_info.groupCount);
   if (!stack_sizes) {
      result = VK_ERROR_OUT_OF_HOST_MEMORY;
      goto fail;
   }

   nir_shader *shader = create_rt_shader(device, &local_create_info, stack_sizes);
   VkComputePipelineCreateInfo compute_info = {
      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
      .pNext = NULL,
      .flags = pCreateInfo->flags,
      .stage =
         {
            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
            .stage = VK_SHADER_STAGE_COMPUTE_BIT,
            .module = vk_shader_module_handle_from_nir(shader),
            .pName = "main",
         },
      .layout = pCreateInfo->layout,
   };
   result = radv_compute_pipeline_create(_device, _cache, &compute_info, pAllocator, pPipeline);
   if (result != VK_SUCCESS)
      goto shader_fail;

   pipeline = radv_pipeline_from_handle(*pPipeline);

   pipeline->compute.rt_group_handles =
      calloc(sizeof(*pipeline->compute.rt_group_handles), local_create_info.groupCount);
   if (!pipeline->compute.rt_group_handles) {
      result = VK_ERROR_OUT_OF_HOST_MEMORY;
      goto shader_fail;
   }

   pipeline->compute.rt_stack_sizes = stack_sizes;
   stack_sizes = NULL;

   for (unsigned i = 0; i < local_create_info.groupCount; ++i) {
      const VkRayTracingShaderGroupCreateInfoKHR *group_info = &local_create_info.pGroups[i];
      switch (group_info->type) {
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
         if (group_info->generalShader != VK_SHADER_UNUSED_KHR)
            pipeline->compute.rt_group_handles[i].handles[0] = i + 2;
         break;
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
         if (group_info->intersectionShader != VK_SHADER_UNUSED_KHR)
            pipeline->compute.rt_group_handles[i].handles[1] = i + 2;
         FALLTHROUGH;
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
         if (group_info->closestHitShader != VK_SHADER_UNUSED_KHR)
            pipeline->compute.rt_group_handles[i].handles[0] = i + 2;
         if (group_info->anyHitShader != VK_SHADER_UNUSED_KHR)
            pipeline->compute.rt_group_handles[i].handles[1] = i + 2;
         break;
      case VK_SHADER_GROUP_SHADER_MAX_ENUM_KHR:
         unreachable("VK_SHADER_GROUP_SHADER_MAX_ENUM_KHR");
      }
   }

shader_fail:
   if (result != VK_SUCCESS && pipeline)
      radv_pipeline_destroy(device, pipeline, pAllocator);
   ralloc_free(shader);
fail:
   free((void *)local_create_info.pGroups);
   free((void *)local_create_info.pStages);
   free(stack_sizes);
   return result;
}

VkResult
radv_CreateRayTracingPipelinesKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation,
                                  VkPipelineCache pipelineCache, uint32_t count,
                                  const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
                                  const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
{
   VkResult result = VK_SUCCESS;

   unsigned i = 0;
   for (; i < count; i++) {
      VkResult r;
      r = radv_rt_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator,
                                  &pPipelines[i]);
      if (r != VK_SUCCESS) {
         result = r;
         pPipelines[i] = VK_NULL_HANDLE;

         if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
            break;
      }
   }

   for (; i < count; ++i)
      pPipelines[i] = VK_NULL_HANDLE;

   return result;
}