radv,aco: Use function call structure for RT programs

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29580>
2026-02-03 13:00:37 +01:00 · 2025-02-17 18:42:48 +01:00 · 2025-02-17 18:42:48 +01:00 · 0a1911b220
commit 0a1911b220
parent c5d796c902
17 changed files with 706 additions and 136 deletions
--- a/src/amd/common/ac_shader_args.h
+++ b/src/amd/common/ac_shader_args.h
@ -188,15 +188,12 @@ struct ac_shader_args {

   /* RT */
   struct {
-      struct ac_arg uniform_shader_addr;
      struct ac_arg sbt_descriptors;
      struct ac_arg launch_sizes[3];
      struct ac_arg launch_size_addr;
      struct ac_arg launch_ids[3];
      struct ac_arg dynamic_callable_stack_base;
      struct ac_arg traversal_shader_addr;
-      struct ac_arg shader_addr;
-      struct ac_arg shader_record;
      struct ac_arg payload_offset;
   } rt;
 };
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -1837,7 +1837,8 @@ emit_program(Program* program, std::vector<uint32_t>& code, std::vector<struct a
               (uint32_t*)(program->constant_data.data() + program->constant_data.size()));

   program->config->scratch_bytes_per_wave =
-      align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule);
+      align(program->config->scratch_bytes_per_wave + program->scratch_arg_size,
+            program->dev.scratch_alloc_granule);
   program->config->wgp_mode = program->wgp_mode;

   return exec_size;
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -575,6 +575,10 @@ kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
       */
      force_waitcnt(ctx, imm);
   }
+   if (instr->opcode == aco_opcode::s_swappc_b64) {
+      u_foreach_bit (i, ctx.nonzero & ~counter_vs)
+         imm[i] = 0;
+   }

   check_instr(ctx, imm, instr);

--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@ -273,8 +273,8 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
 void
 aco_compile_rt_prolog(const struct aco_compiler_options* options,
                      const struct aco_shader_info* info, const struct ac_shader_args* in_args,
-                      const struct ac_shader_args* out_args, aco_callback* build_prolog,
-                      void** binary)
+                      const struct ac_arg* descriptors, unsigned raygen_param_count,
+                      nir_parameter* raygen_params, aco_callback* build_prolog, void** binary)
 {
   init();

@ -285,7 +285,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
   program->debug.func = NULL;
   program->debug.private_data = NULL;

-   select_rt_prolog(program.get(), &config, options, info, in_args, out_args);
+   select_rt_prolog(program.get(), &config, options, info, in_args, descriptors, raygen_param_count,
+                    raygen_params);
   validate(program.get());
   insert_waitcnt(program.get());
   insert_NOPs(program.get());
--- a/src/amd/compiler/aco_interface.h
+++ b/src/amd/compiler/aco_interface.h
@ -18,6 +18,8 @@
 extern "C" {
 #endif

+struct nir_parameter;
+typedef struct nir_parameter nir_parameter;
 struct ac_shader_config;
 struct aco_shader_info;
 struct aco_vs_prolog_info;
@ -42,8 +44,8 @@ void aco_compile_shader(const struct aco_compiler_options* options,

 void aco_compile_rt_prolog(const struct aco_compiler_options* options,
                           const struct aco_shader_info* info, const struct ac_shader_args* in_args,
-                           const struct ac_shader_args* out_args, aco_callback* build_prolog,
-                           void** binary);
+                           const struct ac_arg* descriptors, unsigned raygen_param_count,
+                           nir_parameter* raygen_params, aco_callback* build_prolog, void** binary);

 void aco_compile_vs_prolog(const struct aco_compiler_options* options,
                           const struct aco_shader_info* info,
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -26,6 +26,7 @@
 #include <vector>

 typedef struct nir_shader nir_shader;
+typedef struct nir_parameter nir_parameter;

 namespace aco {

@ -2337,6 +2338,7 @@ public:
   bool has_call = false;
   ABI callee_abi = {};
   RegisterDemand callee_param_demand = RegisterDemand();
+   unsigned scratch_arg_size = 0;

   struct {
      monotonic_buffer_resource memory;
@ -2409,7 +2411,8 @@ void select_trap_handler_shader(Program* program, ac_shader_config* config,
 void select_rt_prolog(Program* program, ac_shader_config* config,
                      const struct aco_compiler_options* options,
                      const struct aco_shader_info* info, const struct ac_shader_args* in_args,
-                      const struct ac_shader_args* out_args);
+                      const struct ac_arg* descriptors, unsigned raygen_param_count,
+                      nir_parameter* raygen_params);
 void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
                      ac_shader_config* config, const struct aco_compiler_options* options,
                      const struct aco_shader_info* info, const struct ac_shader_args* args);
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@ -438,7 +438,10 @@ validate_ir(Program* program)
                  ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
                  (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
                  (instr->opcode == aco_opcode::p_init_scratch && i == 0) ||
-                  (instr_disables_wqm(instr.get()) && i + 2 >= instr->operands.size());
+                  (instr_disables_wqm(instr.get()) && i + 2 >= instr->operands.size()) ||
+                  ((instr->opcode == aco_opcode::p_return ||
+                    instr->opcode == aco_opcode::p_reload_preserved) &&
+                   i == 0);
               check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
            } else {
               check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
--- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h
+++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
@ -285,12 +285,18 @@ void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_
                                     const struct aco_export_mrt* mrt1);
 Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
 void build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs);
-Instruction* add_startpgm(struct isel_context* ctx);
+Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false);
 void finish_program(isel_context* ctx);

 struct callee_info get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
                                   const nir_parameter* parameters, Program* program,
                                   RegisterDemand reg_limit);
+void load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param,
+                        Temp stack_ptr, unsigned scratch_param_size, Temp dst);
+void store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param,
+                         Temp stack_ptr, unsigned scratch_param_size, Temp data);
+
+void emit_reload_preserved(isel_context* ctx);

 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)

--- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
@ -45,14 +45,8 @@ append_logical_end(isel_context* ctx, bool append_reload_preserved)
 {
   Builder bld(ctx->program, ctx->block);

-   if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0) {
-      Operand stack_ptr_op;
-      if (ctx->program->gfx_level >= GFX9)
-         stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp());
-      else
-         stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
-      bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), bld.def(s1, scc), stack_ptr_op);
-   }
+   if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0)
+      emit_reload_preserved(ctx);

   bld.pseudo(aco_opcode::p_logical_end);
 }
@ -676,8 +670,10 @@ build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
 }

 Instruction*
-add_startpgm(struct isel_context* ctx)
+add_startpgm(struct isel_context* ctx, bool is_callee)
 {
+   ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
+
   unsigned def_count = 0;
   for (unsigned i = 0; i < ctx->args->arg_count; i++) {
      if (ctx->args->args[i].skip)
@ -689,6 +685,15 @@ add_startpgm(struct isel_context* ctx)
         def_count++;
   }

+   if (is_callee) {
+      /* We do not support shader args in callees. */
+      assert(def_count == 0);
+      def_count += ctx->callee_info.reg_param_count;
+      /* Add system parameters separately - they aren't counted by reg_param_count */
+      assert(ctx->callee_info.stack_ptr.is_reg && ctx->callee_info.return_address.is_reg);
+      def_count += 2;
+   }
+
   Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
   ctx->block->instructions.emplace_back(startpgm);
   for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
@ -721,6 +726,22 @@ add_startpgm(struct isel_context* ctx)
      }
   }

+   if (is_callee) {
+      unsigned def_idx = 0;
+      if (ctx->program->gfx_level >= GFX9)
+         ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
+      else
+         ctx->program->static_scratch_rsrc = ctx->callee_info.stack_ptr.def.getTemp();
+      startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def;
+      startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def;
+
+      for (auto& info : ctx->callee_info.param_infos) {
+         if (!info.is_reg)
+            continue;
+         startpgm->definitions[def_idx++] = info.def;
+      }
+   }
+
   /* epilog has no scratch */
   if (ctx->args->scratch_offset.used) {
      if (ctx->program->gfx_level < GFX9) {
@ -1074,6 +1095,15 @@ get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,

   find_param_regs(program, abi, info, assignment_infos, reg_limit);

+   /* The call target parameters are special - they are marked as discardable to allow us
+    * to overwrite the parameter values within each callee for the divergent dispatch logic.
+    * However, we still need to explicitly write back the new values to the ABI-assigned registers
+    * when jumping to the next divergent callee/returning. Therefore, mark them as needing explicit
+    * preservation.
+    */
+   info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation = true;
+   info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation = true;
+
   /* Explicitly preserve the stack pointer. spill_preserved() can ensure correctness on its own,
    * but it only can spill the initial stack pointer value to a linear VGPR, the inactive lanes of
    * which would in turn need to be spilled to scratch. Explicitly preserving the stack pointer's
@ -1084,4 +1114,16 @@ get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
   return info;
 }

+void
+emit_reload_preserved(isel_context* ctx)
+{
+   Builder bld(ctx->program, ctx->block);
+   Operand stack_ptr_op;
+   if (ctx->program->gfx_level >= GFX9)
+      stack_ptr_op = Operand(ctx->program->stack_ptr);
+   else
+      stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
+   bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), Operand(), stack_ptr_op);
+}
+
 } // namespace aco
--- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
@ -6,6 +6,7 @@

 #include "aco_instruction_selection.h"
 #include "aco_interface.h"
+#include "aco_nir_call_attribs.h"

 #include "nir_builder.h"
 #include "nir_control_flow.h"
@ -238,8 +239,11 @@ setup_nir(isel_context* ctx, nir_shader* nir)
      nir_opt_dce(nir);
   }

-   nir_function_impl* func = nir_shader_get_entrypoint(nir);
-   nir_index_ssa_defs(func);
+   /* nir_shader_get_entrypoint returns NULL for RT shaders, but there should only be
+    * one impl at this stage.
+    */
+   nir_foreach_function_impl (func, nir)
+      nir_index_ssa_defs(func);
 }

 /* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent,
@ -348,6 +352,13 @@ void
 init_context(isel_context* ctx, nir_shader* shader)
 {
   nir_function_impl* impl = nir_shader_get_entrypoint(shader);
+   if (!impl) {
+      /* RT shaders have no NIR entrypoint, but only one function impl exists at this stage */
+      nir_foreach_function_impl (func, shader) {
+         impl = func;
+         break;
+      }
+   }
   ctx->shader = shader;

   assert(shader->info.max_subgroup_size >= ctx->program->wave_size);
@ -613,7 +624,17 @@ init_context(isel_context* ctx, nir_shader* shader)
               case nir_intrinsic_ddx_fine:
               case nir_intrinsic_ddy_fine:
               case nir_intrinsic_ddx_coarse:
-               case nir_intrinsic_ddy_coarse: type = RegType::vgpr; break;
+               case nir_intrinsic_ddy_coarse:
+               case nir_intrinsic_load_return_param_amd: {
+                  type = RegType::vgpr;
+                  break;
+               }
+               case nir_intrinsic_load_param: {
+                  nir_parameter* param =
+                     &impl->function->params[nir_intrinsic_param_idx(intrinsic)];
+                  type = param->is_uniform ? RegType::sgpr : RegType::vgpr;
+                  break;
+               }
               default:
                  for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
                       i++) {
@ -773,8 +794,17 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
   assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);

   unsigned nir_num_blocks = 0;
-   for (unsigned i = 0; i < shader_count; i++)
-      nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks;
+   for (unsigned i = 0; i < shader_count; i++) {
+      nir_function_impl* entrypoint = nir_shader_get_entrypoint(shaders[i]);
+      if (!entrypoint) {
+         /* RT shaders have no NIR entrypoint, but only one function impl exists at this stage */
+         nir_foreach_function_impl (func, shaders[i]) {
+            entrypoint = func;
+            break;
+         }
+      }
+      nir_num_blocks += entrypoint->num_blocks;
+   }
   ctx.program->blocks.reserve(nir_num_blocks * 2);
   ctx.block = ctx.program->create_and_insert_block();
   ctx.block->kind = block_kind_top_level;
--- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
@ -8,6 +8,7 @@
 #include "aco_builder.h"
 #include "aco_instruction_selection.h"
 #include "aco_ir.h"
+#include "aco_nir_call_attribs.h"

 #include "amdgfxregs.h"
 #include <array>
@ -788,6 +789,141 @@ visit_jump(isel_context* ctx, nir_jump_instr* instr)
   }
 }

+void
+visit_call(isel_context* ctx, nir_call_instr* instr)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   ABI abi;
+   /* TODO: callable abi? */
+   switch (instr->callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
+   case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = rtRaygenABI; break;
+   case ACO_NIR_CALL_ABI_TRAVERSAL: abi = rtTraversalABI; break;
+   case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = rtAnyHitABI; break;
+   default: UNREACHABLE("invalid abi");
+   }
+
+   RegisterDemand limit = get_addr_regs_from_waves(ctx->program, ctx->program->min_waves);
+
+   struct callee_info info =
+      get_callee_info(ctx->program->gfx_level, abi, instr->callee->num_params,
+                      instr->callee->params, nullptr, limit);
+   std::vector<parameter_info> return_infos;
+
+   /* Before setting up the call itself, set up parameters stored in scratch memory.
+    * The stack layout during a call looks something like this:
+    * -------------------------------------------------------------------
+    * | caller stack area | callee's scratch params | callee stack area
+    * -------------------------------------------------------------------
+    * ^ caller's stack ptr                          ^ callee's stack ptr
+    *
+    * Since we don't know how big our own stack area is yet (spilling and register preservation may
+    * add to the stack size), we query the callee's stack pointer using p_callee_stack_ptr and use
+    * negative offsets to index into the scratch parameter area (similar to how the callee will load
+    * the parameters as well).
+    */
+
+   Temp stack_ptr, param_stack_ptr;
+   if (info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9) {
+      param_stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), bld.def(s1, scc),
+                                   Operand::c32(info.scratch_param_size),
+                                   Operand(ctx->callee_info.stack_ptr.def.getTemp()));
+      stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
+   } else {
+      param_stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
+                                   Operand::c32(info.scratch_param_size));
+      stack_ptr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1), Operand::c32(0));
+   }
+
+   for (unsigned i = 0; i < info.param_infos.size(); ++i) {
+      if (info.param_infos[i].is_reg)
+         continue;
+
+      store_scratch_param(ctx, bld, info.param_infos[i], param_stack_ptr, info.scratch_param_size,
+                          get_ssa_temp(ctx, instr->params[i].ssa));
+   }
+
+   unsigned extra_def_count = 1;
+   unsigned extra_param_count = 2;
+
+   unsigned param_size = info.scratch_param_size;
+   if (ctx->program->gfx_level < GFX9)
+      param_size *= ctx->program->wave_size;
+
+   assert(info.param_infos[0].is_reg);
+   Instruction* call_instr = create_instruction(aco_opcode::p_call, Format::PSEUDO_CALL,
+                                                info.reg_param_count + extra_param_count,
+                                                info.reg_discardable_param_count + extra_def_count);
+   call_instr->call().abi = abi;
+   if (ctx->program->gfx_level >= GFX9) {
+      call_instr->operands[0] = Operand(stack_ptr, info.stack_ptr.def.physReg());
+   } else {
+      call_instr->operands[0] = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
+      call_instr->operands[0].setPrecolored(info.stack_ptr.def.physReg());
+   }
+
+   call_instr->operands[1] = Operand::c32(param_size);
+   call_instr->definitions[0] = Definition(bld.tmp(s2), info.return_address.def.physReg());
+
+   /* Set up parameters stored in registers. Every parameter corresponds to an operand,
+    * and parameters that may have their value clobbered (i.e. discardable and return params)
+    * also have a definition.
+    */
+   unsigned reg_param_idx = 0;
+   unsigned reg_discardable_param_idx = 0;
+   for (unsigned i = 0; i < info.param_infos.size(); ++i) {
+      if (!info.param_infos[i].is_reg) {
+         /* While setting up parameters, also capture information about where return parameters
+          * are stored, in order to reload them later.
+          * Since return_infos stores return parameters contiguously, and return parameters in
+          * scratch may be at any position in the parameter list, we need to add information about
+          * returned scratch parameters in the same loop as returned parameters stored in registers.
+          */
+         if (instr->callee->params[i].is_return) {
+            parameter_info return_info = {};
+            return_info.is_reg = false;
+            return_info.scratch_offset = info.param_infos[i].scratch_offset;
+            return_infos.emplace_back(return_info);
+         }
+         continue;
+      }
+
+      Operand& op = call_instr->operands[reg_param_idx + extra_param_count];
+      op.setPrecolored(info.param_infos[i].def.physReg());
+
+      if (instr->callee->params[i].is_uniform)
+         op.setTemp(bld.as_uniform(get_ssa_temp(ctx, instr->params[i].ssa)));
+      else
+         op.setTemp(as_vgpr(ctx, get_ssa_temp(ctx, instr->params[i].ssa)));
+
+      if ((instr->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) ||
+          instr->callee->params[i].is_return) {
+         Definition def = bld.def(op.regClass(), op.physReg());
+         call_instr->definitions[extra_def_count + reg_discardable_param_idx++] = def;
+         if (instr->callee->params[i].is_return) {
+            assert(!instr->callee->params[i].is_uniform);
+            parameter_info return_info = {};
+            return_info.is_reg = true;
+            return_info.def = def;
+            return_infos.emplace_back(return_info);
+         }
+      }
+
+      ++reg_param_idx;
+   }
+
+   ctx->block->instructions.emplace_back(static_cast<Instruction*>(call_instr));
+
+   ctx->call_infos.emplace_back(call_info{
+      instr,
+      call_instr,
+      std::move(return_infos),
+      info.scratch_param_size,
+   });
+   ctx->block->kind |= block_kind_contains_call;
+   ctx->program->has_call = true;
+}
+
 void
 visit_debug_info(isel_context* ctx, nir_instr_debug_info* instr_info)
 {
@ -839,6 +975,7 @@ visit_block(isel_context* ctx, nir_block* block)
      case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
      case nir_instr_type_deref: break;
      case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
+      case nir_instr_type_call: visit_call(ctx, nir_instr_as_call(instr)); break;
      default: isel_err(instr, "Unknown NIR instr type");
      }
   }
@ -1152,32 +1289,52 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i)
 }

 void
-insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
+insert_return(isel_context& ctx)
 {
-   unsigned src_count = 0;
-   for (unsigned i = 0; i < ctx.args->arg_count; i++)
-      src_count += !!BITSET_TEST(ctx.output_args, i);
+   assert(ctx.callee_info.stack_ptr.needs_explicit_preservation);
+   assert(
+      ctx.callee_info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation);
+   assert(
+      ctx.callee_info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation);

+   /* stack_ptr always needs to be explicitly preserved */
+   unsigned preserved_param_count = 1;
+   if (ctx.callee_info.return_address.needs_explicit_preservation)
+      ++preserved_param_count;
+   for (auto param_info : ctx.callee_info.param_infos) {
+      if (!param_info.is_reg || !param_info.needs_explicit_preservation)
+         continue;
+      ++preserved_param_count;
+   }
+   unsigned src_count = preserved_param_count + 1;
   Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
   ctx.block->instructions.emplace_back(ret);

-   src_count = 0;
-   for (unsigned i = 0; i < ctx.args->arg_count; i++) {
-      if (!BITSET_TEST(ctx.output_args, i))
+   unsigned def_idx = 0;
+   ret->operands[def_idx++] = Operand();
+
+   Operand stack_op = Operand(ctx.callee_info.stack_ptr.def.getTemp());
+   stack_op.setPrecolored(ctx.callee_info.stack_ptr.def.physReg());
+   ret->operands[def_idx++] = stack_op;
+
+   for (unsigned i = 0; i < ctx.callee_info.param_infos.size(); ++i) {
+      const auto& param_info = ctx.callee_info.param_infos[i];
+      if (!param_info.is_reg || !param_info.needs_explicit_preservation)
         continue;
-
-      enum ac_arg_regfile file = ctx.args->args[i].file;
-      unsigned size = ctx.args->args[i].size;
-      unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
-      RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
-      Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
-                                         : Operand(PhysReg{reg}, type);
-      ret->operands[src_count] = op;
-      src_count++;
+      Temp param_temp = param_info.def.getTemp();
+      if (i == ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC)
+         param_temp = ctx.next_divergent_pc;
+      else if (i == ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC)
+         param_temp = ctx.next_pc;
+      Operand op = Operand(param_temp);
+      op.setPrecolored(param_info.def.physReg());
+      ret->operands[def_idx++] = op;
+   }
+   if (ctx.callee_info.return_address.needs_explicit_preservation) {
+      Operand op = Operand(ctx.callee_info.return_address.def.getTemp());
+      op.setPrecolored(ctx.callee_info.return_address.def.physReg());
+      ret->operands[def_idx++] = op;
   }
-
-   Builder bld(ctx.program, ctx.block);
-   bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
 }

 void
@ -1194,20 +1351,45 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
      init_context(&ctx, nir);
      setup_fp_mode(&ctx, nir);

-      Instruction* startpgm = add_startpgm(&ctx);
+      nir_function_impl* impl = NULL;
+      nir_foreach_function_impl (func, nir) {
+         impl = func;
+         break;
+      }
+
+      ABI abi;
+      /* TODO: callable abi? */
+      switch (impl->function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
+      case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = rtRaygenABI; break;
+      case ACO_NIR_CALL_ABI_TRAVERSAL: abi = rtTraversalABI; break;
+      case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = rtAnyHitABI; break;
+      default: UNREACHABLE("invalid abi");
+      }
+
+      RegisterDemand limit = get_addr_regs_from_waves(ctx.program, ctx.program->min_waves);
+
+      ctx.callee_abi = abi;
+      ctx.program->callee_abi = ctx.callee_abi;
+      ctx.callee_info =
+         get_callee_info(ctx.program->gfx_level, ctx.callee_abi, impl->function->num_params,
+                         impl->function->params, ctx.program, limit);
+      ctx.program->is_callee = true;
+
+      Instruction* startpgm = add_startpgm(&ctx, true);
+
      append_logical_start(ctx.block);
      split_arguments(&ctx, startpgm);
-      visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
-      append_logical_end(&ctx);
+      visit_cf_list(&ctx, &impl->body);
+      /* This block doesn't need a p_reload_preserved, we add it manually after p_return */
+      append_logical_end(&ctx, false);
      ctx.block->kind |= block_kind_uniform;

-      /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
-       * shader without shader calls.
-       */
-      if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
-         insert_rt_jump_next(ctx, args);
-      else
+      if (ctx.next_pc != Temp()) {
+         insert_return(ctx);
+         Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc));
+      } else {
         Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm);
+      }

      cleanup_context(&ctx);
   }
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@ -1136,9 +1136,9 @@ get_buffer_store_op(unsigned bytes)
 }

 void
-split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
-                   Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
-                   Temp* write_datas, unsigned* offsets)
+split_buffer_store(isel_context* ctx, unsigned align_mul, unsigned align_offset, bool smem,
+                   RegType dst_type, Temp data, unsigned writemask, int swizzle_element_size,
+                   unsigned* write_count, Temp* write_datas, unsigned* offsets)
 {
   unsigned write_count_with_skips = 0;
   bool skips[16];
@ -1168,11 +1168,9 @@ split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, Reg
         byte = 8;

      /* dword or larger stores have to be dword-aligned */
-      unsigned align_mul = nir_intrinsic_align_mul(instr);
-      unsigned align_offset = nir_intrinsic_align_offset(instr) + offset;
-      bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
+      bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
      if (!dword_aligned)
-         byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
+         byte = MIN2(byte, ((align_offset + offset) % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);

      bytes[write_count_with_skips] = byte;
      advance_write_mask(&todo, offset, byte);
@ -2291,8 +2289,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
   unsigned write_count = 0;
   Temp write_datas[32];
   unsigned offsets[32];
-   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, max_size, &write_count,
-                      write_datas, offsets);
+   split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
+                      RegType::vgpr, data, writemask, max_size, &write_count, write_datas, offsets);

   /* GFX6-7 are affected by a hw bug that prevents address clamping to work
    * correctly when the SGPR offset is used.
@ -2457,8 +2455,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
   unsigned write_count = 0;
   Temp write_datas[32];
   unsigned offsets[32];
-   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
-                      write_datas, offsets);
+   split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
+                      RegType::vgpr, data, writemask, 16, &write_count, write_datas, offsets);

   Temp addr, offset;
   uint32_t const_offset;
@ -2830,7 +2828,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
   unsigned write_count = 0;
   Temp write_datas[32];
   unsigned offsets[32];
-   split_buffer_store(ctx, intrin, false, RegType::vgpr, store_src, write_mask,
+   split_buffer_store(ctx, nir_intrinsic_align_mul(intrin), nir_intrinsic_align_offset(intrin),
+                      false, RegType::vgpr, store_src, write_mask,
                      swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
                      write_datas, offsets);

@ -3339,8 +3338,9 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
   Temp write_datas[32];
   unsigned offsets[32];
   unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
-   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
-                      &write_count, write_datas, offsets);
+   split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
+                      RegType::vgpr, data, writemask, swizzle_component_size, &write_count,
+                      write_datas, offsets);

   if (ctx->program->gfx_level >= GFX9) {
      uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
@ -3890,6 +3890,106 @@ emit_ds_bvh_stack_push8_pop1_rtn(isel_context* ctx, nir_intrinsic_instr* instr,

 } // namespace

+void
+load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
+                   unsigned scratch_param_size, Temp dst)
+{
+   int32_t const_offset = param.scratch_offset - scratch_param_size;
+
+   LoadEmitInfo info = {Operand(v1), dst, dst.size(), 4};
+   info.align_mul = 4;
+   info.align_offset = 0;
+   info.cache = get_cache_flags(ctx, ACCESS_IS_SWIZZLED_AMD, ac_access_type_load);
+   info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
+   info.sync = memory_sync_info(storage_scratch, semantic_private);
+   if (ctx->program->gfx_level >= GFX9) {
+      if (const_offset < ctx->program->dev.scratch_global_offset_min) {
+         stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
+                              stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
+                              Operand::c32(const_offset));
+         const_offset = 0;
+      }
+      info.offset = stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr);
+      info.const_offset = const_offset;
+      EmitLoadParameters params = scratch_flat_load_params;
+      params.max_const_offset = ctx->program->dev.scratch_global_offset_max;
+      emit_load(ctx, bld, info, params);
+   } else {
+      info.resource = load_scratch_resource(
+         ctx->program, bld, ctx->program->private_segment_buffers.size() - 1, false);
+      if (stack_ptr.id()) {
+         info.soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), stack_ptr,
+                                 Operand::c32(-const_offset * ctx->program->wave_size));
+      } else {
+         info.soffset =
+            bld.copy(bld.def(s1), Operand::c32(-const_offset * ctx->program->wave_size));
+      }
+      emit_load(ctx, bld, info, scratch_mubuf_load_params);
+   }
+}
+
+void
+store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
+                    unsigned scratch_param_size, Temp data)
+{
+   int32_t const_base_offset = param.scratch_offset - scratch_param_size;
+   unsigned byte_size = data.bytes();
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
+   split_buffer_store(ctx, 4, 0, false, RegType::vgpr, as_vgpr(ctx, data),
+                      u_bit_consecutive(0, byte_size), swizzle_component_size, &write_count,
+                      write_datas, offsets);
+
+   if (ctx->program->gfx_level < GFX9) {
+      Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, -1u, false);
+      for (unsigned i = 0; i < write_count; i++) {
+         Temp soffset;
+         if (stack_ptr.id()) {
+            soffset =
+               bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), stack_ptr,
+                        Operand::c32(-const_base_offset * ctx->program->wave_size + offsets[i]));
+         } else {
+            soffset =
+               bld.copy(bld.def(s1),
+                        Operand::c32(-const_base_offset * ctx->program->wave_size + offsets[i]));
+         }
+         assert(write_datas[i].bytes() == 4);
+
+         Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, scratch_rsrc, Operand(v1),
+                                        Operand(soffset), write_datas[i], 0, false);
+         instr->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
+         instr->mubuf().cache.value = ac_swizzled;
+      }
+      return;
+   }
+
+   for (unsigned i = 0; i < write_count; i++) {
+      int32_t const_offset = const_base_offset + offsets[i];
+
+      if (const_offset < ctx->program->dev.scratch_global_offset_min) {
+         stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
+                              stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
+                              Operand::c32(const_offset));
+         const_offset = 0;
+      }
+
+      aco_opcode op;
+      switch (write_datas[i].bytes()) {
+      case 4: op = aco_opcode::scratch_store_dword; break;
+      case 8: op = aco_opcode::scratch_store_dwordx2; break;
+      case 12: op = aco_opcode::scratch_store_dwordx3; break;
+      case 16: op = aco_opcode::scratch_store_dwordx4; break;
+      default: UNREACHABLE("Unexpected param size");
+      }
+
+      bld.scratch(op, Operand(v1), stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr),
+                  write_datas[i], (int16_t)const_offset,
+                  memory_sync_info(storage_scratch, semantic_private));
+   }
+}
+
 void
 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
 {
@ -4965,6 +5065,81 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
      }
      break;
   }
+   case nir_intrinsic_set_next_call_pc_amd:
+      ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+      ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa);
+      break;
+   case nir_intrinsic_load_call_return_address_amd:
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
+               Operand(ctx->callee_info.return_address.def.getTemp()));
+      break;
+   case nir_intrinsic_load_return_param_amd: {
+      call_info& info = ctx->call_infos[nir_intrinsic_call_idx(instr)];
+
+      unsigned idx = nir_intrinsic_param_idx(instr);
+      assert(idx < info.nir_instr->callee->num_params);
+      assert(info.nir_instr->callee->params[idx].is_return);
+
+      unsigned index_in_return_params = 0u;
+      for (unsigned i = 0; i < idx; ++i) {
+         if (info.nir_instr->callee->params[i].is_return)
+            ++index_in_return_params;
+      }
+
+      if (info.return_info[index_in_return_params].is_reg) {
+         bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
+                  Operand(info.return_info[index_in_return_params].def.getTemp()));
+      } else {
+         Temp stack_ptr;
+         if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
+            stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), bld.def(s1, scc),
+                                   Operand::c32(info.scratch_param_size),
+                                   Operand(ctx->callee_info.stack_ptr.def.getTemp()));
+         else
+            stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
+                                   Operand::c32(info.scratch_param_size));
+         load_scratch_param(ctx, bld, info.return_info[index_in_return_params], stack_ptr,
+                            info.scratch_param_size, get_ssa_temp(ctx, &instr->def));
+      }
+      break;
+   }
+   case nir_intrinsic_load_param: {
+      const auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
+      Temp dst = get_ssa_temp(ctx, &instr->def);
+      if (param.is_reg) {
+         bld.copy(Definition(dst), Operand(param.def.getTemp()));
+
+         auto vec_it = ctx->allocated_vec.find(param.def.tempId());
+         if (vec_it != ctx->allocated_vec.end())
+            ctx->allocated_vec.emplace(dst.id(), vec_it->second);
+      } else {
+         Temp stack_ptr = Temp();
+         if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
+            stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
+         load_scratch_param(ctx, bld, param, stack_ptr, ctx->callee_info.scratch_param_size, dst);
+      }
+      break;
+   }
+   case nir_intrinsic_store_param_amd: {
+      nir_intrinsic_instr* parent = nir_def_as_intrinsic_or_null(instr->src[0].ssa);
+      if (parent && parent->intrinsic == nir_intrinsic_load_param &&
+          nir_intrinsic_param_idx(parent) == nir_intrinsic_param_idx(instr))
+         break;
+
+      auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
+      if (param.is_reg) {
+         param.def.setTemp(param.def.regClass().type() == RegType::vgpr
+                              ? as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))
+                              : bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)));
+      } else {
+         Temp stack_ptr = Temp();
+         if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
+            stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
+         store_scratch_param(ctx, bld, param, stack_ptr, ctx->callee_info.scratch_param_size,
+                             get_ssa_temp(ctx, instr->src[0].ssa));
+      }
+      break;
+   }
   default:
      isel_err(&instr->instr, "Unimplemented intrinsic instr");
      abort();
--- a/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
@ -8,13 +8,18 @@
 #include "aco_instruction_selection.h"
 #include "aco_interface.h"
 #include "aco_ir.h"
+#include "aco_nir_call_attribs.h"
+
+#include "ac_descriptors.h"
+#include "sid.h"

 namespace aco {

 void
 select_rt_prolog(Program* program, ac_shader_config* config,
                 const struct aco_compiler_options* options, const struct aco_shader_info* info,
-                 const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
+                 const struct ac_shader_args* in_args, const struct ac_arg* descriptors,
+                 unsigned raygen_param_count, nir_parameter* raygen_params)
 {
   init_program(program, compute_cs, info, options, config);
   Block* block = program->create_and_insert_block();
@ -24,8 +29,13 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   calc_min_waves(program);
   Builder bld(program, block);
   block->instructions.reserve(32);
-   unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
-   unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
+   unsigned num_sgprs = in_args->num_sgprs_used;
+   unsigned num_vgprs = in_args->num_vgprs_used;
+
+   RegisterDemand limit = get_addr_regs_from_waves(program, program->min_waves);
+
+   struct callee_info raygen_info = get_callee_info(program->gfx_level, rtRaygenABI,
+                                                    raygen_param_count, raygen_params, NULL, limit);

   /* Inputs:
    * Ring offsets:                s[0-1]
@ -41,9 +51,12 @@ select_rt_prolog(Program* program, ac_shader_config* config,
    * Local invocation IDs:        v[0-2]
    */
   PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
+   PhysReg in_descriptors = get_arg_reg(in_args, *descriptors);
+   PhysReg in_push_constants = get_arg_reg(in_args, in_args->push_constants);
+   PhysReg in_dynamic_descriptors = get_arg_reg(in_args, in_args->dynamic_descriptors);
   PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
+   PhysReg in_traversal_addr = get_arg_reg(in_args, in_args->rt.traversal_shader_addr);
   PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
-   PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
   PhysReg in_wg_id_x;
   PhysReg in_wg_id_y;
   PhysReg in_wg_id_z;
@ -77,15 +90,48 @@ select_rt_prolog(Program* program, ac_shader_config* config,
    * Shader VA:                   v[4-5]
    * Shader Record Ptr:           v[6-7]
    */
-   PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
-   PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
-   PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
-   PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
+   assert(raygen_info.stack_ptr.is_reg);
+   assert(raygen_info.return_address.is_reg);
+   assert(raygen_info.param_infos[0].is_reg);
+   assert(raygen_info.param_infos[1].is_reg);
+   assert(raygen_info.param_infos[RT_ARG_LAUNCH_ID + 2].is_reg);
+   assert(raygen_info.param_infos[RT_ARG_LAUNCH_SIZE + 2].is_reg);
+   assert(raygen_info.param_infos[RT_ARG_DESCRIPTORS + 2].is_reg);
+   assert(raygen_info.param_infos[RT_ARG_PUSH_CONSTANTS + 2].is_reg);
+   assert(raygen_info.param_infos[RT_ARG_SBT_DESCRIPTORS + 2].is_reg);
+   assert(raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].is_reg);
+   assert(raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].is_reg);
+   PhysReg out_stack_ptr_param = raygen_info.stack_ptr.def.physReg();
+   PhysReg out_return_shader_addr = raygen_info.return_address.def.physReg();
+   PhysReg out_divergent_shader_addr = raygen_info.param_infos[0].def.physReg();
+   PhysReg out_uniform_shader_addr = raygen_info.param_infos[1].def.physReg();
+   PhysReg out_launch_size_x = raygen_info.param_infos[RT_ARG_LAUNCH_SIZE + 2].def.physReg();
+   PhysReg out_launch_size_y = out_launch_size_x.advance(4);
+   PhysReg out_launch_size_z = out_launch_size_y.advance(4);
   PhysReg out_launch_ids[3];
-   for (unsigned i = 0; i < 3; i++)
-      out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
-   PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
-   PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
+   out_launch_ids[0] = raygen_info.param_infos[RT_ARG_LAUNCH_ID + 2].def.physReg();
+   for (unsigned i = 1; i < 3; i++)
+      out_launch_ids[i] = out_launch_ids[i - 1].advance(4);
+   PhysReg out_descriptors = raygen_info.param_infos[RT_ARG_DESCRIPTORS + 2].def.physReg();
+   PhysReg out_push_constants = raygen_info.param_infos[RT_ARG_PUSH_CONSTANTS + 2].def.physReg();
+   PhysReg out_dynamic_descriptors =
+      raygen_info.param_infos[RT_ARG_DYNAMIC_DESCRIPTORS + 2].def.physReg();
+   PhysReg out_sbt_descriptors = raygen_info.param_infos[RT_ARG_SBT_DESCRIPTORS + 2].def.physReg();
+   PhysReg out_traversal_addr =
+      raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].def.physReg();
+   PhysReg out_record_ptr = raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].def.physReg();
+
+   unsigned param_idx = 0;
+   for (auto& param_info : raygen_info.param_infos) {
+      unsigned byte_size =
+         align(raygen_params[param_idx].bit_size, 32) / 8 * raygen_params[param_idx].num_components;
+      if (raygen_params[param_idx].is_uniform)
+         num_sgprs = std::max(num_sgprs, param_info.def.physReg().reg() + byte_size / 4);
+      else
+         num_vgprs = std::max(num_vgprs, param_info.def.physReg().reg() - 256 + byte_size / 4);
+      ++param_idx;
+   }
+   num_sgprs = std::max(num_sgprs, raygen_info.stack_ptr.def.physReg().reg());

   /* Temporaries: */
   PhysReg tmp_wg_start_x = PhysReg{num_sgprs};
@ -94,18 +140,26 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   num_sgprs++;
   PhysReg tmp_swizzle_bound_y = PhysReg{num_sgprs};
   num_sgprs++;
-   PhysReg tmp_wg_id_y;
-   if (program->gfx_level >= GFX12) {
-      tmp_wg_id_y = PhysReg{num_sgprs};
-      num_sgprs++;
-   } else {
-      tmp_wg_id_y = in_wg_id_y;
-   }
+   PhysReg tmp_wg_id_y = PhysReg{num_sgprs};
+   num_sgprs++;
   num_sgprs = align(num_sgprs, 2);
   PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
   num_sgprs += 2;
+   PhysReg tmp_launch_size_addr = PhysReg{num_sgprs};
+   num_sgprs += 2;
   PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
   num_sgprs += 2;
+   PhysReg tmp_sbt_desc = PhysReg{num_sgprs};
+   if (program->gfx_level < GFX9)
+      num_sgprs += 2;
+   PhysReg tmp_traversal_addr = PhysReg{num_sgprs};
+   num_sgprs += 1;
+   PhysReg tmp_push_constants = PhysReg{num_sgprs};
+   num_sgprs++;
+   PhysReg tmp_descriptors = PhysReg{num_sgprs};
+   num_sgprs++;
+   PhysReg tmp_dynamic_descriptors = PhysReg{num_sgprs};
+   num_sgprs++;

   PhysReg tmp_swizzled_id_x = PhysReg{256 + num_vgprs++};
   PhysReg tmp_swizzled_id_y = PhysReg{256 + num_vgprs++};
@ -113,40 +167,66 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   PhysReg tmp_swizzled_id_shifted_y = PhysReg{256 + num_vgprs++};

   /* Confirm some assumptions about register aliasing */
-   assert(in_ring_offsets == out_uniform_shader_addr);
-   assert(get_arg_reg(in_args, in_args->push_constants) ==
-          get_arg_reg(out_args, out_args->push_constants));
-   assert(get_arg_reg(in_args, in_args->dynamic_descriptors) ==
-          get_arg_reg(out_args, out_args->dynamic_descriptors));
-   assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
-          get_arg_reg(out_args, out_args->rt.sbt_descriptors));
-   assert(get_arg_reg(in_args, in_args->rt.traversal_shader_addr) ==
-          get_arg_reg(out_args, out_args->rt.traversal_shader_addr));
-   assert(in_launch_size_addr == out_launch_size_x);
-   assert(in_stack_base == out_launch_size_z);
-   assert(in_local_id == out_launch_ids[0]);
-
-   /* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
-    * arg. Make sure no other outputs have overwritten it by then.
-    */
-   assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
+   if (program->gfx_level >= GFX9) {
+      if (program->gfx_level < GFX12) {
+         assert(in_wg_id_z == out_launch_size_y);
+         assert(in_wg_id_y == out_launch_size_x);
+      }
+      assert(in_sbt_desc == out_sbt_descriptors);
+      assert(in_traversal_addr == out_descriptors);
+   } else {
+      assert(out_launch_size_x == in_wg_id_y);
+      assert(out_sbt_descriptors == in_launch_size_addr);
+   }

   /* load raygen sbt */
   bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
            Operand::c32(0u));

+   bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_launch_size_addr, s2),
+            Operand(in_launch_size_addr, s2));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_traversal_addr, s1),
+            Operand(in_traversal_addr, s1));
+
+   /* On GFX8-, the out push constant/descriptor parameters alias WG IDs, so we copy these
+    * parameters only after we're done calculating the launch IDs.
+    */
+   bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_push_constants, s1),
+            Operand(in_push_constants, s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_dynamic_descriptors, s1),
+            Operand(in_dynamic_descriptors, s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_descriptors, s1), Operand(in_descriptors, s1));
+
+   if (options->gfx_level < GFX9)
+      bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_sbt_desc, s2), Operand(in_sbt_desc, s2));
+
   /* init scratch */
   if (options->gfx_level < GFX9) {
-      /* copy ring offsets to temporary location*/
-      bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
-               Operand(in_ring_offsets, s2));
+      /* Unconditionally apply the scratch offset to scratch_rsrc so we just have
+       * to pass the rsrc through to callees.
+       */
+      bld.sop2(aco_opcode::s_add_u32, Definition(tmp_ring_offsets, s1), Definition(scc, s1),
+               Operand(in_ring_offsets, s1), Operand(in_scratch_offset, s1));
+      bld.sop2(aco_opcode::s_addc_u32, Definition(tmp_ring_offsets.advance(4), s1),
+               Definition(scc, s1), Operand(in_ring_offsets.advance(4), s1), Operand::c32(0),
+               Operand(scc, s1));
   } else if (options->gfx_level < GFX11) {
      hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
                      Operand(in_scratch_offset, s1));
   }

-   /* set stack ptr */
-   bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
+   /* Set up the Z launch ID, as well as setting up workgroup Y IDs. On gfx11-, the setup consists
+    * of backing the ID up as the load for the ray launch sizes will overwrite it.
+    */
+   if (options->gfx_level >= GFX12) {
+      bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
+                   Operand(in_wg_id_y, s1));
+      bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1),
+               Operand::c32(0));
+   } else {
+      bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
+      bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1));
+   }

   /* load raygen address */
   bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
@ -156,22 +236,12 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   assert(out_launch_size_x.reg() % 4 == 0);
   if (options->gfx_level >= GFX12) {
      bld.smem(aco_opcode::s_load_dwordx3, Definition(out_launch_size_x, s3),
-               Operand(in_launch_size_addr, s2), Operand::c32(0u));
+               Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
   } else {
      bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
-               Operand(in_launch_size_addr, s2), Operand::c32(8u));
+               Operand(tmp_launch_size_addr, s2), Operand::c32(8u));
      bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
-               Operand(in_launch_size_addr, s2), Operand::c32(0u));
-   }
-
-   /* calculate ray launch ids */
-   if (options->gfx_level >= GFX12) {
-      bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
-                   Operand(in_wg_id_y, s1));
-      bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1),
-               Operand::c32(0));
-   } else {
-      bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
+               Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
   }

   /* Swizzle ray launch IDs. We dispatch a 1D 32x1/64x1 workgroup natively. Many games dispatch
@ -313,13 +383,61 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
            Operand(tmp_raygen_sbt.advance(4), s1));

-   if (options->gfx_level < GFX9) {
-      /* write scratch/ring offsets to outputs, if needed */
-      bld.sop1(aco_opcode::s_mov_b32,
-               Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
-               Operand(in_scratch_offset, s1));
-      bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
-               Operand(tmp_ring_offsets, s2));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr, s1),
+            Operand(tmp_traversal_addr, s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr.advance(4), s1),
+            Operand::c32(options->address32_hi));
+
+   if (program->gfx_level < GFX8)
+      bld.vop3(aco_opcode::v_lshr_b64, Definition(out_divergent_shader_addr, v2),
+               Operand(out_uniform_shader_addr, s2), Operand::c32(0));
+   else
+      bld.vop3(aco_opcode::v_lshrrev_b64, Definition(out_divergent_shader_addr, v2),
+               Operand::c32(0), Operand(out_uniform_shader_addr, s2));
+
+   /* Launch IDs are calculated, so copy the push constant/sbt descriptor parameters.
+    * Do this here before other parameters overwrite the inputs.
+    */
+   if (program->gfx_level < GFX9) {
+      bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors, s1),
+               Operand(tmp_sbt_desc, s1));
+      bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors.advance(4), s1),
+               Operand(tmp_sbt_desc.advance(4), s1));
+   }
+   bld.sop1(aco_opcode::s_mov_b32, Definition(out_push_constants, s1),
+            Operand(tmp_push_constants, s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(out_dynamic_descriptors, s1),
+            Operand(tmp_dynamic_descriptors, s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(out_descriptors, s1), Operand(tmp_descriptors, s1));
+
+   bld.sop1(aco_opcode::s_mov_b64, Definition(out_return_shader_addr, s2), Operand::c32(0));
+
+   if (program->gfx_level >= GFX9) {
+      bld.sopk(aco_opcode::s_movk_i32, Definition(out_stack_ptr_param, s1), 0);
+   } else {
+      /* Construct the scratch_rsrc here and pass it to the callees to use directly. */
+      struct ac_buffer_state ac_state = {0};
+      uint32_t desc[4];
+
+      ac_state.size = 0xffffffff;
+      ac_state.format = PIPE_FORMAT_R32_FLOAT;
+      for (int i = 0; i < 4; i++)
+         ac_state.swizzle[i] = PIPE_SWIZZLE_0;
+      ac_state.element_size = 1u;
+      ac_state.index_stride = program->wave_size == 64 ? 3u : 2u;
+      ac_state.add_tid = true;
+      ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
+
+      ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc);
+
+      bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param, s1),
+               Operand(tmp_ring_offsets, s1));
+      bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(4), s1),
+               Operand(tmp_ring_offsets.advance(4), s1));
+      bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(8), s1),
+               Operand::c32(desc[2]));
+      bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(12), s1),
+               Operand::c32(desc[3]));
   }

   /* jump to raygen */
--- a/src/amd/vulkan/radv_pipeline_rt.c
+++ b/src/amd/vulkan/radv_pipeline_rt.c
@ -931,9 +931,13 @@ static void
 compile_rt_prolog(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline)
 {
   const struct radv_physical_device *pdev = radv_device_physical(device);
+   struct nir_function raygen_stub = {};
   uint32_t push_constant_size = 0;

-   pipeline->prolog = radv_create_rt_prolog(device);
+   /* Create a dummy function signature for raygen shaders in order to pass parameter info to the prolog */
+   radv_nir_init_rt_function_params(&raygen_stub, MESA_SHADER_RAYGEN, 0);
+   radv_nir_lower_callee_signature(&raygen_stub);
+   pipeline->prolog = radv_create_rt_prolog(device, raygen_stub.num_params, raygen_stub.params);

   /* create combined config */
   struct ac_shader_config *config = &pipeline->prolog->config;
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -3408,13 +3408,12 @@ radv_aco_build_shader_part(void **bin, uint32_t num_sgprs, uint32_t num_vgprs, c
 }

 struct radv_shader *
-radv_create_rt_prolog(struct radv_device *device)
+radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, nir_parameter *raygen_params)
 {
   const struct radv_physical_device *pdev = radv_device_physical(device);
   const struct radv_instance *instance = radv_physical_device_instance(pdev);
   struct radv_shader *prolog;
   struct radv_shader_args in_args = {0};
-   struct radv_shader_args out_args = {0};
   struct radv_nir_compiler_options options = {0};
   radv_fill_nir_compiler_options(&options, device, NULL, false, instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS,
                                  radv_device_fault_detection_enabled(device), false);
@ -3435,7 +3434,6 @@ radv_create_rt_prolog(struct radv_device *device)
      info.cs.uses_block_id[i] = true;

   radv_declare_shader_args(device, NULL, &info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &in_args);
-   radv_declare_rt_shader_args(options.info->gfx_level, &out_args);
   info.user_sgprs_locs = in_args.user_sgprs_locs;

 #if AMD_LLVM_AVAILABLE
@ -3449,8 +3447,8 @@ radv_create_rt_prolog(struct radv_device *device)
   struct aco_compiler_options ac_opts;
   radv_aco_convert_shader_info(&ac_info, &info, &in_args, &device->cache_key, options.info->gfx_level);
   radv_aco_convert_opts(&ac_opts, &options, &in_args, &stage_key);
-   aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &out_args.ac, &radv_aco_build_shader_binary,
-                         (void **)&binary);
+   aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &in_args.descriptors[0], raygen_param_count, raygen_params,
+                         &radv_aco_build_shader_binary, (void **)&binary);
   binary->info = info;

   radv_postprocess_binary_config(device, binary, &in_args);
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -34,6 +34,8 @@ struct radv_shader_args;
 struct radv_shader_args;
 struct radv_serialized_shader_arena_block;
 struct vk_pipeline_robustness_state;
+struct nir_parameter;
+typedef struct nir_parameter nir_parameter;

 #define RADV_GRAPHICS_STAGE_BITS                                                                                       \
   (VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_TASK_BIT_EXT)
@ -549,7 +551,8 @@ void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena

 struct radv_shader *radv_create_trap_handler_shader(struct radv_device *device);

-struct radv_shader *radv_create_rt_prolog(struct radv_device *device);
+struct radv_shader *radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count,
+                                          nir_parameter *raygen_params);

 struct radv_shader_part *radv_shader_part_create(struct radv_device *device, struct radv_shader_part_binary *binary,
                                                 unsigned wave_size);
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -362,6 +362,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_urb_input_handle_intel:
   case nir_intrinsic_load_urb_output_handle_intel:
   case nir_intrinsic_load_ray_query_global_intel:
+   case nir_intrinsic_load_call_return_address_amd:
      is_divergent = false;
      break;