radv,aco: Use function call structure for RT programs

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29580>
This commit is contained in:
Natalie Vock 2025-02-17 18:42:48 +01:00 committed by Marge Bot
parent c5d796c902
commit 0a1911b220
17 changed files with 706 additions and 136 deletions

View file

@ -188,15 +188,12 @@ struct ac_shader_args {
/* RT */
struct {
struct ac_arg uniform_shader_addr;
struct ac_arg sbt_descriptors;
struct ac_arg launch_sizes[3];
struct ac_arg launch_size_addr;
struct ac_arg launch_ids[3];
struct ac_arg dynamic_callable_stack_base;
struct ac_arg traversal_shader_addr;
struct ac_arg shader_addr;
struct ac_arg shader_record;
struct ac_arg payload_offset;
} rt;
};

View file

@ -1837,7 +1837,8 @@ emit_program(Program* program, std::vector<uint32_t>& code, std::vector<struct a
(uint32_t*)(program->constant_data.data() + program->constant_data.size()));
program->config->scratch_bytes_per_wave =
align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule);
align(program->config->scratch_bytes_per_wave + program->scratch_arg_size,
program->dev.scratch_alloc_granule);
program->config->wgp_mode = program->wgp_mode;
return exec_size;

View file

@ -575,6 +575,10 @@ kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
*/
force_waitcnt(ctx, imm);
}
if (instr->opcode == aco_opcode::s_swappc_b64) {
u_foreach_bit (i, ctx.nonzero & ~counter_vs)
imm[i] = 0;
}
check_instr(ctx, imm, instr);

View file

@ -273,8 +273,8 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
void
aco_compile_rt_prolog(const struct aco_compiler_options* options,
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
const struct ac_shader_args* out_args, aco_callback* build_prolog,
void** binary)
const struct ac_arg* descriptors, unsigned raygen_param_count,
nir_parameter* raygen_params, aco_callback* build_prolog, void** binary)
{
init();
@ -285,7 +285,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
program->debug.func = NULL;
program->debug.private_data = NULL;
select_rt_prolog(program.get(), &config, options, info, in_args, out_args);
select_rt_prolog(program.get(), &config, options, info, in_args, descriptors, raygen_param_count,
raygen_params);
validate(program.get());
insert_waitcnt(program.get());
insert_NOPs(program.get());

View file

@ -18,6 +18,8 @@
extern "C" {
#endif
struct nir_parameter;
typedef struct nir_parameter nir_parameter;
struct ac_shader_config;
struct aco_shader_info;
struct aco_vs_prolog_info;
@ -42,8 +44,8 @@ void aco_compile_shader(const struct aco_compiler_options* options,
void aco_compile_rt_prolog(const struct aco_compiler_options* options,
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
const struct ac_shader_args* out_args, aco_callback* build_prolog,
void** binary);
const struct ac_arg* descriptors, unsigned raygen_param_count,
nir_parameter* raygen_params, aco_callback* build_prolog, void** binary);
void aco_compile_vs_prolog(const struct aco_compiler_options* options,
const struct aco_shader_info* info,

View file

@ -26,6 +26,7 @@
#include <vector>
typedef struct nir_shader nir_shader;
typedef struct nir_parameter nir_parameter;
namespace aco {
@ -2337,6 +2338,7 @@ public:
bool has_call = false;
ABI callee_abi = {};
RegisterDemand callee_param_demand = RegisterDemand();
unsigned scratch_arg_size = 0;
struct {
monotonic_buffer_resource memory;
@ -2409,7 +2411,8 @@ void select_trap_handler_shader(Program* program, ac_shader_config* config,
void select_rt_prolog(Program* program, ac_shader_config* config,
const struct aco_compiler_options* options,
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
const struct ac_shader_args* out_args);
const struct ac_arg* descriptors, unsigned raygen_param_count,
nir_parameter* raygen_params);
void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
ac_shader_config* config, const struct aco_compiler_options* options,
const struct aco_shader_info* info, const struct ac_shader_args* args);

View file

@ -438,7 +438,10 @@ validate_ir(Program* program)
((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
(instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
(instr->opcode == aco_opcode::p_init_scratch && i == 0) ||
(instr_disables_wqm(instr.get()) && i + 2 >= instr->operands.size());
(instr_disables_wqm(instr.get()) && i + 2 >= instr->operands.size()) ||
((instr->opcode == aco_opcode::p_return ||
instr->opcode == aco_opcode::p_reload_preserved) &&
i == 0);
check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
} else {
check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||

View file

@ -285,12 +285,18 @@ void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_
const struct aco_export_mrt* mrt1);
Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
void build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs);
Instruction* add_startpgm(struct isel_context* ctx);
Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false);
void finish_program(isel_context* ctx);
struct callee_info get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
const nir_parameter* parameters, Program* program,
RegisterDemand reg_limit);
void load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param,
Temp stack_ptr, unsigned scratch_param_size, Temp dst);
void store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param,
Temp stack_ptr, unsigned scratch_param_size, Temp data);
void emit_reload_preserved(isel_context* ctx);
#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)

View file

@ -45,14 +45,8 @@ append_logical_end(isel_context* ctx, bool append_reload_preserved)
{
Builder bld(ctx->program, ctx->block);
if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0) {
Operand stack_ptr_op;
if (ctx->program->gfx_level >= GFX9)
stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp());
else
stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), bld.def(s1, scc), stack_ptr_op);
}
if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0)
emit_reload_preserved(ctx);
bld.pseudo(aco_opcode::p_logical_end);
}
@ -676,8 +670,10 @@ build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
}
Instruction*
add_startpgm(struct isel_context* ctx)
add_startpgm(struct isel_context* ctx, bool is_callee)
{
ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
unsigned def_count = 0;
for (unsigned i = 0; i < ctx->args->arg_count; i++) {
if (ctx->args->args[i].skip)
@ -689,6 +685,15 @@ add_startpgm(struct isel_context* ctx)
def_count++;
}
if (is_callee) {
/* We do not support shader args in callees. */
assert(def_count == 0);
def_count += ctx->callee_info.reg_param_count;
/* Add system parameters separately - they aren't counted by reg_param_count */
assert(ctx->callee_info.stack_ptr.is_reg && ctx->callee_info.return_address.is_reg);
def_count += 2;
}
Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
ctx->block->instructions.emplace_back(startpgm);
for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
@ -721,6 +726,22 @@ add_startpgm(struct isel_context* ctx)
}
}
if (is_callee) {
unsigned def_idx = 0;
if (ctx->program->gfx_level >= GFX9)
ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
else
ctx->program->static_scratch_rsrc = ctx->callee_info.stack_ptr.def.getTemp();
startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def;
startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def;
for (auto& info : ctx->callee_info.param_infos) {
if (!info.is_reg)
continue;
startpgm->definitions[def_idx++] = info.def;
}
}
/* epilog has no scratch */
if (ctx->args->scratch_offset.used) {
if (ctx->program->gfx_level < GFX9) {
@ -1074,6 +1095,15 @@ get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
find_param_regs(program, abi, info, assignment_infos, reg_limit);
/* The call target parameters are special - they are marked as discardable to allow us
* to overwrite the parameter values within each callee for the divergent dispatch logic.
* However, we still need to explicitly write back the new values to the ABI-assigned registers
* when jumping to the next divergent callee/returning. Therefore, mark them as needing explicit
* preservation.
*/
info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation = true;
info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation = true;
/* Explicitly preserve the stack pointer. spill_preserved() can ensure correctness on its own,
* but it only can spill the initial stack pointer value to a linear VGPR, the inactive lanes of
* which would in turn need to be spilled to scratch. Explicitly preserving the stack pointer's
@ -1084,4 +1114,16 @@ get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
return info;
}
void
emit_reload_preserved(isel_context* ctx)
{
Builder bld(ctx->program, ctx->block);
Operand stack_ptr_op;
if (ctx->program->gfx_level >= GFX9)
stack_ptr_op = Operand(ctx->program->stack_ptr);
else
stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), Operand(), stack_ptr_op);
}
} // namespace aco

View file

@ -6,6 +6,7 @@
#include "aco_instruction_selection.h"
#include "aco_interface.h"
#include "aco_nir_call_attribs.h"
#include "nir_builder.h"
#include "nir_control_flow.h"
@ -238,8 +239,11 @@ setup_nir(isel_context* ctx, nir_shader* nir)
nir_opt_dce(nir);
}
nir_function_impl* func = nir_shader_get_entrypoint(nir);
nir_index_ssa_defs(func);
/* nir_shader_get_entrypoint returns NULL for RT shaders, but there should only be
* one impl at this stage.
*/
nir_foreach_function_impl (func, nir)
nir_index_ssa_defs(func);
}
/* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent,
@ -348,6 +352,13 @@ void
init_context(isel_context* ctx, nir_shader* shader)
{
nir_function_impl* impl = nir_shader_get_entrypoint(shader);
if (!impl) {
/* RT shaders have no NIR entrypoint, but only one function impl exists at this stage */
nir_foreach_function_impl (func, shader) {
impl = func;
break;
}
}
ctx->shader = shader;
assert(shader->info.max_subgroup_size >= ctx->program->wave_size);
@ -613,7 +624,17 @@ init_context(isel_context* ctx, nir_shader* shader)
case nir_intrinsic_ddx_fine:
case nir_intrinsic_ddy_fine:
case nir_intrinsic_ddx_coarse:
case nir_intrinsic_ddy_coarse: type = RegType::vgpr; break;
case nir_intrinsic_ddy_coarse:
case nir_intrinsic_load_return_param_amd: {
type = RegType::vgpr;
break;
}
case nir_intrinsic_load_param: {
nir_parameter* param =
&impl->function->params[nir_intrinsic_param_idx(intrinsic)];
type = param->is_uniform ? RegType::sgpr : RegType::vgpr;
break;
}
default:
for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
i++) {
@ -773,8 +794,17 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);
unsigned nir_num_blocks = 0;
for (unsigned i = 0; i < shader_count; i++)
nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks;
for (unsigned i = 0; i < shader_count; i++) {
nir_function_impl* entrypoint = nir_shader_get_entrypoint(shaders[i]);
if (!entrypoint) {
/* RT shaders have no NIR entrypoint, but only one function impl exists at this stage */
nir_foreach_function_impl (func, shaders[i]) {
entrypoint = func;
break;
}
}
nir_num_blocks += entrypoint->num_blocks;
}
ctx.program->blocks.reserve(nir_num_blocks * 2);
ctx.block = ctx.program->create_and_insert_block();
ctx.block->kind = block_kind_top_level;

View file

@ -8,6 +8,7 @@
#include "aco_builder.h"
#include "aco_instruction_selection.h"
#include "aco_ir.h"
#include "aco_nir_call_attribs.h"
#include "amdgfxregs.h"
#include <array>
@ -788,6 +789,141 @@ visit_jump(isel_context* ctx, nir_jump_instr* instr)
}
}
void
visit_call(isel_context* ctx, nir_call_instr* instr)
{
Builder bld(ctx->program, ctx->block);
ABI abi;
/* TODO: callable abi? */
switch (instr->callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = rtRaygenABI; break;
case ACO_NIR_CALL_ABI_TRAVERSAL: abi = rtTraversalABI; break;
case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = rtAnyHitABI; break;
default: UNREACHABLE("invalid abi");
}
RegisterDemand limit = get_addr_regs_from_waves(ctx->program, ctx->program->min_waves);
struct callee_info info =
get_callee_info(ctx->program->gfx_level, abi, instr->callee->num_params,
instr->callee->params, nullptr, limit);
std::vector<parameter_info> return_infos;
/* Before setting up the call itself, set up parameters stored in scratch memory.
* The stack layout during a call looks something like this:
* -------------------------------------------------------------------
* | caller stack area | callee's scratch params | callee stack area
* -------------------------------------------------------------------
* ^ caller's stack ptr ^ callee's stack ptr
*
* Since we don't know how big our own stack area is yet (spilling and register preservation may
* add to the stack size), we query the callee's stack pointer using p_callee_stack_ptr and use
* negative offsets to index into the scratch parameter area (similar to how the callee will load
* the parameters as well).
*/
Temp stack_ptr, param_stack_ptr;
if (info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9) {
param_stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), bld.def(s1, scc),
Operand::c32(info.scratch_param_size),
Operand(ctx->callee_info.stack_ptr.def.getTemp()));
stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
} else {
param_stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
Operand::c32(info.scratch_param_size));
stack_ptr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1), Operand::c32(0));
}
for (unsigned i = 0; i < info.param_infos.size(); ++i) {
if (info.param_infos[i].is_reg)
continue;
store_scratch_param(ctx, bld, info.param_infos[i], param_stack_ptr, info.scratch_param_size,
get_ssa_temp(ctx, instr->params[i].ssa));
}
unsigned extra_def_count = 1;
unsigned extra_param_count = 2;
unsigned param_size = info.scratch_param_size;
if (ctx->program->gfx_level < GFX9)
param_size *= ctx->program->wave_size;
assert(info.param_infos[0].is_reg);
Instruction* call_instr = create_instruction(aco_opcode::p_call, Format::PSEUDO_CALL,
info.reg_param_count + extra_param_count,
info.reg_discardable_param_count + extra_def_count);
call_instr->call().abi = abi;
if (ctx->program->gfx_level >= GFX9) {
call_instr->operands[0] = Operand(stack_ptr, info.stack_ptr.def.physReg());
} else {
call_instr->operands[0] = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
call_instr->operands[0].setPrecolored(info.stack_ptr.def.physReg());
}
call_instr->operands[1] = Operand::c32(param_size);
call_instr->definitions[0] = Definition(bld.tmp(s2), info.return_address.def.physReg());
/* Set up parameters stored in registers. Every parameter corresponds to an operand,
* and parameters that may have their value clobbered (i.e. discardable and return params)
* also have a definition.
*/
unsigned reg_param_idx = 0;
unsigned reg_discardable_param_idx = 0;
for (unsigned i = 0; i < info.param_infos.size(); ++i) {
if (!info.param_infos[i].is_reg) {
/* While setting up parameters, also capture information about where return parameters
* are stored, in order to reload them later.
* Since return_infos stores return parameters contiguously, and return parameters in
* scratch may be at any position in the parameter list, we need to add information about
* returned scratch parameters in the same loop as returned parameters stored in registers.
*/
if (instr->callee->params[i].is_return) {
parameter_info return_info = {};
return_info.is_reg = false;
return_info.scratch_offset = info.param_infos[i].scratch_offset;
return_infos.emplace_back(return_info);
}
continue;
}
Operand& op = call_instr->operands[reg_param_idx + extra_param_count];
op.setPrecolored(info.param_infos[i].def.physReg());
if (instr->callee->params[i].is_uniform)
op.setTemp(bld.as_uniform(get_ssa_temp(ctx, instr->params[i].ssa)));
else
op.setTemp(as_vgpr(ctx, get_ssa_temp(ctx, instr->params[i].ssa)));
if ((instr->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) ||
instr->callee->params[i].is_return) {
Definition def = bld.def(op.regClass(), op.physReg());
call_instr->definitions[extra_def_count + reg_discardable_param_idx++] = def;
if (instr->callee->params[i].is_return) {
assert(!instr->callee->params[i].is_uniform);
parameter_info return_info = {};
return_info.is_reg = true;
return_info.def = def;
return_infos.emplace_back(return_info);
}
}
++reg_param_idx;
}
ctx->block->instructions.emplace_back(static_cast<Instruction*>(call_instr));
ctx->call_infos.emplace_back(call_info{
instr,
call_instr,
std::move(return_infos),
info.scratch_param_size,
});
ctx->block->kind |= block_kind_contains_call;
ctx->program->has_call = true;
}
void
visit_debug_info(isel_context* ctx, nir_instr_debug_info* instr_info)
{
@ -839,6 +975,7 @@ visit_block(isel_context* ctx, nir_block* block)
case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
case nir_instr_type_deref: break;
case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
case nir_instr_type_call: visit_call(ctx, nir_instr_as_call(instr)); break;
default: isel_err(instr, "Unknown NIR instr type");
}
}
@ -1152,32 +1289,52 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i)
}
void
insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
insert_return(isel_context& ctx)
{
unsigned src_count = 0;
for (unsigned i = 0; i < ctx.args->arg_count; i++)
src_count += !!BITSET_TEST(ctx.output_args, i);
assert(ctx.callee_info.stack_ptr.needs_explicit_preservation);
assert(
ctx.callee_info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation);
assert(
ctx.callee_info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation);
/* stack_ptr always needs to be explicitly preserved */
unsigned preserved_param_count = 1;
if (ctx.callee_info.return_address.needs_explicit_preservation)
++preserved_param_count;
for (auto param_info : ctx.callee_info.param_infos) {
if (!param_info.is_reg || !param_info.needs_explicit_preservation)
continue;
++preserved_param_count;
}
unsigned src_count = preserved_param_count + 1;
Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
ctx.block->instructions.emplace_back(ret);
src_count = 0;
for (unsigned i = 0; i < ctx.args->arg_count; i++) {
if (!BITSET_TEST(ctx.output_args, i))
unsigned def_idx = 0;
ret->operands[def_idx++] = Operand();
Operand stack_op = Operand(ctx.callee_info.stack_ptr.def.getTemp());
stack_op.setPrecolored(ctx.callee_info.stack_ptr.def.physReg());
ret->operands[def_idx++] = stack_op;
for (unsigned i = 0; i < ctx.callee_info.param_infos.size(); ++i) {
const auto& param_info = ctx.callee_info.param_infos[i];
if (!param_info.is_reg || !param_info.needs_explicit_preservation)
continue;
enum ac_arg_regfile file = ctx.args->args[i].file;
unsigned size = ctx.args->args[i].size;
unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
: Operand(PhysReg{reg}, type);
ret->operands[src_count] = op;
src_count++;
Temp param_temp = param_info.def.getTemp();
if (i == ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC)
param_temp = ctx.next_divergent_pc;
else if (i == ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC)
param_temp = ctx.next_pc;
Operand op = Operand(param_temp);
op.setPrecolored(param_info.def.physReg());
ret->operands[def_idx++] = op;
}
if (ctx.callee_info.return_address.needs_explicit_preservation) {
Operand op = Operand(ctx.callee_info.return_address.def.getTemp());
op.setPrecolored(ctx.callee_info.return_address.def.physReg());
ret->operands[def_idx++] = op;
}
Builder bld(ctx.program, ctx.block);
bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
}
void
@ -1194,20 +1351,45 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
init_context(&ctx, nir);
setup_fp_mode(&ctx, nir);
Instruction* startpgm = add_startpgm(&ctx);
nir_function_impl* impl = NULL;
nir_foreach_function_impl (func, nir) {
impl = func;
break;
}
ABI abi;
/* TODO: callable abi? */
switch (impl->function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = rtRaygenABI; break;
case ACO_NIR_CALL_ABI_TRAVERSAL: abi = rtTraversalABI; break;
case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = rtAnyHitABI; break;
default: UNREACHABLE("invalid abi");
}
RegisterDemand limit = get_addr_regs_from_waves(ctx.program, ctx.program->min_waves);
ctx.callee_abi = abi;
ctx.program->callee_abi = ctx.callee_abi;
ctx.callee_info =
get_callee_info(ctx.program->gfx_level, ctx.callee_abi, impl->function->num_params,
impl->function->params, ctx.program, limit);
ctx.program->is_callee = true;
Instruction* startpgm = add_startpgm(&ctx, true);
append_logical_start(ctx.block);
split_arguments(&ctx, startpgm);
visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
append_logical_end(&ctx);
visit_cf_list(&ctx, &impl->body);
/* This block doesn't need a p_reload_preserved, we add it manually after p_return */
append_logical_end(&ctx, false);
ctx.block->kind |= block_kind_uniform;
/* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
* shader without shader calls.
*/
if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
insert_rt_jump_next(ctx, args);
else
if (ctx.next_pc != Temp()) {
insert_return(ctx);
Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc));
} else {
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm);
}
cleanup_context(&ctx);
}

View file

@ -1136,9 +1136,9 @@ get_buffer_store_op(unsigned bytes)
}
void
split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
Temp* write_datas, unsigned* offsets)
split_buffer_store(isel_context* ctx, unsigned align_mul, unsigned align_offset, bool smem,
RegType dst_type, Temp data, unsigned writemask, int swizzle_element_size,
unsigned* write_count, Temp* write_datas, unsigned* offsets)
{
unsigned write_count_with_skips = 0;
bool skips[16];
@ -1168,11 +1168,9 @@ split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, Reg
byte = 8;
/* dword or larger stores have to be dword-aligned */
unsigned align_mul = nir_intrinsic_align_mul(instr);
unsigned align_offset = nir_intrinsic_align_offset(instr) + offset;
bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
if (!dword_aligned)
byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
byte = MIN2(byte, ((align_offset + offset) % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
bytes[write_count_with_skips] = byte;
advance_write_mask(&todo, offset, byte);
@ -2291,8 +2289,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, max_size, &write_count,
write_datas, offsets);
split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
RegType::vgpr, data, writemask, max_size, &write_count, write_datas, offsets);
/* GFX6-7 are affected by a hw bug that prevents address clamping to work
* correctly when the SGPR offset is used.
@ -2457,8 +2455,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
write_datas, offsets);
split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
RegType::vgpr, data, writemask, 16, &write_count, write_datas, offsets);
Temp addr, offset;
uint32_t const_offset;
@ -2830,7 +2828,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
split_buffer_store(ctx, intrin, false, RegType::vgpr, store_src, write_mask,
split_buffer_store(ctx, nir_intrinsic_align_mul(intrin), nir_intrinsic_align_offset(intrin),
false, RegType::vgpr, store_src, write_mask,
swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
write_datas, offsets);
@ -3339,8 +3338,9 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
Temp write_datas[32];
unsigned offsets[32];
unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
&write_count, write_datas, offsets);
split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
RegType::vgpr, data, writemask, swizzle_component_size, &write_count,
write_datas, offsets);
if (ctx->program->gfx_level >= GFX9) {
uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
@ -3890,6 +3890,106 @@ emit_ds_bvh_stack_push8_pop1_rtn(isel_context* ctx, nir_intrinsic_instr* instr,
} // namespace
void
load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
unsigned scratch_param_size, Temp dst)
{
int32_t const_offset = param.scratch_offset - scratch_param_size;
LoadEmitInfo info = {Operand(v1), dst, dst.size(), 4};
info.align_mul = 4;
info.align_offset = 0;
info.cache = get_cache_flags(ctx, ACCESS_IS_SWIZZLED_AMD, ac_access_type_load);
info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
info.sync = memory_sync_info(storage_scratch, semantic_private);
if (ctx->program->gfx_level >= GFX9) {
if (const_offset < ctx->program->dev.scratch_global_offset_min) {
stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
Operand::c32(const_offset));
const_offset = 0;
}
info.offset = stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr);
info.const_offset = const_offset;
EmitLoadParameters params = scratch_flat_load_params;
params.max_const_offset = ctx->program->dev.scratch_global_offset_max;
emit_load(ctx, bld, info, params);
} else {
info.resource = load_scratch_resource(
ctx->program, bld, ctx->program->private_segment_buffers.size() - 1, false);
if (stack_ptr.id()) {
info.soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), stack_ptr,
Operand::c32(-const_offset * ctx->program->wave_size));
} else {
info.soffset =
bld.copy(bld.def(s1), Operand::c32(-const_offset * ctx->program->wave_size));
}
emit_load(ctx, bld, info, scratch_mubuf_load_params);
}
}
void
store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
unsigned scratch_param_size, Temp data)
{
int32_t const_base_offset = param.scratch_offset - scratch_param_size;
unsigned byte_size = data.bytes();
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
split_buffer_store(ctx, 4, 0, false, RegType::vgpr, as_vgpr(ctx, data),
u_bit_consecutive(0, byte_size), swizzle_component_size, &write_count,
write_datas, offsets);
if (ctx->program->gfx_level < GFX9) {
Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, -1u, false);
for (unsigned i = 0; i < write_count; i++) {
Temp soffset;
if (stack_ptr.id()) {
soffset =
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), stack_ptr,
Operand::c32(-const_base_offset * ctx->program->wave_size + offsets[i]));
} else {
soffset =
bld.copy(bld.def(s1),
Operand::c32(-const_base_offset * ctx->program->wave_size + offsets[i]));
}
assert(write_datas[i].bytes() == 4);
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, scratch_rsrc, Operand(v1),
Operand(soffset), write_datas[i], 0, false);
instr->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
return;
}
for (unsigned i = 0; i < write_count; i++) {
int32_t const_offset = const_base_offset + offsets[i];
if (const_offset < ctx->program->dev.scratch_global_offset_min) {
stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
Operand::c32(const_offset));
const_offset = 0;
}
aco_opcode op;
switch (write_datas[i].bytes()) {
case 4: op = aco_opcode::scratch_store_dword; break;
case 8: op = aco_opcode::scratch_store_dwordx2; break;
case 12: op = aco_opcode::scratch_store_dwordx3; break;
case 16: op = aco_opcode::scratch_store_dwordx4; break;
default: UNREACHABLE("Unexpected param size");
}
bld.scratch(op, Operand(v1), stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr),
write_datas[i], (int16_t)const_offset,
memory_sync_info(storage_scratch, semantic_private));
}
}
void
visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
{
@ -4965,6 +5065,81 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
}
break;
}
case nir_intrinsic_set_next_call_pc_amd:
ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa);
break;
case nir_intrinsic_load_call_return_address_amd:
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
Operand(ctx->callee_info.return_address.def.getTemp()));
break;
case nir_intrinsic_load_return_param_amd: {
call_info& info = ctx->call_infos[nir_intrinsic_call_idx(instr)];
unsigned idx = nir_intrinsic_param_idx(instr);
assert(idx < info.nir_instr->callee->num_params);
assert(info.nir_instr->callee->params[idx].is_return);
unsigned index_in_return_params = 0u;
for (unsigned i = 0; i < idx; ++i) {
if (info.nir_instr->callee->params[i].is_return)
++index_in_return_params;
}
if (info.return_info[index_in_return_params].is_reg) {
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
Operand(info.return_info[index_in_return_params].def.getTemp()));
} else {
Temp stack_ptr;
if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), bld.def(s1, scc),
Operand::c32(info.scratch_param_size),
Operand(ctx->callee_info.stack_ptr.def.getTemp()));
else
stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
Operand::c32(info.scratch_param_size));
load_scratch_param(ctx, bld, info.return_info[index_in_return_params], stack_ptr,
info.scratch_param_size, get_ssa_temp(ctx, &instr->def));
}
break;
}
case nir_intrinsic_load_param: {
const auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
Temp dst = get_ssa_temp(ctx, &instr->def);
if (param.is_reg) {
bld.copy(Definition(dst), Operand(param.def.getTemp()));
auto vec_it = ctx->allocated_vec.find(param.def.tempId());
if (vec_it != ctx->allocated_vec.end())
ctx->allocated_vec.emplace(dst.id(), vec_it->second);
} else {
Temp stack_ptr = Temp();
if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
load_scratch_param(ctx, bld, param, stack_ptr, ctx->callee_info.scratch_param_size, dst);
}
break;
}
case nir_intrinsic_store_param_amd: {
nir_intrinsic_instr* parent = nir_def_as_intrinsic_or_null(instr->src[0].ssa);
if (parent && parent->intrinsic == nir_intrinsic_load_param &&
nir_intrinsic_param_idx(parent) == nir_intrinsic_param_idx(instr))
break;
auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
if (param.is_reg) {
param.def.setTemp(param.def.regClass().type() == RegType::vgpr
? as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))
: bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)));
} else {
Temp stack_ptr = Temp();
if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
store_scratch_param(ctx, bld, param, stack_ptr, ctx->callee_info.scratch_param_size,
get_ssa_temp(ctx, instr->src[0].ssa));
}
break;
}
default:
isel_err(&instr->instr, "Unimplemented intrinsic instr");
abort();

View file

@ -8,13 +8,18 @@
#include "aco_instruction_selection.h"
#include "aco_interface.h"
#include "aco_ir.h"
#include "aco_nir_call_attribs.h"
#include "ac_descriptors.h"
#include "sid.h"
namespace aco {
void
select_rt_prolog(Program* program, ac_shader_config* config,
const struct aco_compiler_options* options, const struct aco_shader_info* info,
const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
const struct ac_shader_args* in_args, const struct ac_arg* descriptors,
unsigned raygen_param_count, nir_parameter* raygen_params)
{
init_program(program, compute_cs, info, options, config);
Block* block = program->create_and_insert_block();
@ -24,8 +29,13 @@ select_rt_prolog(Program* program, ac_shader_config* config,
calc_min_waves(program);
Builder bld(program, block);
block->instructions.reserve(32);
unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
unsigned num_sgprs = in_args->num_sgprs_used;
unsigned num_vgprs = in_args->num_vgprs_used;
RegisterDemand limit = get_addr_regs_from_waves(program, program->min_waves);
struct callee_info raygen_info = get_callee_info(program->gfx_level, rtRaygenABI,
raygen_param_count, raygen_params, NULL, limit);
/* Inputs:
* Ring offsets: s[0-1]
@ -41,9 +51,12 @@ select_rt_prolog(Program* program, ac_shader_config* config,
* Local invocation IDs: v[0-2]
*/
PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
PhysReg in_descriptors = get_arg_reg(in_args, *descriptors);
PhysReg in_push_constants = get_arg_reg(in_args, in_args->push_constants);
PhysReg in_dynamic_descriptors = get_arg_reg(in_args, in_args->dynamic_descriptors);
PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
PhysReg in_traversal_addr = get_arg_reg(in_args, in_args->rt.traversal_shader_addr);
PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
PhysReg in_wg_id_x;
PhysReg in_wg_id_y;
PhysReg in_wg_id_z;
@ -77,15 +90,48 @@ select_rt_prolog(Program* program, ac_shader_config* config,
* Shader VA: v[4-5]
* Shader Record Ptr: v[6-7]
*/
PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
assert(raygen_info.stack_ptr.is_reg);
assert(raygen_info.return_address.is_reg);
assert(raygen_info.param_infos[0].is_reg);
assert(raygen_info.param_infos[1].is_reg);
assert(raygen_info.param_infos[RT_ARG_LAUNCH_ID + 2].is_reg);
assert(raygen_info.param_infos[RT_ARG_LAUNCH_SIZE + 2].is_reg);
assert(raygen_info.param_infos[RT_ARG_DESCRIPTORS + 2].is_reg);
assert(raygen_info.param_infos[RT_ARG_PUSH_CONSTANTS + 2].is_reg);
assert(raygen_info.param_infos[RT_ARG_SBT_DESCRIPTORS + 2].is_reg);
assert(raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].is_reg);
assert(raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].is_reg);
PhysReg out_stack_ptr_param = raygen_info.stack_ptr.def.physReg();
PhysReg out_return_shader_addr = raygen_info.return_address.def.physReg();
PhysReg out_divergent_shader_addr = raygen_info.param_infos[0].def.physReg();
PhysReg out_uniform_shader_addr = raygen_info.param_infos[1].def.physReg();
PhysReg out_launch_size_x = raygen_info.param_infos[RT_ARG_LAUNCH_SIZE + 2].def.physReg();
PhysReg out_launch_size_y = out_launch_size_x.advance(4);
PhysReg out_launch_size_z = out_launch_size_y.advance(4);
PhysReg out_launch_ids[3];
for (unsigned i = 0; i < 3; i++)
out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
out_launch_ids[0] = raygen_info.param_infos[RT_ARG_LAUNCH_ID + 2].def.physReg();
for (unsigned i = 1; i < 3; i++)
out_launch_ids[i] = out_launch_ids[i - 1].advance(4);
PhysReg out_descriptors = raygen_info.param_infos[RT_ARG_DESCRIPTORS + 2].def.physReg();
PhysReg out_push_constants = raygen_info.param_infos[RT_ARG_PUSH_CONSTANTS + 2].def.physReg();
PhysReg out_dynamic_descriptors =
raygen_info.param_infos[RT_ARG_DYNAMIC_DESCRIPTORS + 2].def.physReg();
PhysReg out_sbt_descriptors = raygen_info.param_infos[RT_ARG_SBT_DESCRIPTORS + 2].def.physReg();
PhysReg out_traversal_addr =
raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].def.physReg();
PhysReg out_record_ptr = raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].def.physReg();
unsigned param_idx = 0;
for (auto& param_info : raygen_info.param_infos) {
unsigned byte_size =
align(raygen_params[param_idx].bit_size, 32) / 8 * raygen_params[param_idx].num_components;
if (raygen_params[param_idx].is_uniform)
num_sgprs = std::max(num_sgprs, param_info.def.physReg().reg() + byte_size / 4);
else
num_vgprs = std::max(num_vgprs, param_info.def.physReg().reg() - 256 + byte_size / 4);
++param_idx;
}
num_sgprs = std::max(num_sgprs, raygen_info.stack_ptr.def.physReg().reg());
/* Temporaries: */
PhysReg tmp_wg_start_x = PhysReg{num_sgprs};
@ -94,18 +140,26 @@ select_rt_prolog(Program* program, ac_shader_config* config,
num_sgprs++;
PhysReg tmp_swizzle_bound_y = PhysReg{num_sgprs};
num_sgprs++;
PhysReg tmp_wg_id_y;
if (program->gfx_level >= GFX12) {
tmp_wg_id_y = PhysReg{num_sgprs};
num_sgprs++;
} else {
tmp_wg_id_y = in_wg_id_y;
}
PhysReg tmp_wg_id_y = PhysReg{num_sgprs};
num_sgprs++;
num_sgprs = align(num_sgprs, 2);
PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
num_sgprs += 2;
PhysReg tmp_launch_size_addr = PhysReg{num_sgprs};
num_sgprs += 2;
PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
num_sgprs += 2;
PhysReg tmp_sbt_desc = PhysReg{num_sgprs};
if (program->gfx_level < GFX9)
num_sgprs += 2;
PhysReg tmp_traversal_addr = PhysReg{num_sgprs};
num_sgprs += 1;
PhysReg tmp_push_constants = PhysReg{num_sgprs};
num_sgprs++;
PhysReg tmp_descriptors = PhysReg{num_sgprs};
num_sgprs++;
PhysReg tmp_dynamic_descriptors = PhysReg{num_sgprs};
num_sgprs++;
PhysReg tmp_swizzled_id_x = PhysReg{256 + num_vgprs++};
PhysReg tmp_swizzled_id_y = PhysReg{256 + num_vgprs++};
@ -113,40 +167,66 @@ select_rt_prolog(Program* program, ac_shader_config* config,
PhysReg tmp_swizzled_id_shifted_y = PhysReg{256 + num_vgprs++};
/* Confirm some assumptions about register aliasing */
assert(in_ring_offsets == out_uniform_shader_addr);
assert(get_arg_reg(in_args, in_args->push_constants) ==
get_arg_reg(out_args, out_args->push_constants));
assert(get_arg_reg(in_args, in_args->dynamic_descriptors) ==
get_arg_reg(out_args, out_args->dynamic_descriptors));
assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
get_arg_reg(out_args, out_args->rt.sbt_descriptors));
assert(get_arg_reg(in_args, in_args->rt.traversal_shader_addr) ==
get_arg_reg(out_args, out_args->rt.traversal_shader_addr));
assert(in_launch_size_addr == out_launch_size_x);
assert(in_stack_base == out_launch_size_z);
assert(in_local_id == out_launch_ids[0]);
/* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
* arg. Make sure no other outputs have overwritten it by then.
*/
assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
if (program->gfx_level >= GFX9) {
if (program->gfx_level < GFX12) {
assert(in_wg_id_z == out_launch_size_y);
assert(in_wg_id_y == out_launch_size_x);
}
assert(in_sbt_desc == out_sbt_descriptors);
assert(in_traversal_addr == out_descriptors);
} else {
assert(out_launch_size_x == in_wg_id_y);
assert(out_sbt_descriptors == in_launch_size_addr);
}
/* load raygen sbt */
bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
Operand::c32(0u));
bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_launch_size_addr, s2),
Operand(in_launch_size_addr, s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_traversal_addr, s1),
Operand(in_traversal_addr, s1));
/* On GFX8-, the out push constant/descriptor parameters alias WG IDs, so we copy these
* parameters only after we're done calculating the launch IDs.
*/
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_push_constants, s1),
Operand(in_push_constants, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_dynamic_descriptors, s1),
Operand(in_dynamic_descriptors, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_descriptors, s1), Operand(in_descriptors, s1));
if (options->gfx_level < GFX9)
bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_sbt_desc, s2), Operand(in_sbt_desc, s2));
/* init scratch */
if (options->gfx_level < GFX9) {
/* copy ring offsets to temporary location*/
bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
Operand(in_ring_offsets, s2));
/* Unconditionally apply the scratch offset to scratch_rsrc so we just have
* to pass the rsrc through to callees.
*/
bld.sop2(aco_opcode::s_add_u32, Definition(tmp_ring_offsets, s1), Definition(scc, s1),
Operand(in_ring_offsets, s1), Operand(in_scratch_offset, s1));
bld.sop2(aco_opcode::s_addc_u32, Definition(tmp_ring_offsets.advance(4), s1),
Definition(scc, s1), Operand(in_ring_offsets.advance(4), s1), Operand::c32(0),
Operand(scc, s1));
} else if (options->gfx_level < GFX11) {
hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
Operand(in_scratch_offset, s1));
}
/* set stack ptr */
bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
/* Set up the Z launch ID, as well as setting up workgroup Y IDs. On gfx11-, the setup consists
* of backing the ID up as the load for the ray launch sizes will overwrite it.
*/
if (options->gfx_level >= GFX12) {
bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
Operand(in_wg_id_y, s1));
bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1),
Operand::c32(0));
} else {
bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1));
}
/* load raygen address */
bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
@ -156,22 +236,12 @@ select_rt_prolog(Program* program, ac_shader_config* config,
assert(out_launch_size_x.reg() % 4 == 0);
if (options->gfx_level >= GFX12) {
bld.smem(aco_opcode::s_load_dwordx3, Definition(out_launch_size_x, s3),
Operand(in_launch_size_addr, s2), Operand::c32(0u));
Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
} else {
bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
Operand(in_launch_size_addr, s2), Operand::c32(8u));
Operand(tmp_launch_size_addr, s2), Operand::c32(8u));
bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
Operand(in_launch_size_addr, s2), Operand::c32(0u));
}
/* calculate ray launch ids */
if (options->gfx_level >= GFX12) {
bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
Operand(in_wg_id_y, s1));
bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1),
Operand::c32(0));
} else {
bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
}
/* Swizzle ray launch IDs. We dispatch a 1D 32x1/64x1 workgroup natively. Many games dispatch
@ -313,13 +383,61 @@ select_rt_prolog(Program* program, ac_shader_config* config,
bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
Operand(tmp_raygen_sbt.advance(4), s1));
if (options->gfx_level < GFX9) {
/* write scratch/ring offsets to outputs, if needed */
bld.sop1(aco_opcode::s_mov_b32,
Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
Operand(in_scratch_offset, s1));
bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
Operand(tmp_ring_offsets, s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr, s1),
Operand(tmp_traversal_addr, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr.advance(4), s1),
Operand::c32(options->address32_hi));
if (program->gfx_level < GFX8)
bld.vop3(aco_opcode::v_lshr_b64, Definition(out_divergent_shader_addr, v2),
Operand(out_uniform_shader_addr, s2), Operand::c32(0));
else
bld.vop3(aco_opcode::v_lshrrev_b64, Definition(out_divergent_shader_addr, v2),
Operand::c32(0), Operand(out_uniform_shader_addr, s2));
/* Launch IDs are calculated, so copy the push constant/sbt descriptor parameters.
* Do this here before other parameters overwrite the inputs.
*/
if (program->gfx_level < GFX9) {
bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors, s1),
Operand(tmp_sbt_desc, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors.advance(4), s1),
Operand(tmp_sbt_desc.advance(4), s1));
}
bld.sop1(aco_opcode::s_mov_b32, Definition(out_push_constants, s1),
Operand(tmp_push_constants, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_dynamic_descriptors, s1),
Operand(tmp_dynamic_descriptors, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_descriptors, s1), Operand(tmp_descriptors, s1));
bld.sop1(aco_opcode::s_mov_b64, Definition(out_return_shader_addr, s2), Operand::c32(0));
if (program->gfx_level >= GFX9) {
bld.sopk(aco_opcode::s_movk_i32, Definition(out_stack_ptr_param, s1), 0);
} else {
/* Construct the scratch_rsrc here and pass it to the callees to use directly. */
struct ac_buffer_state ac_state = {0};
uint32_t desc[4];
ac_state.size = 0xffffffff;
ac_state.format = PIPE_FORMAT_R32_FLOAT;
for (int i = 0; i < 4; i++)
ac_state.swizzle[i] = PIPE_SWIZZLE_0;
ac_state.element_size = 1u;
ac_state.index_stride = program->wave_size == 64 ? 3u : 2u;
ac_state.add_tid = true;
ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc);
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param, s1),
Operand(tmp_ring_offsets, s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(4), s1),
Operand(tmp_ring_offsets.advance(4), s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(8), s1),
Operand::c32(desc[2]));
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(12), s1),
Operand::c32(desc[3]));
}
/* jump to raygen */

View file

@ -931,9 +931,13 @@ static void
compile_rt_prolog(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct nir_function raygen_stub = {};
uint32_t push_constant_size = 0;
pipeline->prolog = radv_create_rt_prolog(device);
/* Create a dummy function signature for raygen shaders in order to pass parameter info to the prolog */
radv_nir_init_rt_function_params(&raygen_stub, MESA_SHADER_RAYGEN, 0);
radv_nir_lower_callee_signature(&raygen_stub);
pipeline->prolog = radv_create_rt_prolog(device, raygen_stub.num_params, raygen_stub.params);
/* create combined config */
struct ac_shader_config *config = &pipeline->prolog->config;

View file

@ -3408,13 +3408,12 @@ radv_aco_build_shader_part(void **bin, uint32_t num_sgprs, uint32_t num_vgprs, c
}
struct radv_shader *
radv_create_rt_prolog(struct radv_device *device)
radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, nir_parameter *raygen_params)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_instance *instance = radv_physical_device_instance(pdev);
struct radv_shader *prolog;
struct radv_shader_args in_args = {0};
struct radv_shader_args out_args = {0};
struct radv_nir_compiler_options options = {0};
radv_fill_nir_compiler_options(&options, device, NULL, false, instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS,
radv_device_fault_detection_enabled(device), false);
@ -3435,7 +3434,6 @@ radv_create_rt_prolog(struct radv_device *device)
info.cs.uses_block_id[i] = true;
radv_declare_shader_args(device, NULL, &info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &in_args);
radv_declare_rt_shader_args(options.info->gfx_level, &out_args);
info.user_sgprs_locs = in_args.user_sgprs_locs;
#if AMD_LLVM_AVAILABLE
@ -3449,8 +3447,8 @@ radv_create_rt_prolog(struct radv_device *device)
struct aco_compiler_options ac_opts;
radv_aco_convert_shader_info(&ac_info, &info, &in_args, &device->cache_key, options.info->gfx_level);
radv_aco_convert_opts(&ac_opts, &options, &in_args, &stage_key);
aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &out_args.ac, &radv_aco_build_shader_binary,
(void **)&binary);
aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &in_args.descriptors[0], raygen_param_count, raygen_params,
&radv_aco_build_shader_binary, (void **)&binary);
binary->info = info;
radv_postprocess_binary_config(device, binary, &in_args);

View file

@ -34,6 +34,8 @@ struct radv_shader_args;
struct radv_shader_args;
struct radv_serialized_shader_arena_block;
struct vk_pipeline_robustness_state;
struct nir_parameter;
typedef struct nir_parameter nir_parameter;
#define RADV_GRAPHICS_STAGE_BITS \
(VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_TASK_BIT_EXT)
@ -549,7 +551,8 @@ void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena
struct radv_shader *radv_create_trap_handler_shader(struct radv_device *device);
struct radv_shader *radv_create_rt_prolog(struct radv_device *device);
struct radv_shader *radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count,
nir_parameter *raygen_params);
struct radv_shader_part *radv_shader_part_create(struct radv_device *device, struct radv_shader_part_binary *binary,
unsigned wave_size);

View file

@ -362,6 +362,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_urb_input_handle_intel:
case nir_intrinsic_load_urb_output_handle_intel:
case nir_intrinsic_load_ray_query_global_intel:
case nir_intrinsic_load_call_return_address_amd:
is_divergent = false;
break;